In [1]:
#!pip install -r requirements.txt

In [1]:
import pandas as pd
from pycaret.regression import setup, get_config


In [9]:
# Step 1: Load the Excel data
df = pd.read_excel("data/ISBSG2016R1.1-Formatted4CSVAgileOnly.xlsx")  # Replace with your file name

# Preview data
print(df.shape)
print(df.head())


(84, 51)
   ISBSG Project ID External (EEF) - Data Quality Rating  \
0             10279                                    B   
1             10317                                    B   
2             10572                                    B   
3             11278                                    A   
4             11497                                    B   

   Project (PRF) - Year of Project External (EEF) - Industry Sector  \
0                             2013                          Banking   
1                             2015                       Government   
2                             2014                       Government   
3                             2010                 Service Industry   
4                             2012                          Banking   

                  External (EEF) - Organisation Type  \
0  Government;Education Institution;Wholesale & R...   
1                                        Government;   
2                                  

In [10]:
# Analyse missing values
missing_pct = df.isnull().mean() * 100
print(missing_pct.sort_values(ascending=False))


People (PRF) - IT experience >3 yr                100.000000
People (PRF) - IT experience 1 to 3 yr            100.000000
People (PRF) - IT experience <1 yr                100.000000
People (PRF) - Project user involvement           100.000000
Tech (TF) - Type of Server                        100.000000
Process (PMF) - Prototyping Used                   94.047619
People (PRF) - BA team experience <1 yr            82.142857
 - CASE Tool Used                                  80.952381
People (PRF) - IT experience >9 yr                 79.761905
People (PRF) - BA team experience >3 yr            79.761905
People (PRF) - BA team experience 1 to 3 yr        78.571429
Project (PRF) - Currency multiple                  78.571429
People (PRF) - Project manage experience           77.380952
People (PRF) - IT experience 3 to 9 yr             75.000000
People (PRF) - IT experience <3 yr                 72.619048
Tech (TF) - Client Roles                           69.047619
Tech (TF) - Server Roles

In [11]:
high_missing_cols = missing_pct[missing_pct > 70].index.tolist()
print(f"Columns with >70% missing values: {high_missing_cols}")


Columns with >70% missing values: [' - CASE Tool Used', 'Process (PMF) - Prototyping Used', 'Tech (TF) - Type of Server', 'People (PRF) - Project user involvement', 'People (PRF) - BA team experience <1 yr', 'People (PRF) - BA team experience 1 to 3 yr', 'People (PRF) - BA team experience >3 yr', 'People (PRF) - IT experience <1 yr', 'People (PRF) - IT experience 1 to 3 yr', 'People (PRF) - IT experience >3 yr', 'People (PRF) - IT experience <3 yr', 'People (PRF) - IT experience 3 to 9 yr', 'People (PRF) - IT experience >9 yr', 'People (PRF) - Project manage experience', 'Project (PRF) - Currency multiple']


In [12]:
# List of columns with >70% missing values
high_missing_cols = [
    ' - CASE Tool Used',
    'Process (PMF) - Prototyping Used',
    'Tech (TF) - Type of Server',
    'People (PRF) - Project user involvement',
    'People (PRF) - BA team experience <1 yr',
    'People (PRF) - BA team experience 1 to 3 yr',
    'People (PRF) - BA team experience >3 yr',
    'People (PRF) - IT experience <1 yr',
    'People (PRF) - IT experience 1 to 3 yr',
    'People (PRF) - IT experience >3 yr',
    'People (PRF) - IT experience <3 yr',
    'People (PRF) - IT experience 3 to 9 yr',
    'People (PRF) - IT experience >9 yr',
    'People (PRF) - Project manage experience',
    'Project (PRF) - Currency multiple'
]

# Drop these columns from the DataFrame
df_clean = df.drop(columns=high_missing_cols)

print(f"Data shape after dropping high-missing columns: {df_clean.shape}")


Data shape after dropping high-missing columns: (84, 36)


In [24]:
# Select categorical columns only
cat_cols = df_clean.select_dtypes(include=['object', 'category']).columns

# Fill missing values in categorical columns with "Missing"
for col in cat_cols:
    df[col].fillna('Missing', inplace=True)


ISBSG Project ID                                   0
External (EEF) - Data Quality Rating               0
Project (PRF) - Year of Project                    0
External (EEF) - Industry Sector                   1
External (EEF) - Organisation Type                 0
Project (PRF) - Application Group                  5
Project (PRF) - Application Type                   0
Project (PRF) - Development Type                   0
Tech (TF) - Development Platform                  15
Tech (TF) - Language Type                          0
Tech (TF) - Primary Programming Language           0
Project (PRF) - Functional Size                    1
Project (PRF) - Relative Size                      1
Project (PRF) - Normalised Work Effort Level 1     0
Project (PRF) - Normalised Work Effort             0
Project (PRF) - Normalised Level 1 PDR (ufp)       1
Project (PRF) - Normalised PDR (ufp)               1
Project (PRF) - Defect Density                    53
Project (PRF) - Speed of Delivery             

In [25]:
missing_summary = pd.DataFrame({
    'Column': df.columns,
    'Data_Type': df.dtypes.values,
    'Missing_Count': df.isnull().sum().values
})

print(missing_summary)


                                            Column Data_Type  Missing_Count
0                                 ISBSG Project ID     int64              0
1             External (EEF) - Data Quality Rating    object              0
2                  Project (PRF) - Year of Project     int64              0
3                 External (EEF) - Industry Sector    object              0
4               External (EEF) - Organisation Type    object              0
5                Project (PRF) - Application Group    object              0
6                 Project (PRF) - Application Type    object              0
7                 Project (PRF) - Development Type    object              0
8                 Tech (TF) - Development Platform    object              0
9                        Tech (TF) - Language Type    object              0
10        Tech (TF) - Primary Programming Language    object              0
11                 Project (PRF) - Functional Size   float64              1
12          

In [26]:
# Save the entire cleaned DataFrame (not just the column names) to CSV
df_clean.to_csv('data/cleaned_data.csv', index=False)

In [8]:
def get_data_summary(df, n_unique_samples=5):
    # Summary dataframe with basic info
    summary = pd.DataFrame({
        'Feature': df.columns,
        'data_type': df.dtypes.values,
        'Null_number': df.isnull().sum().values,
        'Null_pct': (df.isnull().mean() * 100).values,
        'Unique_counts': df.nunique().values,
        'unique_value': [list(df[col].dropna().unique()[:n_unique_samples]) for col in df.columns]
    })
    
    return summary

# Usage
summary_df = get_data_summary(df)
print(summary_df)

# Also print descriptive stats
desc_stats = df.describe().T
print(desc_stats)

                                           Feature data_type  Null_number  \
0                                 ISBSG Project ID     int64            0   
1             External (EEF) - Data Quality Rating    object            0   
2                  Project (PRF) - Year of Project     int64            0   
3                 External (EEF) - Industry Sector    object            1   
4               External (EEF) - Organisation Type    object            0   
5                Project (PRF) - Application Group    object            5   
6                 Project (PRF) - Application Type    object            0   
7                 Project (PRF) - Development Type    object            0   
8                 Tech (TF) - Development Platform    object           15   
9                        Tech (TF) - Language Type    object            0   
10        Tech (TF) - Primary Programming Language    object            0   
11                 Project (PRF) - Functional Size   float64            1   

In [4]:
# Step 2: Initialise PyCaret setup for preprocessing


# the DataFrame is 'df' and target column is numeric
s = setup(
    data=df,
    target='Project (PRF) - Normalised Work Effort',
    preprocess=True,
    imputation_type='simple',  # mean/median imputation for numeric missing values
    numeric_imputation='mean',
    categorical_imputation='mode',
    profile=True,              # generates an automated profiling report
    session_id=123,
    verbose=True
)

# After setup, get the preprocessed dataset
processed_data = get_config('dataset')
print(processed_data.head())


Unnamed: 0,Description,Value
0,Session id,123
1,Target,Project (PRF) - Normalised Work Effort
2,Target type,Regression
3,Original data shape,"(84, 51)"
4,Transformed data shape,"(84, 139)"
5,Transformed train set shape,"(58, 139)"
6,Transformed test set shape,"(26, 139)"
7,Numeric features,28
8,Categorical features,22
9,Rows with missing values,100.0%


Loading profile... Please Wait!


    ISBSG Project ID External (EEF) - Data Quality Rating  \
63             26861                                    B   
44             21796                                    B   
64             27063                                    B   
20             14621                                    B   
14             14319                                    B   

    Project (PRF) - Year of Project External (EEF) - Industry Sector  \
63                             2010          Electronics & Computers   
44                             2015                 Service Industry   
64                             2012                          Banking   
20                             2008                 Service Industry   
14                             2010                          Banking   

                   External (EEF) - Organisation Type  \
63                                         High Tech;   
44                          Art , Events , Ticketing;   
64  Government;Education Inst

In [5]:
# Save to CSV
processed_data.to_csv("data/preprocessed_data.csv", index=False)

In [6]:
from ydata_profiling import ProfileReport

profile = ProfileReport(df, title="Dataset Profiling Report")
profile.to_file("data_profile.html")


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


[A%|          | 0/51 [00:00<?, ?it/s]
[A%|▊         | 4/51 [00:00<00:02, 18.38it/s]
100%|██████████| 51/51 [00:00<00:00, 70.27it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [22]:
print(df['Project (PRF) - Normalised Work Effort'].nunique())
print(df['Project (PRF) - Normalised Work Effort'].value_counts())


4214
Project (PRF) - Normalised Work Effort
995     9
304     9
620     9
473     9
62      9
       ..
1030    1
1243    1
4213    1
2225    1
80      1
Name: count, Length: 4214, dtype: int64


In [23]:
import seaborn as sns
import matplotlib.pyplot as plt

df = get_config("dataset")
corr = df.corr()

# Plot heatmap of correlations
plt.figure(figsize=(10, 8))
sns.heatmap(corr[[get_config('target')]].sort_values(by=get_config('target'), ascending=False),
            annot=True, cmap='coolwarm')
plt.title('Correlation with Target')
plt.show()


ValueError: could not convert string to float: 'B'

In [12]:
# Step 3 : Get the preprocessed dataset
# preprocessed_df = pull()  # Pulls the setup data summary
processed_data = get_config('X_train')  # Or get_config('dataset') for full

# Step 6 (Optional): Save the processed data
processed_data.to_csv("data/preprocessed_data.csv", index=False)

TypeError: 'NoneType' object is not subscriptable