In [1]:
# # ISBSG Data Analysis and Regression Modeling
# 
# This notebook performs data cleaning, preprocessing, and regression modeling on the ISBSG dataset.

# ## Setup and Environment Configuration

# Install required packages (uncomment if needed)
# !pip install -r requirements.txt

In [2]:
# Import basic libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

In [3]:
# Configure timestamp callback for Jupyter cells
from IPython import get_ipython

def setup_timestamp_callback():
    """Setup a timestamp callback for Jupyter cells without clearing existing callbacks."""
    ip = get_ipython()
    if ip is not None:
        # Define timestamp function
        def print_timestamp(*args, **kwargs):
            """Print timestamp after cell execution."""
            print(f"Cell executed at: {datetime.now()}")
        
        # Check if our callback is already registered
        callbacks = ip.events.callbacks.get('post_run_cell', [])
        for cb in callbacks:
            if hasattr(cb, '__name__') and cb.__name__ == 'print_timestamp':
                # Already registered
                return
                
        # Register new callback if not already present
        ip.events.register('post_run_cell', print_timestamp)
        print("Timestamp printing activated.")
    else:
        print("Not running in IPython/Jupyter environment.")

In [4]:
# Setup timestamp callback
setup_timestamp_callback()

# Set visualization style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)

Timestamp printing activated.
Cell executed at: 2025-05-15 17:53:03.321637


In [5]:
# ## Data Loading and Initial Exploration

Cell executed at: 2025-05-15 17:53:03.324304


In [11]:
# Load the data
print("Loading data...")
df = pd.read_excel("data/ISBSG2016R1.1-Formatted4CSVAgileOnly.xlsx")


Loading data...
Cell executed at: 2025-05-15 18:04:51.091933


In [12]:
import re

def clean_column_names(columns):
    cleaned_cols = []
    for col in columns:
        col_clean = re.sub(r'[^\w\s]', '', col)  # remove special chars
        col_clean = col_clean.replace(' ', '_')  # replace spaces with _
        cleaned_cols.append(col_clean)
    return cleaned_cols

df.columns = clean_column_names(df.columns)


Cell executed at: 2025-05-15 18:04:56.974719


In [13]:
# Display basic information
print(f"Dataset shape: {df.shape}")
print("\nFirst 5 rows:")
print(df.head())


Dataset shape: (84, 51)

First 5 rows:
   ISBSG_Project_ID External_EEF__Data_Quality_Rating  \
0             10279                                 B   
1             10317                                 B   
2             10572                                 B   
3             11278                                 A   
4             11497                                 B   

   Project_PRF__Year_of_Project External_EEF__Industry_Sector  \
0                          2013                       Banking   
1                          2015                    Government   
2                          2014                    Government   
3                          2010              Service Industry   
4                          2012                       Banking   

                     External_EEF__Organisation_Type  \
0  Government;Education Institution;Wholesale & R...   
1                                        Government;   
2                                        Government;   
3  

In [10]:
print(df.columns)
print(type(df.columns))


Index(['ISBSG_Project_ID', 'External_EEF__Data_Quality_Rating',
       'Project_PRF__Year_of_Project', 'External_EEF__Industry_Sector',
       'External_EEF__Organisation_Type', 'Project_PRF__Application_Group',
       'Project_PRF__Application_Type', 'Project_PRF__Development_Type',
       'Tech_TF__Development_Platform', 'Tech_TF__Language_Type',
       'Tech_TF__Primary_Programming_Language', 'Project_PRF__Functional_Size',
       'Project_PRF__Relative_Size',
       'Project_PRF__Normalised_Work_Effort_Level_1',
       'Project_PRF__Normalised_Work_Effort',
       'Project_PRF__Normalised_Level_1_PDR_ufp',
       'Project_PRF__Normalised_PDR_ufp', 'Project_PRF__Defect_Density',
       'Project_PRF__Speed_of_Delivery', 'Project_PRF__Manpower_Delivery_Rate',
       'Project_PRF__Project_Elapsed_Time', 'Project_PRF__Team_Size_Group',
       'Project_PRF__Max_Team_Size', '__CASE_Tool_Used',
       'Process_PMF__Development_Methodologies',
       'Process_PMF__Prototyping_Used', 'Proces

In [14]:
# Create a function to get comprehensive data summary
def get_data_summary(df, n_unique_samples=5):
    """
    Generate a comprehensive summary of the dataframe.
    
    Args:
        df: Pandas DataFrame
        n_unique_samples: Number of unique values to show as sample
        
    Returns:
        DataFrame with summary information
    """
    # Summary dataframe with basic info
    summary = pd.DataFrame({
        'Feature': df.columns,
        'data_type': df.dtypes.values,
        'Null_number': df.isnull().sum().values,
        'Null_pct': (df.isnull().mean() * 100).values,
        'Unique_counts': df.nunique().values,
        'unique_samples': [list(df[col].dropna().unique()[:n_unique_samples]) for col in df.columns]
    })
    
    return summary

# Generate and display data summary
summary_df = get_data_summary(df)
print("\nData Summary:")
print(summary_df)



Data Summary:
                                           Feature data_type  Null_number  \
0                                 ISBSG_Project_ID     int64            0   
1                External_EEF__Data_Quality_Rating    object            0   
2                     Project_PRF__Year_of_Project     int64            0   
3                    External_EEF__Industry_Sector    object            1   
4                  External_EEF__Organisation_Type    object            0   
5                   Project_PRF__Application_Group    object            5   
6                    Project_PRF__Application_Type    object            0   
7                    Project_PRF__Development_Type    object            0   
8                    Tech_TF__Development_Platform    object           15   
9                           Tech_TF__Language_Type    object            0   
10           Tech_TF__Primary_Programming_Language    object            0   
11                    Project_PRF__Functional_Size   float64 

In [15]:
# Display descriptive statistics for numeric columns
desc_stats = df.describe().T
print("\nDescriptive Statistics:")
print(desc_stats)


Descriptive Statistics:
                                                count           mean  \
ISBSG_Project_ID                                 84.0   20951.142857   
Project_PRF__Year_of_Project                     84.0    2011.369048   
Project_PRF__Functional_Size                     83.0     240.349398   
Project_PRF__Normalised_Work_Effort_Level_1      84.0    2935.773810   
Project_PRF__Normalised_Work_Effort              84.0    3219.678571   
Project_PRF__Normalised_Level_1_PDR_ufp          83.0      12.912048   
Project_PRF__Normalised_PDR_ufp                  83.0      12.944578   
Project_PRF__Defect_Density                      31.0      19.229032   
Project_PRF__Speed_of_Delivery                   81.0      83.383951   
Project_PRF__Manpower_Delivery_Rate              56.0      46.051786   
Project_PRF__Project_Elapsed_Time                82.0       6.420732   
Project_PRF__Max_Team_Size                       59.0       6.355932   
Process_PMF__Docs                      

In [8]:
# ## Data Cleaning and Preprocessing

Cell executed at: 2025-05-15 17:26:19.416517


In [16]:
# Analyze missing values
print("\nAnalyzing missing values...")
missing_pct = df.isnull().mean() * 100
missing_sorted = missing_pct.sort_values(ascending=False)
print(missing_sorted)


Analyzing missing values...
People_PRF__IT_experience_great_than_3_yr         100.000000
People_PRF__IT_experience_1_to_3_yr               100.000000
People_PRF__IT_experience_less_than_1_yr          100.000000
People_PRF__Project_user_involvement              100.000000
Tech_TF__Type_of_Server                           100.000000
Process_PMF__Prototyping_Used                      94.047619
People_PRF__BA_team_experience_less_than_1_yr      82.142857
__CASE_Tool_Used                                   80.952381
People_PRF__IT_experience_great_than_9_yr          79.761905
People_PRF__BA_team_experience_great_than_3_yr     79.761905
People_PRF__BA_team_experience_1_to_3_yr           78.571429
Project_PRF__Currency_multiple                     78.571429
People_PRF__Project_manage_experience              77.380952
People_PRF__IT_experience_3_to_9_yr                75.000000
People_PRF__IT_experience_less_than_3_yr           72.619048
Tech_TF__Client_Roles                              69.04

In [17]:
# Identify columns with high missing values (>70%)
high_missing_cols = missing_pct[missing_pct > 70].index.tolist()
print(f"\nColumns with >70% missing values ({len(high_missing_cols)} columns):")
for col in high_missing_cols:
    print(f"  - {col}: {missing_pct[col]:.2f}% missing")



Columns with >70% missing values (15 columns):
  - __CASE_Tool_Used: 80.95% missing
  - Process_PMF__Prototyping_Used: 94.05% missing
  - Tech_TF__Type_of_Server: 100.00% missing
  - People_PRF__Project_user_involvement: 100.00% missing
  - People_PRF__BA_team_experience_less_than_1_yr: 82.14% missing
  - People_PRF__BA_team_experience_1_to_3_yr: 78.57% missing
  - People_PRF__BA_team_experience_great_than_3_yr: 79.76% missing
  - People_PRF__IT_experience_less_than_1_yr: 100.00% missing
  - People_PRF__IT_experience_1_to_3_yr: 100.00% missing
  - People_PRF__IT_experience_great_than_3_yr: 100.00% missing
  - People_PRF__IT_experience_less_than_3_yr: 72.62% missing
  - People_PRF__IT_experience_3_to_9_yr: 75.00% missing
  - People_PRF__IT_experience_great_than_9_yr: 79.76% missing
  - People_PRF__Project_manage_experience: 77.38% missing
  - Project_PRF__Currency_multiple: 78.57% missing
Cell executed at: 2025-05-15 18:06:13.348556


In [18]:
# Create a clean dataframe by dropping high-missing columns
df_clean = df.drop(columns=high_missing_cols)
print(f"\nData shape after dropping high-missing columns: {df_clean.shape}")


Data shape after dropping high-missing columns: (84, 36)
Cell executed at: 2025-05-15 18:06:19.645175


In [19]:
# Handle remaining missing values
print("\nHandling remaining missing values...")


Handling remaining missing values...
Cell executed at: 2025-05-15 18:06:25.320719


In [20]:
# Fill missing values in categorical columns with "Missing"
cat_cols = df_clean.select_dtypes(include=['object', 'category']).columns
for col in cat_cols:
    df_clean[col].fillna('Missing', inplace=True)

Cell executed at: 2025-05-15 18:06:31.755005


In [21]:
# Check remaining missing values
remaining_missing = df_clean.isnull().sum()
print("\nRemaining missing values after handling categorical columns:")
print(remaining_missing[remaining_missing > 0])


Remaining missing values after handling categorical columns:
Project_PRF__Functional_Size                1
Project_PRF__Normalised_Level_1_PDR_ufp     1
Project_PRF__Normalised_PDR_ufp             1
Project_PRF__Defect_Density                53
Project_PRF__Speed_of_Delivery              3
Project_PRF__Manpower_Delivery_Rate        28
Project_PRF__Project_Elapsed_Time           2
Project_PRF__Max_Team_Size                 25
People_PRF__Project_manage_changes         52
People_PRF__Personnel_changes              52
Project_PRF__Total_project_cost            55
dtype: int64
Cell executed at: 2025-05-15 18:06:37.940155


In [24]:
print(df_clean.columns.tolist())


['ISBSG_Project_ID', 'External_EEF__Data_Quality_Rating', 'Project_PRF__Year_of_Project', 'External_EEF__Industry_Sector', 'External_EEF__Organisation_Type', 'Project_PRF__Application_Group', 'Project_PRF__Application_Type', 'Project_PRF__Development_Type', 'Tech_TF__Development_Platform', 'Tech_TF__Language_Type', 'Tech_TF__Primary_Programming_Language', 'Project_PRF__Functional_Size', 'Project_PRF__Relative_Size', 'Project_PRF__Normalised_Work_Effort_Level_1', 'Project_PRF__Normalised_Work_Effort', 'Project_PRF__Normalised_Level_1_PDR_ufp', 'Project_PRF__Normalised_PDR_ufp', 'Project_PRF__Defect_Density', 'Project_PRF__Speed_of_Delivery', 'Project_PRF__Manpower_Delivery_Rate', 'Project_PRF__Project_Elapsed_Time', 'Project_PRF__Team_Size_Group', 'Project_PRF__Max_Team_Size', 'Process_PMF__Development_Methodologies', 'Process_PMF__Docs', 'Tech_TF__Architecture', 'Tech_TF__Client_Server', 'Tech_TF__Client_Roles', 'Tech_TF__Server_Roles', 'Tech_TF__Web_Development', 'Tech_TF__DBMS_Used',

In [23]:
# Verify target variable
target_col = 'Project_PRF_Normalised_Work_Effort'
print(f"\nTarget variable '{target_col}' summary:")
print(f"Unique values: {df_clean[target_col].nunique()}")
print(f"Missing values: {df_clean[target_col].isnull().sum()}")
print(f"Top value counts:")
print(df_clean[target_col].value_counts().head())



Target variable 'Project_PRF_Normalised_Work_Effort' summary:


KeyError: 'Project_PRF_Normalised_Work_Effort'

Cell executed at: 2025-05-15 18:08:02.298331


In [16]:
# Check for infinite values
inf_check = np.isinf(df_clean.select_dtypes(include=[np.number])).sum().sum()
print(f"\nNumber of infinite values: {inf_check}")


Number of infinite values: 0
Cell executed at: 2025-05-15 17:29:23.458692


In [17]:
# Save cleaned data
df_clean.to_csv('data/cleaned_data.csv', index=False)
print("\nCleaned data saved to 'data/cleaned_data.csv'")



Cleaned data saved to 'data/cleaned_data.csv'
Cell executed at: 2025-05-15 17:29:34.167274


In [18]:
# ## Feature Engineering and Encoding

Cell executed at: 2025-05-15 17:29:42.763836


In [19]:
# Identify categorical columns and check cardinality
print("\nCategorical columns and their cardinality:")
cat_cols = df_clean.select_dtypes(include=['object', 'category']).columns.tolist()
for col in cat_cols:
    print(f"  {col}: {df_clean[col].nunique()} unique values")


Categorical columns and their cardinality:
  External (EEF) - Data Quality Rating: 4 unique values
  External (EEF) - Industry Sector: 11 unique values
  External (EEF) - Organisation Type: 25 unique values
  Project (PRF) - Application Group: 5 unique values
  Project (PRF) - Application Type: 32 unique values
  Project (PRF) - Development Type: 3 unique values
  Tech (TF) - Development Platform: 5 unique values
  Tech (TF) - Language Type: 3 unique values
  Tech (TF) - Primary Programming Language: 9 unique values
  Project (PRF) - Relative Size: 8 unique values
  Project (PRF) - Team Size Group: 9 unique values
  Process (PMF) - Development Methodologies: 10 unique values
  Tech (TF) - Architecture: 5 unique values
  Tech (TF) - Client Server?: 3 unique values
  Tech (TF) - Client Roles: 11 unique values
  Tech (TF) - Server Roles: 13 unique values
  Tech (TF) - Web Development: 2 unique values
  Tech (TF) - DBMS Used: 2 unique values
  Project (PRF) - Cost currency: 4 unique value

In [20]:
# One-hot encode categorical columns with low cardinality (<10 unique values)
low_card_cols = [col for col in cat_cols if df_clean[col].nunique() < 10]
print(f"\nApplying one-hot encoding to {len(low_card_cols)} low-cardinality columns:")
for col in low_card_cols[:10]:  # Show first 10
    print(f"  - {col}")
if len(low_card_cols) > 10:
    print(f"  - ... and {len(low_card_cols) - 10} more columns")



Applying one-hot encoding to 13 low-cardinality columns:
  - External (EEF) - Data Quality Rating
  - Project (PRF) - Application Group
  - Project (PRF) - Development Type
  - Tech (TF) - Development Platform
  - Tech (TF) - Language Type
  - Tech (TF) - Primary Programming Language
  - Project (PRF) - Relative Size
  - Project (PRF) - Team Size Group
  - Tech (TF) - Architecture
  - Tech (TF) - Client Server?
  - ... and 3 more columns
Cell executed at: 2025-05-15 17:30:27.860918


In [21]:
# Create encoded dataframe
df_encoded = pd.get_dummies(df_clean, columns=low_card_cols, drop_first=True)
print(f"\nData shape after one-hot encoding: {df_encoded.shape}")


Data shape after one-hot encoding: (84, 72)
Cell executed at: 2025-05-15 17:31:07.539019


In [22]:
# Save encoded data
df_encoded.to_csv('data/encoded_data.csv', index=False)
print("\nEncoded data saved to 'data/encoded_data.csv'")


Encoded data saved to 'data/encoded_data.csv'
Cell executed at: 2025-05-15 17:31:20.723759


In [24]:
# ## Data Profiling (Optional)

try:
    from ydata_profiling import ProfileReport
    
    print("\nGenerating data profile report...")
    profile = ProfileReport(df_clean, title="ISBSG Dataset Profiling Report", minimal=True)
    profile.to_file("data_profile.html")
    print("Data profile report saved to 'data_profile.html'")
except ImportError:
    print("\nSkipping data profiling (ydata_profiling not installed)")
    print("To install: pip install ydata-profiling")


Generating data profile report...


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


[A%|          | 0/36 [00:00<?, ?it/s]
[A%|█▍        | 5/36 [00:00<00:01, 25.78it/s]
100%|██████████| 36/36 [00:00<00:00, 85.11it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Data profile report saved to 'data_profile.html'
Cell executed at: 2025-05-15 17:32:12.321297


In [None]:
# ## Model Building with PyCaret

In [33]:
# Import PyCaret regression module
from pycaret.regression import setup, compare_models, create_model, pull, plot_model, tune_model, evaluate_model, save_model

# Setup PyCaret environment
print("\nSetting up PyCaret environment...")
try:
    setup_results = setup(
        data=df_encoded,
        target=target_col,
        session_id=123,
        preprocess=True,
        imputation_type='simple',
        numeric_imputation='mean',
        categorical_imputation='mode',
        encoding_method=None,  # Already encoded low-cardinality columns
        normalize=True,
        transformation=True,
        # remove_outliers=True,
        # outliers_threshold=0.05,
        feature_selection=True,
        # feature_selection_threshold=0.8,   # watch out: PyCaret will drop features with correlation above 0.8 (multicollinearity).
        ignore_features=None,
       verbose=False
    )
except Exception as e:
    print("Error during PyCaret setup:", e)


Setting up PyCaret environment...
Error during PyCaret setup: Do not support special JSON characters in feature name.
Cell executed at: 2025-05-15 17:47:16.129631


In [32]:
# Get preprocessed dataset
processed_data = pull('dataset')
processed_data.to_csv('data/pycaret_processed_data.csv', index=False)
print(f"PyCaret preprocessed data saved to 'data/pycaret_processed_data.csv'")
    
# Compare regression models
print("\nComparing regression models...")
best_models = compare_models(n_select=3)  # Select top 3 models
model_results = pull()
print("\nModel comparison results:")
print(model_results)

ValueError: _CURRENT_EXPERIMENT global variable is not set. Please run setup() first.

Cell executed at: 2025-05-15 17:44:28.346951


In [None]:
    # Select best model and create it
    best_model_name = model_results.index[0]
    print(f"\nCreating best model: {best_model_name}")
    model = create_model(best_model_name)

In [None]:
    # Tune the best model
    print("\nTuning the best model...")
    tuned_model = tune_model(model, n_iter=10)
    

In [None]:
    # Evaluate tuned model
    print("\nEvaluating tuned model...")
    evaluate_model(tuned_model)

In [None]:
    # Save the model
    save_model(tuned_model, 'best_model')
    print("\nBest model saved as 'best_model'")

In [None]:
    # Feature importance visualization
    print("\nGenerating feature importance plot...")
    try:
        plot_model(tuned_model, plot='feature', save=True)
        print("Feature importance plot saved")
    except Exception as e:
        print(f"Could not generate feature plot: {e}")

In [None]:
    # SHAP analysis (if applicable)
    print("\nAttempting SHAP analysis...")
    try:
        import shap
        
        # Use only numeric features for SHAP analysis
        X = processed_data.drop(columns=[target_col])
        X_numeric = X.select_dtypes(include=[np.number])
        
        # Check if any columns were dropped
        if X.shape[1] != X_numeric.shape[1]:
            print(f"Warning: {X.shape[1] - X_numeric.shape[1]} non-numeric columns excluded from SHAP analysis")
        
        # Create SHAP explainer
        explainer = shap.Explainer(tuned_model, X_numeric)
        shap_values = explainer(X_numeric)
        
        # Generate and save SHAP summary plot
        plt.figure(figsize=(12, 10))
        shap.summary_plot(shap_values, X_numeric, show=False)
        plt.tight_layout()
        plt.savefig('shap_summary.png')
        plt.close()
        print("SHAP analysis completed and saved as 'shap_summary.png'")
    except Exception as e:
        print(f"SHAP analysis failed: {e}")
        print("Consider using a different model or preprocessing approach.")

In [None]:
    # Extract feature importance directly (if available)
    print("\nExtracting direct feature importance...")
    try:
        if hasattr(tuned_model, 'feature_importances_'):
            fi = pd.DataFrame({
                'Feature': X.columns,
                'Importance': tuned_model.feature_importances_
            })
            fi = fi.sort_values('Importance', ascending=False)
            print("\nFeature importances:")
            print(fi.head(15))  # Show top 15 features
            fi.to_csv('feature_importance.csv', index=False)
            print("Feature importance saved to 'feature_importance.csv'")
        else:
            print("Feature importance attribute not available for this model")
    except Exception as e:
        print(f"Failed to extract feature importance: {e}")

except Exception as e:
    print(f"\nError in PyCaret workflow: {e}")
    print("Check if PyCaret is installed correctly: pip install pycaret")

print("\nAnalysis complete!")