In [1]:
# ================================
# 1. Import libraries
# ================================
import pandas as pd
import numpy as np
import re
from pathlib import Path
from sklearn.preprocessing import MultiLabelBinarizer
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import re
from pathlib import Path
import os

In [2]:
# Sets up an automatic timestamp printout after each Jupyter cell execution 
# and configures the default visualization style.
from IPython import get_ipython

def setup_timestamp_callback():
    """Setup a timestamp callback for Jupyter cells without clearing existing callbacks."""
    ip = get_ipython()
    if ip is not None:
        # Define timestamp function
        def print_timestamp(*args, **kwargs):
            """Print timestamp after cell execution."""
            print(f"Cell executed at: {datetime.now()}")
        
        # Check if our callback is already registered
        callbacks = ip.events.callbacks.get('post_run_cell', [])
        for cb in callbacks:
            if hasattr(cb, '__name__') and cb.__name__ == 'print_timestamp':
                # Already registered
                return
                
        # Register new callback if not already present
        ip.events.register('post_run_cell', print_timestamp)
        print("Timestamp printing activated.")
    else:
        print("Not running in IPython/Jupyter environment.")

# Setup timestamp callback
setup_timestamp_callback()

# Set visualization style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)

Timestamp printing activated.
Cell executed at: 2025-05-30 20:22:28.553065


In [3]:
# ================================
# 2. Set file paths and load data
# ================================
data_folder = Path("../data")
sample_file = "sample_clean_a_agile_only.xlsx"
data_file = ""

# Load the data
print("Loading data...")

file_path = f"{data_folder}/{sample_file}"  # should use data_file for model training
file_name_no_ext = Path(file_path).stem                # 'ISBSG2016R1.1 - FormattedForCSV'
print(file_name_no_ext)


df = pd.read_excel(file_path)

Loading data...
sample_clean_a_agile_only
Cell executed at: 2025-05-30 20:22:28.873662


In [4]:
# functions to standardise column names

def standardize_columns(df):
    return df.rename(columns=lambda x: x.strip().lower().replace(' ', '_'))




Cell executed at: 2025-05-30 20:22:28.888761


In [5]:
# ================================
# 3. Identify columns with semicolons
# ================================
semicolon_cols = [
    col for col in df.columns
    if df[col].dropna().astype(str).str.contains(';').any()
]

print("Columns with semicolons:", semicolon_cols)


Columns with semicolons: ['External_EEF_Organisation Type', 'Project_PRF_Application Type', 'Process_PMF_Development Methodologies', 'Tech_TF_Client_Roles', 'Tech_TF_Server_Roles']
Cell executed at: 2025-05-30 20:22:28.922372


In [6]:
# ================================
# 4. Cleaning function for semicolon-separated columns
# ================================
def clean_and_sort_semicolon(val, apply_standardization=False, mapping=None):
    """
    Clean, deduplicate, sort, and (optionally) standardize semicolon-separated values.
    """
    if pd.isnull(val) or val == '':
        return val
    parts = [x.strip().lower() for x in str(val).split(';') if x.strip()]
    if apply_standardization and mapping is not None:
        parts = [mapping.get(part, part) for part in parts]
    unique_cleaned = sorted(set(parts))
    return '; '.join(unique_cleaned)

# Optionally: a mapping dictionary for extra standardization
standardization_mapping = {
    "scrum": "agile development",
    "file &/or print server": "file/print server",
    # Add more business-specific mappings here!
}

Cell executed at: 2025-05-30 20:22:28.927074


In [7]:
# ================================
# 5. Apply cleaning to each semicolon column
# ================================
for col in semicolon_cols:
    # Choose whether to apply mapping (you can edit logic below per column)
    apply_mapping = col in ['Process_PMF_Development Methodologies', 'Tech_TF_Server_Roles']
    mapping = standardization_mapping if apply_mapping else None
    df[col + "_cleaned"] = df[col].map(lambda x: clean_and_sort_semicolon(x, apply_standardization=apply_mapping, mapping=mapping))

Cell executed at: 2025-05-30 20:22:28.942482


In [8]:
# ================================
# 6. Show before/after for each column (first 3 examples)
# ================================
for col in semicolon_cols:
    print(f"\nColumn: {col}")
    print("BEFORE:", list(df[col].dropna().astype(str).unique()[:3]))
    print("AFTER:", list(df[col + "_cleaned"].dropna().astype(str).unique()[:3]))



Column: External_EEF_Organisation Type
BEFORE: ['Government;Education Institution;Wholesale & Retail Trade;Transport & Storage;Communications;Medical and Health Care;Banking;', 'Government;', 'Community Services;']
AFTER: ['banking; communications; education institution; government; medical and health care; transport & storage; wholesale & retail trade', 'government', 'community services']

Column: Project_PRF_Application Type
BEFORE: ['Surveillance and security;', 'Business Application;', 'Workflow support & management;Complex process control;']
AFTER: ['surveillance and security', 'business application', 'complex process control; workflow support & management']

Column: Process_PMF_Development Methodologies
BEFORE: ['Agile Development;', 'Agile Development;Unified Process;', 'Agile Development;Personal Software Process (PSP);Unified Process;']
AFTER: ['agile development', 'agile development; unified process', 'agile development; personal software process (psp); unified process']

Co

In [9]:
# ================================
# 7. One-hot encode cleaned columns & show unique categories
# ================================
unique_values = {}
mlb_results = {}

for col in semicolon_cols:
    cleaned_col = col + "_cleaned"
    values = df[cleaned_col].dropna().astype(str).apply(lambda x: [item.strip() for item in x.split(';') if item.strip()])
    mlb = MultiLabelBinarizer()
    onehot = pd.DataFrame(
        mlb.fit_transform(values),
        columns=[f"{cleaned_col}__{cat}" for cat in mlb.classes_],
        index=values.index
    )
    # Merge one-hot with main df if needed: df = df.join(onehot)
    mlb_results[cleaned_col] = onehot
    unique_values[col] = list(mlb.classes_)
    print(f"\nUnique categories in '{col}':\n", mlb.classes_)



Unique categories in 'External_EEF_Organisation Type':
 ['aerospace / automotive' 'all industry organization types' 'banking'
 'biotech' 'communications' 'community services' 'construction'
 'consumer goods' 'defence' 'education institution'
 'electricity, gas, water' 'energy'
 'financial, property & business services' 'food processing' 'government'
 'high tech' 'ieee' 'information technology'
 'institutions eg. kindergartens' 'manufacturing'
 'medical and health care' 'professional' 'public administration'
 'public sector' 'real estate & property' 'surveillance & security'
 'transport & storage' 'wholesale & retail trade']

Unique categories in 'Project_PRF_Application Type':
 ['airport management' 'analysis management' 'auditing management'
 'automated data acquisition' 'business application'
 'catalogue/register of things or events' 'clinical archive'
 'complex process control' 'content management system' 'course management'
 'customer billing' 'customer billing/relationship manage

In [10]:
# ================================
# 8. (Optional) Export cleaned data & one-hot encoded columns
# ================================
df.to_csv(data_folder / (file_name_no_ext + "_cleaned_data.csv"), index=False)

# For one-hot: 
pd.concat([df, onehot], axis=1).to_csv(data_folder / (file_name_no_ext + "_cleaned_data_with_onehot.csv"), index=False)

Cell executed at: 2025-05-30 20:22:29.002007


In [11]:
# Step 1: Replace original columns with cleaned versions
for col in semicolon_cols:
    cleaned_col = col + "_cleaned"
    if cleaned_col in df.columns:
        df[col] = df[cleaned_col]

# Step 2: Drop the now-redundant _cleaned columns
df = df.drop([col + "_cleaned" for col in semicolon_cols if col + "_cleaned" in df.columns], axis=1)

df_cleaned = standardize_columns(df)

# Step 3: Save the cleaned DataFrame to CSV
df_cleaned.to_csv(data_folder / (file_name_no_ext + "_cleaned_no_add.csv"), index=False)

Cell executed at: 2025-05-30 20:22:29.020968


In [12]:
print("Current columns:", df_cleaned.columns.tolist())

Current columns: ['isbsg_project_id', 'external_eef_data_quality_rating', 'project_prf_year_of_project', 'external_eef_industry_sector', 'external_eef_organisation_type', 'project_prf_application_group', 'project_prf_application_type', 'project_prf_development_type', 'tech_tf_development_platform', 'tech_tf_language_type', 'tech_tf_primary_programming_language', 'project_prf_functional_size', 'project_prf_relative_size', 'project_prf_normalised_work_effort_level_1', 'project_prf_normalised_work_effort', 'project_prf_normalised_level_1_pdr_ufp', 'project_prf_normalised_pdr_ufp', 'project_prf_defect_density', 'project_prf_speed_of_delivery', 'project_prf_manpower_delivery_rate', 'project_prf_project_elapsed_time', 'project_prf_team_size_group', 'project_prf_max_team_size', 'project_prf_case_tool_used', 'process_pmf_development_methodologies', 'process_pmf_prototyping_used', 'process_pmf_docs', 'tech_tf_architecture', 'tech_tf_client_server', 'tech_tf_client_roles', 'tech_tf_server_roles'