In [1]:
# Import basic libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime


In [2]:
# Configure timestamp callback for Jupyter cells
from IPython import get_ipython

def setup_timestamp_callback():
    """Setup a timestamp callback for Jupyter cells without clearing existing callbacks."""
    ip = get_ipython()
    if ip is not None:
        # Define timestamp function
        def print_timestamp(*args, **kwargs):
            """Print timestamp after cell execution."""
            print(f"Cell executed at: {datetime.now()}")
        
        # Check if our callback is already registered
        callbacks = ip.events.callbacks.get('post_run_cell', [])
        for cb in callbacks:
            if hasattr(cb, '__name__') and cb.__name__ == 'print_timestamp':
                # Already registered
                return
                
        # Register new callback if not already present
        ip.events.register('post_run_cell', print_timestamp)
        print("Timestamp printing activated.")
    else:
        print("Not running in IPython/Jupyter environment.")

In [3]:
# Setup timestamp callback
setup_timestamp_callback()

# Set visualization style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)

Timestamp printing activated.
Cell executed at: 2025-05-17 16:06:05.258151


In [4]:
# Load the data
print("Loading data...")
df = pd.read_excel("data/ISBSG2016R1.1-Formatted4CSVAgileOnly.xlsx")


Loading data...
Cell executed at: 2025-05-17 16:06:05.871305


In [5]:
# clean up columns like lowercase, strip spaces, remove trailing punctuation

import re

def clean_category(val):
    if pd.isnull(val):
        return val
    # Lowercase, strip spaces, remove trailing punctuation
    val = val.strip().lower()
    val = re.sub(r'\s+', ' ', val)  # collapse multiple spaces
    val = val.rstrip(';,.')
    # Remove duplicate semicolons and extra spaces between separated values
    val = re.sub(r';\s*;', ';', val)
    val = re.sub(r';\s+', '; ', val)
    return val

cat_cols = df.select_dtypes(include=['object', 'category']).columns
for col in cat_cols:
    df[col] = df[col].map(clean_category)

# Clean column names: lowercase, replace spaces with underscores, strip
df.columns = (
    df.columns
    .str.strip()                # remove leading/trailing spaces
    .str.lower()                # make lowercase
    .str.replace(' ', '_')      # replace spaces with underscores
    .str.replace('-', '_')      # optional: replace hyphens with underscores
)


Cell executed at: 2025-05-17 16:06:05.902238


In [6]:
print("Current columns:", df.columns.tolist())


Current columns: ['isbsg_project_id', 'external_eef_data_quality_rating', 'project_prf_year_of_project', 'external_eef_industry_sector', 'external_eef_organisation_type', 'project_prf_application_group', 'project_prf_application_type', 'project_prf_development_type', 'tech_tf_development_platform', 'tech_tf_language_type', 'tech_tf_primary_programming_language', 'project_prf_functional_size', 'project_prf_relative_size', 'project_prf_normalised_work_effort', 'project_prf_normalised_level_1_pdr_ufp', 'project_prf_normalised_pdr_ufp', 'project_prf_defect_density', 'project_prf_speed_of_delivery', 'project_prf_manpower_delivery_rate', 'project_prf_project_elapsed_time', 'project_prf_team_size_group', 'project_prf_max_team_size', 'project_prf_case_tool_used', 'process_pmf_development_methodologies', 'process_pmf_prototyping_used', 'process_pmf_docs', 'tech_tf_architecture', 'tech_tf_client_server', 'tech_tf_client_roles', 'tech_tf_server_roles', 'tech_tf_type_of_server', 'tech_tf_web_devel

In [7]:
# Clean and standardise a semicolon-seperated categorical string

def clean_and_sort_semicolon(val):
    """Clean and standardize a semicolon-separated categorical string."""
    if pd.isnull(val):
        return val
    # Split, strip, lower, remove trailing punctuation
    parts = [p.strip().lower().rstrip(';,.') for p in val.split(';') if p.strip()]
    # Remove duplicates, sort
    parts = sorted(set(parts))
    return '; '.join(parts)

cols_with_semicolons = [
    'project_prf_application_group',
    'external_eef_organisation_type',
    'process_pmf_development_methodologies',
    'project_prf_application_type',
    'tech_tf_client_roles',
    'tech_tf_server_roles',
    'tech_tf_type_of_server'
]
for col in cols_with_semicolons:
    df[col] = df[col].map(clean_and_sort_semicolon)


Cell executed at: 2025-05-17 16:06:05.923442


In [8]:
# Standardise some columns with mixed cases

def standardize_value(val):
    if pd.isnull(val):
        return val
    val = val.strip().lower()
    if val in ['stand alone', 'stand-alone']:
        return 'stand-alone'
    if val == 'client server':
        return 'client-server'
    # Remove question mark from web dev
    if val.replace('?', '').strip() == 'web':
        return 'web'
    # You can add more cases as needed
    return val

df['tech_tf_architecture'] = df['tech_tf_architecture'].map(standardize_value)
df['tech_tf_web_development'] = df['tech_tf_web_development'].map(standardize_value)
df['tech_tf_language_type'] = df['tech_tf_language_type'].str.upper().str.strip()


Cell executed at: 2025-05-17 16:06:05.934305


In [9]:
# Save the cleaned Unique Multilables for some categorical columns

import os

output_file = "temp/cleaned_unique_multilabels.txt"

with open(output_file, "w", encoding="utf-8") as f:
    for col in cols_with_semicolons:
        if col in df.columns:
            uniques = sorted(df[col].dropna().unique())
            f.write(f"Column: {col}\n")
            for val in uniques:
                f.write(f"    {repr(val)}\n")
            f.write("\n" + "-"*40 + "\n\n")
print(f"Cleaned and sorted unique values written to '{output_file}'")



Cleaned and sorted unique values written to 'temp/cleaned_unique_multilabels.txt'
Cell executed at: 2025-05-17 16:06:05.952878


In [11]:
# Save categorical unique values for all categorical columns

cat_cols = df.select_dtypes(include=['object', 'category']).columns

with open('temp/categorical_unique_values.txt', 'w') as f:
    for col in cat_cols:
        f.write(f"Column: {col} (n_unique = {df[col].nunique()})\n")
        f.write(f"{df[col].unique()}\n")
        f.write('-' * 40 + '\n')

print("Unique values for categorical columns saved to 'temp/categorical_unique_values.txt'")

Unique values for categorical columns saved to 'temp/categorical_unique_values.txt'
Cell executed at: 2025-05-17 16:21:01.366309


In [10]:
df.to_csv("data/ISBSG2016R1.1-Formatted4CSVAgileOnly_cleaned.csv", index=False)
print("Whole cleaned DataFrame written to 'data/ISBSG2016R1.1-Formatted4CSVAgileOnly_cleaned.csv'")


Whole cleaned DataFrame written to 'data/ISBSG2016R1.1-Formatted4CSVAgileOnly_cleaned.csv'
Cell executed at: 2025-05-17 16:06:05.970032


In [13]:
# Group by 'Relative Size' and get mode of 'Functional Size' for each group
mean_of_modes = df.groupby('project_prf_relative_size')['project_prf_functional_size'].agg(lambda x: x.mode().mean())

print("Mode of Functional Size for each Relative Size:")
print(mean_of_modes)


Mode of Functional Size for each Relative Size:
project_prf_relative_size
l      1687.000000
m1      190.500000
m2      616.272727
s        56.000000
xs       15.500000
xxs       5.166667
Name: project_prf_functional_size, dtype: float64
Cell executed at: 2025-05-17 20:11:14.735004
