In [1]:
# Import basic libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime


In [2]:
# Define the foler path
models_folder = '../models'
plots_folder = '../plots'
temp_folder = '../temp'
data_folder = '../data'
logs_folder = '../logs'

In [3]:
# Sets up an automatic timestamp printout after each Jupyter cell execution 
# and configures the default visualization style.
from IPython import get_ipython

def setup_timestamp_callback():
    """Setup a timestamp callback for Jupyter cells without clearing existing callbacks."""
    ip = get_ipython()
    if ip is not None:
        # Define timestamp function
        def print_timestamp(*args, **kwargs):
            """Print timestamp after cell execution."""
            print(f"Cell executed at: {datetime.now()}")
        
        # Check if our callback is already registered
        callbacks = ip.events.callbacks.get('post_run_cell', [])
        for cb in callbacks:
            if hasattr(cb, '__name__') and cb.__name__ == 'print_timestamp':
                # Already registered
                return
                
        # Register new callback if not already present
        ip.events.register('post_run_cell', print_timestamp)
        print("Timestamp printing activated.")
    else:
        print("Not running in IPython/Jupyter environment.")

# Setup timestamp callback
setup_timestamp_callback()

# Set visualization style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)

Timestamp printing activated.
Cell executed at: 2025-05-25 10:48:49.565215


In [4]:
# Load the data


from pathlib import Path

print("Loading data...")

file_path = f"{data_folder}/ISBSG2016R1.1_Formatted4CSVAgileOnly.xlsx"
file_name_no_ext = Path(file_path).stem                # 'ISBSG2016R1.1 - FormattedForCSV'
print(file_name_no_ext)


df = pd.read_excel(file_path)


Loading data...
ISBSG2016R1.1_Formatted4CSVAgileOnly
Cell executed at: 2025-05-25 10:48:49.935422


In [5]:
# Cleans and standardizes string columns and column names by removing spaces, 
# converting to lowercase, and normalizing formatting.

import re

def clean_category(val):
    if pd.isnull(val):
        return val
    # Lowercase, strip spaces, remove trailing punctuation
    val = val.strip().lower()
    val = re.sub(r'\s+', ' ', val)  # collapse multiple spaces
    val = val.rstrip(';,.')
    val = val.replace('(', '').replace(')', '')
    # Remove duplicate semicolons and extra spaces between separated values
    val = re.sub(r';\s*;', ';', val)
    val = re.sub(r';\s+', '; ', val)
    return val

cat_cols = df.select_dtypes(include=['object', 'category']).columns
for col in cat_cols:
    df[col] = df[col].map(clean_category)

# Clean column names: lowercase, replace spaces with underscores, strip
df.columns = (
    df.columns
    .str.strip()                # remove leading/trailing spaces
    .str.lower()                # make lowercase
    .str.replace(' ', '_')      # replace spaces with underscores
    .str.replace('-', '_')      # optional: replace hyphens with underscores
    .str.replace('__', '_')      
    .str.replace('(', '')     
    .str.replace(')', '')      
    .str.replace('<', 'less_than_')     
    .str.replace('>', 'great_than_')
    .str.replace('?', '')
)


Cell executed at: 2025-05-25 10:48:49.957952


In [6]:
print("Current columns:", df.columns.tolist())

Current columns: ['isbsg_project_id', 'external_eef_data_quality_rating', 'project_prf_year_of_project', 'external_eef_industry_sector', 'external_eef_organisation_type', 'project_prf_application_group', 'project_prf_application_type', 'project_prf_development_type', 'tech_tf_development_platform', 'tech_tf_language_type', 'tech_tf_primary_programming_language', 'project_prf_functional_size', 'project_prf_relative_size', 'project_prf_normalised_work_effort', 'project_prf_normalised_level_1_pdr_ufp', 'project_prf_normalised_pdr_ufp', 'project_prf_defect_density', 'project_prf_speed_of_delivery', 'project_prf_manpower_delivery_rate', 'project_prf_project_elapsed_time', 'project_prf_team_size_group', 'project_prf_max_team_size', 'project_prf_case_tool_used', 'process_pmf_development_methodologies', 'process_pmf_prototyping_used', 'process_pmf_docs', 'tech_tf_architecture', 'tech_tf_client_server', 'tech_tf_client_roles', 'tech_tf_server_roles', 'tech_tf_type_of_server', 'tech_tf_web_devel

In [7]:
# Save the entire cleaned DataFrame (not just the column names) to CSV
df.to_csv(f'../data/{file_name_no_ext}.csv', index=False)

Cell executed at: 2025-05-25 10:48:49.973626


In [8]:
# Clean data
# Cleans, de-duplicates, and sorts semicolon-separated categorical values in specified columns.

def clean_and_sort_semicolon(val):
    """Clean and standardise a semicolon-separated categorical string."""
    if pd.isnull(val):
        return val
    
    # Split, strip, lower, remove trailing punctuation
    parts = []
    for p in val.split(';'):
        stripped_p = p.strip()
        if stripped_p: # Only process non-empty parts after stripping
            # Normalize internal multiple spaces to a single space
            cleaned_p = re.sub(r'\s+', ' ', stripped_p)
            cleaned_p = cleaned_p.lower().rstrip(';,.')
            parts.append(cleaned_p)
    
    # Remove duplicates, sort
    parts = sorted(set(parts))
    return '; '.join(parts)

cols_with_semicolons = [
    'project_prf_application_group',
    'external_eef_organisation_type',
    'project_prf_application_group',
    'project_prf_application_type',
    'project_prf_development_type',
    'process_pmf_development_methodologies',
    'tech_tf_client_server',
    'tech_tf_client_roles',
    'tech_tf_server_roles'
]
for col in cols_with_semicolons:
    df[col] = df[col].map(clean_and_sort_semicolon)


Cell executed at: 2025-05-25 10:48:49.987212


In [9]:
# Standardizes specific categorical columns by normalizing case and correcting inconsistent formatting.

def standardize_value(val):
    if pd.isnull(val):
        return val
    val = val.strip().lower()
    if val in ['stand alone', 'stand-alone']:
        return 'stand-alone'
    if val == 'client server':
        return 'client-server'
    if val == 'mathematically intensive':
        return 'mathematically-intensive'

    # Remove question mark from web dev
    if val.replace('?', '').strip() == 'web':
        return 'web'
    # You can add more cases as needed
    return val

df['tech_tf_architecture'] = df['tech_tf_architecture'].map(standardize_value)
df['tech_tf_web_development'] = df['tech_tf_web_development'].map(standardize_value)
df['project_prf_application_group'] = df['project_prf_application_group'].map(standardize_value)
df['tech_tf_language_type'] = df['tech_tf_language_type'].str.upper().str.strip()


Cell executed at: 2025-05-25 10:48:50.000737


In [10]:
# Writes the unique values of all categorical columns to a text file for reference or auditing.

cat_cols = df.select_dtypes(include=['object', 'category']).columns

with open(f"{temp_folder}/all_categorical_unique_values_beforeDropping.txt", 'w') as f:
    for col in cat_cols:
        f.write(f"Column: {col} (n_unique = {df[col].nunique()})\n")
        f.write(f"{df[col].unique()}\n")
        f.write('-' * 40 + '\n')

print(f"Before Dropping: Unique values for categorical columns saved to '{temp_folder}/all_categorical_unique_values_beforeDropping.txt'")

Before Dropping: Unique values for categorical columns saved to '../temp/all_categorical_unique_values_beforeDropping.txt'
Cell executed at: 2025-05-25 10:48:50.021628


In [11]:
# Save the entire cleaned DataFrame (not just the column names) to CSV
df.to_csv(f"{data_folder}/{file_name_no_ext}_cleaned.csv", index=False)

Cell executed at: 2025-05-24 19:08:06.763176
