In [1]:
"""
Complete Scikit-Learn Preprocessing Pipeline for ISBSG Data
===========================================================

This module provides a comprehensive preprocessing pipeline that handles:
1. Data loading and initial cleaning
2. Column name standardization
3. Missing value handling
4. Semicolon-separated value processing
5. One-hot encoding for categorical variables
6. Multi-label binarization for multi-value columns
7. Feature selection and filtering
8. Data validation and export

Based on the preprocessing steps from the provided notebooks.
"""



In [2]:
# === Imports ===

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
from pathlib import Path
import seaborn as sns
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.compose import ColumnTransformer
import joblib
import os
from collections import Counter
from datetime import datetime
from typing import Dict, List, Optional, Tuple, Union, Any
import warnings
warnings.filterwarnings('ignore')



In [3]:
# Sets up an automatic timestamp printout after each Jupyter cell execution 
# and configures the default visualization style.
from IPython import get_ipython

def setup_timestamp_callback():
    """Setup a timestamp callback for Jupyter cells without clearing existing callbacks."""
    ip = get_ipython()
    if ip is not None:
        # Define timestamp function
        def print_timestamp(*args, **kwargs):
            """Print timestamp after cell execution."""
            print(f"Cell executed at: {datetime.now()}")
        
        # Check if our callback is already registered
        callbacks = ip.events.callbacks.get('post_run_cell', [])
        for cb in callbacks:
            if hasattr(cb, '__name__') and cb.__name__ == 'print_timestamp':
                # Already registered
                return
                
        # Register new callback if not already present
        ip.events.register('post_run_cell', print_timestamp)
        print("Timestamp printing activated.")
    else:
        print("Not running in IPython/Jupyter environment.")

# Setup timestamp callback
setup_timestamp_callback()

# Set visualization style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)

Timestamp printing activated.
Cell executed at: 2025-06-02 16:14:37.931984


In [4]:
# Configuration
DATA_FOLDER = "../data"
SAMPLE_FILE = "ISBSG2016R1_1_agile_dataset_only.xlsx"
FULL_FILE = "ISBSG2016R1_1_full_dataset.xlsx"
TARGET_COL = "project_prf_normalised_work_effort"  # be careful about case sensitive
EXCLUDE_FROM_CATEGORY_AUGMENT = ['project_prf_year_of_project']

Cell executed at: 2025-06-02 16:14:37.947790


In [5]:
"""
    As of 20250602:
    Data preprocessing pipeline for machine learning using scikit-learn transformers: designed specifically for processing project effort estimation data with a target column like 'project_prf_normalised_work_effort':
        - DataFrame validation
        - Column standardization, missing value imputation, categorical encoding (including multi-value semicolon-separated columns)
        - Data cleaning in a systematic way
        - The main entry point is preprocess_dataframe() which applies all transformations and returns a clean, ML-ready dataset
        - The pipeline uses one-hot encoding for both regular categorical variables and multi-label columns (semicolon-separated values).
"""

# === 1. DataFrameValidator: Validate input DataFrame and check target column ===
class DataFrameValidator(BaseEstimator, TransformerMixin):
    """
    Validate input DataFrame and perform initial checks:
    - Ensures input is a DataFrame
    - Validates target column exists with smart matching
    - Stores original shape and target column info
    """
    
    def __init__(self, target_col: str = 'project_prf_normalised_work_effort'):
        self.target_col = target_col
        self.original_shape = None
        self.original_target_col = None
        
    def fit(self, X, y=None):
        print(f"{self.__class__.__name__}.fit() CALLED")
        return self
    
    def _standardize_column_name(self, col_name: str) -> str:
        """Convert column name to standardized format"""
        return str(col_name).strip().lower().replace(' ', '_')
    
    def _find_target_column(self, df_columns) -> Union[str, Tuple[None, List[str]]]:
        """Smart target column finder - handles various formats"""
        target_standardized = self._standardize_column_name(self.target_col)
        
        # Try exact match first
        if self.target_col in df_columns:
            return self.target_col
            
        # Try standardized versions of all columns
        for col in df_columns:
            col_standardized = self._standardize_column_name(col)
            if col_standardized == target_standardized:
                return col
                
        # If not found, look for partial matches
        similar_cols = []
        target_words = set(target_standardized.split('_'))
        for col in df_columns:
            col_words = set(self._standardize_column_name(col).split('_'))
            if len(target_words.intersection(col_words)) >= 2:
                similar_cols.append(col)
                
        return None, similar_cols
    
    def transform(self, X):
        """Validate DataFrame and find target column"""
        if not isinstance(X, pd.DataFrame):
            raise TypeError("Input must be a pandas DataFrame")
        
        df = X.copy()
        self.original_shape = df.shape
        print(f"Processing DataFrame with shape: {df.shape}")

        # Standardize ALL object/categorical columns: lowercase and strip
        for col in df.select_dtypes(include='object').columns:
            df[col] = df[col].astype(str).str.lower().str.strip()

        # Smart target column finding
        result = self._find_target_column(df.columns)
        
        if isinstance(result, tuple):  # Not found
            actual_col, similar_cols = result
            error_msg = f"Target column '{self.target_col}' not found in DataFrame."
            if similar_cols:
                error_msg += f" Similar columns found: {similar_cols}"
            else:
                error_msg += f" Available columns: {list(df.columns)}"
            raise ValueError(error_msg)
        else:
            actual_col = result
            
        # Store the original column name we found
        self.original_target_col = actual_col
        
        if actual_col != self.target_col:
            print(f"Target column found: '{actual_col}' -> will be standardized to '{self.target_col}'")
            
        return df

# === 2. ColumnNameStandardizer: Clean and standardize column names ===
class ColumnNameStandardizer(BaseEstimator, TransformerMixin):
    """
    Standardize column names for consistency:
    - Strips spaces, lowercases, replaces & with _&_, removes special chars
    - Maintains mapping for reference
    """
    
    def __init__(self, target_col: Optional[str] = None, original_target_col: Optional[str] = None):
        self.column_mapping = {}
        self.target_col = target_col
        self.original_target_col = original_target_col
        
    def fit(self, X, y=None):
        print(f"{self.__class__.__name__}.fit() CALLED")
        return self
    
    def _standardize_columns(self, columns) -> List[str]:
        """Standardize column names"""
        return [str(col).strip().lower().replace(' ', '_') for col in columns]
    
    def _clean_column_names(self, columns) -> List[str]:
        """Clean column names for compatibility"""
        cleaned_cols = []
        for col in columns:
            # Replace ampersands with _&_ to match expected transformations
            col_clean = str(col).replace(' & ', '_&_')
            # Remove special characters except underscores and ampersands
            col_clean = re.sub(r'[^\w\s&]', '', col_clean)
            # Replace spaces with underscores
            col_clean = col_clean.replace(' ', '_')
            cleaned_cols.append(col_clean)
        return cleaned_cols
    
    def transform(self, X):
        """Apply column name standardization"""
        df = X.copy()
        
        # Store original column names
        original_columns = df.columns.tolist()
        
        # Apply standardization
        standardized_cols = self._standardize_columns(original_columns)
        cleaned_cols = self._clean_column_names(standardized_cols)

        # Special handling for target column
        if self.original_target_col and self.target_col:
            try:
                target_index = original_columns.index(self.original_target_col)
                cleaned_cols[target_index] = self.target_col
                print(f"Target column '{self.original_target_col}' -> '{self.target_col}'")
            except ValueError:
                pass  # Original target col not found, proceed normally
        
        # Create mapping
        self.column_mapping = dict(zip(original_columns, cleaned_cols))
        
        # Apply new column names
        df.columns = cleaned_cols
        
        # Report changes
        changed_cols = sum(1 for orig, new in self.column_mapping.items() if orig != new)
        print(f"Standardized {changed_cols} column names")
        
        return df

# === 3. CategoricalValueStandardizer: Apply standardization mapping to categorical values ===
class CategoricalValueStandardizer(BaseEstimator, TransformerMixin):
    """Apply standardization mapping to categorical column values"""
    
    def __init__(self, mapping: Optional[Dict[str, str]] = None, columns: Optional[List[str]] = None):
        self.mapping = mapping or {}
        self.columns = columns

    def fit(self, X, y=None):
        print(f"{self.__class__.__name__}.fit() CALLED")
        print(f"single-value columns are: {self.columns}")
        if self.columns is None:
            possible_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
            single_value_cols = []
            for col in possible_cols:
                if not X[col].dropna().astype(str).str.contains(';').any():
                    single_value_cols.append(col)
            self.columns = single_value_cols
            print(f"CategoricalValueStandardizer[fit] Single-value categorical columns selected for mapping: {self.columns}")
        return self

    def transform(self, X):
        #print(f"Single-value categorical columns mapping: {self.mapping}")
        print(f"[transform] Single-value Columns to be mapped: {self.columns}")
        df = X.copy()
        for col in self.columns:
            if col in df.columns:
                #print(f"[transform] Processing column: '{col}'")
                original_values = df[col].copy()  # assign before overwrite
                #print(f"  Unique values before mapping: {original_values.unique()[:10]}")
                df[col] = (
                    df[col].astype(str)
                    .str.strip()
                    .str.lower()
                    .map(lambda x: self.mapping.get(x, x))
                )
            # Print changes only (where original != new)
            #print(f"  Unique values after mapping: {df[col].unique()[:10]}")
            changed = original_values[original_values != df[col]]
            #if not changed.empty:
            #    print(f"\nColumn '{col}':")
            #    for idx in changed.index:
            #        print(f"  {original_values[idx]!r} -> {df[col][idx]!r}")
        return df

# === 4. CategoricalValueCleaner: Clean categorical values ===
class CategoricalValueCleaner(BaseEstimator, TransformerMixin):
    """
    Clean categorical column values:
    - Replace '-' with '_'
    - Lowercase and strip whitespace
    """
    
    def fit(self, X, y=None):
        print(f"{self.__class__.__name__}.fit() CALLED")
        return self
        
    def transform(self, X):
        df = X.copy()
        cat_cols = df.select_dtypes(include=['object', 'category']).columns
        for col in cat_cols:
            df[col] = (
                df[col].astype(str)
                .str.replace('-', '_')
                .str.lower()
                .str.strip()
            )
        return df

# === 5. MissingValueAnalyzer: Analyze and handle missing values ===
class MissingValueAnalyzer(BaseEstimator, TransformerMixin):
    """
    Analyze and handle missing values:
    - Reports missing value statistics
    - Drops high-missing columns (except protected ones)
    - Fills remaining missing values appropriately
    """
    
    def __init__(self, high_missing_threshold: float = 0.7, cols_to_keep: Optional[List[str]] = None):
        self.high_missing_threshold = high_missing_threshold
        self.cols_to_keep = cols_to_keep or []
        self.high_missing_cols = []
        self.missing_stats = {}
        self.fill_values = {}
        
    def fit(self, X, y=None):
        # Pre-calculate fill values for numeric columns
        num_cols = X.select_dtypes(include=['number']).columns
        self.fill_values = {col: X[col].median() for col in num_cols}
        print(f"{self.__class__.__name__}.fit() CALLED")
        return self
    
    def transform(self, X):
        """Analyze and handle missing values"""
        df = X.copy()
        
        # Calculate missing percentages
        missing_pct = df.isnull().mean()
        self.missing_stats = missing_pct.sort_values(ascending=False)
        
        print(f"\nMissing value analysis:")
        print(f"Columns with >50% missing: {sum(missing_pct > 0.5)}")
        print(f"Columns with >70% missing: {sum(missing_pct > self.high_missing_threshold)}")
        
        # Identify high missing columns
        self.high_missing_cols = missing_pct[missing_pct > self.high_missing_threshold].index.tolist()
        
        # Filter out protected columns
        cols_to_drop = [col for col in self.high_missing_cols if col not in self.cols_to_keep]
        
        if cols_to_drop:
            print(f"Dropping {len(cols_to_drop)} columns with >{self.high_missing_threshold*100}% missing values")
            print(f"Columns to be dropped due to high missing values: {cols_to_drop}")
            df = df.drop(columns=cols_to_drop)
        
        # Fill missing values efficiently
        # Categorical columns
        cat_cols = df.select_dtypes(include=['object', 'category']).columns
        df[cat_cols] = df[cat_cols].fillna('missing')
        
        # Numeric columns
        for col in df.select_dtypes(include=['number']).columns:
            if col in self.fill_values and df[col].isnull().any():
                df[col] = df[col].fillna(self.fill_values[col])
        
        print(f"Data shape after missing value handling: {df.shape}")
        return df

# === 6. SemicolonProcessor: Process multi-value columns ===
class SemicolonProcessor(BaseEstimator, TransformerMixin):
    """
    Process semicolon-separated values in columns:
    - Identifies columns with semicolons
    - Cleans, deduplicates, and sorts values
    - Applies standardization mapping where specified
    """
    
    def __init__(self, standardization_mapping: Optional[Dict[str, str]] = None):
        self.semicolon_cols = []
        self.standardization_mapping = standardization_mapping or {}
        
    def fit(self, X, y=None):
        print(f"{self.__class__.__name__}.fit() CALLED")
        return self
    
    def _clean_and_sort_semicolon(self, val, apply_standardization: bool = False, 
                                 mapping: Optional[Dict[str, str]] = None) -> str:
        """Clean, deduplicate, sort, and standardize semicolon-separated values"""
        if pd.isnull(val) or val == '':
            return val
        
        parts = [x.strip().lower() for x in str(val).split(';') if x.strip()]
        
        if apply_standardization and mapping is not None:
            parts = [mapping.get(part, part) for part in parts]
        
        unique_cleaned = sorted(set(parts))
        return '; '.join(unique_cleaned)
    
    def transform(self, X):
        """Process semicolon-separated columns"""
        df = X.copy()
        # Identify columns with semicolons
        self.semicolon_cols = [
            col for col in df.columns
            if df[col].dropna().astype(str).str.contains(';').any()
        ]
        print(f"Found {len(self.semicolon_cols)} columns with semicolons: {self.semicolon_cols}")
        #print(f"Semicolon mapping: {self.standardization_mapping}")
        # Process each semicolon column
        for col in self.semicolon_cols:
            apply_mapping = col in self.semicolon_cols
            mapping = self.standardization_mapping if apply_mapping else None
            df[col] = df[col].apply(
                lambda x: self._clean_and_sort_semicolon(
                    x, apply_standardization=apply_mapping, mapping=mapping
                )
            )
        return df

# === 7. MultiValueEncoder: Encode semicolon columns using MultiLabelBinarizer ===
class MultiValueEncoder(BaseEstimator, TransformerMixin):
    """
    Handle multi-value columns using MultiLabelBinarizer:
    - Only processes columns with manageable cardinality
    - Creates binary columns for each unique value
    """
    
    def __init__(self, max_cardinality: int = 10):
        self.max_cardinality = max_cardinality
        self.multi_value_cols = []
        self.mlb_transformers = {}
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        """Encode multi-value columns"""
        df = X.copy()
        
        # Identify semicolon columns
        semicolon_cols = [
            col for col in df.columns
            if df[col].dropna().astype(str).str.contains(';').any()
        ]
        
        # Filter for low cardinality multi-value columns
        self.multi_value_cols = []
        for col in semicolon_cols:
            # Get unique values across all entries
            all_values = set()
            for val in df[col].dropna().astype(str):
                values = [v.strip() for v in val.split(';') if v.strip()]
                all_values.update(values)
            
            if len(all_values) <= self.max_cardinality:
                self.multi_value_cols.append(col)
        
        print(f"Encoding {len(self.multi_value_cols)} multi-value columns: {self.multi_value_cols}")
        
        # Process each multi-value column
        for col in self.multi_value_cols:
            # Prepare data for MultiLabelBinarizer
            values_list = []
            for idx in df.index:
                val = df.loc[idx, col]
                if pd.notna(val) and str(val).strip():
                    values_list.append([item.strip() for item in str(val).split(';') if item.strip()])
                else:
                    values_list.append([])
            
            if not any(values_list):  # Skip if no valid values
                continue
                
            # Fit and transform
            mlb = MultiLabelBinarizer()
            onehot_array = mlb.fit_transform(values_list)
            
            # Create DataFrame with proper column names
            onehot_df = pd.DataFrame(
                onehot_array,
                columns=[f"{col}__{cat}" for cat in mlb.classes_],
                index=df.index
            )
            
            # Store transformer
            self.mlb_transformers[col] = mlb
            
            # Check for column conflicts and resolve
            overlap = df.columns.intersection(onehot_df.columns)
            if not overlap.empty:
                print(f"Resolving column conflicts for {col}: {list(overlap)}")
                onehot_df = onehot_df.drop(columns=overlap)
            
            # Join with main dataframe
            df = pd.concat([df, onehot_df], axis=1)
            
            print(f"Encoded {col} into {len(mlb.classes_)} binary columns")
        
        # Remove original multi-value columns
        df = df.drop(columns=self.multi_value_cols)

        print("Columns after multi-value encoding:")
        print(df.columns.tolist())
        
        return df

# === 8. CategoricalEncoder: One-hot encode regular categorical columns ===
class CategoricalEncoder(BaseEstimator, TransformerMixin):
    """
    Handle single-value categorical columns:
    - Excludes semicolon columns
    - Only encodes low-cardinality columns
    - Optionally drops first category to avoid multicollinearity
    """
    
    def __init__(self, max_cardinality: int = 10, drop_first: bool = False):
        self.max_cardinality = max_cardinality
        self.drop_first = drop_first
        self.categorical_cols = []
        
    def fit(self, X, y=None):
        print(f"{self.__class__.__name__}.fit() CALLED")
        return self
    
    def transform(self, X):
        """Encode categorical columns"""
        df = X.copy()
        
        # Identify categorical columns
        cat_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
        print(f"Found all categorical columns for single-valued encoding: {cat_cols}")
        
        # Exclude semicolon(multi-value) columns 
        semicolon_cols = [
            col for col in df.columns
            if df[col].dropna().astype(str).str.contains(';').any()
        ]
        #print(f"Found multi-value columns are: {semicolon_cols}")
        
        # Filter for low cardinality single-value categorical columns
        self.categorical_cols = [
            col for col in cat_cols 
            if col not in semicolon_cols and df[col].nunique() <= self.max_cardinality
        ]
        
        print(f"One-hot encoding for single value categorical data{len(self.categorical_cols)} categorical columns: {self.categorical_cols}")

        # Apply one-hot encoding
        if self.categorical_cols:
            df = pd.get_dummies(df, columns=self.categorical_cols, drop_first=self.drop_first)

        #print("****Columns after one-hot encoding:")
        #print(df.columns.tolist())
        
        return df

# === 9. ColumnNameFixer: Final column name cleanup ===
class ColumnNameFixer(BaseEstimator, TransformerMixin):
    """
    Fix column names for compatibility:
    - Removes illegal characters
    - Handles duplicates
    - Ensures clean, consistent naming
    """
    
    def __init__(self):
        self.column_transformations = {}
        
    def fit(self, X, y=None):
        print(f"{self.__class__.__name__}.fit() CALLED")
        return self
    
    def transform(self, X):
        """Fix problematic column names"""
        df = X.copy()
        original_cols = df.columns.tolist()
        fixed_columns = []
        seen_columns = set()
        
        for col in original_cols:
            # Clean column name
            fixed_col = str(col).replace(' ', '_').replace('&', 'and')
            fixed_col = ''.join(c if c.isalnum() or c == '_' else '_' for c in fixed_col)
            fixed_col = re.sub('_+', '_', fixed_col).strip('_')
            
            # Handle duplicates
            base_col = fixed_col
            suffix = 1
            while fixed_col in seen_columns:
                fixed_col = f"{base_col}_{suffix}"
                suffix += 1
            
            seen_columns.add(fixed_col)
            fixed_columns.append(fixed_col)
        
        # Store transformations
        self.column_transformations = dict(zip(original_cols, fixed_columns))
        df.columns = fixed_columns
        
        n_changed = sum(1 for old, new in self.column_transformations.items() if old != new)
        print(f"Fixed {n_changed} column names for compatibility")
        
        return df

# === 10. DataValidator: Final validation and summary ===
class DataValidator(BaseEstimator, TransformerMixin):
    """
    Validate final dataset:
    - Check shape, missing values, data types
    - Provide target variable summary
    - Report any issues
    """
    
    def __init__(self, target_col: str):
        self.target_col = target_col
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        """Validate the processed dataset"""
        df = X.copy()
        
        print(f"\n=== Final Data Validation ===")
        print(f"Final shape: {df.shape}")
        print(f"Target column: {self.target_col}")
        
        # Check for missing values
        missing_count = df.isnull().sum().sum()
        print(f"Total missing values: {missing_count}")
        
        # Check for infinite values
        numeric_cols = df.select_dtypes(include=[np.number]).columns
        if len(numeric_cols) > 0:
            inf_count = np.isinf(df[numeric_cols].values).sum()
            print(f"Total infinite values: {inf_count}")
        
        # Data types summary
        print(f"\nData types:")
        print(f"  Numeric columns: {len(df.select_dtypes(include=[np.number]).columns)}")
        print(f"  Categorical columns: {len(df.select_dtypes(include=['object', 'category']).columns)}")
        
        # Target variable summary
        if self.target_col in df.columns:
            target_stats = df[self.target_col].describe()
            print(f"\nTarget variable '{self.target_col}' statistics:")
            print(f"  Count: {target_stats['count']}")
            print(f"  Mean: {target_stats['mean']:.2f}")
            print(f"  Std: {target_stats['std']:.2f}")
            print(f"  Min: {target_stats['min']:.2f}")
            print(f"  Max: {target_stats['max']:.2f}")
            print(f"  Missing: {df[self.target_col].isnull().sum()}")
        else:
            print(f"WARNING: Target column '{self.target_col}' not found!")
        
        return df

# === Pipeline creation function ===
def create_preprocessing_pipeline(
    target_col: str = 'project_prf_normalised_work_effort',
    high_missing_threshold: float = 0.7,
    cols_to_keep: Optional[List[str]] = None,
    max_categorical_cardinality: int = 10,
    standardization_mapping: Optional[Dict[str, str]] = None
) -> Pipeline:
    """
    Create complete preprocessing pipeline for DataFrame input
    
    Parameters:
    -----------
    target_col : str
        Name of target column
    high_missing_threshold : float
        Threshold for dropping columns with high missing values
    cols_to_keep : list
        Columns to keep even if they have high missing values
    max_categorical_cardinality : int
        Maximum number of unique values for categorical encoding
    standardization_mapping : dict
        Custom mapping for standardizing values
    
    Returns:
    --------
    Pipeline
        Complete preprocessing pipeline
    """
    
    if cols_to_keep is None:
        cols_to_keep = [
            'project_prf_case_tool_used', 
            'process_pmf_prototyping_used',
            'tech_tf_client_roles', 
            'tech_tf_type_of_server', 
            'tech_tf_clientserver_description'
        ]
    
    pipeline = Pipeline([
        ('validator', DataFrameValidator(target_col)),
        ('column_standardizer', ColumnNameStandardizer()),
        ('missing_handler', MissingValueAnalyzer(
            high_missing_threshold=high_missing_threshold,
            cols_to_keep=cols_to_keep
        )),
        ('cat_value_cleaner', CategoricalValueCleaner()),
        ('semicolon_processor', SemicolonProcessor(standardization_mapping=standardization_mapping)),
        ('cat_value_standardizer', CategoricalValueStandardizer(
            mapping=standardization_mapping,
            columns=None
        )),
        ('multi_value_encoder', MultiValueEncoder(max_cardinality=max_categorical_cardinality)),
        ('categorical_encoder', CategoricalEncoder(max_cardinality=max_categorical_cardinality)),
        ('column_fixer', ColumnNameFixer()),
        ('final_validator', DataValidator(target_col))
    ])
    
    return pipeline

# === Simplified preprocessing function ===
def preprocess_dataframe(
    df: pd.DataFrame,
    target_col: str = 'project_prf_normalised_work_effort',
    **pipeline_kwargs
) -> Tuple[pd.DataFrame, Dict]:
    """
    Preprocess a DataFrame using the complete pipeline
    
    Parameters:
    -----------
    df : pd.DataFrame
        Input DataFrame to preprocess
    target_col : str
        Name of target column
    **pipeline_kwargs : dict
        Additional arguments for pipeline creation
    
    Returns:
    --------
    pd.DataFrame
        Processed DataFrame ready for modeling
    dict
        Processing metadata and statistics
    """
    
    print("="*60)
    print("DataFrame Preprocessing Pipeline")
    print("="*60)
    print(f"Input shape: {df.shape}")
    print(f"Target column: {target_col}")
    print(f"Timestamp: {datetime.now()}")
    
    # Create and apply pipeline
    pipeline = create_preprocessing_pipeline(target_col=target_col, **pipeline_kwargs)
    #print(f"Pipeline is: {pipeline}")
    
    # Get original target column info from validator
    validator = pipeline.named_steps['validator']
    df_processed = pipeline.fit_transform(df)
    
    # Prepare metadata
    metadata = {
        'original_shape': df.shape,
        'processed_shape': df_processed.shape,
        'processing_timestamp': datetime.now().isoformat(),
        'target_column_standardized': target_col,
        'target_column_original': validator.original_target_col,
        'pipeline_steps': [step[0] for step in pipeline.steps]
    }
    
    print("\n" + "="*60)
    print("Preprocessing completed successfully!")
    print(f"Shape: {df.shape} -> {df_processed.shape}")
    print("="*60)
    
    return df_processed, metadata


Cell executed at: 2025-06-02 16:14:38.032588


In [6]:
# Summary
# As of 20250206

def integrated_categorical_preprocessing(
    sample_df: pd.DataFrame,
    full_df: pd.DataFrame,
    target_col: str,
    cols_to_keep: List[str] = None,
    high_card_columns: List[str] = None,
    max_categorical_cardinality: int = 10,
    samples_per_category: int = 3,
    standardization_mapping: Dict[str, str] = None,
    high_missing_threshold: float = 0.7,
    separator: str = ';',
    strategy: str = 'top_k',
    k: int = 20,
    exclude_from_augment: List[str] = None
) -> Tuple[pd.DataFrame, Dict[str, Any]]:
    """
    Efficient in-memory categorical preprocessing and augmentation pipeline.
    Args:
        sample_df, full_df: DataFrames
        target_col: target column
        cols_to_keep: columns to retain (optional)
        high_card_columns: list of high-cardinality categorical columns
        ... [other params as above]
    Returns:
        enhanced_sample: DataFrame
        metadata: dict
    """
    # Step 0: (Optional) Column Filtering
    #if cols_to_keep:
    #    sample_df = sample_df[cols_to_keep + [target_col]].copy()
    #    full_df = full_df[cols_to_keep + [target_col]].copy()

    # Step 1: Standardize/clean categorical columns (optional, minimal here)
    if standardization_mapping:
        for col, mapped_col in standardization_mapping.items():
            if col in sample_df and mapped_col in sample_df:
                sample_df[col] = sample_df[mapped_col]
            if col in full_df and mapped_col in full_df:
                full_df[col] = full_df[mapped_col]

    # Step 2: Detect categorical columns (vectorized, robust)
    # Only include columns with "object" or "category" dtype and limited cardinality
    potential_cats = [
        col for col in sample_df.columns
        if col != target_col and (
            sample_df[col].dtype == "object"
            or pd.api.types.is_categorical_dtype(sample_df[col])
        )
    ]
    categorical_columns = [
        col for col in potential_cats
        if sample_df[col].nunique(dropna=True) <= max_categorical_cardinality
    ]

    # Step 3: Handle high-cardinality multi-value columns, both in full/sample
    if high_card_columns is None:
        high_card_columns = []

    col_mapping = {}
    for df in [full_df, sample_df]:
        for col in high_card_columns:
            if col in df.columns:
                # Fast top-k binarization of multi-value column
                all_vals = df[col].dropna().astype(str).str.split(separator).explode().str.strip()
                top_values = [v for v, c in Counter(all_vals).most_common(k)]
                for v in top_values:
                    df[f"{col}_top_{v}"] = df[col].fillna("").astype(str).apply(lambda x: int(v in [e.strip() for e in x.split(separator)]))
                # Add 'other' bin
                df[f"{col}_other"] = df[col].fillna("").astype(str).apply(
                    lambda x: int(any(e.strip() not in top_values for e in x.split(separator) if e.strip()))
                )
                col_mapping[col] = [f"{col}_top_{v}" for v in top_values] + [f"{col}_other"]
                # Remove original
                df.drop(columns=[col], inplace=True)
    # Update list of categorical columns after processing
    updated_cats = [
        col for col in sample_df.columns
        if (sample_df[col].dtype == "object" or pd.api.types.is_categorical_dtype(sample_df[col]))
        and sample_df[col].nunique(dropna=True) <= max_categorical_cardinality
        and col != target_col
    ]
    # Remove excluded columns (if any)
    final_cats = [col for col in updated_cats if not (exclude_from_augment and col in exclude_from_augment)]

    # Step 4: Augment sample_df with missing categories found only in full_df
    # For each categorical column, sample a few rows from full_df for each missing category
    additional_rows = []
    missing_report = {}
    for col in final_cats:
        full_cats = set(full_df[col].dropna().unique())
        sample_cats = set(sample_df[col].dropna().unique())
        missing = full_cats - sample_cats
        missing_report[col] = list(missing)
        for val in missing:
            matches = full_df[full_df[col] == val]
            if not matches.empty:
                additional = matches.sample(
                    n=min(samples_per_category, len(matches)),
                    random_state=42
                )
                additional_rows.append(additional)
    # Combine new rows with original sample
    if additional_rows:
        df_aug = pd.concat([sample_df] + additional_rows, ignore_index=True).drop_duplicates()
    else:
        df_aug = sample_df.copy()

    # Step 5: Clean up duplicate columns
    df_aug = df_aug.loc[:, ~df_aug.columns.duplicated()]

    # Step 6: Compile metadata
    metadata = {
        "original_sample_shape": sample_df.shape,
        "original_full_shape": full_df.shape,
        "final_shape": df_aug.shape,
        "categorical_columns_detected": categorical_columns,
        "updated_categorical_columns": updated_cats,
        "final_augmented_categorical_columns": final_cats,
        "high_cardinality_columns_processed": high_card_columns,
        "column_mapping": col_mapping,
        "missing_categories_report": missing_report,
        "rows_added_from_full_dataset": df_aug.shape[0] - sample_df.shape[0]
    }

    return df_aug, metadata


Cell executed at: 2025-06-02 16:14:38.052775


In [7]:
# Main execution function
def main():
    """
    Main function to run the integrated pipeline
    """
    
    # Configuration
    sample_file_path = os.path.join(DATA_FOLDER, SAMPLE_FILE)
    full_file_path = os.path.join(DATA_FOLDER, FULL_FILE)

    sample_df = pd.read_excel(sample_file_path)
    full_df = pd.read_excel(full_file_path)
    
    # Columns to keep (customize as needed)
    cols_to_keep = [
        'project_prf_case_tool_used', 
        'process_pmf_prototyping_used',
        'tech_tf_client_roles', 
        'tech_tf_type_of_server', 
        'tech_tf_clientserver_description',
        'people_prf_project_user_involvement'
    ]
    
    # High-cardinality multi-value columns
    high_card_columns = [
        'external_eef_organisation_type', 
        'project_prf_application_type'
    ]

   
    # Standardization rules
    standardization_map = {
        # Programming languages
        '.net': 'dotnet',
        'c': 'c_lang',         # or simply 'c'
        'c++': 'cpp',
        'c#': 'csharp',
        # Architecture
        'stand alone': 'standalone',
        'stand-alone': 'standalone',
        'client server': 'client-server',
        # Application group
        'mathematically_intensive application': 'mathematically_intensive_application',
        # Web development
        "Web?": "web",
        # Server roles
        "file &/or print server": "file/print server",
        # Add others as needed
    }
    

    try:
        # Create and apply pipeline for sample
        processed_sample_df, sample_metadata = preprocess_dataframe(
            sample_df, 
            target_col='project_prf_normalised_work_effort',
            cols_to_keep=cols_to_keep,
            max_categorical_cardinality=10,
            standardization_mapping=standardization_map,
            high_missing_threshold=0.7
        )
        
        # Create and apply pipeline for full dataset
        processed_full_df, full_metadata = preprocess_dataframe(
            full_df, 
            target_col='project_prf_normalised_work_effort',
            cols_to_keep=cols_to_keep,
            max_categorical_cardinality=10,
            standardization_mapping=standardization_map,
            high_missing_threshold=0.7
        )

        # adding back missing categorical values to the sample dataset
        final_df, meta = integrated_categorical_preprocessing(
            sample_df=processed_sample_df,
            full_df=processed_full_df,
            target_col=TARGET_COL,
            cols_to_keep=cols_to_keep,
            high_card_columns=high_card_columns,
            max_categorical_cardinality=10,
            samples_per_category=3,
            standardization_mapping=standardization_map,
            high_missing_threshold=0.7,
            separator=';',
            strategy='top_k',
            k=20
        )

       
        # Save results
        output_path = os.path.join(DATA_FOLDER, 'enhanced_sample_final.csv')
        final_df.to_csv(output_path, index=False)
        
        print(f"\n" + "="*60)
        print("PIPELINE COMPLETED SUCCESSFULLY!")
        print("="*60)
        print(f"Final dataset saved to: {output_path}")
        print(f"Final shape: {final_df.shape}")
        print(f"Final meta: {meta}")
        print(f"Ready for PyCaret setup!")
        
        # Print summary of changes
        print(f"\nSUMMARY:")
        #print(f"- Original sample rows: {metadata['original_sample_shape'][0]}")
        #print(f"- Rows added from full dataset: {metadata['rows_added_from_full_dataset']}")
        #print(f"- Final rows: {metadata['final_shape'][0]}")
        #print(f"- Original columns: {metadata['original_sample_shape'][1]}")
        #print(f"- Final columns: {metadata['final_shape'][1]}")
        
        return final_df, meta
        
    except Exception as e:
        print(f"Error in integrated pipeline: {e}")
        raise

Cell executed at: 2025-06-02 16:14:38.068806


In [8]:
if __name__ == "__main__":
    final_df, metadata = main()

DataFrame Preprocessing Pipeline
Input shape: (3786, 52)
Target column: project_prf_normalised_work_effort
Timestamp: 2025-06-02 16:14:41.629858
DataFrameValidator.fit() CALLED
Processing DataFrame with shape: (3786, 52)
Target column found: 'Project_PRF_Normalised Work Effort' -> will be standardized to 'project_prf_normalised_work_effort'
ColumnNameStandardizer.fit() CALLED
Standardized 52 column names
MissingValueAnalyzer.fit() CALLED

Missing value analysis:
Columns with >50% missing: 16
Columns with >70% missing: 15
Dropping 15 columns with >70.0% missing values
Columns to be dropped due to high missing values: ['project_prf_defect_density', 'project_prf_manpower_delivery_rate', 'people_prf_ba_team_experience_less_than_1_yr', 'people_prf_ba_team_experience_1_to_3_yr', 'people_prf_ba_team_experience_great_than_3_yr', 'people_prf_it_experience_less_than_1_yr', 'people_prf_it_experience_1_to_3_yr', 'people_prf_it_experience_great_than_3_yr', 'people_prf_it_experience_less_than_3_yr',