In [1]:
"""
Complete Scikit-Learn Preprocessing Pipeline for ISBSG Data
===========================================================

This module provides a comprehensive preprocessing pipeline that handles:
1. Data loading and initial cleaning
2. Column name standardization
3. Missing value handling
4. Semicolon-separated value processing
5. One-hot encoding for categorical variables
6. Multi-label binarization for multi-value columns
7. Feature selection and filtering
8. Data validation and export

Based on the preprocessing steps from the provided notebooks.
"""



In [2]:
# === Imports ===

import pandas as pd
import numpy as np
import re
from pathlib import Path
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.compose import ColumnTransformer
import joblib
import os
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')


In [3]:
# Configuration
DATA_FOLDER = "../data"
SAMPLE_FILE = "sample_clean_a_agile_only.xlsx"
DATA_FILE = ""
TARGET_COL = "Project_PRF_Normalised_Work_Effort"  # be careful about case sensitive


In [4]:
# === 1. DataLoader: Load data and check target column ===

class DataLoader(BaseEstimator, TransformerMixin):
    """
        Load and perform initial data validation whether the target col exists:
        - Handles both .xlsx and .csv.
        - Stores the original shape of the data.
        - Raises an error if the target column is missing.

    """
    
    def __init__(self, file_path, target_col='project_prf_normalised_work_effort'):
        self.file_path = file_path
        self.target_col = target_col  # This should be the standardized form
        self.original_shape = None
        self.original_target_col = None  # Store what we actually found
        
    def fit(self, X=None, y=None):
        return self
    
    def _standardize_column_name(self, col_name):
        """Convert column name to standardized format"""
        return col_name.strip().lower().replace(' ', '_')
    
    def _find_target_column(self, df_columns):
        """
        Smart target column finder - handles various formats
        Returns the actual column name from the dataframe
        """
        target_standardized = self.target_col.lower().replace(' ', '_')
        
        # Try exact match first
        if self.target_col in df_columns:
            return self.target_col
            
        # Try standardized versions of all columns
        for col in df_columns:
            col_standardized = self._standardize_column_name(col)
            if col_standardized == target_standardized:
                return col
                
        # If still not found, look for partial matches (for debugging)
        similar_cols = []
        target_words = set(target_standardized.split('_'))
        for col in df_columns:
            col_words = set(self._standardize_column_name(col).split('_'))
            if len(target_words.intersection(col_words)) >= 2:  # At least 2 words match
                similar_cols.append(col)
                
        return None, similar_cols
    
    def transform(self, X=None):
        """Load data from file with smart column handling"""

        print(f"Loading data from: {self.file_path}")
        
        # Determine file type and load accordingly; support for Excel or CSV
        if self.file_path.endswith('.xlsx'):
            df = pd.read_excel(self.file_path)
        elif self.file_path.endswith('.csv'):
            df = pd.read_csv(self.file_path)
        else:
            raise ValueError("Unsupported file format. Use .xlsx or .csv")
        
        self.original_shape = df.shape
        print(f"Loaded data with shape: {df.shape}")
        
        # Smart target column finding
        result = self._find_target_column(df.columns)
        
        if isinstance(result, tuple):  # Not found, got similar columns
            actual_col, similar_cols = result
            error_msg = f"Target column '{self.target_col}' not found in data."
            if similar_cols:
                error_msg += f" Similar columns found: {similar_cols}"
            else:
                error_msg += f" Available columns: {list(df.columns)}"
            raise ValueError(error_msg)
        else:
            actual_col = result
            
        # Store the original column name we found
        self.original_target_col = actual_col
        
        if actual_col != self.target_col:
            print(f"Target column found: '{actual_col}' -> will be standardized to '{self.target_col}'")
            
        return df

# === 2. ColumnNameStandardizer: Clean and standardize column names ===
class ColumnNameStandardizer(BaseEstimator, TransformerMixin):
    """
        Standardize column names for consistency (lowercase, underscores, removes odd chars):
        - Strips spaces, lowercases, replaces & with _&_, removes special chars.
        - Useful for later steps and compatibility with modeling libraries.)
        
    """
    
    def __init__(self, target_col=None, original_target_col=None):
        self.column_mapping = {}
        self.target_col = target_col
        self.original_target_col = original_target_col
        
    def fit(self, X, y=None):
        return self
    
    def _standardize_columns(self, columns):
        """Standardize column names"""
        return [col.strip().lower().replace(' ', '_') for col in columns]
    
    def _clean_column_names(self, columns):
        """Clean column names for compatibility"""
        cleaned_cols = []
        for col in columns:
            # Replace ampersands with _&_ to match expected transformations
            col_clean = col.replace(' & ', '_&_')
            # Remove special characters except underscores and ampersands
            col_clean = re.sub(r'[^\w\s&]', '', col_clean)
            # Replace spaces with underscores
            col_clean = col_clean.replace(' ', '_')
            cleaned_cols.append(col_clean)
        return cleaned_cols
    
    def transform(self, X):
        """Apply column name standardization"""
        df = X.copy()
        
        # Store original column names
        original_columns = df.columns.tolist()
        
        # Apply standardization
        standardized_cols = self._standardize_columns(original_columns)
        cleaned_cols = self._clean_column_names(standardized_cols)

        # Special handling for target column
        if self.original_target_col and self.target_col:
            target_index = None
            try:
                target_index = original_columns.index(self.original_target_col)
                cleaned_cols[target_index] = self.target_col
                print(f"Target column '{self.original_target_col}' -> '{self.target_col}'")
            except ValueError:
                pass  # Original target col not found, proceed normally
        
        
        # Create mapping
        self.column_mapping = dict(zip(original_columns, cleaned_cols))
        
        # Apply new column names
        df.columns = cleaned_cols
        
        # Report changes
        changed_cols = sum(1 for orig, new in self.column_mapping.items() if orig != new)
        print(f"Standardized {changed_cols} column names")
        
        return df

# === 3. MissingValueAnalyzer: Analyze and handle missing values ===
class MissingValueAnalyzer(BaseEstimator, TransformerMixin):
    """
        Analyze and handle missing values
        - Reports number of columns with >50% and >70% missing.
        - Drops columns with a high proportion of missing data, except those you want to keep.
        - Fills remaining missing values:
            - Categorical: Fills with "Missing".
            - Numeric: Fills with column median.
    """
    
    def __init__(self, high_missing_threshold=0.7, cols_to_keep=None):
        self.high_missing_threshold = high_missing_threshold
        self.cols_to_keep = cols_to_keep or []
        self.high_missing_cols = []
        self.missing_stats = {}
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        """Analyze and handle missing values"""
        df = X.copy()
        
        # Calculate missing percentages
        missing_pct = df.isnull().mean()
        self.missing_stats = missing_pct.sort_values(ascending=False)
        
        print(f"\nMissing value analysis:")
        print(f"Columns with >50% missing: {sum(missing_pct > 0.5)}")
        print(f"Columns with >70% missing: {sum(missing_pct > self.high_missing_threshold)}")
        
        # Identify high missing columns
        self.high_missing_cols = missing_pct[missing_pct > self.high_missing_threshold].index.tolist()
        
        # Filter out columns we want to keep
        final_high_missing_cols = [col for col in self.high_missing_cols if col not in self.cols_to_keep]
        
        print(f"Dropping {len(final_high_missing_cols)} columns with >{self.high_missing_threshold*100}% missing values")
        
        # Drop high missing columns
        df_clean = df.drop(columns=final_high_missing_cols)
        
        # Fill remaining missing values in categorical columns
        cat_cols = df_clean.select_dtypes(include=['object', 'category']).columns
        for col in cat_cols:
            df_clean[col] = df_clean[col].fillna('Missing')
        
        # Fill remaining missing values in numerical columns with median
        num_cols = df_clean.select_dtypes(include=['number']).columns
        for col in num_cols:
            if df_clean[col].isnull().sum() > 0:
                median_val = df_clean[col].median()
                df_clean[col] = df_clean[col].fillna(median_val)
                print(f"Filled {col} missing values with median: {median_val}")
        
        print(f"Data shape after missing value handling: {df_clean.shape}")
        return df_clean

# === 4. SemicolonProcessor: Process multi-value columns (semicolon-separated) ===
class SemicolonProcessor(BaseEstimator, TransformerMixin):
    """
        Process semicolon-separated values in columns (e.g., “Python; Java; SQL”)
        - Identifies columns with semicolons.
        - Cleans: lowercases, strips, deduplicates, sorts, optionally standardizes values (e.g., "stand alone" → "stand-alone").
        - Useful for multi-value categorical features.
        
    """
    
    def __init__(self, standardization_mapping=None):
        self.semicolon_cols = []
        self.standardization_mapping = standardization_mapping or {
            "scrum": "agile development",
            "file &/or print server": "file/print server",
        }
        
    def fit(self, X, y=None):
        return self
    
    def _clean_and_sort_semicolon(self, val, apply_standardization=False, mapping=None):
        """Clean, deduplicate, sort, and standardize semicolon-separated values"""
        if pd.isnull(val) or val == '':
            return val
        
        parts = [x.strip().lower() for x in str(val).split(';') if x.strip()]
        
        if apply_standardization and mapping is not None:
            parts = [mapping.get(part, part) for part in parts]
        
        unique_cleaned = sorted(set(parts))
        return '; '.join(unique_cleaned)
    
    def transform(self, X):
        """Process semicolon-separated columns"""
        df = X.copy()
        
        # Identify columns with semicolons
        self.semicolon_cols = [
            col for col in df.columns
            if df[col].dropna().astype(str).str.contains(';').any()
        ]
        
        print(f"Found {len(self.semicolon_cols)} columns with semicolons: {self.semicolon_cols}")
        
        # Process each semicolon column
        for col in self.semicolon_cols:
            # Apply mapping for specific columns
            apply_mapping = col in ['process_pmf_development_methodologies', 'tech_tf_server_roles']
            mapping = self.standardization_mapping if apply_mapping else None
            
            # Clean the column
            df[col] = df[col].apply(
                lambda x: self._clean_and_sort_semicolon(x, apply_standardization=apply_mapping, mapping=mapping)
            )
        
        return df

# === 5. MultiValueEncoder: Encode semicolon columns using MultiLabelBinarizer ===
class MultiValueEncoder(BaseEstimator, TransformerMixin):
    """
        Handle multi-value columns using MultiLabelBinarizer
        - Only processes columns with a manageable number of unique values (max_cardinality).
        - Each semicolon column becomes several binary columns (e.g., "lang__python", "lang__java", ...).     
    """
    
    def __init__(self, max_cardinality=10):
        # Ensure max_cardinality is always an integer
        self.max_cardinality = int(max_cardinality) if max_cardinality is not None else 10
        self.multi_value_cols = []
        self.mlb_transformers = {}
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        """Encode multi-value columns"""
        df = X.copy()
        
        # Identify semicolon columns (multi-value)
        semicolon_cols = [
            col for col in df.columns
            if df[col].dropna().astype(str).str.contains(';').any()
        ]
        
        # Filter for low cardinality multi-value columns
        self.multi_value_cols = []
        for col in semicolon_cols:
            # Get unique values across all entries
            all_values = set()
            for val in df[col].dropna().astype(str):
                values = [v.strip() for v in val.split(';') if v.strip()]
                all_values.update(values)
            
            # Check cardinality (max_cardinality is already an integer from __init__)
            if len(all_values) <= self.max_cardinality:
                self.multi_value_cols.append(col)
        
        print(f"Encoding {len(self.multi_value_cols)} multi-value columns: {self.multi_value_cols}")
        
        # Process each multi-value column
        for col in self.multi_value_cols:
            # Prepare data for MultiLabelBinarizer
            values = df[col].dropna().astype(str).apply(
                lambda x: [item.strip() for item in x.split(';') if item.strip()]
            )
            
            # Handle empty values - fill with empty list for MultiLabelBinarizer
            if len(values) == 0:
                continue
                
            # Fit and transform
            mlb = MultiLabelBinarizer()
            
            # Convert to list of lists, handling NaN/empty cases
            values_list = []
            for idx in df.index:
                if idx in values.index and values[idx]:
                    values_list.append(values[idx])
                else:
                    values_list.append([])  # Empty list for missing values
            
            onehot = pd.DataFrame(
                mlb.fit_transform(values_list),
                columns=[f"{col}__{cat}" for cat in mlb.classes_],
                index=df.index
            )
            
            # Store transformer for later use
            self.mlb_transformers[col] = mlb
            
            # Join with main dataframe
            df = df.join(onehot, how='left')
            
            print(f"Encoded {col} into {len(mlb.classes_)} binary columns")
        
        # Remove original multi-value columns
        df = df.drop(columns=self.multi_value_cols)
        
        return df

# === 6. CategoricalEncoder: One-hot encode regular categorical columns ===
class CategoricalEncoder(BaseEstimator, TransformerMixin):
    """
        Handle single-value categorical columns
        - Ignores semicolon columns.
        - Only encodes columns with a number of categories ≤ max_cardinality (to avoid high-dimensional explosion).
        - Can drop the first category for each variable to avoid multicollinearity.
        
    """
    
    def __init__(self, max_cardinality=10, drop_first=True):
        self.max_cardinality = max_cardinality
        self.drop_first = drop_first
        self.categorical_cols = []
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        """Encode categorical columns"""
        df = X.copy()
        
        # Identify categorical columns
        cat_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
        
        # Identify semicolon columns to exclude
        semicolon_cols = [
            col for col in df.columns
            if df[col].dropna().astype(str).str.contains(';').any()
        ]
        
        # Filter for low cardinality single-value categorical columns
        self.categorical_cols = [
            col for col in cat_cols 
            if col not in semicolon_cols and df[col].nunique() <= self.max_cardinality
        ]
        
        print(f"One-hot encoding {len(self.categorical_cols)} categorical columns: {self.categorical_cols}")
        
        # Apply one-hot encoding
        if self.categorical_cols:
            df = pd.get_dummies(df, columns=self.categorical_cols, drop_first=self.drop_first)
        
        return df

# === 7. ColumnNameFixer: Final column name cleanup for PyCaret etc ===
class ColumnNameFixer(BaseEstimator, TransformerMixin):
    """
        Fix column names for PyCaret compatibility (removes illegal characters, replaces spaces/ampersands, handles duplicates):
        - No duplicate column names after encoding.
        - Only alphanumeric and underscores. 
        
    """
    
    def __init__(self):
        self.column_transformations = {}
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        """Fix problematic column names"""
        df = X.copy()
        original_cols = df.columns.tolist()
        fixed_columns = []
        seen_columns = set()
        
        for col in original_cols:
            # Replace spaces with underscores
            fixed_col = col.replace(' ', '_')
            # Replace ampersands
            fixed_col = fixed_col.replace('&', 'and')
            # Remove other problematic characters
            fixed_col = ''.join(c if c.isalnum() or c == '_' else '_' for c in fixed_col)
            # Remove multiple consecutive underscores
            fixed_col = re.sub('_+', '_', fixed_col)
            # Remove leading/trailing underscores
            fixed_col = fixed_col.strip('_')
            
            # Handle duplicates
            base_col = fixed_col
            suffix = 1
            while fixed_col in seen_columns:
                fixed_col = f"{base_col}_{suffix}"
                suffix += 1
            
            seen_columns.add(fixed_col)
            fixed_columns.append(fixed_col)
        
        # Store transformations
        self.column_transformations = dict(zip(original_cols, fixed_columns))
        
        # Apply new column names
        df.columns = fixed_columns
        
        # Check for duplicates
        dup_check = [item for item, count in pd.Series(fixed_columns).value_counts().items() if count > 1]
        if dup_check:
            print(f"WARNING: Found {len(dup_check)} duplicate column names: {dup_check}")
        else:
            print("No duplicate column names after fixing")
        
        n_changed = sum(1 for old, new in self.column_transformations.items() if old != new)
        print(f"Fixed {n_changed} column names for PyCaret compatibility")
        
        return df

# === 8. DataValidator: Final summary and checks ===
class DataValidator(BaseEstimator, TransformerMixin):
    """
        Validate final dataset
        - Shape, missing values, infinities.
        - Data types (numeric, categorical).
        - Stats on the target column (mean, std, min, max, missing).
        - Report issues if any.
        
    """
    
    def __init__(self, target_col):
        self.target_col = target_col
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        """Validate the processed dataset"""
        df = X.copy()
        
        print(f"\n=== Final Data Validation ===")
        print(f"Final shape: {df.shape}")
        print(f"Target column: {self.target_col}")
        
        # Check for missing values
        missing_count = df.isnull().sum().sum()
        print(f"Total missing values: {missing_count}")
        
        # Check for infinite values
        numeric_cols = df.select_dtypes(include=[np.number]).columns
        inf_count = np.isinf(df[numeric_cols].values).sum()
        print(f"Total infinite values: {inf_count}")
        
        # Data types summary
        print(f"\nData types:")
        print(f"  Numeric columns: {len(df.select_dtypes(include=[np.number]).columns)}")
        print(f"  Categorical columns: {len(df.select_dtypes(include=['object', 'category']).columns)}")
        
        # Target variable summary
        if self.target_col in df.columns:
            target_stats = df[self.target_col].describe()
            print(f"\nTarget variable '{self.target_col}' statistics:")
            print(f"  Mean: {target_stats['mean']:.2f}")
            print(f"  Std: {target_stats['std']:.2f}")
            print(f"  Min: {target_stats['min']:.2f}")
            print(f"  Max: {target_stats['max']:.2f}")
            print(f"  Missing: {df[self.target_col].isnull().sum()}")
        else:
            print(f"WARNING: Target column '{self.target_col}' not found!")
        
        return df

# === Pipeline creation function: returns the Scikit-learn pipeline ===
def create_isbsg_preprocessing_pipeline(
    target_col='project_prf_normalised_work_effort',
    original_target_col=None,
    high_missing_threshold=0.7,
    cols_to_keep=None,
    max_categorical_cardinality=10,
    standardization_mapping=None
):
    """
    Create complete preprocessing pipeline with smart target column handling
    
    Parameters:
    -----------
    target_col : str
        Name of target column
    original_target_col : str
        Original target column name found in data
    high_missing_threshold : float
        Threshold for dropping columns with high missing values
    cols_to_keep : list
        Columns to keep even if they have high missing values
    max_categorical_cardinality : int
        Maximum number of unique values for categorical encoding
    standardization_mapping : dict
        Custom mapping for standardizing semicolon-separated values
    
    Returns:
    --------
    sklearn.pipeline.Pipeline
        Complete preprocessing pipeline
    """
    
    if cols_to_keep is None:
        cols_to_keep = [
            'project_prf_case_tool_used', 
            'process_pmf_prototyping_used',
            'tech_tf_client_roles', 
            'tech_tf_type_of_server', 
            'tech_tf_clientserver_description'
        ]
    
    # Ensure max_categorical_cardinality is an integer
    if not isinstance(max_categorical_cardinality, int):
        max_categorical_cardinality = 10
        print(f"Warning: max_categorical_cardinality was not an integer, defaulting to {max_categorical_cardinality}")
    
    pipeline = Pipeline([
        ('column_standardizer', ColumnNameStandardizer(target_col, original_target_col)),
        ('missing_handler', MissingValueAnalyzer(
            high_missing_threshold=high_missing_threshold,
            cols_to_keep=cols_to_keep
        )),
        ('semicolon_processor', SemicolonProcessor(standardization_mapping=standardization_mapping)),
        ('multi_value_encoder', MultiValueEncoder(max_cardinality=max_categorical_cardinality)),
        ('categorical_encoder', CategoricalEncoder(max_cardinality=max_categorical_cardinality)),
        ('column_fixer', ColumnNameFixer()),
        ('validator', DataValidator(target_col))
    ])
    
    return pipeline

# === Full workflow function: orchestrates loading, pipeline, and saving ===
def preprocess_isbsg_data(
    file_path,
    target_col='project_prf_normalised_work_effort',  # Always use standardized form
    output_dir='../data',
    save_intermediate=True,
    **pipeline_kwargs
):
    """
    Complete preprocessing workflow for ISBSG data: loads the data, runs 
      the full preprocessing pipeline, saves processed data, pipeline 
      object, and a metadata report to disk, and returns the processed 
      DataFrame and metadata
    
    Parameters:
    -----------
    file_path : str
        Path to input data file
    target_col : str
        Name of target column
    output_dir : str
        Directory to save processed data
    save_intermediate : bool
        Whether to save intermediate processing steps
    **pipeline_kwargs : dict
        Additional arguments for pipeline creation
    
    Returns:
    --------
    pandas.DataFrame
        Processed dataframe ready for modeling
    dict
        Processing metadata and statistics
    """

    # print pipeline header
    print("="*60)
    print("ISBSG Data Preprocessing Pipeline")
    print("="*60)
    print(f"Processing file: {file_path}")
    print(f"Target column (standardized): {target_col}")
    print(f"Timestamp: {datetime.now()}")
    
    # Create output directory
    os.makedirs(output_dir, exist_ok=True)
    
    # Load data with smart column detection
    loader = DataLoader(file_path, target_col)
    df_raw = loader.transform(X = None)
    
    # Create and fit preprocessing pipeline
    pipeline = create_isbsg_preprocessing_pipeline(
        target_col=target_col,
        original_target_col=loader.original_target_col,  # Pass the found column name
        **pipeline_kwargs
    )
    
    # Apply preprocessing in order of ColumnNameStandardizer=> MissingValueAnalyzer =>
    # SemicolonProcessor=> MultiValueEncoder=> CategoricalEncoder => ColumnNameFixer

    # Apply preprocessing
    df_processed = pipeline.fit_transform(df_raw)
    
    # Prepare metadata
    metadata = {
        'original_shape': loader.original_shape,
        'processed_shape': df_processed.shape,
        'processing_timestamp': datetime.now().isoformat(),
        'target_column_standardized': target_col,
        'target_column_original': loader.original_target_col,
        'pipeline_steps': [step[0] for step in pipeline.steps]
    }
    
    # Save processed data
    file_stem = Path(file_path).stem
    output_path = os.path.join(output_dir, f"{file_stem}_preprocessed.csv")
    df_processed.to_csv(output_path, index=False)
    print(f"\nProcessed data saved to: {output_path}")
    
    # Save pipeline
    pipeline_path = os.path.join(output_dir, f"{file_stem}_preprocessing_pipeline.pkl")
    joblib.dump(pipeline, pipeline_path)
    print(f"Pipeline saved to: {pipeline_path}")
    
    # Save metadata
    metadata_path = os.path.join(output_dir, f"{file_stem}_preprocessing_metadata.txt")
    with open(metadata_path, 'w') as f:
        f.write("ISBSG Data Preprocessing Metadata\n")
        f.write("="*40 + "\n")
        for key, value in metadata.items():
            f.write(f"{key}: {value}\n")
    
    print(f"Metadata saved to: {metadata_path}")

    # Print completion & return results
    print("\n" + "="*60)
    print("Preprocessing completed successfully!")
    print("="*60)
    
    return df_processed, metadata




In [5]:
# Execution: usage and testing
if __name__ == "__main__":
    
    # File path
    file_path = os.path.join(DATA_FOLDER, SAMPLE_FILE)
    
    # Custom configuration
    cols_to_keep = [
        'Project_PRF_CASE_Tool_Used', 
        'Process_PMF_Prototyping_Used',
        'Tech_TF_Client_Roles', 
        'Tech_TF_Type_of_Server', 
        'Tech_TF_ClientServer_Description'
    ]
    
    # Specific standardization rules for individual components (after cleaning)
    standardization_map = {
        'stand alone': 'stand-alone',
        'client server': 'client-server',
        'mathematically intensive': 'mathematically-intensive',
        'mathematically intensive application': 'mathematically-intensive application',
        "file &/or print server": "file/print server",
    }
    
    try:
        # Run preprocessing
        df_processed, metadata = preprocess_isbsg_data(
            file_path=file_path,
            target_col=TARGET_COL,
            output_dir=DATA_FOLDER,
            cols_to_keep=cols_to_keep,
            max_categorical_cardinality=10,
            standardization_mapping=standardization_map,
            high_missing_threshold=0.7
        )
        
        print(f"\nFinal dataset shape: {df_processed.shape}")
        print(f"Columns: {list(df_processed.columns[:10])}...")  # Show first 10 columns
        
        # Ready for PyCaret setup
        print("\nDataset is now ready for PyCaret setup!")
        
    except Exception as e:
        print(f"Error during preprocessing: {e}")
        raise

ISBSG Data Preprocessing Pipeline
Processing file: ../data\sample_clean_a_agile_only.xlsx
Target column (standardized): Project_PRF_Normalised_Work_Effort
Timestamp: 2025-05-31 17:32:41.635778
Loading data from: ../data\sample_clean_a_agile_only.xlsx
Loaded data with shape: (78, 52)
Target column found: 'Project_PRF_Normalised Work Effort' -> will be standardized to 'Project_PRF_Normalised_Work_Effort'
Target column 'Project_PRF_Normalised Work Effort' -> 'Project_PRF_Normalised_Work_Effort'
Standardized 52 column names

Missing value analysis:
Columns with >50% missing: 25
Columns with >70% missing: 18
Dropping 18 columns with >70.0% missing values
Filled project_prf_functional_size missing values with median: 82.0
Filled project_prf_normalised_level_1_pdr_ufp missing values with median: 3.5
Filled project_prf_normalised_pdr_ufp missing values with median: 3.5
Filled project_prf_defect_density missing values with median: 0.0
Filled project_prf_speed_of_delivery missing values with med

In [None]:
# Quick preprocessing
df_processed, metadata = preprocess_isbsg_data(
    file_path="your_data.xlsx",
    target_col="Project_PRF_Normalised_Work_Effort"
)

In [None]:
# With custom settings
df_processed, metadata = preprocess_isbsg_data(
    file_path="your_data.xlsx",
    target_col="Project_PRF_Normalised_Work_Effort",
    high_missing_threshold=0.8,  # More lenient with missing values
    max_categorical_cardinality=15,  # Allow higher cardinality
    cols_to_keep=['specific_column_to_keep']
)