# **<span style="color:#4682B4"><u>Data Exploration and Preprocessing</u></span>**

---
## **<span style="color:#483D8B"><u>Workflow</u></span>**
- Import Libraries
- Load Data
- Describe Features and Target
- Data Visualization
- Preprocessing and Feature Engineering

---
### **<span style="color:#8B008B"><u>Import Libraries</u></span>**

In [1]:
# Core Libraries
import numpy as np
import pandas as pd
import json
import os

# Visualization Libraries
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Scipy: Statistical & Transformation Tools
from scipy import stats
from scipy.stats import boxcox, skew
from scipy.special import inv_boxcox

# Scikit-Learn: Data Preparation
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder, LabelEncoder, PolynomialFeatures
from sklearn.model_selection import train_test_split

# System
import warnings
warnings.filterwarnings('ignore')

### **<span style="color:#8B008B"><u>Load Data</u></span>**

In [2]:
def load_data(dataset_path='../dataset'):
    """
    Load training, test, submission data and metadata from dataset directory.
    
    ### Parameters:
        dataset_path : str
            Path to the dataset directory containing CSV files and JSON metadata
    
    ### Returns:
        tuple
            (train_df, test_df, sample_submission_df, metadata)
    """
    # Construct metadata file path
    metadata_file = os.path.join(dataset_path, 'dataset.json')
    
    # Load metadata
    try:
        with open(metadata_file, 'r') as file:
            metadata = json.load(file)
        print(f'Metadata loaded from: {metadata_file}')
    except FileNotFoundError:
        print(f'Metadata file not found, using default file names')
        metadata = {}
    except Exception as e:
        print(f'Error loading metadata: {e}')
        metadata = {}
    
    # Get file names (use defaults if metadata not available)
    data_files = metadata.get('data_files', {})
    train_file = os.path.join(dataset_path, data_files.get('train', {}).get('filename', 'train.csv'))
    test_file = os.path.join(dataset_path, data_files.get('test', {}).get('filename', 'test.csv'))
    submission_file = os.path.join(dataset_path, data_files.get('sample_submission', {}).get('filename', 'sample_submission.csv'))
    
    # Load CSV files
    try:
        print(f'Loading {metadata.get("dataset_name", "dataset")}...')
        
        train_df = pd.read_csv(train_file)
        test_df = pd.read_csv(test_file)
        sample_submission_df = pd.read_csv(submission_file)
        
        print(f'Training: {train_df.shape[0]} samples, {train_df.shape[1]} columns')
        print(f'Test: {test_df.shape[0]} samples, {test_df.shape[1]} columns')
        print(f'Submission: {sample_submission_df.shape[0]} entries')
        print('Data loaded successfully!')
        
        return train_df, test_df, sample_submission_df, metadata
        
    except FileNotFoundError as e:
        print(f'ERROR: CSV files not found - {e}')
        return None, None, None, None
    except Exception as e:
        print(f'ERROR: Failed to load data - {e}')
        return None, None, None, None


# Load data
train_df, test_df, sample_submission_df, dataset_metadata = load_data('../dataset')

Metadata loaded from: ../dataset/dataset.json
Loading Multi-Class Prediction of Obesity Risk...
Training: 20758 samples, 18 columns
Test: 13840 samples, 17 columns
Submission: 13840 entries
Data loaded successfully!


---
### **<span style="color:#8B008B"><u>Describe Features and Target</u></span>**
#### **<span style="color:#8B4513"><u>Exploratory Data Analysis</u></span>**


In [3]:
def examine_dataset(df, name='DATASET', target_col=None, save_samples=True, output_dir='../artifacts/data', short_name=None):
    """
    Comprehensive dataset examination for EDA
    
    Parameters:
    -----------
    df : pandas.DataFrame
        Dataset to examine
    name : str
        Name/identifier for the dataset
    target_col : str, optional
        Name of the target column for classification tasks
    save_samples : bool, default True
        Whether to save sample data (head/tail) to separate files
    output_dir : str, default '../artifacts/data'
        Directory to save sample data files
    short_name : str, optional
        Short name for file naming convention. If None, derived from name.
    
    Returns:
    --------
    dict : Summary statistics and information
    """
    
    print('\n' + '='*80)
    print(f'{name} OVERVIEW')
    print('='*80)

    # Initialize summary dict for JSON storage
    summary_info = {
        'dataset_name': name,
        'timestamp': pd.Timestamp.now().isoformat()
    }
    
    # Shape & Columns
    print(f'\nShape: {df.shape}')
    print(f'Rows: {df.shape[0]:,}')
    print(f'Columns: {df.shape[1]}')
    print('\nColumn Names:')
    print(df.columns.tolist())
    
    summary_info['shape'] = {
        'rows': int(df.shape[0]),
        'columns': int(df.shape[1])
    }
    summary_info['columns'] = df.columns.tolist()

    # Data Types, Missing, Zeros, Duplicates
    print('\n' + '-'*80)
    print('DATA QUALITY ANALYSIS')
    print('-'*80)

    # Create comprehensive summary
    data_quality = pd.DataFrame({
        'dtype': df.dtypes,
        'missing_count': df.isnull().sum(),
        'missing_pct': (df.isnull().sum() / len(df) * 100).round(2),
        'zero_count': (df == 0).sum(),
        'zero_pct': ((df == 0).sum() / len(df) * 100).round(2),
        'unique_values': df.nunique(),
        'unique_pct': (df.nunique() / len(df) * 100).round(2)
    })
    
    print(data_quality)
    
    # Convert to JSON-serializable format
    summary_info['data_quality'] = {}
    for col in df.columns:
        summary_info['data_quality'][col] = {
            'dtype': str(data_quality.loc[col, 'dtype']),
            'missing_count': int(data_quality.loc[col, 'missing_count']),
            'missing_pct': float(data_quality.loc[col, 'missing_pct']),
            'zero_count': int(data_quality.loc[col, 'zero_count']),
            'zero_pct': float(data_quality.loc[col, 'zero_pct']),
            'unique_values': int(data_quality.loc[col, 'unique_values']),
            'unique_pct': float(data_quality.loc[col, 'unique_pct'])
        }

    # Duplicate rows analysis
    duplicate_count = df.duplicated().sum()
    duplicate_pct = (duplicate_count / len(df) * 100)
    print(f'\nDuplicate Rows: {duplicate_count:,} ({duplicate_pct:.2f}%)')
    
    summary_info['duplicates'] = {
        'count': int(duplicate_count),
        'percentage': float(duplicate_pct)
    }
    
    # Memory usage
    memory_usage = df.memory_usage(deep=True).sum()
    print(f'Memory Usage: {memory_usage / (1024**2):.2f} MB')
    summary_info['memory_usage_mb'] = float(memory_usage / (1024**2))

    # Sample rows - Show first and last 10 rows
    print('\n' + '-'*80)
    print('SAMPLE ROWS (HEAD & TAIL - 10 ROWS EACH)')
    print('-'*80)
    
    # Get sample data
    head_10 = df.head(10)
    tail_10 = df.tail(10)
    
    print("HEAD (First 10 rows):")
    print(head_10)
    print("\nTAIL (Last 10 rows):")
    print(tail_10)
    
    # Save sample data for frontend if requested
    if save_samples:
        try:
            os.makedirs(output_dir, exist_ok=True)
            
            # Use short_name for file naming, fallback to cleaned name if not provided
            clean_name = short_name if short_name else name.lower().replace(' ', '_').replace('-', '_')
            
            # Save head data
            head_filename = f"{clean_name}_sample_head_10.csv"
            head_filepath = os.path.join(output_dir, head_filename)
            head_10.to_csv(head_filepath, index=False)
            
            # Save tail data
            tail_filename = f"{clean_name}_sample_tail_10.csv"
            tail_filepath = os.path.join(output_dir, tail_filename)
            tail_10.to_csv(tail_filepath, index=False)
            
            print(f"\nSample data saved:")
            print(f"- Head (10 rows): {head_filepath}")
            print(f"- Tail (10 rows): {tail_filepath}")
            
            # Store sample file paths in summary
            summary_info['sample_data_files'] = {
                'head_file': head_filepath,
                'tail_file': tail_filepath,
                'head_filename': head_filename,
                'tail_filename': tail_filename,
            }
            
        except Exception as e:
            print(f"Warning: Could not save sample data - {e}")
            summary_info['sample_data_files'] = None

    # Target analysis (if specified)
    if target_col and target_col in df.columns:
        print('\n' + '-'*80)
        print(f'TARGET ANALYSIS: {target_col}')
        print('-'*80)
        
        target_counts = df[target_col].value_counts().sort_index()
        target_info = {
            'column': target_col,
            'unique_classes': int(df[target_col].nunique()),
            'class_distribution': {}
        }
        
        print(f'Unique Classes: {df[target_col].nunique()}')
        print('\nClass Distribution:')
        for class_name, count in target_counts.items():
            percentage = (count / len(df)) * 100
            print(f'  {class_name}: {count:,} ({percentage:.2f}%)')
            target_info['class_distribution'][str(class_name)] = {
                'count': int(count),
                'percentage': float(percentage)
            }
        
        # Class balance analysis
        min_class_pct = target_counts.min() / len(df) * 100
        max_class_pct = target_counts.max() / len(df) * 100
        balance_ratio = target_counts.min() / target_counts.max()
        
        print(f'\nClass Balance Analysis:')
        print(f'- Min class: {min_class_pct:.2f}%')
        print(f'- Max class: {max_class_pct:.2f}%')
        print(f'- Balance ratio: {balance_ratio:.3f}')
        
        if balance_ratio < 0.1:
            print('Severely imbalanced dataset!')
        elif balance_ratio < 0.5:
            print('Moderately imbalanced dataset')
        else:
            print('Relatively balanced dataset')
            
        target_info['balance_metrics'] = {
            'min_class_pct': float(min_class_pct),
            'max_class_pct': float(max_class_pct),
            'balance_ratio': float(balance_ratio),
            'is_balanced': bool(balance_ratio >= 0.5)
        }
        
        summary_info['target_analysis'] = target_info

    # Numerical features analysis
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    if numeric_cols:
        print('\n' + '-'*80)
        print('NUMERICAL FEATURES ANALYSIS')
        print('-'*80)
        
        # Statistical summary
        numeric_stats = df[numeric_cols].describe()
        print(numeric_stats)
        
        # Convert to JSON format
        summary_info['numerical_analysis'] = {}
        for col in numeric_cols:
            col_stats = df[col].describe()
            summary_info['numerical_analysis'][col] = {
                'count': float(col_stats['count']),
                'mean': float(col_stats['mean']),
                'std': float(col_stats['std']),
                'min': float(col_stats['min']),
                'q1': float(col_stats['25%']),
                'median': float(col_stats['50%']),
                'q3': float(col_stats['75%']),
                'max': float(col_stats['max']),
                'skewness': float(df[col].skew()),
                'kurtosis': float(df[col].kurtosis())
            }
        
        # Identify highly skewed features
        print('\nSkewness Analysis:')
        skewed_features = []
        for col in numeric_cols:
            skewness = df[col].skew()
            if abs(skewness) > 1:
                print(f'  {col}: {skewness:.3f} ({"highly skewed" if abs(skewness) > 2 else "moderately skewed"})')
                skewed_features.append(col)
            
        summary_info['skewed_features'] = skewed_features

    # Categorical features analysis
    categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
    if categorical_cols:
        print('\n' + '-'*80)
        print('CATEGORICAL FEATURES ANALYSIS')
        print('-'*80)
        
        summary_info['categorical_analysis'] = {}
        
        for col in categorical_cols:
            if col != target_col:  # Skip target column as it's analyzed separately
                print(f'\n{col}:')
                value_counts = df[col].value_counts()
                unique_count = df[col].nunique()
                
                print(f'  Unique values: {unique_count}')
                if unique_count <= 20:  # Show distribution for low cardinality features
                    print('  Distribution:')
                    for value, count in value_counts.head(10).items():
                        pct = (count / len(df)) * 100
                        print(f'    {value}: {count} ({pct:.2f}%)')
                        
                summary_info['categorical_analysis'][col] = {
                    'unique_count': int(unique_count),
                    'top_values': {str(k): int(v) for k, v in value_counts.head(10).items()},
                    'cardinality_level': 'low' if unique_count <= 10 else 'medium' if unique_count <= 50 else 'high'
                }

    print('\n' + '='*80)

    return summary_info

In [4]:
def categorize_dataset_features(df, metadata=None, auto_detect=True, target_col=None):
    """
    Categorize dataset features into numerical and categorical with proper data type conversion
    
    Parameters:
    -----------
    df : pandas.DataFrame
        Dataset to categorize
    metadata : dict, optional
        Dataset metadata containing feature categories
    auto_detect : bool, default True
        Whether to auto-detect feature types if metadata not available
    target_col : str, optional
        Name of the target column to exclude from feature categorization
    
    Returns:
    --------
    dict : Dictionary containing categorized feature lists (excludes target column)
    """
    
    feature_categories = {
        'numerical': [],
        'categorical': [],
        'datetime': [],
        'binary': [],
        'ordinal': [],
        'exclude': []
    }
    
    # Use metadata if available
    if metadata and 'feature_categories' in metadata:
        for category, features in metadata['feature_categories'].items():
            if category in feature_categories:
                feature_categories[category] = [f for f in features if f in df.columns]
    
    # Auto-detect if no metadata or for remaining features
    if auto_detect:
        for col in df.columns:
            # Skip target column - it's not a feature
            if target_col and col == target_col:
                continue
                
            if col in sum(feature_categories.values(), []):
                continue  # Skip if already categorized
                
            # DateTime detection
            if df[col].dtype == 'object':
                try:
                    pd.to_datetime(df[col].head())
                    feature_categories['datetime'].append(col)
                    continue
                except:
                    pass
            
            # Numerical features
            if df[col].dtype in ['int64', 'float64', 'int32', 'float32']:
                # Check if binary (only 0 and 1)
                unique_vals = df[col].dropna().unique()
                if len(unique_vals) == 2 and set(unique_vals).issubset({0, 1, 0.0, 1.0}):
                    feature_categories['binary'].append(col)
                else:
                    feature_categories['numerical'].append(col)
            
            # Categorical features
            elif df[col].dtype in ['object', 'category'] or df[col].nunique() < 20:
                unique_count = df[col].nunique()
                if unique_count == 2:
                    feature_categories['binary'].append(col)
                elif unique_count < 10:
                    feature_categories['categorical'].append(col)
                else:
                    feature_categories['categorical'].append(col)
    
    return feature_categories


def apply_feature_categorization(df, feature_categories, inplace=True):
    """
    Apply proper data types to features based on categorization
    
    Parameters:
    -----------
    df : pandas.DataFrame
        Dataset to modify
    feature_categories : dict
        Dictionary containing categorized feature lists
    inplace : bool, default True
        Whether to modify the dataframe in place
    
    Returns:
    --------
    pandas.DataFrame : Modified dataframe (if inplace=False)
    """
    
    if not inplace:
        df = df.copy()
    
    # Convert numerical features
    for col in feature_categories.get('numerical', []):
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')
    
    # Convert categorical features
    for col in feature_categories.get('categorical', []):
        if col in df.columns:
            df[col] = df[col].astype('category')
    
    # Convert binary features
    for col in feature_categories.get('binary', []):
        if col in df.columns:
            df[col] = df[col].astype('category')
    
    # Convert datetime features
    for col in feature_categories.get('datetime', []):
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], errors='coerce')
    
    # Target column is not modified - it's handled separately
    
    if not inplace:
        return df


def save_eda_results(results_dict, output_dir, filename):
    """
    Save EDA results to JSON file
    
    Parameters:
    -----------
    results_dict : dict
        Dictionary containing EDA results
    output_dir : str
        Directory to save the results
    filename : str
        Custom filename for the JSON file
    """
    
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    filepath = os.path.join(output_dir, filename)
    
    # Save to JSON
    try:
        with open(filepath, 'w') as f:
            json.dump(results_dict, f, indent=4, default=str)
        print(f"EDA results saved to: {filepath}")
        return filepath
    except Exception as e:
        print(f"Error saving EDA results: {e}")
        return None

In [5]:
# Execute EDA if training data was loaded successfully
if train_df is not None and dataset_metadata is not None:
    
    # Get dataset name and target column from metadata
    dataset_name = dataset_metadata.get('dataset_name', 'TRAINING DATASET')
    target_col = dataset_metadata.get('target', 'TARGET')
    
    # Get short_name from metadata for file naming convention
    clean_dataset_name = dataset_metadata.get('short_name', dataset_name.lower().replace(' ', '_').replace('-', '_'))
    
    print(f"Dataset: {dataset_name}")
    print(f"Dataset Short Name (for files): {clean_dataset_name}")
    print(f"Target column: {target_col}")
    
    # Focus EDA only on training data (test data will be used only for final predictions)
    print("Performing comprehensive EDA on TRAINING data...")
    
    # Training data EDA with dataset name and short_name for file naming
    train_eda = examine_dataset(train_df, dataset_name, target_col, short_name=clean_dataset_name)
    
    # Feature categorization for training data - exclude target column
    print("\nCategorizing features...")
    train_features = categorize_dataset_features(train_df, dataset_metadata, auto_detect=True, target_col=target_col)
    
    print("Feature Categories:")
    for category, features in train_features.items():
        if features:
            print(f"  {category.title()}: {features}")
    
    # Apply categorization to training data
    apply_feature_categorization(train_df, train_features, inplace=True)
    
    # Create comprehensive EDA results
    eda_results = {
        'dataset_metadata': {
            'name': dataset_name,
            'short_name': clean_dataset_name,
            'source': dataset_metadata.get('repository_shortname', 'N/A'),
            'tasks': dataset_metadata.get('tasks', []),
            'license': dataset_metadata.get('license', 'N/A'),
            'num_features': dataset_metadata.get('num_features', 'N/A'),
            'num_classes': dataset_metadata.get('num_classes', 'N/A')
        },
        'training_analysis': train_eda,
        'feature_categorization': train_features,
        'analysis_scope': {
            'datasets_analyzed': ['training'],
            'note': 'Test data excluded from EDA - reserved for final predictions only',
            'training_shape': train_df.shape,
            'test_shape': test_df.shape if test_df is not None else None
        },
        'overall_summary': {
            'dataset_name': dataset_name,
            'short_name': clean_dataset_name,
            'task_type': dataset_metadata.get('tasks', ['classification'])[0] if dataset_metadata.get('tasks') else 'classification',
            'target_column': target_col,
            'num_classes': dataset_metadata.get('num_classes', train_df[target_col].nunique() if target_col and target_col in train_df.columns else None),
            'total_features': len(train_df.columns) - (1 if target_col and target_col in train_df.columns else 0),
            'training_samples': train_df.shape[0],
            'has_missing_values': train_df.isnull().any().any(),
            'has_duplicates': train_df.duplicated().any(),
            'is_balanced': train_eda.get('target_analysis', {}).get('balance_metrics', {}).get('is_balanced', None)
        }
    }
    
    # Save comprehensive EDA results using short_name
    eda_filename = f"{clean_dataset_name}_training_eda_analysis.json"
    eda_filepath = save_eda_results(eda_results, output_dir='../artifacts/data', filename=eda_filename)
    
    print(f"\nTraining data EDA analysis completed!")
else:
    print("Cannot perform EDA - training data loading failed. Please check the data loading section above.")

Dataset: Multi-Class Prediction of Obesity Risk
Dataset Short Name (for files): kaggle_obesity_prediction
Target column: NObeyesdad
Performing comprehensive EDA on TRAINING data...

Multi-Class Prediction of Obesity Risk OVERVIEW

Shape: (20758, 18)
Rows: 20,758
Columns: 18

Column Names:
['id', 'Gender', 'Age', 'Height', 'Weight', 'family_history_with_overweight', 'FAVC', 'FCVC', 'NCP', 'CAEC', 'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE', 'CALC', 'MTRANS', 'NObeyesdad']

--------------------------------------------------------------------------------
DATA QUALITY ANALYSIS
--------------------------------------------------------------------------------
                                  dtype  missing_count  missing_pct  \
id                                int64              0          0.0   
Gender                           object              0          0.0   
Age                             float64              0          0.0   
Height                          float64              0        

---

### **<span style="color:#8B008B"><u>Data Visualization & Analysis</u></span>**

**Key Visualization Components:**
- **Distribution Analysis**: Boxplots, histograms, Q-Q plots for understanding data distributions
- **Feature Relationships**: Numerical and categorical features vs target variable analysis
- **Correlation Analysis**: Heatmaps and correlation matrices to identify multicollinearity
- **Target Variable Analysis**: Class distribution, imbalance detection, and proportions
- **Statistical Insights**: Skewness analysis, transformation recommendations

In [6]:
def create_target_distribution_plot(df, target_col, short_name, output_dir='../artifacts/reports'):
    """
    Create interactive target variable distribution bar chart
    
    Parameters:
    -----------
    df : pandas.DataFrame
        Dataset containing the target column
    target_col : str
        Name of the target column
    short_name : str
        Short name for file naming convention
    output_dir : str
        Directory to save the HTML plot
    
    Returns:
    --------
    plotly.graph_objects.Figure : The created figure
    """
    
    os.makedirs(output_dir, exist_ok=True)
    
    # Get target distribution
    target_counts = df[target_col].value_counts().sort_index()
    
    # Create bar chart
    fig = go.Figure(data=[
        go.Bar(
            x=target_counts.index.astype(str),
            y=target_counts.values,
            marker_color=['#00D9FF', '#FF6B9D', '#FFC233', '#00E396', '#775DD0', '#FF4560', '#00E3F0'],
            opacity=0.8
        )
    ])
    
    fig.update_layout(
        title=f"Target Variable Distribution: {target_col}",
        title_font=dict(size=20, color='white'),
        xaxis_title="Classes",
        yaxis_title="Count",
        template='plotly_dark',
        paper_bgcolor='#1e1e1e',
        plot_bgcolor='#2d2d2d',
        font=dict(color='white'),
        height=500
    )
    
    # Save as HTML with short_name prefix
    filename = f'{short_name}_target_distribution.html'
    output_path = os.path.join(output_dir, filename)
    fig.write_html(output_path)
    print(f"Saved: {filename}")
    
    return fig


def create_numerical_distributions_plot(df, numerical_features, short_name, output_dir='../artifacts/reports'):
    """
    Create interactive histograms for numerical features
    
    Parameters:
    -----------
    df : pandas.DataFrame
        Dataset containing numerical features
    numerical_features : list
        List of numerical feature column names
    short_name : str
        Short name for file naming convention
    output_dir : str
        Directory to save the HTML plot
    
    Returns:
    --------
    plotly.graph_objects.Figure : The created figure
    """
    
    if not numerical_features:
        print("No numerical features to plot")
        return None
    
    os.makedirs(output_dir, exist_ok=True)
    
    # Create subplots
    n_cols = min(3, len(numerical_features))
    n_rows = (len(numerical_features) + n_cols - 1) // n_cols
    
    fig = make_subplots(
        rows=n_rows, cols=n_cols,
        subplot_titles=numerical_features,
        vertical_spacing=0.12,
        horizontal_spacing=0.08
    )
    
    colors = ['#00D9FF', '#FF6B9D', '#FFC233', '#00E396', '#775DD0', '#FF4560', '#00E3F0', '#FEB019']
    
    # Add histograms
    for idx, col in enumerate(numerical_features):
        row = idx // n_cols + 1
        col_pos = idx % n_cols + 1
        
        fig.add_trace(
            go.Histogram(
                x=df[col],
                name=col,
                marker_color=colors[idx % len(colors)],
                opacity=0.8,
                showlegend=False
            ),
            row=row, col=col_pos
        )
    
    fig.update_layout(
        title_text="Numerical Features Distribution",
        title_font=dict(size=20, color='white'),
        height=300 * n_rows,
        template='plotly_dark',
        paper_bgcolor='#1e1e1e',
        plot_bgcolor='#2d2d2d',
        font=dict(color='white'),
        showlegend=False
    )
    
    # Update axes with labels
    for i in range(1, n_rows + 1):
        for j in range(1, n_cols + 1):
            idx = (i - 1) * n_cols + (j - 1)
            if idx < len(numerical_features):
                fig.update_xaxes(
                    title_text=numerical_features[idx],
                    gridcolor='#444444',
                    row=i, col=j
                )
                fig.update_yaxes(
                    title_text="Frequency",
                    gridcolor='#444444',
                    row=i, col=j
                )
    
    # Save as HTML with short_name prefix
    filename = f'{short_name}_numerical_distributions.html'
    output_path = os.path.join(output_dir, filename)
    fig.write_html(output_path)
    print(f"Saved: {filename}")
    
    return fig


def create_categorical_distributions_plot(df, categorical_features, short_name, output_dir='../artifacts/reports'):
    """
    Create interactive bar charts for categorical features
    
    Parameters:
    -----------
    df : pandas.DataFrame
        Dataset containing categorical features
    categorical_features : list
        List of categorical feature column names
    short_name : str
        Short name for file naming convention
    output_dir : str
        Directory to save the HTML plot
    
    Returns:
    --------
    plotly.graph_objects.Figure : The created figure
    """
    
    if not categorical_features:
        print("No categorical features to plot")
        return None
    
    os.makedirs(output_dir, exist_ok=True)
    
    # Create subplots
    n_cols = min(2, len(categorical_features))
    n_rows = (len(categorical_features) + n_cols - 1) // n_cols
    
    fig = make_subplots(
        rows=n_rows, cols=n_cols,
        subplot_titles=categorical_features,
        vertical_spacing=0.15,
        horizontal_spacing=0.10
    )
    
    colors = ['#00D9FF', '#FF6B9D', '#FFC233', '#00E396', '#775DD0', '#FF4560', '#00E3F0', '#FEB019']
    
    # Add bar charts
    for idx, col in enumerate(categorical_features):
        row = idx // n_cols + 1
        col_pos = idx % n_cols + 1
        
        value_counts = df[col].value_counts()
        
        fig.add_trace(
            go.Bar(
                x=value_counts.index.astype(str),
                y=value_counts.values,
                name=col,
                marker_color=colors[idx % len(colors)],
                opacity=0.8,
                showlegend=False
            ),
            row=row, col=col_pos
        )
    
    fig.update_layout(
        title_text="Categorical Features Distribution",
        title_font=dict(size=20, color='white'),
        height=400 * n_rows,
        template='plotly_dark',
        paper_bgcolor='#1e1e1e',
        plot_bgcolor='#2d2d2d',
        font=dict(color='white'),
        showlegend=False
    )
    
    # Update axes with labels
    for i in range(1, n_rows + 1):
        for j in range(1, n_cols + 1):
            idx = (i - 1) * n_cols + (j - 1)
            if idx < len(categorical_features):
                fig.update_xaxes(
                    title_text="Categories",
                    gridcolor='#444444',
                    row=i, col=j
                )
                fig.update_yaxes(
                    title_text="Count",
                    gridcolor='#444444',
                    row=i, col=j
                )
    
    # Save as HTML with short_name prefix
    filename = f'{short_name}_categorical_distributions.html'
    output_path = os.path.join(output_dir, filename)
    fig.write_html(output_path)
    print(f"Saved: {filename}")
    
    return fig


def create_correlation_matrix_plot(df, numerical_features, short_name, output_dir='../artifacts/reports'):
    """
    Create interactive correlation matrix heatmap
    
    Parameters:
    -----------
    df : pandas.DataFrame
        Dataset containing numerical features
    numerical_features : list
        List of numerical feature column names
    short_name : str
        Short name for file naming convention
    output_dir : str
        Directory to save the HTML plot
    
    Returns:
    --------
    plotly.graph_objects.Figure : The created figure
    """
    
    if len(numerical_features) < 2:
        print("Need at least 2 numerical features for correlation matrix")
        return None
    
    os.makedirs(output_dir, exist_ok=True)
    
    # Calculate correlation matrix
    corr_data = df[numerical_features].corr()
    
    # Create heatmap
    fig = go.Figure(data=go.Heatmap(
        z=corr_data.values,
        x=corr_data.columns,
        y=corr_data.columns,
        colorscale='RdBu_r',
        zmid=0,
        text=corr_data.round(2).values,
        texttemplate="%{text}",
        textfont={"size": 10},
        colorbar=dict(title="Correlation")
    ))
    
    fig.update_layout(
        title="Feature Correlation Matrix",
        title_font=dict(size=20, color='white'),
        xaxis_title="Features",
        yaxis_title="Features",
        template='plotly_dark',
        paper_bgcolor='#1e1e1e',
        font=dict(color='white'),
        height=max(500, len(numerical_features) * 30)
    )
    
    # Save as HTML with short_name prefix
    filename = f'{short_name}_correlation_matrix.html'
    output_path = os.path.join(output_dir, filename)
    fig.write_html(output_path)
    print(f"Saved: {filename}")
    
    return fig


def create_dataset_overview_table(df, dataset_name, target_col, short_name, output_dir='../artifacts/reports'):
    """
    Create interactive dataset overview summary table
    
    Parameters:
    -----------
    df : pandas.DataFrame
        Dataset to summarize
    dataset_name : str
        Name of the dataset
    target_col : str
        Name of the target column
    short_name : str
        Short name for file naming convention
    output_dir : str
        Directory to save the HTML table
    
    Returns:
    --------
    plotly.graph_objects.Figure : The created figure
    """
    
    os.makedirs(output_dir, exist_ok=True)
    
    # Gather dataset statistics
    total_samples = len(df)
    total_features = len(df.columns) - 1  # exclude target
    numerical_features = df.select_dtypes(include=[np.number]).columns.tolist()
    if target_col in numerical_features:
        numerical_features.remove(target_col)
    
    categorical_features = df.select_dtypes(include=['object', 'category']).columns.tolist()
    if target_col in categorical_features:
        categorical_features.remove(target_col)
    
    missing_values = df.isnull().sum().sum()
    duplicate_rows = df.duplicated().sum()
    
    # Create summary statistics table
    summary_stats = [
        ["Dataset Name", dataset_name],
        ["Total Samples", f"{total_samples:,}"],
        ["Total Features", total_features],
        ["Numerical Features", len(numerical_features)],
        ["Categorical Features", len(categorical_features)],
        ["Target Variable", target_col],
        ["Target Classes", df[target_col].nunique()],
        ["Missing Values", missing_values],
        ["Duplicate Rows", duplicate_rows],
        ["Memory Usage", f"{df.memory_usage(deep=True).sum() / 1024**2:.2f} MB"]
    ]
    
    # Create overview table
    fig = go.Figure(data=[go.Table(
        header=dict(
            values=['<b>Attribute</b>', '<b>Value</b>'],
            fill_color='#00D9FF',
            font=dict(color='#1e1e1e', size=14, family='Arial Black'),
            align='center',
            height=40
        ),
        cells=dict(
            values=[[stat[0] for stat in summary_stats], [stat[1] for stat in summary_stats]],
            fill_color=[['#2d2d2d' if i % 2 == 0 else '#3a3a3a' for i in range(len(summary_stats))]] * 2,
            align=['left', 'center'],
            font=dict(color='white', size=12, family='Arial'),
            height=35
        )
    )])
    
    fig.update_layout(
        title=f"{dataset_name} - Dataset Overview",
        title_font=dict(size=24, color='white'),
        height=min(600, 150 + len(summary_stats) * 40),
        template='plotly_dark',
        paper_bgcolor='#1e1e1e',
        font=dict(color='white')
    )
    
    # Save HTML with short_name prefix
    filename = f'{short_name}_dataset_overview.html'
    output_path = os.path.join(output_dir, filename)
    fig.write_html(output_path)
    print(f"Saved: {filename}")
    
    return fig


def create_feature_summary_table(df, dataset_name, target_col, short_name, output_dir='../artifacts/reports'):
    """
    Create interactive feature summary table
    
    Parameters:
    -----------
    df : pandas.DataFrame
        Dataset to summarize
    dataset_name : str
        Name of the dataset
    target_col : str
        Name of the target column
    short_name : str
        Short name for file naming convention
    output_dir : str
        Directory to save the HTML table
    
    Returns:
    --------
    plotly.graph_objects.Figure : The created figure
    """
    
    os.makedirs(output_dir, exist_ok=True)
    
    # Get features
    numerical_features = df.select_dtypes(include=[np.number]).columns.tolist()
    if target_col in numerical_features:
        numerical_features.remove(target_col)
    
    categorical_features = df.select_dtypes(include=['object', 'category']).columns.tolist()
    if target_col in categorical_features:
        categorical_features.remove(target_col)
    
    # Build feature details
    feature_details = []
    
    # Add numerical features
    for col in numerical_features:
        feature_details.append([
            col,
            "Numerical",
            f"{df[col].dtype}",
            f"{df[col].isnull().sum()}",
            f"μ={df[col].mean():.2f}, σ={df[col].std():.2f}"
        ])
    
    # Add categorical features
    for col in categorical_features:
        unique_vals = df[col].nunique()
        most_common = df[col].mode().iloc[0] if len(df[col].mode()) > 0 else "N/A"
        feature_details.append([
            col,
            "Categorical",
            f"{df[col].dtype}",
            f"{df[col].isnull().sum()}",
            f"{unique_vals} unique, mode: {most_common}"
        ])
    
    # Create feature summary table
    fig = go.Figure(data=[go.Table(
        header=dict(
            values=['<b>Feature</b>', '<b>Type</b>', '<b>Data Type</b>', '<b>Missing</b>', '<b>Statistics</b>'],
            fill_color='#FF6B9D',
            font=dict(color='#1e1e1e', size=12, family='Arial Black'),
            align='center',
            height=35
        ),
        cells=dict(
            values=[
                [detail[0] for detail in feature_details],
                [detail[1] for detail in feature_details],
                [detail[2] for detail in feature_details],
                [detail[3] for detail in feature_details],
                [detail[4] for detail in feature_details]
            ],
            fill_color=[['#2d2d2d' if i % 2 == 0 else '#3a3a3a' for i in range(len(feature_details))]] * 5,
            align=['left', 'center', 'center', 'center', 'left'],
            font=dict(color='white', size=10, family='Arial'),
            height=30
        )
    )])
    
    fig.update_layout(
        title=f"Feature Summary - {dataset_name}",
        title_font=dict(size=20, color='white'),
        height=min(800, 150 + len(feature_details) * 35),
        template='plotly_dark',
        paper_bgcolor='#1e1e1e',
        font=dict(color='white')
    )
    
    # Save HTML with short_name prefix
    filename = f'{short_name}_feature_summary.html'
    output_path = os.path.join(output_dir, filename)
    fig.write_html(output_path)
    print(f"Saved: {filename}")
    
    return fig


def save_analysis_summary_json(df, dataset_name, target_col, short_name, output_dir='../artifacts/data'):
    """
    Save analysis summary as JSON file
    
    Parameters:
    -----------
    df : pandas.DataFrame
        Dataset to summarize
    dataset_name : str
        Name of the dataset
    target_col : str
        Name of the target column
    short_name : str
        Short name for file naming convention
    output_dir : str
        Directory to save the JSON file
    
    Returns:
    --------
    str : Path to the saved JSON file
    """
    
    os.makedirs(output_dir, exist_ok=True)
    
    # Gather statistics
    total_samples = len(df)
    total_features = len(df.columns) - 1
    numerical_features = df.select_dtypes(include=[np.number]).columns.tolist()
    if target_col in numerical_features:
        numerical_features.remove(target_col)
    
    categorical_features = df.select_dtypes(include=['object', 'category']).columns.tolist()
    if target_col in categorical_features:
        categorical_features.remove(target_col)
    
    missing_values = df.isnull().sum().sum()
    duplicate_rows = df.duplicated().sum()
    
    # Create summary dictionary
    analysis_summary = {
        'dataset_info': {
            'name': dataset_name,
            'short_name': short_name,
            'total_samples': int(total_samples),
            'total_features': int(total_features),
            'target_column': target_col,
            'analysis_timestamp': pd.Timestamp.now().isoformat()
        },
        'feature_breakdown': {
            'numerical_features': {
                'count': len(numerical_features),
                'names': numerical_features
            },
            'categorical_features': {
                'count': len(categorical_features),
                'names': categorical_features
            }
        },
        'data_quality': {
            'missing_values': int(missing_values),
            'duplicate_rows': int(duplicate_rows),
            'completeness_rate': float((total_samples * len(df.columns) - missing_values) / (total_samples * len(df.columns)) * 100)
        },
        'target_analysis': {
            'type': 'categorical',
            'num_classes': int(df[target_col].nunique()),
            'class_distribution': {str(k): int(v) for k, v in df[target_col].value_counts().to_dict().items()}
        },
        'generated_files': {
            'html_reports': [
                f'{short_name}_dataset_overview.html',
                f'{short_name}_feature_summary.html',
                f'{short_name}_target_distribution.html',
                f'{short_name}_numerical_distributions.html',
                f'{short_name}_categorical_distributions.html',
                f'{short_name}_correlation_matrix.html'
            ],
            'json_data': f'{short_name}_visualization_analysis.json'
        }
    }
    
    # Save JSON with short_name prefix
    json_filename = f'{short_name}_visualization_analysis.json'
    json_filepath = os.path.join(output_dir, json_filename)
    
    with open(json_filepath, 'w') as f:
        json.dump(analysis_summary, f, indent=2, default=str)
    
    print(f"Saved: {json_filename}")
    
    return json_filepath


def run_visualization_analysis(df, metadata, output_dir='../artifacts/reports', save_json=True):
    """
    Run comprehensive visualization analysis on dataset
    
    Parameters:
    -----------
    df : pandas.DataFrame
        Dataset to analyze
    metadata : dict
        Dataset metadata containing dataset_name, short_name and target column info
    output_dir : str
        Directory to save all HTML plots
    save_json : bool
        Whether to save JSON summary
    
    Returns:
    --------
    dict : Dictionary containing all generated figures
    """
    
    # Extract metadata
    dataset_name = metadata.get('dataset_name', 'TRAINING DATASET')
    target_col = metadata.get('target', 'TARGET')
    # Get short_name from metadata for file naming convention
    short_name = metadata.get('short_name', dataset_name.lower().replace(' ', '_').replace('-', '_'))
    
    print(f"Starting comprehensive visualization analysis...")
    print(f"Dataset: {dataset_name}")
    print(f"Short Name (for files): {short_name}")
    print(f"Target: {target_col}")
    print(f"Data shape: {df.shape}")
    
    # Create output directory
    os.makedirs(output_dir, exist_ok=True)
    
    print(f"\nVISUALIZATION ANALYSIS")
    print("="*100)
    
    figures = {}
    
    # 1. Dataset Overview Table
    print(f"\nCreating dataset overview...")
    fig_overview = create_dataset_overview_table(df, dataset_name, target_col, short_name, output_dir)
    if fig_overview:
        display(fig_overview)
        figures['dataset_overview'] = fig_overview
    
    # 2. Feature Summary Table
    print(f"\nCreating feature summary...")
    fig_features = create_feature_summary_table(df, dataset_name, target_col, short_name, output_dir)
    if fig_features:
        display(fig_features)
        figures['feature_summary'] = fig_features
    
    # 3. Target distribution
    if target_col and target_col in df.columns:
        print(f"\nCreating target distribution...")
        fig_target = create_target_distribution_plot(df, target_col, short_name, output_dir)
        if fig_target:
            display(fig_target)
            figures['target_distribution'] = fig_target
    else:
        print(f"Target column '{target_col}' not found in dataset")
        return figures
    
    # 4. Numerical features distribution
    numerical_features = df.select_dtypes(include=[np.number]).columns.tolist()
    if target_col in numerical_features:
        numerical_features.remove(target_col)
    
    if numerical_features:
        print(f"\nCreating numerical features distribution plots...")
        fig_num = create_numerical_distributions_plot(df, numerical_features, short_name, output_dir)
        if fig_num:
            display(fig_num)
            figures['numerical_distributions'] = fig_num
    
    # 5. Categorical features distribution
    categorical_features = df.select_dtypes(include=['object', 'category']).columns.tolist()
    if target_col in categorical_features:
        categorical_features.remove(target_col)
    
    if categorical_features:
        print(f"\nCreating categorical features distribution plots...")
        fig_cat = create_categorical_distributions_plot(df, categorical_features, short_name, output_dir)
        if fig_cat:
            display(fig_cat)
            figures['categorical_distributions'] = fig_cat
    
    # 6. Correlation matrix
    if len(numerical_features) > 1:
        print(f"\nCreating correlation matrix...")
        fig_corr = create_correlation_matrix_plot(df, numerical_features, short_name, output_dir)
        if fig_corr:
            display(fig_corr)
            figures['correlation_matrix'] = fig_corr
    
    # 7. Save JSON summary
    if save_json:
        print(f"\nSaving analysis summary JSON...")
        save_analysis_summary_json(df, dataset_name, target_col, short_name, output_dir='../artifacts/data')
    
    print(f"\nComprehensive visualization analysis completed!")
    print(f"All files saved to: {output_dir}/ and ../artifacts/data/")
    
    return figures


print("Visualization functions loaded successfully!")

Visualization functions loaded successfully!


In [7]:
# Execute comprehensive visualization analysis using modular functions
if train_df is not None and dataset_metadata is not None:
    # Run all visualization analyses
    visualization_figures = run_visualization_analysis(
        df=train_df,
        metadata=dataset_metadata,
        output_dir='../artifacts/reports'
    )
else:
    print("Cannot perform visualization analysis - training data or metadata not available")

Starting comprehensive visualization analysis...
Dataset: Multi-Class Prediction of Obesity Risk
Short Name (for files): kaggle_obesity_prediction
Target: NObeyesdad
Data shape: (20758, 18)

VISUALIZATION ANALYSIS

Creating dataset overview...
Saved: kaggle_obesity_prediction_dataset_overview.html



Creating feature summary...
Saved: kaggle_obesity_prediction_feature_summary.html



Creating target distribution...
Saved: kaggle_obesity_prediction_target_distribution.html



Creating numerical features distribution plots...
Saved: kaggle_obesity_prediction_numerical_distributions.html



Creating categorical features distribution plots...
Saved: kaggle_obesity_prediction_categorical_distributions.html



Creating correlation matrix...
Saved: kaggle_obesity_prediction_correlation_matrix.html



Saving analysis summary JSON...
Saved: kaggle_obesity_prediction_visualization_analysis.json

Comprehensive visualization analysis completed!
All files saved to: ../artifacts/reports/ and ../artifacts/data/


---

### **<span style="color:#8B008B"><u>Preprocessing and Feature Engineering</u></span>**

- **Missing Value Handling**: Imputation strategies for numerical and categorical features
- **Categorical Encoding**: One-hot encoding for nominal features, label encoding for ordinal features
- **Feature Scaling**: Standardization/normalization based on distribution analysis
- **Feature Engineering**: Create new features, polynomial features, interaction terms
- **Feature Selection**: Remove multicollinear features, select important features based on correlation analysis
- **Data Splitting**: Stratified train-validation split to maintain class distribution
- **Skewness Correction**: Apply transformations (log, Box-Cox) to highly skewed features
- **Outlier Detection**: Identify and handle outliers in numerical features

In [8]:
def split_test_set_first(df, target_col, test_size=0.10, random_state=42):
    """
    Split test set from original data BEFORE any preprocessing
    This keeps the test set in original form with labels for final verification
    
    Parameters:
    -----------
    df : pandas.DataFrame
        Original dataset (train.csv with labels)
    target_col : str
        Name of target column
    test_size : float
        Proportion of data for test (0-1)
    random_state : int
        Random seed for reproducibility
    
    Returns:
    --------
    tuple : (df_trainval, test_set)
        df_trainval: data for training and validation (will be preprocessed)
        test_set: unprocessed test data with labels
    """
    X = df.drop(columns=[target_col])
    y = df[target_col]
    
    # Split off test set with stratification
    X_trainval, X_test, y_trainval, y_test = train_test_split(
        X, y, 
        test_size=test_size, 
        random_state=random_state, 
        stratify=y
    )
    
    # Reconstruct full dataframes
    df_trainval = X_trainval.copy()
    df_trainval[target_col] = y_trainval
    
    test_set = X_test.copy()
    test_set[target_col] = y_test
    
    test_pct = test_size * 100
    trainval_pct = (1 - test_size) * 100
    
    print(f"Initial split (BEFORE preprocessing):")
    print(f"   Train+Val: {len(df_trainval):5d} samples ({trainval_pct:.1f}%) - will be preprocessed")
    print(f"   Test:      {len(test_set):5d} samples ({test_pct:.1f}%) - kept unprocessed with labels")
    
    return df_trainval, test_set


def handle_missing_values(df, numerical_features, categorical_features):
    """
    Handle missing values in numerical and categorical columns
    
    Parameters:
    -----------
    df : pandas.DataFrame
        Input dataframe
    numerical_features : list
        List of numerical feature names
    categorical_features : list
        List of categorical feature names
    
    Returns:
    --------
    tuple : (df_imputed, imputation_stats)
    """
    df_imputed = df.copy()
    imputation_stats = {}
    
    # Handle numerical features
    for col in numerical_features:
        missing_count = df_imputed[col].isnull().sum()
        if missing_count > 0:
            # Use median for numerical features (more robust to outliers)
            imputation_value = df_imputed[col].median()
            df_imputed[col].fillna(imputation_value, inplace=True)
            imputation_stats[col] = {
                'missing_count': int(missing_count),
                'imputation_method': 'median',
                'imputation_value': float(imputation_value)
            }
    
    # Handle categorical features
    for col in categorical_features:
        missing_count = df_imputed[col].isnull().sum()
        if missing_count > 0:
            # Use mode for categorical features
            imputation_value = df_imputed[col].mode()[0]
            df_imputed[col].fillna(imputation_value, inplace=True)
            imputation_stats[col] = {
                'missing_count': int(missing_count),
                'imputation_method': 'mode',
                'imputation_value': str(imputation_value)
            }
    
    total_imputed = sum(stat['missing_count'] for stat in imputation_stats.values())
    print(f"Missing values: {total_imputed} values imputed across {len(imputation_stats)} features")
    
    return df_imputed, imputation_stats


def encode_categorical_features(df, categorical_features, encoding_type='onehot'):
    """
    Encode categorical features using one-hot or label encoding
    
    Parameters:
    -----------
    df : pandas.DataFrame
        Input dataframe
    categorical_features : list
        List of categorical feature names
    encoding_type : str
        'onehot' or 'label'
    
    Returns:
    --------
    tuple : (df_encoded, encoding_info)
    """
    df_encoded = df.copy()
    encoding_info = {}
    
    if encoding_type == 'onehot':
        # One-hot encoding
        df_encoded = pd.get_dummies(df_encoded, columns=categorical_features, drop_first=True)
        
        # Track encoding info
        for col in categorical_features:
            new_cols = [c for c in df_encoded.columns if c.startswith(f"{col}_")]
            encoding_info[col] = {
                'encoding_type': 'onehot',
                'new_columns': new_cols,
                'num_categories': len(new_cols) + 1  # +1 for dropped first category
            }
        
        total_new_features = sum(len(info['new_columns']) for info in encoding_info.values())
        print(f"Categorical encoding: {len(categorical_features)} features encoded to {total_new_features} binary features")
    
    else:  # label encoding
        for col in categorical_features:
            le = LabelEncoder()
            df_encoded[col] = le.fit_transform(df_encoded[col])
            encoding_info[col] = {
                'encoding_type': 'label',
                'classes': le.classes_.tolist()
            }
        print(f"Categorical encoding: {len(categorical_features)} features label-encoded")
    
    return df_encoded, encoding_info


def correct_skewness(df, numerical_features, skewness_threshold=1.0, method='log'):
    """
    Correct skewness in numerical features
    
    Parameters:
    -----------
    df : pandas.DataFrame
        Input dataframe
    numerical_features : list
        List of numerical feature names
    skewness_threshold : float
        Absolute skewness value above which to apply transformation
    method : str
        Transformation method - 'log', 'sqrt', or 'boxcox'
    
    Returns:
    --------
    tuple : (df_transformed, transformation_info)
    """
    df_transformed = df.copy()
    transformation_info = {}
    
    for col in numerical_features:
        col_skewness = skew(df_transformed[col].dropna())
        
        if abs(col_skewness) > skewness_threshold:
            original_skew = col_skewness
            
            if method == 'log':
                # Add 1 to handle zeros, ensure positive values
                min_val = df_transformed[col].min()
                if min_val <= 0:
                    df_transformed[col] = df_transformed[col] + abs(min_val) + 1
                df_transformed[col] = np.log1p(df_transformed[col])
                
            elif method == 'sqrt':
                min_val = df_transformed[col].min()
                if min_val < 0:
                    df_transformed[col] = df_transformed[col] + abs(min_val)
                df_transformed[col] = np.sqrt(df_transformed[col])
                
            elif method == 'boxcox':
                min_val = df_transformed[col].min()
                if min_val <= 0:
                    df_transformed[col] = df_transformed[col] + abs(min_val) + 1
                df_transformed[col], fitted_lambda = boxcox(df_transformed[col])
                transformation_info[col] = {
                    'method': method,
                    'original_skewness': float(original_skew),
                    'transformed_skewness': float(skew(df_transformed[col])),
                    'lambda': float(fitted_lambda)
                }
                continue
            
            transformation_info[col] = {
                'method': method,
                'original_skewness': float(original_skew),
                'transformed_skewness': float(skew(df_transformed[col]))
            }
    
    print(f"Skewness correction: {len(transformation_info)} features transformed using {method} method")
    
    return df_transformed, transformation_info


def detect_and_handle_outliers(df, numerical_features, method='iqr', threshold=1.5):
    """
    Detect and handle outliers in numerical features
    
    Parameters:
    -----------
    df : pandas.DataFrame
        Input dataframe
    numerical_features : list
        List of numerical feature names
    method : str
        Detection method - 'iqr' or 'zscore'
    threshold : float
        Threshold multiplier (1.5 for IQR, 3 for z-score)
    
    Returns:
    --------
    tuple : (df_cleaned, outlier_stats)
    """
    df_cleaned = df.copy()
    outlier_stats = {}
    
    for col in numerical_features:
        if method == 'iqr':
            Q1 = df_cleaned[col].quantile(0.25)
            Q3 = df_cleaned[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - threshold * IQR
            upper_bound = Q3 + threshold * IQR
        else:  # zscore
            mean = df_cleaned[col].mean()
            std = df_cleaned[col].std()
            lower_bound = mean - threshold * std
            upper_bound = mean + threshold * std
        
        # Count outliers
        outliers = ((df_cleaned[col] < lower_bound) | (df_cleaned[col] > upper_bound)).sum()
        
        if outliers > 0:
            # Cap outliers
            df_cleaned[col] = df_cleaned[col].clip(lower=lower_bound, upper=upper_bound)
            
            outlier_stats[col] = {
                'method': method,
                'outlier_count': int(outliers),
                'lower_bound': float(lower_bound),
                'upper_bound': float(upper_bound)
            }
    
    total_outliers = sum(stat['outlier_count'] for stat in outlier_stats.values())
    print(f"Outlier handling: {total_outliers} outliers capped across {len(outlier_stats)} features")
    
    return df_cleaned, outlier_stats


def encode_target_variable(y_train, y_val):
    """
    Encode target variable using label encoding (only for train and validation)
    
    Parameters:
    -----------
    y_train : pandas.Series
        Training target values
    y_val : pandas.Series
        Validation target values
    
    Returns:
    --------
    tuple : (y_train_encoded, y_val_encoded, target_encoding_map)
    """
    
    # Create label encoder
    label_encoder = LabelEncoder()
    
    # Fit on training data and transform both train and val
    y_train_encoded = pd.Series(label_encoder.fit_transform(y_train), index=y_train.index)
    y_val_encoded = pd.Series(label_encoder.transform(y_val), index=y_val.index)
    
    # Create encoding map
    target_encoding_map = {
        'encoder_classes': label_encoder.classes_.tolist(),
        'class_to_label': {str(cls): int(idx) for idx, cls in enumerate(label_encoder.classes_)},
        'label_to_class': {int(idx): str(cls) for idx, cls in enumerate(label_encoder.classes_)},
        'num_classes': len(label_encoder.classes_)
    }
    
    print(f"Target encoding: {target_encoding_map['num_classes']} classes encoded")
    print(f"   Classes: {target_encoding_map['encoder_classes']}")
    
    return y_train_encoded, y_val_encoded, target_encoding_map


def split_train_validation(df, target_col, val_size=0.15, random_state=42):
    """
    Split data into train and validation sets with stratification
    (Applied after test set has been separated)
    
    Parameters:
    -----------
    df : pandas.DataFrame
        Train+Val dataset (after test has been separated)
    target_col : str
        Name of target column
    val_size : float
        Proportion of data for validation (0-1)
    random_state : int
        Random seed for reproducibility
    
    Returns:
    --------
    tuple : (X_train, X_val, y_train, y_val)
    """
    X = df.drop(columns=[target_col])
    y = df[target_col]
    
    X_train, X_val, y_train, y_val = train_test_split(
        X, y,
        test_size=val_size,
        random_state=random_state,
        stratify=y
    )
    
    train_pct = (1 - val_size) * 100
    val_pct = val_size * 100
    
    print(f"Train/Val split:")
    print(f"   Train:      {X_train.shape[0]:5d} samples ({train_pct:.1f}%)")
    print(f"   Validation: {X_val.shape[0]:5d} samples ({val_pct:.1f}%)")
    
    return X_train, X_val, y_train, y_val


def scale_features(X_train, X_val, numerical_features, method='standard'):
    """
    Scale numerical features using standardization or normalization
    
    Parameters:
    -----------
    X_train : pandas.DataFrame
        Training features
    X_val : pandas.DataFrame
        Validation features
    numerical_features : list
        List of numerical feature names to scale
    method : str
        Scaling method - 'standard' or 'minmax'
    
    Returns:
    --------
    tuple : (X_train_scaled, X_val_scaled, scaling_params)
    """
    X_train_scaled = X_train.copy()
    X_val_scaled = X_val.copy()
    scaling_params = {}
    
    if method == 'standard':
        scaler = StandardScaler()
    else:
        scaler = MinMaxScaler()
    
    # Fit on training data and transform both sets
    X_train_scaled[numerical_features] = scaler.fit_transform(X_train[numerical_features])
    X_val_scaled[numerical_features] = scaler.transform(X_val[numerical_features])
    
    # Store scaling parameters for each feature
    for i, col in enumerate(numerical_features):
        if method == 'standard':
            scaling_params[col] = {
                'method': 'standard',
                'mean': float(scaler.mean_[i]),
                'std': float(scaler.scale_[i])
            }
        else:
            scaling_params[col] = {
                'method': 'minmax',
                'min': float(scaler.data_min_[i]),
                'max': float(scaler.data_max_[i])
            }
    
    print(f"Feature scaling: {len(numerical_features)} features scaled using {method} method")
    
    return X_train_scaled, X_val_scaled, scaling_params


def save_preprocessed_data(X_train, X_val, y_train, y_val, test_set,
                          scaling_params, preprocessing_stats, 
                          output_dir='../artifacts/data'):
    """
    Save preprocessed training and validation data, plus unprocessed test set
    
    Parameters:
    -----------
    X_train, X_val : pandas.DataFrame
        Processed feature sets
    y_train, y_val : pandas.Series
        Target variables (encoded)
    test_set : pandas.DataFrame
        Unprocessed test data with original labels
    scaling_params : dict
        Scaling parameters for inverse transformation
    preprocessing_stats : dict
        All preprocessing statistics
    output_dir : str
        Output directory path
    
    Returns:
    --------
    dict : Dictionary of saved file paths
    """
    os.makedirs(output_dir, exist_ok=True)

    # Get short_name from metadata for file naming convention
    dataset_name = dataset_metadata.get('dataset_name', 'TRAINING DATASET')
    clean_dataset_name = dataset_metadata.get('short_name', dataset_name.lower().replace(' ', '_').replace('-', '_'))
    
    saved_files = {}
    
    # Save train data (preprocessed)
    train_data = X_train.copy()
    train_data['target'] = y_train.values
    train_path = os.path.join(output_dir, f'{clean_dataset_name}_train_preprocessed.csv')
    train_data.to_csv(train_path, index=False)
    saved_files['train'] = train_path
    print(f"Saved training data (preprocessed): {train_path}")
    
    # Save validation data (preprocessed)
    val_data = X_val.copy()
    val_data['target'] = y_val.values
    val_path = os.path.join(output_dir, f'{clean_dataset_name}_validation_preprocessed.csv')
    val_data.to_csv(val_path, index=False)
    saved_files['validation'] = val_path
    print(f"Saved validation data (preprocessed): {val_path}")
    
    # Save test data (UNPROCESSED with original labels)
    test_path = os.path.join(output_dir, f'{clean_dataset_name}_test.csv')
    test_set.to_csv(test_path, index=False)
    saved_files['test'] = test_path
    print(f"Saved test data (UNPROCESSED with labels): {test_path}")
    
    # Save complete preprocessing metadata
    analysis = {
        'timestamp': pd.Timestamp.now().isoformat(),
        'train_shape': X_train.shape,
        'validation_shape': X_val.shape,
        'test_shape': test_set.shape,
        'test_has_labels': True,
        'test_is_preprocessed': False,
        'feature_names': X_train.columns.tolist(),
        'preprocessing_stats': preprocessing_stats,
        'scaling_params': scaling_params,
    }
    
    analysis_path = os.path.join(output_dir, f'{clean_dataset_name}_preprocessing_analysis.json')
    with open(analysis_path, 'w') as f:
        json.dump(analysis, f, indent=2, default=str)
    saved_files['analysis'] = analysis_path
    print(f"Saved preprocessing analysis: {analysis_path}")
    
    return saved_files


def run_preprocessing_pipeline(df, target_col, numerical_features, categorical_features,
                               handle_missing=True, encode_categorical=True, 
                               correct_skew=True, handle_outliers=True,
                               test_size=0.10, val_size=0.15, scaling_method='standard',
                               output_dir='../artifacts/data'):
    """
    Run complete preprocessing pipeline
    
    CRITICAL: Test set is separated FIRST before any preprocessing and kept in original form
    with labels for final verification
    
    Parameters:
    -----------
    df : pandas.DataFrame
        Raw dataset (train.csv with labels)
    target_col : str
        Target column name
    numerical_features : list
        Numerical feature names
    categorical_features : list
        Categorical feature names
    handle_missing : bool
        Whether to handle missing values
    encode_categorical : bool
        Whether to encode categorical features
    correct_skew : bool
        Whether to correct skewness
    handle_outliers : bool
        Whether to handle outliers
    test_size : float
        Test set proportion (default: 0.10 = 10%)
    val_size : float
        Validation set proportion of remaining data (default: 0.15 = 15% of total)
    scaling_method : str
        'standard' or 'minmax'
    output_dir : str
        Output directory for preprocessed data
    
    Returns:
    --------
    dict : Dictionary containing all outputs and statistics
    """
    print("="*100)
    print("STARTING PREPROCESSING PIPELINE")
    print("="*100)
    print(f"CRITICAL: Test set ({test_size*100:.0f}%) will be separated FIRST and kept UNPROCESSED")
    print(f"          Test set will retain original labels for final verification")
    print("="*100)
    
    # STEP 0: Split test set BEFORE any preprocessing
    print("\nStep 0: Separating Test Set (BEFORE preprocessing)")
    df_trainval, test_set = split_test_set_first(df, target_col, test_size=test_size, random_state=42)
    
    # Now preprocess only the train+val data
    df_processed = df_trainval.copy()
    preprocessing_stats = {}
    
    # Step 1: Handle Missing Values
    if handle_missing:
        print("\nStep 1: Handling Missing Values (Train+Val only)")
        df_processed, imputation_stats = handle_missing_values(
            df_processed, numerical_features, categorical_features
        )
        preprocessing_stats['imputation'] = imputation_stats
    
    # Step 2: Correct Skewness (before outlier handling)
    if correct_skew:
        print("\nStep 2: Correcting Skewness")
        df_processed, transformation_info = correct_skewness(
            df_processed, numerical_features, skewness_threshold=1.0, method='log'
        )
        preprocessing_stats['skewness_correction'] = transformation_info
    
    # Step 3: Handle Outliers
    if handle_outliers:
        print("\nStep 3: Handling Outliers")
        df_processed, outlier_stats = detect_and_handle_outliers(
            df_processed, numerical_features, method='iqr', threshold=1.5
        )
        preprocessing_stats['outlier_handling'] = outlier_stats
    
    # Step 4: Encode Categorical Features
    if encode_categorical and len(categorical_features) > 0:
        print("\nStep 4: Encoding Categorical Features (Train+Val only)")
        df_processed, encoding_info = encode_categorical_features(
            df_processed, categorical_features, encoding_type='onehot'
        )
        preprocessing_stats['encoding'] = encoding_info
    
    # Step 5: Split Train and Validation
    print("\nStep 5: Splitting Train and Validation")
    X_train, X_val, y_train, y_val = split_train_validation(
        df_processed, target_col, val_size=val_size, random_state=42
    )
    
    # Step 6: Encode Target Variable (only train and val)
    print("\nStep 6: Encoding Target Variable (Train+Val only)")
    y_train_encoded, y_val_encoded, target_encoding_map = encode_target_variable(y_train, y_val)
    preprocessing_stats['target_encoding'] = target_encoding_map
    
    # Update numerical features list after encoding
    if encode_categorical:
        numerical_features = [col for col in X_train.columns if col not in categorical_features]
    
    # Step 7: Scale Features (only train and val)
    print("\nStep 7: Scaling Numerical Features (Train+Val only)")
    X_train_scaled, X_val_scaled, scaling_params = scale_features(
        X_train, X_val, numerical_features, method=scaling_method
    )
    preprocessing_stats['scaling'] = scaling_params
    
    # Step 8: Save Preprocessed Data
    print("\nStep 8: Saving Preprocessed Data")
    saved_files = save_preprocessed_data(
        X_train_scaled, X_val_scaled, y_train_encoded, y_val_encoded,
        test_set, scaling_params, preprocessing_stats, output_dir=output_dir
    )
    
    print("\n" + "="*100)
    print("PREPROCESSING PIPELINE COMPLETED")
    print("="*100)
    print(f"\nSummary:")
    print(f"   Original data (train.csv): {df.shape}")
    print(f"   Training set (preprocessed): {X_train_scaled.shape}")
    print(f"   Validation set (preprocessed): {X_val_scaled.shape}")
    print(f"   Test set (UNPROCESSED with labels): {test_set.shape}")
    print(f"   Total features (after preprocessing): {X_train_scaled.shape[1]}")
    print(f"   Saved files: {len(saved_files)}")
    print(f"\n   Test set is UNPROCESSED - apply same preprocessing before prediction!")
    print(f"   Test set has labels - can verify predictions at the end")
    print(f"\n   Note: Original test.csv (unlabeled) is separate for final submission")
    
    return {
        'X_train': X_train_scaled,
        'X_val': X_val_scaled,
        'test_set': test_set,  # Unprocessed with labels
        'y_train': y_train_encoded,
        'y_val': y_val_encoded,
        'preprocessing_stats': preprocessing_stats,
        'scaling_params': scaling_params,
        'saved_files': saved_files
    }


print("Preprocessing functions loaded successfully!")

Preprocessing functions loaded successfully!


In [9]:
# Execute Preprocessing Pipeline
if train_df is not None and dataset_metadata is not None:
    
    # Get target column and feature lists
    target_col = dataset_metadata.get('target', 'TARGET')
    
    # Get numerical and categorical features (excluding target)
    numerical_features = train_df.select_dtypes(include=[np.number]).columns.tolist()
    if target_col in numerical_features:
        numerical_features.remove(target_col)
    
    categorical_features = train_df.select_dtypes(include=['object', 'category']).columns.tolist()
    if target_col in categorical_features:
        categorical_features.remove(target_col)
    
    print(f"Features to preprocess:")
    print(f"    Numerical: {len(numerical_features)} features")
    print(f"    Categorical: {len(categorical_features)} features")
    print(f"    Target: {target_col}")
    
    # Run preprocessing pipeline
    # Test set is separated FIRST and kept unprocessed with labels
    # Only train+val are preprocessed
    preprocessing_results = run_preprocessing_pipeline(
        df=train_df,
        target_col=target_col,
        numerical_features=numerical_features,
        categorical_features=categorical_features,
        handle_missing=True,
        encode_categorical=True,
        correct_skew=False,  # Disabled for now
        handle_outliers=False,  # Disabled for now
        test_size=0.10,  # 10% for test (unprocessed)
        val_size=0.15,  # 15% of total for validation
        scaling_method='standard',
        output_dir='../artifacts/data'
    )
    
    # Extract results for easy access
    X_train_processed = preprocessing_results['X_train']
    X_val_processed = preprocessing_results['X_val']
    test_set_raw = preprocessing_results['test_set']  # Unprocessed with labels
    y_train_processed = preprocessing_results['y_train']
    y_val_processed = preprocessing_results['y_val']
    
    print("\nPreprocessing completed successfully!")
    print(f"   Train (processed): {X_train_processed.shape}")
    print(f"   Val (processed): {X_val_processed.shape}")
    print(f"   Test (raw with labels): {test_set_raw.shape}")
    
else:
    print("Error: Training data or metadata not loaded!")

Features to preprocess:
    Numerical: 9 features
    Categorical: 8 features
    Target: NObeyesdad
STARTING PREPROCESSING PIPELINE
CRITICAL: Test set (10%) will be separated FIRST and kept UNPROCESSED
          Test set will retain original labels for final verification

Step 0: Separating Test Set (BEFORE preprocessing)
Initial split (BEFORE preprocessing):
   Train+Val: 18682 samples (90.0%) - will be preprocessed
   Test:       2076 samples (10.0%) - kept unprocessed with labels

Step 1: Handling Missing Values (Train+Val only)
Missing values: 0 values imputed across 0 features

Step 4: Encoding Categorical Features (Train+Val only)
Categorical encoding: 8 features encoded to 14 binary features

Step 5: Splitting Train and Validation
Train/Val split:
   Train:      15879 samples (85.0%)
   Validation:  2803 samples (15.0%)

Step 6: Encoding Target Variable (Train+Val only)
Target encoding: 7 classes encoded
   Classes: ['Insufficient_Weight', 'Normal_Weight', 'Obesity_Type_I', 'Ob