# üöÄ Enhanced Machine Learning Model Training and Evaluation

This notebook provides a comprehensive framework for training and evaluating multiple ML models with advanced features including:
- Interactive data selection and exploration
- Automated handling of class imbalance
- Advanced visualizations (2D and 3D)
- Feature importance and selection
- Model optimization tracking
- Performance comparison and ensemble analysis

## üìÅ Expected Project Structure
```
Your Project/
‚îú‚îÄ‚îÄ 02_data/
‚îÇ   ‚îî‚îÄ‚îÄ Processed_data/             ‚Üê Pre-scaled data
‚îú‚îÄ‚îÄ 03_notebooks/                   ‚Üê Run notebooks from here
‚îÇ   ‚îî‚îÄ‚îÄ src/                        ‚Üê Custom modules
‚îî‚îÄ‚îÄ 05_results/                     ‚Üê Output files
```

## üìö 1. Import Libraries and Setup

In [None]:
# Core libraries
import sys
import warnings
import numpy as np
import pandas as pd
from pathlib import Path
from datetime import datetime
import json
import joblib
from typing import Dict, List, Tuple, Optional

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# Machine Learning libraries
from sklearn.model_selection import (
    train_test_split, 
    cross_val_score, 
    GridSearchCV,
    StratifiedKFold,
    learning_curve,
    validation_curve
)
from sklearn.metrics import (
    accuracy_score, 
    classification_report, 
    confusion_matrix,
    roc_auc_score,
    roc_curve,
    precision_recall_curve,
    f1_score,
    balanced_accuracy_score,
    make_scorer
)
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import (
    SelectKBest, 
    f_classif,
    mutual_info_classif,
    RFE
)
from sklearn.utils.class_weight import compute_class_weight

# ML Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier,
    VotingClassifier
)
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.offline as pyo

# Initialize plotly for notebook
pyo.init_notebook_mode(connected=True)

# Set plotting style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

print("‚úÖ All libraries imported successfully!")
print(f"üìÖ Analysis date: {datetime.now().strftime('%Y-%m-%d %H:%M')}")

## üîß 2. Helper Functions

In [None]:
def check_scaling(df: pd.DataFrame, sample_cols: int = 5) -> Dict:
    """Check if data appears to be scaled by examining statistical properties."""
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    sample_cols = min(sample_cols, len(numeric_cols))
    sampled_cols = np.random.choice(numeric_cols, sample_cols, replace=False)
    
    scaling_info = {
        'appears_scaled': True,
        'details': {}
    }
    
    for col in sampled_cols:
        col_stats = {
            'mean': df[col].mean(),
            'std': df[col].std(),
            'min': df[col].min(),
            'max': df[col].max()
        }
        scaling_info['details'][col] = col_stats
        
        # Check if data appears to be scaled (common patterns)
        if abs(col_stats['mean']) > 10 or col_stats['std'] > 10:
            scaling_info['appears_scaled'] = False
    
    return scaling_info

def handle_missing_data(df: pd.DataFrame, strategy: str = 'auto') -> pd.DataFrame:
    """Handle missing data with various strategies."""
    missing_info = df.isnull().sum()
    missing_cols = missing_info[missing_info > 0]
    
    if len(missing_cols) == 0:
        print("‚úÖ No missing data detected!")
        return df
    
    print(f"‚ö†Ô∏è Missing data found in {len(missing_cols)} columns:")
    print(missing_cols.head(10))
    
    if strategy == 'auto':
        # Automatic handling based on missing percentage
        for col in missing_cols.index:
            missing_pct = missing_cols[col] / len(df) * 100
            
            if missing_pct > 50:
                print(f"  ‚ùå Dropping {col} ({missing_pct:.1f}% missing)")
                df = df.drop(columns=[col])
            elif missing_pct > 20:
                print(f"  üìä Filling {col} with median ({missing_pct:.1f}% missing)")
                df[col] = df[col].fillna(df[col].median())
            else:
                print(f"  üìà Forward filling {col} ({missing_pct:.1f}% missing)")
                df[col] = df[col].fillna(method='ffill').fillna(method='bfill')
    
    return df

def check_class_balance(y: pd.Series) -> Dict:
    """Check for class imbalance and suggest strategies."""
    class_counts = y.value_counts()
    class_props = y.value_counts(normalize=True)
    
    imbalance_info = {
        'balanced': True,
        'class_counts': class_counts.to_dict(),
        'class_proportions': class_props.to_dict(),
        'minority_class': class_props.idxmin(),
        'majority_class': class_props.idxmax(),
        'imbalance_ratio': class_props.max() / class_props.min(),
        'suggested_strategy': None
    }
    
    # Check for imbalance (less than 20% for any class)
    if class_props.min() < 0.2:
        imbalance_info['balanced'] = False
        
        if class_props.min() < 0.1:
            imbalance_info['suggested_strategy'] = 'SMOTE or class weights'
        else:
            imbalance_info['suggested_strategy'] = 'class weights'
    
    return imbalance_info

def create_ensemble_model(best_models: Dict, voting: str = 'soft') -> VotingClassifier:
    """Create an ensemble from the best performing models."""
    estimators = [(name, model) for name, model in best_models.items()]
    ensemble = VotingClassifier(estimators=estimators, voting=voting)
    return ensemble

## üì• 3. Load and Explore Data

In [None]:
# Interactive file selection
print("üìÅ Please select your data file:")
print("Enter the full path to your scaled dataset, or press Enter to use file browser")

file_path = input("\n üëâ File path: ").strip()

if not file_path:
    from tkinter import filedialog
    import tkinter as tk
    root = tk.Tk()
    root.withdraw()
    file_path = filedialog.askopenfilename(
        title="Select your scaled dataset",
        filetypes=[("CSV files", "*.csv"), ("All files", "*.*")]
    )

# Load the dataset
print(f"\nüìä Loading dataset from: {file_path}")
df = pd.read_csv(file_path)

# Display basic information
print(f"\nüìã Dataset Information:")
print(f"   Shape: {df.shape[0]:,} rows √ó {df.shape[1]} columns")
print(f"   Memory usage: {df.memory_usage().sum() / 1024**2:.2f} MB")
print(f"\n   Column types:")
print(df.dtypes.value_counts())

# Check for missing data
missing_summary = df.isnull().sum().sum()
print(f"\n   Missing values: {missing_summary:,} total")

üìÅ Please select your data file:
Enter the full path to your scaled dataset, or press Enter to use file browser



 üëâ File path:  


### üìä Data Quality Checks

In [None]:
# Check if data is scaled
print("\nüîç Checking if data appears to be scaled...")
scaling_info = check_scaling(df)

if scaling_info['appears_scaled']:
    print("‚úÖ Data appears to be properly scaled")
else:
    print("‚ö†Ô∏è Data may not be scaled. Consider scaling before model training.")
    
print("\nSample statistics from random columns:")
for col, stats in list(scaling_info['details'].items())[:3]:
    print(f"  {col}: mean={stats['mean']:.3f}, std={stats['std']:.3f}, "
          f"range=[{stats['min']:.3f}, {stats['max']:.3f}]")

# Handle missing data
df = handle_missing_data(df, strategy='auto')

### üìÖ Interactive Time Period Selection

In [None]:
# Check for date columns
date_cols = [col for col in df.columns if 'date' in col.lower()]
if date_cols:
    date_col = date_cols[0]
    
    # Extract years
    df['_year'] = pd.to_datetime(df[date_col], format='%Y%m%d').dt.year
    year_counts = df['_year'].value_counts().sort_index()
    
    print("\nüìÖ TIME PERIOD SELECTION")
    print("="*60)
    print(f"Date column found: {date_col}")
    print(f"\nAvailable years and data points:")
    
    # Create a visual representation
    fig = px.bar(x=year_counts.index.tolist(), y=year_counts.values.tolist(),
                 labels={'x': 'Year', 'y': 'Number of Records'},
                 title='Data Distribution by Year')
    fig.show()
    
    # Print year options
    for year, count in year_counts.items():
        print(f"  {year}: {count:,} records")
    
    print("\n" + "-"*60)
    print("Options:")
    print("  ‚Ä¢ Enter a specific year (e.g., 2020)")
    print("  ‚Ä¢ Enter a range (e.g., 2018-2020)")
    print("  ‚Ä¢ Enter 'all' to use entire dataset")
    print("  ‚Ä¢ Enter 'last5' for last 5 years")
    
    time_selection = input("\nüëâ Your selection: ").strip().lower()
    
    # Process selection
    if time_selection == 'all':
        print("‚úÖ Using entire dataset")
    elif time_selection == 'last5':
        last_year = df['_year'].max()
        df = df[df['_year'] >= last_year - 4]
        print(f"‚úÖ Using data from {last_year-4} to {last_year}")
    elif '-' in time_selection:
        start, end = time_selection.split('-')
        df = df[(df['_year'] >= int(start)) & (df['_year'] <= int(end))]
        print(f"‚úÖ Using data from {start} to {end}")
    else:
        df = df[df['_year'] == int(time_selection)]
        print(f"‚úÖ Using data from year {time_selection}")
    
    # Clean up temporary column
    df = df.drop('_year', axis=1)
    print(f"   Final dataset size: {len(df):,} records")

## üéØ 4. Interactive Feature and Target Selection

In [None]:
# Comprehensive column analysis
print("\nüìä COLUMN ANALYSIS AND GROUPING")
print("="*80)

# Create column info DataFrame
numeric_cols = df.select_dtypes(include=[np.number]).columns
column_info = pd.DataFrame({
    'Column Name': numeric_cols,
    'Data Type': df[numeric_cols].dtypes.values,
    'Non-Null Count': df[numeric_cols].count().values,
    'Null %': (df[numeric_cols].isnull().sum() / len(df) * 100).round(2).values,
    'Mean': df[numeric_cols].mean().round(3).values,
    'Std': df[numeric_cols].std().round(3).values
})

# Group columns by patterns
patterns = {
    'Statistical Measures': ['mean', 'max', 'min', 'std', 'avg', 'median', 'sum', 'count'],
    'Temperature Related': ['temp', 'temperature', 'celsius', 'fahrenheit'],
    'Humidity/Pressure': ['humid', 'pressure', 'precip', 'rain'],
    'Wind Related': ['wind', 'gust', 'speed'],
    'Time Related': ['date', 'time', 'year', 'month', 'day', 'hour'],
    'Location Related': ['station', 'city', 'location', 'lat', 'lon']
}

grouped_columns = {}
unmatched_columns = list(numeric_cols)

for group_name, keywords in patterns.items():
    matched = []
    for col in numeric_cols:
        col_lower = col.lower()
        if any(keyword in col_lower for keyword in keywords):
            matched.append(col)
            if col in unmatched_columns:
                unmatched_columns.remove(col)
    if matched:
        grouped_columns[group_name] = matched

if unmatched_columns:
    grouped_columns["Other Columns"] = unmatched_columns

# Display grouped columns
print("\nColumn Groups Found:")
for i, (group_name, cols) in enumerate(grouped_columns.items(), 1):
    print(f"\n{i}. üè∑Ô∏è {group_name} ({len(cols)} columns):")
    for col in cols[:5]:
        print(f"   ‚Ä¢ {col}")
    if len(cols) > 5:
        print(f"   ... and {len(cols) - 5} more")

# Interactive feature selection
print("\n\nüéØ FEATURE SELECTION")
print("="*80)
print("\nSelect column groups to include as features:")
print("Enter numbers separated by commas (e.g., 1,3,5) or 'all':")

for i, group in enumerate(grouped_columns.keys(), 1):
    print(f"  {i}. {group} ({len(grouped_columns[group])} columns)")

user_groups = input("\nüëâ Your selection: ").strip()

# Process selection
if user_groups.lower() == 'all':
    selected_columns = list(numeric_cols)
else:
    selected_groups = []
    selected_columns = []
    try:
        indices = [int(x.strip()) - 1 for x in user_groups.split(',')]
        group_list = list(grouped_columns.keys())
        for idx in indices:
            if 0 <= idx < len(group_list):
                group = group_list[idx]
                selected_groups.append(group)
                selected_columns.extend(grouped_columns[group])
    except:
        print("‚ö†Ô∏è Invalid input. Using all columns.")
        selected_columns = list(numeric_cols)

print(f"\n‚úÖ Selected {len(selected_columns)} features")

# Keyword filtering
keyword_filter = input("\nüëâ Filter by keywords (optional, press Enter to skip): ").strip()
if keyword_filter:
    keywords = [k.strip().lower() for k in keyword_filter.split(',')]
    filtered_columns = [col for col in selected_columns 
                       if all(kw in col.lower() for kw in keywords)]
    if filtered_columns:
        selected_columns = filtered_columns
        print(f"‚úÖ Filtered to {len(selected_columns)} columns")

### üéØ Target Variable Creation

In [None]:
# Interactive target variable creation
print("\n\nüéØ TARGET VARIABLE CREATION")
print("="*80)

print("\nOptions for target variable:")
print("1. Select an existing column to predict")
print("2. Create a binary target from a column (above/below threshold)")
print("3. Create multi-class target (binning)")
print("4. Skip (for unsupervised learning)")

target_option = input("\nüëâ Your choice (1-4): ").strip()

if target_option == '1':
    # Show available columns for target
    print("\nAvailable columns for target:")
    for i, col in enumerate(selected_columns[:20], 1):
        print(f"  {i}. {col}")
    if len(selected_columns) > 20:
        print(f"  ... and {len(selected_columns) - 20} more")
    
    target_idx = int(input("\nüëâ Select target column number: ")) - 1
    target_column = selected_columns[target_idx]
    df['target'] = df[target_column]
    selected_columns.remove(target_column)
    
elif target_option == '2':
    # Binary target creation
    print("\nSelect column for binary target creation:")
    for i, col in enumerate(selected_columns[:20], 1):
        sample_vals = df[col].describe()[['25%', '50%', '75%']].round(3)
        print(f"  {i}. {col} (Q1={sample_vals['25%']}, Median={sample_vals['50%']}, Q3={sample_vals['75%']})")
    
    col_idx = int(input("\nüëâ Select column number: ")) - 1
    target_column = selected_columns[col_idx]
    
    print("\nThreshold options:")
    print("1. Median")
    print("2. Mean")
    print("3. Custom value")
    
    threshold_option = input("\nüëâ Your choice (1-3): ").strip()
    
    if threshold_option == '1':
        threshold = df[target_column].median()
    elif threshold_option == '2':
        threshold = df[target_column].mean()
    else:
        threshold = float(input("üëâ Enter threshold value: "))
    
    df['target'] = (df[target_column] > threshold).astype(int)
    selected_columns.remove(target_column)
    print(f"\n‚úÖ Created binary target (above/below {threshold:.3f})")
    
elif target_option == '3':
    # Multi-class target
    print("\nSelect column for multi-class target:")
    for i, col in enumerate(selected_columns[:20], 1):
        print(f"  {i}. {col}")
    
    col_idx = int(input("\nüëâ Select column number: ")) - 1
    target_column = selected_columns[col_idx]
    
    n_classes = int(input("\nüëâ Number of classes (3-10): "))
    df['target'] = pd.qcut(df[target_column], q=n_classes, 
                          labels=range(n_classes), duplicates='drop')
    selected_columns.remove(target_column)
    print(f"\n‚úÖ Created {n_classes}-class target")

else:
    print("\n‚úÖ No target variable created - ready for unsupervised learning")
    target_column = None

# Display target distribution if created
if 'target' in df.columns:
    print("\nüìä Target Distribution:")
    target_dist = df['target'].value_counts().sort_index()
    for class_val, count in target_dist.items():
        pct = count / len(df) * 100
        print(f"   Class {class_val}: {count:,} ({pct:.1f}%)")
    
    # Check class balance
    balance_info = check_class_balance(df['target'])
    if not balance_info['balanced']:
        print(f"\n‚ö†Ô∏è Class imbalance detected!")
        print(f"   Imbalance ratio: {balance_info['imbalance_ratio']:.2f}")
        print(f"   Suggested strategy: {balance_info['suggested_strategy']}")

## üìà 5. Data Visualization and Exploration

In [None]:
# 3D visualization of feature relationships
if 'target' in df.columns and len(selected_columns) >= 2:
    print("\nüìä Creating 3D visualization of feature relationships...")
    
    # Select top 2 features by correlation with target
    correlations = df[selected_columns].corrwith(df['target']).abs().sort_values(ascending=False)
    top_features = correlations.head(2).index.tolist()
    
    # Create 3D scatter plot
    fig = go.Figure(data=[go.Scatter3d(
        x=df[top_features[0]],
        y=df[top_features[1]],
        z=df['target'],
        mode='markers',
        marker=dict(
            size=5,
            color=df['target'],
            colorscale='Viridis',
            showscale=True,
            colorbar=dict(title="Target")
        ),
        text=[f"Target: {t}" for t in df['target']],
        hovertemplate='%{text}<br>' +
                      f'{top_features[0]}: %{{x:.3f}}<br>' +
                      f'{top_features[1]}: %{{y:.3f}}<extra></extra>'
    )])
    
    fig.update_layout(
        title=f'3D Visualization: Target vs Top 2 Features',
        scene=dict(
            xaxis_title=top_features[0],
            yaxis_title=top_features[1],
            zaxis_title='Target'
        ),
        height=600
    )
    fig.show()

# Feature correlation heatmap
if len(selected_columns) > 1:
    print("\nüìä Creating feature correlation heatmap...")
    
    # Sample features if too many
    if len(selected_columns) > 20:
        # Select diverse features based on correlation
        sample_features = selected_columns[:20]
    else:
        sample_features = selected_columns
    
    corr_matrix = df[sample_features].corr()
    
    fig = px.imshow(corr_matrix,
                    text_auto='.2f',
                    color_continuous_scale='RdBu_r',
                    title='Feature Correlation Heatmap')
    fig.update_layout(width=800, height=600)
    fig.show()

## üîÑ 6. Data Preparation

In [None]:
# Prepare features and target
X = df[selected_columns]
y = df['target'] if 'target' in df.columns else None

print(f"\nüìä Data shapes:")
print(f"   Features (X): {X.shape}")
if y is not None:
    print(f"   Target (y): {y.shape}")

# Feature selection option
if X.shape[1] > 50:
    print(f"\n‚ö†Ô∏è You have {X.shape[1]} features. Consider feature selection?")
    do_feature_selection = input("üëâ Perform feature selection? (y/n): ").strip().lower()
    
    if do_feature_selection == 'y':
        print("\nFeature selection methods:")
        print("1. SelectKBest (statistical)")
        print("2. Mutual Information")
        print("3. Recursive Feature Elimination (slower)")
        
        method = input("üëâ Select method (1-3): ").strip()
        k = int(input("üëâ Number of features to keep: "))
        
        if method == '1':
            selector = SelectKBest(f_classif, k=k)
        elif method == '2':
            selector = SelectKBest(mutual_info_classif, k=k)
        else:
            # Use a simple model for RFE
            estimator = RandomForestClassifier(n_estimators=50, random_state=42)
            selector = RFE(estimator, n_features_to_select=k)
        
        X_selected = selector.fit_transform(X, y)
        selected_features = X.columns[selector.get_support()]
        X = pd.DataFrame(X_selected, columns=selected_features)
        print(f"\n‚úÖ Selected {k} features")
        selected_columns = list(selected_features)

# Train-test split
test_size = float(input("\nüëâ Test set size (0.1-0.4, default=0.2): ") or "0.2")
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=test_size, random_state=42, stratify=y

print(f"\n‚úÇÔ∏è Data split completed:")
print(f"   Training set: {X_train.shape[0]:,} samples")
print(f"   Test set: {X_test.shape[0]:,} samples")

## ü§ñ 7. Model Definition and Training

In [None]:
# Define models with class weight handling for imbalanced data
if y is not None:
    class_weights = None
    if not balance_info['balanced']:
        # Calculate class weights
        classes = np.unique(y_train)
        weights = compute_class_weight('balanced', classes=classes, y=y_train)
        class_weights = dict(zip(classes, weights))
        print(f"\n‚öñÔ∏è Using class weights: {class_weights}")

# Performance mode selection
print("\nüöÄ PERFORMANCE MODE SELECTION")
print("="*60)
print("1. Quick Mode (fewer hyperparameters, faster)")
print("2. Standard Mode (balanced)")
print("3. Thorough Mode (more hyperparameters, slower)")

mode = input("\nüëâ Select mode (1-3, default=2): ").strip() or "2"

# Define models based on mode
if mode == '1':
    # Quick mode - minimal parameters
    models = {
        'Logistic Regression': {
            'model': LogisticRegression(max_iter=1000, random_state=42, 
                                      class_weight=class_weights if 'class_weights' in locals() else None),
            'params': {
                'C': [0.1, 1, 10],
                'solver': ['lbfgs']
            }
        },
        'Decision Tree': {
            'model': DecisionTreeClassifier(random_state=42,
                                          class_weight=class_weights if 'class_weights' in locals() else None),
            'params': {
                'max_depth': [5, 10],
                'min_samples_split': [2, 10]
            }
        },
        'Random Forest': {
            'model': RandomForestClassifier(random_state=42, n_jobs=-1,
                                          class_weight=class_weights if 'class_weights' in locals() else None),
            'params': {
                'n_estimators': [50, 100],
                'max_depth': [10, 20]
            }
        }
    }
elif mode == '3':
    # Thorough mode - extensive parameters
    models = {
        'Logistic Regression': {
            'model': LogisticRegression(max_iter=2000, random_state=42,
                                      class_weight=class_weights if 'class_weights' in locals() else None),
            'params': {
                'C': [0.001, 0.01, 0.1, 1, 10, 100],
                'penalty': ['l1', 'l2'],
                'solver': ['liblinear', 'saga']
            }
        },
        'Decision Tree': {
            'model': DecisionTreeClassifier(random_state=42,
                                          class_weight=class_weights if 'class_weights' in locals() else None),
            'params': {
                'max_depth': [3, 5, 7, 10, 15, None],
                'min_samples_split': [2, 5, 10, 20],
                'min_samples_leaf': [1, 2, 4, 8],
                'criterion': ['gini', 'entropy']
            }
        },
        'Random Forest': {
            'model': RandomForestClassifier(random_state=42, n_jobs=-1,
                                          class_weight=class_weights if 'class_weights' in locals() else None),
            'params': {
                'n_estimators': [50, 100, 200, 300],
                'max_depth': [5, 10, 15, 20, None],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 4],
                'max_features': ['sqrt', 'log2', None]
            }
        },
        'Gradient Boosting': {
            'model': GradientBoostingClassifier(random_state=42),
            'params': {
                'n_estimators': [50, 100, 200],
                'learning_rate': [0.01, 0.1, 0.2],
                'max_depth': [3, 5, 7],
                'subsample': [0.8, 1.0]
            }
        },
        'SVM': {
            'model': SVC(random_state=42, probability=True,
                        class_weight=class_weights if 'class_weights' in locals() else None),
            'params': {
                'C': [0.1, 1, 10, 100],
                'kernel': ['rbf', 'linear', 'poly'],
                'gamma': ['scale', 'auto', 0.001, 0.01]
            }
        },
        'Neural Network': {
            'model': MLPClassifier(random_state=42, max_iter=1000),
            'params': {
                'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50)],
                'activation': ['relu', 'tanh'],
                'alpha': [0.0001, 0.001, 0.01],
                'learning_rate': ['constant', 'adaptive']
            }
        }
    }
else:
    # Standard mode - balanced parameters
    models = {
        'Logistic Regression': {
            'model': LogisticRegression(max_iter=1000, random_state=42,
                                      class_weight=class_weights if 'class_weights' in locals() else None),
            'params': {
                'C': [0.01, 0.1, 1, 10],
                'penalty': ['l2'],
                'solver': ['lbfgs', 'liblinear']
            }
        },
        'Decision Tree': {
            'model': DecisionTreeClassifier(random_state=42,
                                          class_weight=class_weights if 'class_weights' in locals() else None),
            'params': {
                'max_depth': [3, 5, 7, 10],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 4]
            }
        },
        'Random Forest': {
            'model': RandomForestClassifier(random_state=42, n_jobs=-1,
                                          class_weight=class_weights if 'class_weights' in locals() else None),
            'params': {
                'n_estimators': [50, 100, 200],
                'max_depth': [5, 10, 15],
                'min_samples_split': [2, 5],
                'min_samples_leaf': [1, 2]
            }
        },
        'Gradient Boosting': {
            'model': GradientBoostingClassifier(random_state=42),
            'params': {
                'n_estimators': [50, 100],
                'learning_rate': [0.01, 0.1],
                'max_depth': [3, 5]
            }
        },
        'SVM': {
            'model': SVC(random_state=42, probability=True,
                        class_weight=class_weights if 'class_weights' in locals() else None),
            'params': {
                'C': [0.1, 1, 10],
                'kernel': ['rbf', 'linear'],
                'gamma': ['scale', 'auto']
            }
        }
    }

print(f"\n‚úÖ {len(models)} models configured for training")

## üèÉ‚Äç‚ôÇÔ∏è 8. Model Training with Optimization Tracking

In [None]:
# Initialize storage
results = {}
best_models = {}
training_histories = {}

# Define scoring metrics based on class balance
if 'balance_info' in locals() and not balance_info['balanced']:
    scoring_metrics = ['balanced_accuracy', 'f1_weighted', 'roc_auc_ovr_weighted']
    primary_metric = 'balanced_accuracy'
else:
    scoring_metrics = ['accuracy', 'f1_weighted', 'roc_auc_ovr']
    primary_metric = 'accuracy'

# Cross-validation setup
cv_folds = 5
cv_strategy = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42)

print(f"\nüîÑ Starting model training with {cv_folds}-fold cross-validation...")
print(f"   Primary metric: {primary_metric}")
print(f"   Additional metrics: {', '.join(scoring_metrics[1:])}")

# Train each model
for model_name, model_info in models.items():
    print(f"\n{'='*60}")
    print(f"ü§ñ Training {model_name}...")
    start_time = datetime.now()
    
    # GridSearchCV with multiple metrics
    grid_search = GridSearchCV(
        estimator=model_info['model'],
        param_grid=model_info['params'],
        cv=cv_strategy,
        scoring=primary_metric,
        n_jobs=-1,
        verbose=1,
        return_train_score=True
    )
    
    # Fit the model
    grid_search.fit(X_train, y_train)
    
    # Store the best model
    best_models[model_name] = grid_search.best_estimator_
    
    # Make predictions
    y_pred = grid_search.predict(X_test)
    y_pred_proba = None
    if hasattr(grid_search.best_estimator_, 'predict_proba'):
        y_pred_proba = grid_search.predict_proba(X_test)
    
    # Calculate comprehensive metrics
    test_accuracy = accuracy_score(y_test, y_pred)
    balanced_acc = balanced_accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    # Cross-validation scores
    cv_scores = cross_val_score(grid_search.best_estimator_, X_train, y_train, 
                               cv=cv_strategy, scoring=primary_metric)
    
    # Store results
    results[model_name] = {
        'best_params': grid_search.best_params_,
        'best_cv_score': grid_search.best_score_,
        'cv_scores': cv_scores,
        'test_accuracy': test_accuracy,
        'balanced_accuracy': balanced_acc,
        'f1_score': f1,
        'predictions': y_pred,
        'pred_proba': y_pred_proba,
        'training_time': (datetime.now() - start_time).total_seconds(),
        'grid_search_results': grid_search.cv_results_
    }
    
    # Store training history
    training_histories[model_name] = {
        'mean_train_scores': grid_search.cv_results_['mean_train_score'],
        'mean_test_scores': grid_search.cv_results_['mean_test_score'],
        'params': grid_search.cv_results_['params']
    }
    
    print(f"\n‚úÖ {model_name} training completed!")
    print(f"   Best parameters: {grid_search.best_params_}")
    print(f"   Best CV score: {grid_search.best_score_:.4f}")
    print(f"   Test accuracy: {test_accuracy:.4f}")
    print(f"   Balanced accuracy: {balanced_acc:.4f}")
    print(f"   F1 score: {f1:.4f}")
    print(f"   Training time: {results[model_name]['training_time']:.2f} seconds")

print(f"\n{'='*60}")
print("‚úÖ All models trained successfully!")

## üìä 9. Model Comparison and Visualization

In [None]:
# Create comprehensive comparison
comparison_data = []
for model_name, result in results.items():
    comparison_data.append({
        'Model': model_name,
        'CV Mean Score': result['cv_scores'].mean(),
        'CV Std': result['cv_scores'].std(),
        'Test Accuracy': result['test_accuracy'],
        'Balanced Accuracy': result['balanced_accuracy'],
        'F1 Score': result['f1_score'],
        'Training Time (s)': result['training_time']
    })

comparison_df = pd.DataFrame(comparison_data)
comparison_df = comparison_df.sort_values('Balanced Accuracy', ascending=False)

print("\nüìä Model Performance Comparison:")
print("=" * 100)
print(comparison_df.to_string(index=False, float_format='%.4f'))

# Identify best model
best_model_name = comparison_df.iloc[0]['Model']
print(f"\nüèÜ Best Model: {best_model_name}")

### üìà Advanced Visualizations

In [None]:
# Create interactive comparison plots
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=('Model Accuracy Comparison', 'Training Time vs Performance',
                    'Cross-Validation Consistency', 'Metric Comparison'),
    specs=[[{"type": "bar"}, {"type": "scatter"}],
           [{"type": "box"}, {"type": "bar"}]]
)

# 1. Accuracy comparison
fig.add_trace(
    go.Bar(x=comparison_df['Model'], y=comparison_df['Test Accuracy'],
           name='Test Accuracy', marker_color='lightblue'),
    row=1, col=1
)
fig.add_trace(
    go.Bar(x=comparison_df['Model'], y=comparison_df['Balanced Accuracy'],
           name='Balanced Accuracy', marker_color='darkblue'),
    row=1, col=1
)

# 2. Training time vs performance
fig.add_trace(
    go.Scatter(x=comparison_df['Training Time (s)'], 
               y=comparison_df['Balanced Accuracy'],
               mode='markers+text',
               text=comparison_df['Model'],
               textposition="top center",
               marker=dict(size=10)),
    row=1, col=2
)

# 3. CV consistency (box plot)
for model_name in results.keys():
    fig.add_trace(
        go.Box(y=results[model_name]['cv_scores'],
               name=model_name),
        row=2, col=1
    )

# 4. Multiple metrics comparison
metrics_data = []
for model in comparison_df['Model']:
    metrics_data.extend([
        go.Bar(name=f'{model} - Accuracy', 
               x=['Accuracy'], y=[comparison_df[comparison_df['Model']==model]['Test Accuracy'].values[0]]),
        go.Bar(name=f'{model} - F1', 
               x=['F1 Score'], y=[comparison_df[comparison_df['Model']==model]['F1 Score'].values[0]])
    ])

fig.update_layout(height=800, showlegend=True, 
                  title_text="Comprehensive Model Performance Analysis")
fig.show()

### üéØ GridSearch Optimization Landscape

In [None]:
# Visualize hyperparameter optimization for the best model
best_model_history = training_histories[best_model_name]

# Create optimization landscape visualization
if len(models[best_model_name]['params']) >= 2:
    print(f"\nüìä Hyperparameter Optimization Landscape for {best_model_name}")
    
    # Get parameter names
    param_names = list(models[best_model_name]['params'].keys())[:2]
    
    # Create a pivot table for visualization
    results_df = pd.DataFrame({
        'param_' + param_names[0]: [p[param_names[0]] for p in best_model_history['params']],
        'param_' + param_names[1]: [p[param_names[1]] for p in best_model_history['params']],
        'score': best_model_history['mean_test_scores']
    })
    
    # Create unique values for each parameter
    param1_values = sorted(results_df['param_' + param_names[0]].unique())
    param2_values = sorted(results_df['param_' + param_names[1]].unique())
    
    # Create heatmap data
    heatmap_data = []
    for p1 in param1_values:
        row = []
        for p2 in param2_values:
            score = results_df[(results_df['param_' + param_names[0]] == p1) & 
                              (results_df['param_' + param_names[1]] == p2)]['score']
            row.append(score.values[0] if len(score) > 0 else None)
        heatmap_data.append(row)
    
    # Create 3D surface plot
    fig = go.Figure(data=[go.Surface(
        x=param2_values,
        y=param1_values,
        z=heatmap_data,
        colorscale='Viridis'
    )])
    
    fig.update_layout(
        title=f'Hyperparameter Optimization Landscape - {best_model_name}',
        scene=dict(
            xaxis_title=param_names[1],
            yaxis_title=param_names[0],
            zaxis_title='CV Score'
        ),
        height=600
    )
    fig.show()

## üîç 10. Best Model Analysis

In [None]:
# Get the best model
best_model = best_models[best_model_name]
best_result = results[best_model_name]

print(f"\nüèÜ Detailed Analysis of {best_model_name}")
print("="*60)

# Classification Report
print("\nüìã Classification Report:")
print(classification_report(y_test, best_result['predictions']))

# Confusion Matrix
cm = confusion_matrix(y_test, best_result['predictions'])
print("\nüìä Confusion Matrix:")
print(cm)

# Visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# 1. Confusion Matrix Heatmap
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[0, 0])
axes[0, 0].set_title(f'Confusion Matrix - {best_model_name}', fontsize=14, fontweight='bold')
axes[0, 0].set_xlabel('Predicted')
axes[0, 0].set_ylabel('Actual')

# 2. ROC Curves (for binary classification)
if len(np.unique(y_test)) == 2 and best_result['pred_proba'] is not None:
    fpr, tpr, _ = roc_curve(y_test, best_result['pred_proba'][:, 1])
    auc_score = roc_auc_score(y_test, best_result['pred_proba'][:, 1])
    
    axes[0, 1].plot(fpr, tpr, 'b-', linewidth=2, label=f'ROC Curve (AUC = {auc_score:.3f})')
    axes[0, 1].plot([0, 1], [0, 1], 'k--', alpha=0.5)
    axes[0, 1].set_xlabel('False Positive Rate')
    axes[0, 1].set_ylabel('True Positive Rate')
    axes[0, 1].set_title(f'ROC Curve - {best_model_name}', fontsize=14, fontweight='bold')
    axes[0, 1].legend(loc='lower right')
    axes[0, 1].grid(True, alpha=0.3)
    
    # Precision-Recall Curve
    precision, recall, _ = precision_recall_curve(y_test, best_result['pred_proba'][:, 1])
    axes[1, 0].plot(recall, precision, 'g-', linewidth=2)
    axes[1, 0].set_xlabel('Recall')
    axes[1, 0].set_ylabel('Precision')
    axes[1, 0].set_title(f'Precision-Recall Curve - {best_model_name}', fontsize=14, fontweight='bold')
    axes[1, 0].grid(True, alpha=0.3)
else:
    axes[0, 1].text(0.5, 0.5, 'ROC Curve not available\nfor multi-class', 
                    ha='center', va='center', transform=axes[0, 1].transAxes)
    axes[1, 0].text(0.5, 0.5, 'Precision-Recall Curve\nnot available for multi-class', 
                    ha='center', va='center', transform=axes[1, 0].transAxes)

# 3. Learning Curves
train_sizes, train_scores, val_scores = learning_curve(
    best_model, X_train, y_train, cv=cv_strategy, 
    train_sizes=np.linspace(0.1, 1.0, 10), n_jobs=-1
)

axes[1, 1].plot(train_sizes, np.mean(train_scores, axis=1), 'o-', color='r', label='Training score')
axes[1, 1].plot(train_sizes, np.mean(val_scores, axis=1), 'o-', color='g', label='Validation score')
axes[1, 1].fill_between(train_sizes, 
                        np.mean(train_scores, axis=1) - np.std(train_scores, axis=1),
                        np.mean(train_scores, axis=1) + np.std(train_scores, axis=1), 
                        alpha=0.1, color='r')
axes[1, 1].fill_between(train_sizes, 
                        np.mean(val_scores, axis=1) - np.std(val_scores, axis=1),
                        np.mean(val_scores, axis=1) + np.std(val_scores, axis=1), 
                        alpha=0.1, color='g')
axes[1, 1].set_xlabel('Training Set Size')
axes[1, 1].set_ylabel('Score')
axes[1, 1].set_title(f'Learning Curves - {best_model_name}', fontsize=14, fontweight='bold')
axes[1, 1].legend(loc='best')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## üåü 11. Feature Importance Analysis

In [None]:
# Feature importance for applicable models
if best_model_name in ['Random Forest', 'Gradient Boosting', 'Decision Tree']:
    feature_importance = pd.DataFrame({
        'feature': X_train.columns,
        'importance': best_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    # Display top features
    print(f"\nüìä Top 20 Most Important Features ({best_model_name}):")
    print(feature_importance.head(20).to_string(index=False))
    
    # Visualize feature importance
    fig = px.bar(feature_importance.head(20), 
                 y='feature', x='importance',
                 orientation='h',
                 title=f'Top 20 Feature Importances - {best_model_name}',
                 labels={'importance': 'Feature Importance', 'feature': 'Feature'})
    fig.update_layout(height=600, yaxis={'categoryorder':'total ascending'})
    fig.show()
    
elif best_model_name == 'Logistic Regression':
    # Get coefficients
    if len(np.unique(y_train)) == 2:
        coefficients = pd.DataFrame({
            'feature': X_train.columns,
            'coefficient': best_model.coef_[0]
        })
    else:
        # Multi-class: average absolute coefficients
        coefficients = pd.DataFrame({
            'feature': X_train.columns,
            'coefficient': np.mean(np.abs(best_model.coef_), axis=0)
        })
    
    coefficients['abs_coefficient'] = coefficients['coefficient'].abs()
    coefficients = coefficients.sort_values('abs_coefficient', ascending=False)
    
    print(f"\nüìä Top 20 Most Important Features ({best_model_name}):")
    print(coefficients.head(20)[['feature', 'coefficient']].to_string(index=False))
    
    # Visualize coefficients
    fig = px.bar(coefficients.head(20), 
                 y='feature', x='coefficient',
                 orientation='h',
                 title=f'Top 20 Feature Coefficients - {best_model_name}',
                 color='coefficient',
                 color_continuous_scale='RdBu_r',
                 color_continuous_midpoint=0)
    fig.update_layout(height=600, yaxis={'categoryorder':'total ascending'})
    fig.show()

## ü§ù 12. Ensemble Model Creation

In [None]:
# Create ensemble from top performing models
print("\nü§ù Creating Ensemble Model...")
print("="*60)

# Select top 3 models
top_models = comparison_df.head(3)['Model'].tolist()
print(f"Selected models for ensemble: {', '.join(top_models)}")

# Create ensemble
ensemble_estimators = [(name, best_models[name]) for name in top_models]
ensemble_model = VotingClassifier(estimators=ensemble_estimators, voting='soft')

# Train ensemble
ensemble_model.fit(X_train, y_train)

# Evaluate ensemble
ensemble_pred = ensemble_model.predict(X_test)
ensemble_accuracy = accuracy_score(y_test, ensemble_pred)
ensemble_balanced = balanced_accuracy_score(y_test, ensemble_pred)
ensemble_f1 = f1_score(y_test, ensemble_pred, average='weighted')

print(f"\n‚úÖ Ensemble Model Performance:")
print(f"   Test Accuracy: {ensemble_accuracy:.4f}")
print(f"   Balanced Accuracy: {ensemble_balanced:.4f}")
print(f"   F1 Score: {ensemble_f1:.4f}")

# Compare with best single model
improvement = (ensemble_balanced - comparison_df.iloc[0]['Balanced Accuracy']) * 100
print(f"\nüìä Improvement over best single model: {improvement:+.2f}%")

## üíæ 13. Save Results and Models

In [None]:
# Create output directory
output_dir = Path('model_results') / datetime.now().strftime('%Y%m%d_%H%M%S')
output_dir.mkdir(parents=True, exist_ok=True)

# Prepare comprehensive results
final_results = {
    'analysis_date': datetime.now().strftime('%Y-%m-%d %H:%M'),
    'dataset_info': {
        'shape': df.shape,
        'features_used': len(selected_columns),
        'train_size': X_train.shape[0],
        'test_size': X_test.shape[0],
        'class_distribution': y.value_counts().to_dict() if y is not None else None,
        'class_balance': balance_info if 'balance_info' in locals() else None
    },
    'model_comparison': comparison_df.to_dict('records'),
    'best_model': {
        'name': best_model_name,
        'parameters': results[best_model_name]['best_params'],
        'test_accuracy': results[best_model_name]['test_accuracy'],
        'balanced_accuracy': results[best_model_name]['balanced_accuracy'],
        'f1_score': results[best_model_name]['f1_score'],
        'cv_mean_score': results[best_model_name]['cv_scores'].mean(),
        'cv_std_score': results[best_model_name]['cv_scores'].std()
    },
    'ensemble_performance': {
        'models': top_models,
        'test_accuracy': ensemble_accuracy,
        'balanced_accuracy': ensemble_balanced,
        'f1_score': ensemble_f1,
        'improvement': improvement
    }
}

# Save results
results_file = output_dir / 'analysis_results.json'
with open(results_file, 'w') as f:
    json.dump(final_results, f, indent=4)
print(f"\n‚úÖ Results saved to: {results_file}")

# Save models
for model_name, model in best_models.items():
    model_file = output_dir / f"{model_name.lower().replace(' ', '_')}_model.pkl"
    joblib.dump(model, model_file)
print(f"‚úÖ Individual models saved to: {output_dir}")

# Save ensemble model
ensemble_file = output_dir / 'ensemble_model.pkl'
joblib.dump(ensemble_model, ensemble_file)
print(f"‚úÖ Ensemble model saved to: {ensemble_file}")

# Save predictions
predictions_df = pd.DataFrame({
    'actual': y_test,
    'predicted_best': best_result['predictions'],
    'predicted_ensemble': ensemble_pred
})
predictions_file = output_dir / 'predictions.csv'
predictions_df.to_csv(predictions_file, index=False)
print(f"‚úÖ Predictions saved to: {predictions_file}")

## üéØ 14. Final Summary and Recommendations

In [None]:
print("\n" + "="*80)
print("üìä FINAL ANALYSIS SUMMARY")
print("="*80)

print(f"\nüéØ Best Single Model: {best_model_name}")
print(f"   - Balanced Accuracy: {results[best_model_name]['balanced_accuracy']:.4f}")
print(f"   - F1 Score: {results[best_model_name]['f1_score']:.4f}")
print(f"   - Training Time: {results[best_model_name]['training_time']:.2f}s")

print(f"\nü§ù Ensemble Model Performance:")
print(f"   - Balanced Accuracy: {ensemble_balanced:.4f}")
print(f"   - Improvement: {improvement:+.2f}%")

print("\nüí° Key Insights:")
if 'balance_info' in locals() and not balance_info['balanced']:
    print(f"   ‚ö†Ô∏è Class imbalance handled with class weights")
    print(f"   - Imbalance ratio: {balance_info['imbalance_ratio']:.2f}")

if 'feature_importance' in locals():
    top_features = feature_importance.head(5)['feature'].tolist()
    print(f"   üìä Most important features: {', '.join(top_features)}")

print("\nüöÄ Recommendations for Next Steps:")
print("   1. Deploy the ensemble model for best performance")
print("   2. Monitor model performance on new data")
print("   3. Consider collecting more data for minority classes")
print("   4. Experiment with feature engineering based on importance")
print("   5. Set up automated retraining pipeline")

print(f"\n‚úÖ Analysis completed successfully!")
print(f"üìÅ All results saved to: {output_dir}")

## üìö 15. Manual vs Automated Optimization Comparison

This section provides an educational comparison between manual parameter tuning (as in gradient descent) and automated methods like GridSearchCV.

In [None]:
print("\nüéØ OPTIMIZATION METHODS COMPARISON")
print("="*80)

# Create comparison visualization
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Manual optimization illustration
x = np.linspace(-10, 10, 100)
y = np.linspace(-10, 10, 100)
X_mesh, Y_mesh = np.meshgrid(x, y)
Z = np.sin(np.sqrt(X_mesh**2 + Y_mesh**2))

# Manual path
manual_path_x = [8, 6, 4, 2, 0.5, 0.1]
manual_path_y = [8, 5, 3, 1, 0.3, 0.1]

ax1.contour(X_mesh, Y_mesh, Z, levels=20, alpha=0.6)
ax1.plot(manual_path_x, manual_path_y, 'ro-', linewidth=2, markersize=8)
ax1.set_title('Manual Optimization\n(Like Gradient Descent)', fontsize=14)
ax1.set_xlabel('Parameter 1')
ax1.set_ylabel('Parameter 2')
ax1.grid(True, alpha=0.3)

# Grid search illustration
grid_x = np.linspace(-10, 10, 10)
grid_y = np.linspace(-10, 10, 10)
grid_points_x, grid_points_y = np.meshgrid(grid_x, grid_y)

ax2.contour(X_mesh, Y_mesh, Z, levels=20, alpha=0.6)
ax2.scatter(grid_points_x, grid_points_y, c='red', s=50, alpha=0.8)
ax2.set_title('Grid Search Optimization\n(Systematic Exploration)', fontsize=14)
ax2.set_xlabel('Parameter 1')
ax2.set_ylabel('Parameter 2')
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\nüìä Comparison Summary:")
print("\nManual Optimization (Gradient Descent Style):")
print("  ‚úì Provides intuition about parameter space")
print("  ‚úì Can be faster for simple problems")
print("  ‚úì Allows for custom optimization strategies")
print("  ‚úó Requires expertise and manual tuning")
print("  ‚úó May get stuck in local optima")
print("  ‚úó Time-consuming for multiple parameters")

print("\nAutomated Optimization (GridSearchCV):")
print("  ‚úì Systematic and reproducible")
print("  ‚úì Explores entire parameter space")
print("  ‚úì Finds global optimum within search space")
print("  ‚úì Handles multiple parameters easily")
print("  ‚úó Can be computationally expensive")
print("  ‚úó Limited to predefined parameter grid")

print("\nüí° Best Practice: Use GridSearchCV for model selection, ")
print("   then fine-tune with manual methods if needed!")

print("\n" + "="*80)
print("üéâ Enhanced ML Training Notebook Complete!")
print("="*80)