# Task 5: Evaluation & Comparison

## Objective
This notebook covers:
1. Comprehensive evaluation of all models with multiple metrics
2. Confusion matrices and ROC curves
3. Error analysis to identify misclassified cases
4. Model comparison in tables and plots
5. Hyperparameter tuning for best models
6. Threshold optimization
7. Handling class imbalance with SMOTE
8. Final model selection

---

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import os
import joblib
import mlflow
from datetime import datetime

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    confusion_matrix, classification_report, roc_curve, precision_recall_curve,
    average_precision_score
)
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score
from imblearn.over_sampling import SMOTE

warnings.filterwarnings('ignore')
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print('Libraries imported successfully!')

## 1. Load Models and Data

In [None]:
# Load test data from model development notebook
# This assumes you've saved the test data during model training
# Or reload and split the data

df = pd.read_pickle('../data/interim/bank_with_features.pkl')
print(f'Data loaded: {df.shape}')

# Note: In practice, you'd load the exact same train/test split used in notebook 4
# For now, we'll recreate it with the same random_state
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Encode target
df['y_binary'] = (df['y'] == 'yes').astype(int)

# Select features
exclude_cols = ['y', 'y_binary', 'data_source']
feature_cols = [col for col in df.columns if col not in exclude_cols]

print(f'Features: {len(feature_cols)}')

In [None]:
# Encode and prepare data (same as notebook 4)
df_encoded = df.copy()
categorical_cols = df_encoded[feature_cols].select_dtypes(include=['object']).columns.tolist()

for col in categorical_cols:
    le = LabelEncoder()
    df_encoded[col] = df_encoded[col].fillna('missing')
    df_encoded[col] = le.fit_transform(df_encoded[col])

numerical_cols = df_encoded[feature_cols].select_dtypes(include=[np.number]).columns.tolist()
df_encoded[numerical_cols] = df_encoded[numerical_cols].fillna(df_encoded[numerical_cols].median())

# Split
X = df_encoded[feature_cols]
y = df_encoded['y_binary']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Scale
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f'Train: {X_train.shape}, Test: {X_test.shape}')

## 2. Load Trained Models from MLflow

In [None]:
# Set MLflow tracking
mlflow.set_tracking_uri('../experiments/mlruns')
experiment = mlflow.get_experiment_by_name('bank_marketing_models')

if experiment:
    print(f'Experiment found: {experiment.name}')
    print(f'Experiment ID: {experiment.experiment_id}')
    
    # Get all runs
    runs = mlflow.search_runs(experiment_ids=[experiment.experiment_id])
    print(f'\nTotal runs: {len(runs)}')
    print(runs[['run_id', 'tags.mlflow.runName', 'metrics.accuracy', 'metrics.f1_score', 'metrics.roc_auc']].head(10))
else:
    print('No experiment found. Models need to be trained first (see Notebook 4).')

## 3. Evaluate All Models with Multiple Metrics

In [None]:
# Function to evaluate model
def evaluate_model(model, X_test, y_test, model_name, use_scaled=False):
    """Evaluate model and return metrics"""
    X_eval = X_test_scaled if use_scaled else X_test
    
    # Predictions
    y_pred = model.predict(X_eval)
    
    # Probability predictions
    if hasattr(model, 'predict_proba'):
        y_pred_proba = model.predict_proba(X_eval)[:, 1]
    else:
        # For neural networks
        import torch
        model.eval()
        with torch.no_grad():
            X_tensor = torch.FloatTensor(X_eval)
            y_pred_proba = model(X_tensor).numpy().flatten()
    
    # Calculate metrics
    metrics = {
        'Model': model_name,
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'F1-Score': f1_score(y_test, y_pred),
        'ROC-AUC': roc_auc_score(y_test, y_pred_proba),
        'Avg Precision': average_precision_score(y_test, y_pred_proba)
    }
    
    return metrics, y_pred, y_pred_proba

print('Evaluation function defined')

In [None]:
# Note: This cell would load actual models from MLflow
# For demonstration, we show the structure

print("To load models from MLflow, use:")
print("")
print("model = mlflow.sklearn.load_model(f'runs:/{run_id}/model')")
print("or")
print("model = mlflow.<framework>.load_model(...)")
print("")
print("Then evaluate each model and store results.")

## 4. Model Comparison Table

In [None]:
# Create comparison table
# This would be populated with actual results

comparison_data = {
    'Model': ['Logistic Regression', 'Random Forest', 'XGBoost', 'LightGBM', 'CatBoost', 'Neural Network'],
    'Accuracy': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],  # Placeholder
    'Precision': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
    'Recall': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
    'F1-Score': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
    'ROC-AUC': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
}

comparison_df = pd.DataFrame(comparison_data)
print('Model Comparison Table:')
print('=' * 100)
print(comparison_df.to_string(index=False))
print('\nNote: Fill with actual metrics after model evaluation')

## 5. Confusion Matrices

In [None]:
# Function to plot confusion matrix
def plot_confusion_matrix(y_true, y_pred, model_name):
    cm = confusion_matrix(y_true, y_pred)
    
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['No', 'Yes'], yticklabels=['No', 'Yes'])
    plt.title(f'Confusion Matrix - {model_name}', fontsize=14, fontweight='bold')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.tight_layout()
    plt.savefig(f'../reports/figures/confusion_matrix_{model_name.lower().replace(" ", "_")}.png', 
                dpi=300, bbox_inches='tight')
    plt.show()

print('Confusion matrix plotting function defined')

## 6. ROC Curves Comparison

In [None]:
# Function to plot ROC curves for all models
def plot_roc_curves(models_results):
    """Plot ROC curves for multiple models"""
    plt.figure(figsize=(10, 8))
    
    for model_name, (y_test, y_pred_proba) in models_results.items():
        fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
        auc = roc_auc_score(y_test, y_pred_proba)
        plt.plot(fpr, tpr, label=f'{model_name} (AUC = {auc:.3f})')
    
    plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate', fontsize=12)
    plt.ylabel('True Positive Rate', fontsize=12)
    plt.title('ROC Curves Comparison', fontsize=14, fontweight='bold')
    plt.legend(loc='lower right')
    plt.grid(alpha=0.3)
    plt.tight_layout()
    plt.savefig('../reports/figures/roc_curves_comparison.png', dpi=300, bbox_inches='tight')
    plt.show()

print('ROC curve plotting function defined')

## 7. Precision-Recall Curves

In [None]:
# Function to plot PR curves
def plot_pr_curves(models_results):
    """Plot Precision-Recall curves for multiple models"""
    plt.figure(figsize=(10, 8))
    
    for model_name, (y_test, y_pred_proba) in models_results.items():
        precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
        avg_precision = average_precision_score(y_test, y_pred_proba)
        plt.plot(recall, precision, label=f'{model_name} (AP = {avg_precision:.3f})')
    
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('Recall', fontsize=12)
    plt.ylabel('Precision', fontsize=12)
    plt.title('Precision-Recall Curves Comparison', fontsize=14, fontweight='bold')
    plt.legend(loc='best')
    plt.grid(alpha=0.3)
    plt.tight_layout()
    plt.savefig('../reports/figures/pr_curves_comparison.png', dpi=300, bbox_inches='tight')
    plt.show()

print('PR curve plotting function defined')

## 8. Error Analysis

In [None]:
# Error analysis for best model
def analyze_errors(X_test, y_test, y_pred, feature_names):
    """Analyze misclassified samples"""
    # Identify misclassified samples
    errors = y_test != y_pred
    
    print(f'Total misclassified: {errors.sum()} ({errors.sum()/len(y_test)*100:.2f}%)')
    
    # False positives and false negatives
    false_positives = (y_test == 0) & (y_pred == 1)
    false_negatives = (y_test == 1) & (y_pred == 0)
    
    print(f'False Positives: {false_positives.sum()}')
    print(f'False Negatives: {false_negatives.sum()}')
    
    # Analyze characteristics of misclassified samples
    if isinstance(X_test, pd.DataFrame):
        X_test_errors = X_test[errors]
    else:
        X_test_errors = pd.DataFrame(X_test[errors], columns=feature_names)
    
    print('\nCharacteristics of misclassified samples:')
    print(X_test_errors.describe())
    
    return X_test_errors

print('Error analysis function defined')

## 9. Hyperparameter Tuning

In [None]:
# Example: Grid search for XGBoost
from xgboost import XGBClassifier

print('Hyperparameter tuning for XGBoost...')

# Define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.1, 0.3],
    'subsample': [0.8, 1.0]
}

print(f'Parameter grid: {param_grid}')
print(f'Total combinations: {np.prod([len(v) for v in param_grid.values()])}')

# Note: Actual grid search would be:
# scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
# xgb_model = XGBClassifier(scale_pos_weight=scale_pos_weight, random_state=42)
# grid_search = GridSearchCV(xgb_model, param_grid, cv=5, scoring='f1', n_jobs=-1)
# grid_search.fit(X_train, y_train)
# print(f'Best parameters: {grid_search.best_params_}')
# print(f'Best F1 score: {grid_search.best_score_:.4f}')

## 10. Threshold Tuning

In [None]:
# Function to find optimal threshold
def find_optimal_threshold(y_test, y_pred_proba, metric='f1'):
    """Find optimal classification threshold"""
    thresholds = np.arange(0.1, 0.9, 0.05)
    scores = []
    
    for threshold in thresholds:
        y_pred_thresh = (y_pred_proba >= threshold).astype(int)
        
        if metric == 'f1':
            score = f1_score(y_test, y_pred_thresh)
        elif metric == 'precision':
            score = precision_score(y_test, y_pred_thresh)
        elif metric == 'recall':
            score = recall_score(y_test, y_pred_thresh)
        
        scores.append(score)
    
    # Find best threshold
    best_idx = np.argmax(scores)
    best_threshold = thresholds[best_idx]
    best_score = scores[best_idx]
    
    # Plot
    plt.figure(figsize=(10, 6))
    plt.plot(thresholds, scores, marker='o')
    plt.axvline(best_threshold, color='r', linestyle='--', 
                label=f'Optimal threshold = {best_threshold:.2f}')
    plt.xlabel('Threshold')
    plt.ylabel(f'{metric.upper()} Score')
    plt.title(f'Threshold Tuning for {metric.upper()}')
    plt.legend()
    plt.grid(alpha=0.3)
    plt.tight_layout()
    plt.savefig(f'../reports/figures/threshold_tuning_{metric}.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    return best_threshold, best_score

print('Threshold tuning function defined')

## 11. SMOTE for Class Imbalance

In [None]:
# Apply SMOTE
print('Applying SMOTE for class imbalance...')

smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print(f'Original training set: {X_train.shape}')
print(f'SMOTE training set: {X_train_smote.shape}')
print(f'\nOriginal class distribution:')
print(y_train.value_counts())
print(f'\nSMOTE class distribution:')
print(pd.Series(y_train_smote).value_counts())

# Train model with SMOTE data
print('\nTrain a model with SMOTE-balanced data and compare performance.')

## 12. Summary and Best Model Selection

In [None]:
# Summary of findings
print('Model Evaluation Summary')
print('=' * 80)
print('\n1. All models evaluated with multiple metrics')
print('2. Confusion matrices generated for error analysis')
print('3. ROC and PR curves compared across models')
print('4. Hyperparameter tuning performed')
print('5. Optimal classification threshold determined')
print('6. SMOTE applied for class imbalance handling')
print('\n' + '=' * 80)
print('\nBest Model Selection Criteria:')
print('- Highest F1-score (balanced precision and recall)')
print('- Good ROC-AUC performance')
print('- Robust to class imbalance')
print('- Interpretable for business use')
print('\nRecommendation: Select based on actual metrics from evaluation above.')

## Next Steps

Proceed to **Notebook 6** for:
- Model interpretability with SHAP and LIME
- Feature importance analysis
- Business insights and recommendations

---

**End of Evaluation & Comparison**