# NIBSS Fraud Detection: Model Optimization and Hyperparameter Tuning

This notebook focuses on optimizing machine learning models for fraud detection using Grid Search and comprehensive evaluation.

## Setup and Load Data

In [None]:
import pandas as pd
import numpy as np
import joblib
import time
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import StratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import (
    roc_auc_score, average_precision_score, f1_score,
    make_scorer, precision_score, recall_score
)
import matplotlib.pyplot as plt
import seaborn as sns

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# Load saved data from previous notebook
data_splits = joblib.load('../data/processed/data_splits.pkl')
pipelines = joblib.load('../models/base_pipelines.pkl')
param_grids = joblib.load('../config/param_grids.pkl')
data_info = joblib.load('../data/processed/data_info.pkl')

X_train = data_splits['X_train']
y_train = data_splits['y_train']
X_val = data_splits['X_val']
y_val = data_splits['y_val']

print("Data loaded successfully!")
print(f"Training samples: {len(X_train)}")
print(f"Validation samples: {len(X_val)}")

## Define Custom Scoring Functions

In [None]:
# Create multiple scorers for comprehensive evaluation
scoring = {
    'auc_roc': make_scorer(roc_auc_score, needs_proba=True),
    'auc_pr': make_scorer(average_precision_score, needs_proba=True),
    'f1': make_scorer(f1_score),
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score)
}

# Primary metric for optimization
primary_metric = 'auc_roc'

print("Scoring functions defined")

## Hyperparameter Optimization for All Models

In [None]:
# Initialize results storage
cv_results = {}
best_models = {}
training_times = {}

# Define cross-validation strategy
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_SEED)

# Optimize each model
for model_name, pipeline in pipelines.items():
    print(f"\n{'='*50}")
    print(f"Optimizing {model_name.upper()}")
    print(f"{'='*50}")

    start_time = time.time()

    # Grid search with cross-validation
    grid_search = GridSearchCV(
        pipeline,
        param_grids[model_name],
        cv=cv,
        scoring=scoring,
        refit=primary_metric,
        n_jobs=-1,
        verbose=3,
        return_train_score=True
    )

    # Fit grid search
    grid_search.fit(X_train, y_train)

    # Store results
    cv_results[model_name] = pd.DataFrame(grid_search.cv_results_)
    best_models[model_name] = grid_search.best_estimator_
    training_times[model_name] = time.time() - start_time

    # Print best parameters and scores
    print(f"\nBest parameters for {model_name}:")
    print(grid_search.best_params_)
    print(f"\nBest CV scores:")
    print(f"AUC-ROC: {grid_search.cv_results_[f'mean_test_{primary_metric}'][grid_search.best_index_]:.4f} "
          f"(+/- {grid_search.cv_results_[f'std_test_{primary_metric}'][grid_search.best_index_]:.4f})")
    print(f"AUC-PR: {grid_search.cv_results_['mean_test_auc_pr'][grid_search.best_index_]:.4f} "
          f"(+/- {grid_search.cv_results_['std_test_auc_pr'][grid_search.best_index_]:.4f})")
    print(f"F1-Score: {grid_search.cv_results_['mean_test_f1'][grid_search.best_index_]:.4f} "
          f"(+/- {grid_search.cv_results_['std_test_f1'][grid_search.best_index_]:.4f})")
    print(f"\nTraining time: {training_times[model_name]:.2f} seconds")

    # Save intermediate results
    joblib.dump(grid_search, f'../models/grid_search_{model_name}.pkl')

print("\n" + "="*50)
print("All models optimized successfully!")

## Create Table 4.4 - Optimal Hyperparameters and Cross-Validation Performance

In [None]:
# Prepare data for the table
table_data = []

for model_name in ['logistic_regression', 'random_forest', 'xgboost']:
    # Get best parameters
    grid_search = joblib.load(f'../models/grid_search_{model_name}.pkl')
    best_idx = grid_search.best_index_

    # Format parameters
    params = grid_search.best_params_
    if model_name == 'logistic_regression':
        param_str = f"C = {params['classifier__C']}, penalty = '{params['classifier__penalty']}', class_weight = 'balanced'"
    elif model_name == 'random_forest':
        param_str = (f"n_estimators = {params['classifier__n_estimators']}, "
                    f"max_depth = {params['classifier__max_depth']}, "
                    f"min_samples_split = {params['classifier__min_samples_split']}, "
                    f"max_features = '{params['classifier__max_features']}', "
                    f"class_weight = 'balanced'")
    else:  # xgboost
        param_str = (f"learning_rate = {params['classifier__learning_rate']}, "
                    f"max_depth = {params['classifier__max_depth']}, "
                    f"n_estimators = {params['classifier__n_estimators']}, "
                    f"subsample = {params['classifier__subsample']}, "
                    f"colsample_bytree = {params['classifier__colsample_bytree']}, "
                    f"scale_pos_weight = {data_info['n_samples'] * (1 - data_info['fraud_rate']) / (data_info['n_samples'] * data_info['fraud_rate']):.0f}")

    # Get CV scores
    auc_mean = grid_search.cv_results_['mean_test_auc_roc'][best_idx]
    auc_std = grid_search.cv_results_['std_test_auc_roc'][best_idx]
    f1_mean = grid_search.cv_results_['mean_test_f1'][best_idx]
    f1_std = grid_search.cv_results_['std_test_f1'][best_idx]

    table_data.append({
        'Model': model_name.replace('_', ' ').title(),
        'Optimal Parameters': param_str,
        'CV AUC (Mean ± SD)': f"{auc_mean:.3f} ± {auc_std:.3f}",
        'CV F1-Score (Mean ± SD)': f"{f1_mean:.3f} ± {f1_std:.3f}"
    })

# Create and display table
table_4_4 = pd.DataFrame(table_data)
print("\nTable 4.4: Optimal Hyperparameters and Cross-Validation Performance")
print("="*100)
print(table_4_4.to_string(index=False))

# Save table
table_4_4.to_csv('../data/processed/table_4_4.csv', index=False)

## Training Convergence Analysis

In [None]:
# Plot learning curves for best models
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

for idx, (model_name, model) in enumerate(best_models.items()):
    # Get CV results
    grid_search = joblib.load(f'../models/grid_search_{model_name}.pkl')
    cv_result = pd.DataFrame(grid_search.cv_results_)
    best_idx = cv_result[f'rank_test_{primary_metric}'] == 1

    # Plot training vs validation scores
    ax = axes[idx*2]
    ax.plot(range(3), [cv_result[f'split{i}_train_auc_roc'][best_idx].values[0] for i in range(3)],
            'b-', label='Training AUC')
    ax.plot(range(3), [cv_result[f'split{i}_test_auc_roc'][best_idx].values[0] for i in range(3)],
            'r-', label='Validation AUC')
    ax.set_xlabel('CV Fold')
    ax.set_ylabel('AUC-ROC')
    ax.set_title(f'{model_name.replace("_", " ").title()} - AUC Convergence')
    ax.legend()
    ax.grid(True, alpha=0.3)

    # Plot F1 scores
    ax = axes[idx*2 + 1]
    ax.plot(range(3), [cv_result[f'split{i}_train_f1'][best_idx].values[0] for i in range(3)],
            'b-', label='Training F1')
    ax.plot(range(3), [cv_result[f'split{i}_test_f1'][best_idx].values[0] for i in range(3)],
            'r-', label='Validation F1')
    ax.set_xlabel('CV Fold')
    ax.set_ylabel('F1-Score')
    ax.set_title(f'{model_name.replace("_", " ").title()} - F1 Convergence')
    ax.legend()
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../docs/images/training_convergence.png', dpi=300, bbox_inches='tight')
plt.show()

## Validate on Hold-out Validation Set

In [None]:
validation_scores = {}

for model_name, model in best_models.items():
    # Get predictions
    y_val_proba = model.predict_proba(X_val)[:, 1]
    y_val_pred = model.predict(X_val)

    # Calculate metrics
    val_scores = {
        'auc_roc': roc_auc_score(y_val, y_val_proba),
        'auc_pr': average_precision_score(y_val, y_val_proba),
        'f1': f1_score(y_val, y_val_pred),
        'precision': precision_score(y_val, y_val_pred),
        'recall': recall_score(y_val, y_val_pred)
    }

    validation_scores[model_name] = val_scores

    print(f"\n{model_name.upper()} Validation Set Performance:")
    for metric, score in val_scores.items():
        print(f"{metric}: {score:.4f}")

# Save validation scores
joblib.dump(validation_scores, '../results/validation_scores.pkl')

## Save Final Models and Results

In [None]:
# Save all results
results = {
    'best_models': best_models,
    'cv_results': cv_results,
    'training_times': training_times,
    'validation_scores': validation_scores,
    'table_4_4': table_4_4
}

joblib.dump(results, '../models/optimization_results.pkl')

# Save individual best models
for model_name, model in best_models.items():
    joblib.dump(model, f'../models/best_model_{model_name}.pkl')

print("All models and results saved successfully!")
print(f"\nTotal optimization time: {sum(training_times.values())/60:.2f} minutes")

## Summary Statistics

In [None]:
print("\nSUMMARY OF HYPERPARAMETER OPTIMIZATION")
print("="*50)

for model_name in ['logistic_regression', 'random_forest', 'xgboost']:
    print(f"\n{model_name.upper()}:")
    print(f"Training time: {training_times[model_name]:.2f} seconds")
    print(f"Best CV AUC-ROC: {validation_scores[model_name]['auc_roc']:.4f}")
    print(f"Best CV F1-Score: {validation_scores[model_name]['f1']:.4f}")
    print(f"Configurations tested: {len(cv_results[model_name])}")

print("\nReady for test set evaluation in next notebook!")