# 🤖 Fraud Detection Pipeline - Part 4: ML Model Training

## 📋 Overview
This notebook covers training multiple Machine Learning models:
- Logistic Regression (Baseline)
- Random Forest
- XGBoost
- LightGBM
- CatBoost

Including:
- Class imbalance handling (SMOTE, class weights)
- Hyperparameter tuning
- Cross-validation
- Model evaluation

---

## 1️⃣ Setup & Load Data

In [None]:
!pip install -q xgboost lightgbm catboost imbalanced-learn shap

In [None]:
import os
BASE_PATH = os.path.abspath('.')

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import (
    classification_report, confusion_matrix, roc_auc_score,
    precision_recall_curve, roc_curve, average_precision_score,
    f1_score, precision_score, recall_score, accuracy_score
)

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline

import pickle
import json
import time
import warnings
import gc
warnings.filterwarnings('ignore')

COLORS = {
    'primary': '#3498db',
    'secondary': '#2ecc71',
    'danger': '#e74c3c',
    'warning': '#f39c12',
    'info': '#9b59b6',
    'dark': '#2c3e50'
}

print('✅ Libraries imported!')

✅ Libraries imported!


In [6]:
X_train = np.load(f'{BASE_PATH}/data/splits/X_train.npy')
X_val = np.load(f'{BASE_PATH}/data/splits/X_val.npy')
X_holdout = np.load(f'{BASE_PATH}/data/splits/X_holdout.npy')
y_train = np.load(f'{BASE_PATH}/data/splits/y_train.npy')
y_val = np.load(f'{BASE_PATH}/data/splits/y_val.npy')
y_holdout = np.load(f'{BASE_PATH}/data/splits/y_holdout.npy')

with open(f'{BASE_PATH}/reports/metrics/feature_engineering_info.json', 'r') as f:
    feat_info = json.load(f)

FEATURE_NAMES = feat_info['final_features']

print(f'📊 Train: {X_train.shape}')
print(f'📊 Validation: {X_val.shape}')
print(f'📊 Holdout: {X_holdout.shape}')
print(f'📊 Features: {len(FEATURE_NAMES)}')
print(f'\n🎯 Fraud rate - Train: {y_train.mean()*100:.2f}%, Val: {y_val.mean()*100:.2f}%')

📊 Train: (472432, 91)
📊 Validation: (59054, 91)
📊 Holdout: (59054, 91)
📊 Features: 91

🎯 Fraud rate - Train: 3.50%, Val: 3.50%


## 2️⃣ Handle Class Imbalance

In [7]:
n_samples = len(y_train)
n_fraud = y_train.sum()
n_legit = n_samples - n_fraud

class_weights = {
    0: n_samples / (2 * n_legit),
    1: n_samples / (2 * n_fraud)
}

scale_pos_weight = n_legit / n_fraud

print(f'⚖️ Class Weights:')
print(f'  Class 0 (Legit): {class_weights[0]:.4f}')
print(f'  Class 1 (Fraud): {class_weights[1]:.4f}')
print(f'\n📊 Scale Pos Weight (for XGBoost): {scale_pos_weight:.2f}')

⚖️ Class Weights:
  Class 0 (Legit): 0.5181
  Class 1 (Fraud): 14.2901

📊 Scale Pos Weight (for XGBoost): 27.58


In [8]:
print('🔧 Applying SMOTE...')

smote = SMOTE(random_state=42, sampling_strategy=0.3)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print(f'\n📊 Original train shape: {X_train.shape}')
print(f'📊 SMOTE train shape: {X_train_smote.shape}')
print(f'\n🎯 Original fraud rate: {y_train.mean()*100:.2f}%')
print(f'🎯 SMOTE fraud rate: {y_train_smote.mean()*100:.2f}%')

🔧 Applying SMOTE...

📊 Original train shape: (472432, 91)
📊 SMOTE train shape: (592672, 91)

🎯 Original fraud rate: 3.50%
🎯 SMOTE fraud rate: 23.08%


## 3️⃣ Utility Functions

In [9]:
def evaluate_model(model, X, y, model_name, threshold=0.5):
    """
    Comprehensive model evaluation
    """
    y_pred_proba = model.predict_proba(X)[:, 1]
    y_pred = (y_pred_proba >= threshold).astype(int)
    
    metrics = {
        'model': model_name,
        'accuracy': accuracy_score(y, y_pred),
        'precision': precision_score(y, y_pred),
        'recall': recall_score(y, y_pred),
        'f1': f1_score(y, y_pred),
        'roc_auc': roc_auc_score(y, y_pred_proba),
        'pr_auc': average_precision_score(y, y_pred_proba)
    }
    
    return metrics, y_pred, y_pred_proba

In [10]:
def plot_model_performance(y_true, y_pred, y_proba, model_name):
    """
    Plot comprehensive model performance
    """
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=(
            'Confusion Matrix',
            'ROC Curve',
            'Precision-Recall Curve',
            'Prediction Distribution'
        ),
        specs=[[{'type': 'heatmap'}, {'type': 'scatter'}],
               [{'type': 'scatter'}, {'type': 'histogram'}]]
    )
    
    cm = confusion_matrix(y_true, y_pred)
    cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    
    fig.add_trace(
        go.Heatmap(
            z=cm_normalized,
            x=['Predicted Legit', 'Predicted Fraud'],
            y=['Actual Legit', 'Actual Fraud'],
            colorscale='Blues',
            text=cm,
            texttemplate='%{text:,}',
            showscale=False
        ),
        row=1, col=1
    )
    
    fpr, tpr, _ = roc_curve(y_true, y_proba)
    roc_auc = roc_auc_score(y_true, y_proba)
    
    fig.add_trace(
        go.Scatter(
            x=fpr, y=tpr,
            mode='lines',
            name=f'ROC (AUC={roc_auc:.4f})',
            line=dict(color=COLORS['primary'], width=2)
        ),
        row=1, col=2
    )
    fig.add_trace(
        go.Scatter(
            x=[0, 1], y=[0, 1],
            mode='lines',
            line=dict(color='gray', dash='dash'),
            showlegend=False
        ),
        row=1, col=2
    )
    
    precision, recall, _ = precision_recall_curve(y_true, y_proba)
    pr_auc = average_precision_score(y_true, y_proba)
    
    fig.add_trace(
        go.Scatter(
            x=recall, y=precision,
            mode='lines',
            name=f'PR (AUC={pr_auc:.4f})',
            line=dict(color=COLORS['secondary'], width=2)
        ),
        row=2, col=1
    )
    
    fig.add_trace(
        go.Histogram(
            x=y_proba[y_true == 0],
            name='Legitimate',
            marker_color=COLORS['secondary'],
            opacity=0.7,
            nbinsx=50
        ),
        row=2, col=2
    )
    fig.add_trace(
        go.Histogram(
            x=y_proba[y_true == 1],
            name='Fraud',
            marker_color=COLORS['danger'],
            opacity=0.7,
            nbinsx=50
        ),
        row=2, col=2
    )
    
    fig.update_layout(
        title=f'📊 {model_name} Performance',
        title_font_size=20,
        height=700,
        showlegend=True
    )
    
    fig.update_xaxes(title_text='False Positive Rate', row=1, col=2)
    fig.update_yaxes(title_text='True Positive Rate', row=1, col=2)
    fig.update_xaxes(title_text='Recall', row=2, col=1)
    fig.update_yaxes(title_text='Precision', row=2, col=1)
    fig.update_xaxes(title_text='Predicted Probability', row=2, col=2)
    
    return fig

In [11]:
all_results = []
trained_models = {}

## 4️⃣ Model 1: Logistic Regression (Baseline)

In [12]:
print('🔧 Training Logistic Regression...')
start_time = time.time()

lr_model = LogisticRegression(
    class_weight='balanced',
    max_iter=1000,
    random_state=42,
    n_jobs=-1
)

lr_model.fit(X_train, y_train)
train_time = time.time() - start_time

lr_metrics, lr_pred, lr_proba = evaluate_model(lr_model, X_val, y_val, 'Logistic Regression')
lr_metrics['train_time'] = train_time
all_results.append(lr_metrics)
trained_models['logistic_regression'] = lr_model

print(f'\n✅ Training complete in {train_time:.2f}s')
print(f'\n📊 Validation Results:')
for k, v in lr_metrics.items():
    if k != 'model':
        print(f'  {k}: {v:.4f}' if isinstance(v, float) else f'  {k}: {v}')

🔧 Training Logistic Regression...

✅ Training complete in 68.23s

📊 Validation Results:
  accuracy: 0.8186
  precision: 0.1281
  recall: 0.7209
  f1: 0.2176
  roc_auc: 0.8453
  pr_auc: 0.3484
  train_time: 68.2256


In [13]:
fig = plot_model_performance(y_val, lr_pred, lr_proba, 'Logistic Regression')
fig.show()
fig.write_html(f'{BASE_PATH}/reports/figures/15_lr_performance.html')

## 5️⃣ Model 2: Random Forest

In [14]:
print('🔧 Training Random Forest...')
start_time = time.time()

rf_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=15,
    min_samples_split=10,
    min_samples_leaf=5,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

rf_model.fit(X_train, y_train)
train_time = time.time() - start_time

rf_metrics, rf_pred, rf_proba = evaluate_model(rf_model, X_val, y_val, 'Random Forest')
rf_metrics['train_time'] = train_time
all_results.append(rf_metrics)
trained_models['random_forest'] = rf_model

print(f'\n✅ Training complete in {train_time:.2f}s')
print(f'\n📊 Validation Results:')
for k, v in rf_metrics.items():
    if k != 'model':
        print(f'  {k}: {v:.4f}' if isinstance(v, float) else f'  {k}: {v}')

🔧 Training Random Forest...

✅ Training complete in 50.78s

📊 Validation Results:
  accuracy: 0.9305
  precision: 0.2924
  recall: 0.6942
  f1: 0.4115
  roc_auc: 0.9078
  pr_auc: 0.5724
  train_time: 50.7756


In [15]:
fig = plot_model_performance(y_val, rf_pred, rf_proba, 'Random Forest')
fig.show()
fig.write_html(f'{BASE_PATH}/reports/figures/16_rf_performance.html')

In [16]:
rf_importance = pd.DataFrame({
    'Feature': FEATURE_NAMES,
    'Importance': rf_model.feature_importances_
}).sort_values('Importance', ascending=False)

top_20 = rf_importance.head(20)

fig = go.Figure(go.Bar(
    x=top_20['Importance'],
    y=top_20['Feature'],
    orientation='h',
    marker=dict(
        color=top_20['Importance'],
        colorscale='Viridis'
    )
))

fig.update_layout(
    title='🌲 Random Forest - Top 20 Feature Importance',
    title_font_size=18,
    xaxis_title='Importance',
    height=600,
    margin=dict(l=200)
)

fig.show()

## 6️⃣ Model 3: XGBoost

In [17]:
print('🔧 Training XGBoost...')
start_time = time.time()

xgb_model = XGBClassifier(
    n_estimators=300,
    max_depth=8,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=scale_pos_weight,
    random_state=42,
    eval_metric='auc',
    early_stopping_rounds=30,
    n_jobs=-1
)

xgb_model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    verbose=False
)
train_time = time.time() - start_time

xgb_metrics, xgb_pred, xgb_proba = evaluate_model(xgb_model, X_val, y_val, 'XGBoost')
xgb_metrics['train_time'] = train_time
all_results.append(xgb_metrics)
trained_models['xgboost'] = xgb_model

print(f'\n✅ Training complete in {train_time:.2f}s')
print(f'📊 Best iteration: {xgb_model.best_iteration}')
print(f'\n📊 Validation Results:')
for k, v in xgb_metrics.items():
    if k != 'model':
        print(f'  {k}: {v:.4f}' if isinstance(v, float) else f'  {k}: {v}')

🔧 Training XGBoost...

✅ Training complete in 23.85s
📊 Best iteration: 299

📊 Validation Results:
  accuracy: 0.9473
  precision: 0.3799
  recall: 0.7978
  f1: 0.5147
  roc_auc: 0.9537
  pr_auc: 0.7299
  train_time: 23.8546


In [18]:
fig = plot_model_performance(y_val, xgb_pred, xgb_proba, 'XGBoost')
fig.show()
fig.write_html(f'{BASE_PATH}/reports/figures/17_xgb_performance.html')

## 7️⃣ Model 4: LightGBM

In [19]:
print('🔧 Training LightGBM...')
start_time = time.time()

lgb_model = LGBMClassifier(
    n_estimators=300,
    max_depth=8,
    learning_rate=0.1,
    num_leaves=50,
    subsample=0.8,
    colsample_bytree=0.8,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1,
    verbose=-1
)

lgb_model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    callbacks=[]
)
train_time = time.time() - start_time

lgb_metrics, lgb_pred, lgb_proba = evaluate_model(lgb_model, X_val, y_val, 'LightGBM')
lgb_metrics['train_time'] = train_time
all_results.append(lgb_metrics)
trained_models['lightgbm'] = lgb_model

print(f'\n✅ Training complete in {train_time:.2f}s')
print(f'\n📊 Validation Results:')
for k, v in lgb_metrics.items():
    if k != 'model':
        print(f'  {k}: {v:.4f}' if isinstance(v, float) else f'  {k}: {v}')

🔧 Training LightGBM...

✅ Training complete in 6.41s

📊 Validation Results:
  accuracy: 0.9163
  precision: 0.2694
  recall: 0.8128
  f1: 0.4047
  roc_auc: 0.9435
  pr_auc: 0.6646
  train_time: 6.4086


In [20]:
fig = plot_model_performance(y_val, lgb_pred, lgb_proba, 'LightGBM')
fig.show()
fig.write_html(f'{BASE_PATH}/reports/figures/18_lgb_performance.html')

## 8️⃣ Model 5: CatBoost

In [21]:
print('🔧 Training CatBoost...')
start_time = time.time()

cat_model = CatBoostClassifier(
    iterations=300,
    depth=8,
    learning_rate=0.1,
    auto_class_weights='Balanced',
    random_state=42,
    verbose=False,
    early_stopping_rounds=30
)

cat_model.fit(
    X_train, y_train,
    eval_set=(X_val, y_val),
    verbose=False
)
train_time = time.time() - start_time

cat_metrics, cat_pred, cat_proba = evaluate_model(cat_model, X_val, y_val, 'CatBoost')
cat_metrics['train_time'] = train_time
all_results.append(cat_metrics)
trained_models['catboost'] = cat_model

print(f'\n✅ Training complete in {train_time:.2f}s')
print(f'📊 Best iteration: {cat_model.best_iteration_}')
print(f'\n📊 Validation Results:')
for k, v in cat_metrics.items():
    if k != 'model':
        print(f'  {k}: {v:.4f}' if isinstance(v, float) else f'  {k}: {v}')

🔧 Training CatBoost...

✅ Training complete in 32.74s
📊 Best iteration: 299

📊 Validation Results:
  accuracy: 0.9009
  precision: 0.2340
  recall: 0.8060
  f1: 0.3627
  roc_auc: 0.9336
  pr_auc: 0.6249
  train_time: 32.7388


In [22]:
fig = plot_model_performance(y_val, cat_pred, cat_proba, 'CatBoost')
fig.show()
fig.write_html(f'{BASE_PATH}/reports/figures/19_cat_performance.html')

## 9️⃣ XGBoost with Hyperparameter Tuning

In [23]:
print('🔧 Hyperparameter Tuning for XGBoost...')

param_grid = {
    'max_depth': [6, 8, 10],
    'learning_rate': [0.05, 0.1, 0.15],
    'n_estimators': [200, 300],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9],
    'min_child_weight': [1, 3, 5]
}

xgb_base = XGBClassifier(
    scale_pos_weight=scale_pos_weight,
    random_state=42,
    eval_metric='auc',
    n_jobs=-1
)

sample_size = min(50000, len(X_train))
sample_idx = np.random.choice(len(X_train), sample_size, replace=False)
X_sample = X_train[sample_idx]
y_sample = y_train[sample_idx]

random_search = RandomizedSearchCV(
    xgb_base,
    param_distributions=param_grid,
    n_iter=20,
    scoring='roc_auc',
    cv=3,
    random_state=42,
    n_jobs=-1,
    verbose=1
)

random_search.fit(X_sample, y_sample)

print(f'\n✅ Best parameters: {random_search.best_params_}')
print(f'📊 Best CV score: {random_search.best_score_:.4f}')

🔧 Hyperparameter Tuning for XGBoost...
Fitting 3 folds for each of 20 candidates, totalling 60 fits

✅ Best parameters: {'subsample': 0.8, 'n_estimators': 300, 'min_child_weight': 3, 'max_depth': 10, 'learning_rate': 0.1, 'colsample_bytree': 0.8}
📊 Best CV score: 0.8819


In [24]:
print('\n🔧 Training XGBoost with best parameters...')
start_time = time.time()

xgb_tuned = XGBClassifier(
    **random_search.best_params_,
    scale_pos_weight=scale_pos_weight,
    random_state=42,
    eval_metric='auc',
    early_stopping_rounds=30,
    n_jobs=-1
)

xgb_tuned.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    verbose=False
)
train_time = time.time() - start_time

xgb_tuned_metrics, xgb_tuned_pred, xgb_tuned_proba = evaluate_model(
    xgb_tuned, X_val, y_val, 'XGBoost (Tuned)'
)
xgb_tuned_metrics['train_time'] = train_time
all_results.append(xgb_tuned_metrics)
trained_models['xgboost_tuned'] = xgb_tuned

print(f'\n✅ Training complete in {train_time:.2f}s')
print(f'\n📊 Validation Results:')
for k, v in xgb_tuned_metrics.items():
    if k != 'model':
        print(f'  {k}: {v:.4f}' if isinstance(v, float) else f'  {k}: {v}')


🔧 Training XGBoost with best parameters...

✅ Training complete in 18.83s

📊 Validation Results:
  accuracy: 0.9715
  precision: 0.5694
  recall: 0.7605
  f1: 0.6512
  roc_auc: 0.9571
  pr_auc: 0.7666
  train_time: 18.8275


## 🔟 Model Comparison

In [25]:
results_df = pd.DataFrame(all_results)
results_df = results_df.sort_values('roc_auc', ascending=False)

print('📊 Model Comparison (Sorted by ROC-AUC):')
results_df

📊 Model Comparison (Sorted by ROC-AUC):


Unnamed: 0,model,accuracy,precision,recall,f1,roc_auc,pr_auc,train_time
5,XGBoost (Tuned),0.971484,0.569359,0.760522,0.651201,0.957106,0.766584,18.827508
2,XGBoost,0.947336,0.379866,0.797775,0.514669,0.953661,0.72989,23.854616
3,LightGBM,0.916314,0.269447,0.812772,0.404722,0.94354,0.664598,6.408592
4,CatBoost,0.90087,0.234022,0.805999,0.362726,0.933579,0.624946,32.738792
1,Random Forest,0.930504,0.292439,0.694243,0.411529,0.907826,0.572382,50.77561
0,Logistic Regression,0.818573,0.12815,0.720851,0.217614,0.845284,0.348376,68.22561


In [26]:
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=('ROC-AUC Score', 'PR-AUC Score', 'F1 Score', 'Training Time (s)')
)

colors = px.colors.qualitative.Set2[:len(results_df)]

fig.add_trace(
    go.Bar(
        x=results_df['model'],
        y=results_df['roc_auc'],
        marker_color=colors,
        text=[f'{x:.4f}' for x in results_df['roc_auc']],
        textposition='outside'
    ),
    row=1, col=1
)

fig.add_trace(
    go.Bar(
        x=results_df['model'],
        y=results_df['pr_auc'],
        marker_color=colors,
        text=[f'{x:.4f}' for x in results_df['pr_auc']],
        textposition='outside'
    ),
    row=1, col=2
)

fig.add_trace(
    go.Bar(
        x=results_df['model'],
        y=results_df['f1'],
        marker_color=colors,
        text=[f'{x:.4f}' for x in results_df['f1']],
        textposition='outside'
    ),
    row=2, col=1
)

fig.add_trace(
    go.Bar(
        x=results_df['model'],
        y=results_df['train_time'],
        marker_color=colors,
        text=[f'{x:.1f}s' for x in results_df['train_time']],
        textposition='outside'
    ),
    row=2, col=2
)

fig.update_layout(
    title='📊 ML Models Comparison',
    title_font_size=20,
    height=700,
    showlegend=False
)

fig.update_xaxes(tickangle=45)

fig.show()
fig.write_html(f'{BASE_PATH}/reports/figures/20_ml_model_comparison.html')

In [27]:
fig = go.Figure()

models_probas = {
    'Logistic Regression': lr_proba,
    'Random Forest': rf_proba,
    'XGBoost': xgb_proba,
    'LightGBM': lgb_proba,
    'CatBoost': cat_proba,
    'XGBoost (Tuned)': xgb_tuned_proba
}

colors = px.colors.qualitative.Set2

for idx, (name, proba) in enumerate(models_probas.items()):
    fpr, tpr, _ = roc_curve(y_val, proba)
    auc = roc_auc_score(y_val, proba)
    
    fig.add_trace(
        go.Scatter(
            x=fpr, y=tpr,
            mode='lines',
            name=f'{name} (AUC={auc:.4f})',
            line=dict(width=2, color=colors[idx])
        )
    )

fig.add_trace(
    go.Scatter(
        x=[0, 1], y=[0, 1],
        mode='lines',
        line=dict(dash='dash', color='gray'),
        showlegend=False
    )
)

fig.update_layout(
    title='📈 ROC Curves Comparison',
    title_font_size=18,
    xaxis_title='False Positive Rate',
    yaxis_title='True Positive Rate',
    height=500,
    legend=dict(x=0.6, y=0.1)
)

fig.show()
fig.write_html(f'{BASE_PATH}/reports/figures/21_roc_comparison.html')

## 1️⃣1️⃣ Save Models & Results

In [28]:
print('💾 Saving models...')

for name, model in trained_models.items():
    model_path = f'{BASE_PATH}/models/ml/{name}.pkl'
    with open(model_path, 'wb') as f:
        pickle.dump(model, f)
    print(f'  ✓ Saved {name}')

print('\n✅ All models saved!')

💾 Saving models...
  ✓ Saved logistic_regression
  ✓ Saved random_forest
  ✓ Saved xgboost
  ✓ Saved lightgbm
  ✓ Saved catboost
  ✓ Saved xgboost_tuned

✅ All models saved!


In [29]:
results_df.to_csv(f'{BASE_PATH}/reports/metrics/ml_model_results.csv', index=False)

best_model_name = results_df.iloc[0]['model']
best_model_info = {
    'best_ml_model': best_model_name,
    'roc_auc': float(results_df.iloc[0]['roc_auc']),
    'pr_auc': float(results_df.iloc[0]['pr_auc']),
    'f1': float(results_df.iloc[0]['f1'])
}

with open(f'{BASE_PATH}/reports/metrics/best_ml_model.json', 'w') as f:
    json.dump(best_model_info, f, indent=2)

print(f'\n🏆 Best ML Model: {best_model_name}')
print(f'   ROC-AUC: {best_model_info["roc_auc"]:.4f}')


🏆 Best ML Model: XGBoost (Tuned)
   ROC-AUC: 0.9571
