# Model Training and Explainability Analysis
## Tasks 2 Model Building, Training, and SHAP Explainability

In [1]:
# Import required libraries
import sys
import os
sys.path.append('../')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Import custom utilities
from components.dataUtilities import DataLoader, DataCleaner, merge_with_geolocation
from components.featureEngineering import FeatureEngineer, create_all_features
from components.preprocessing import full_preprocessing_pipeline
from components.model_training import ModelTrainer, cross_validate_models
from components.model_evaluation import evaluate_models_comprehensive
from components.model_explainability import explain_best_model

# Set display options
pd.set_option('display.max_columns', None)
plt.style.use('default')
sns.set_palette("husl")

print("All libraries imported successfully!")

All libraries imported successfully!


1. Data Preparation

In [2]:
# Load and prepare data (using preprocessed data from previous analysis)
print("=== DATA PREPARATION ===")

# Load data
data_loader = DataLoader(data_path='../data/raw/')
fraud_df = data_loader.load_fraud_data()
creditcard_df = data_loader.load_creditcard_data()

# Clean and engineer features for fraud data
if not fraud_df.empty:
    cleaner = DataCleaner()
    fraud_df_clean = cleaner.handle_missing_values(fraud_df, strategy='drop')
    fraud_df_clean = cleaner.remove_duplicates(fraud_df_clean)
    fraud_df_clean = cleaner.correct_data_types(fraud_df_clean)
    
    # Feature engineering
    fraud_df_features = create_all_features(fraud_df_clean)
    
    print(f"Fraud dataset prepared: {fraud_df_features.shape}")
else:
    print("Fraud dataset not available")

# Prepare creditcard data if available
if not creditcard_df.empty:
    print(f"Credit card dataset available: {creditcard_df.shape}")
else:
    print("Credit card dataset not available")

=== DATA PREPARATION ===
Fraud data loaded: (151112, 11)
Credit card data loaded: (284807, 31)
Missing values before cleaning:
user_id           0
signup_time       0
purchase_time     0
purchase_value    0
device_id         0
source            0
browser           0
sex               0
age               0
ip_address        0
class             0
dtype: int64
Missing values after cleaning:
user_id           0
signup_time       0
purchase_time     0
purchase_value    0
device_id         0
source            0
browser           0
sex               0
age               0
ip_address        0
class             0
dtype: int64
Duplicates removed: 0 rows
Shape before: (151112, 11), Shape after: (151112, 11)
Starting comprehensive feature engineering...
Time features created: hour_of_day, day_of_week, time_period, is_weekend
Time since signup calculated (in hours)
Transaction features created for users, devices, and IP addresses
Purchase value features created
Categorical features encoded: ['source

2. Model Training Pipeline

In [3]:
# Preprocessing pipeline for fraud data
if not fraud_df_features.empty:
    print("=== PREPROCESSING FRAUD DATA ===")
    
    # Full preprocessing pipeline
    fraud_processed = full_preprocessing_pipeline(
        fraud_df_features,
        target_col='class',
        sampling_strategy='smote',
        scaling_method='standard'
    )
    
    print("Fraud data preprocessing completed!")
    print(f"Training set: {fraud_processed['X_train'].shape}")
    print(f"Test set: {fraud_processed['X_test'].shape}")
    print(f"Features: {len(fraud_processed['feature_names'])}")

=== PREPROCESSING FRAUD DATA ===
Starting preprocessing pipeline...
Preparing features...
Removing datetime columns: ['signup_time', 'purchase_time', 'user_first_transaction', 'user_last_transaction']
Removing datetime columns: ['signup_time', 'purchase_time', 'user_first_transaction', 'user_last_transaction']
Dropping all-NaN columns: ['user_transaction_std']
Dropping all-NaN columns: ['user_transaction_std']
Applying SMOTE...
✅ Preprocessing completed successfully!
Fraud data preprocessing completed!
Training set: (219136, 47)
Test set: (30223, 47)
Features: 47


In [4]:
# Train models on fraud data
if 'fraud_processed' in locals():
    print("=== TRAINING MODELS ON FRAUD DATA ===")
    
    # Initialize model trainer
    trainer = ModelTrainer(random_state=42)
    
    # Train models
    fraud_models = trainer.train_all_models(
        fraud_processed['X_train'],
        fraud_processed['y_train'],
        models_to_train=['logistic_regression', 'random_forest', 'xgboost'],
        hyperparameter_tuning=True
    )
    
    print("\nModel training completed for fraud data!")
    print(f"Models trained: {list(fraud_models.keys())}")

=== TRAINING MODELS ON FRAUD DATA ===
Training 3 models...

LOGISTIC REGRESSION
----------------------------------------
Training Logistic Regression model...
Fitting 5 folds for each of 48 candidates, totalling 240 fits


KeyboardInterrupt: 

3. Model Evaluation

In [5]:
# Comprehensive model evaluation for fraud data
if 'fraud_models' in locals():
    print("=== COMPREHENSIVE MODEL EVALUATION ===")
    
    # Evaluate all models
    fraud_comparison, fraud_best_model = evaluate_models_comprehensive(
        fraud_models,
        fraud_processed['X_train'],
        fraud_processed['y_train'],
        fraud_processed['X_test'],
        fraud_processed['y_test'],
        fraud_processed['feature_names']
    )
    
    print(f"\nBest model for fraud detection: {fraud_best_model}")
    
    # Display comparison results
    print("\nModel Comparison Results:")
    display(fraud_comparison[['f1_score', 'precision', 'recall', 'pr_auc', 'roc_auc']].round(4))

4. Model Selection Justification

In [6]:
# Detailed justification for best model selection
if 'fraud_comparison' in locals():
    print("=== MODEL SELECTION JUSTIFICATION ===")
    print("\nFor fraud detection, we prioritize:")
    print("1. F1-Score: Balance between precision and recall")
    print("2. PR-AUC: Performance on imbalanced data")
    print("3. Recall: Catching actual fraud cases")
    print("4. Precision: Minimizing false positives")
    
    # Rank models by key metrics
    key_metrics = ['f1_score', 'pr_auc', 'recall', 'precision']
    
    print("\nModel Rankings by Key Metrics:")
    print("-" * 50)
    
    for metric in key_metrics:
        ranked = fraud_comparison.sort_values(metric, ascending=False)
        print(f"\n{metric.upper()}:")
        for i, (model, score) in enumerate(ranked[metric].items(), 1):
            print(f"  {i}. {model:15}: {score:.4f}")
    
    # Business impact analysis
    print("\n" + "="*60)
    print("BUSINESS IMPACT ANALYSIS")
    print("="*60)
    
    best_model_metrics = fraud_comparison.loc[fraud_best_model]
    
    print(f"\nSelected Model: {fraud_best_model.upper()}")
    print(f"F1-Score: {best_model_metrics['f1_score']:.4f}")
    print(f"Precision: {best_model_metrics['precision']:.4f} (False Positive Rate: {1-best_model_metrics['precision']:.4f})")
    print(f"Recall: {best_model_metrics['recall']:.4f} (False Negative Rate: {1-best_model_metrics['recall']:.4f})")
    print(f"PR-AUC: {best_model_metrics['pr_auc']:.4f}")
    
    print("\nBusiness Justification:")
    print(f"• Balanced performance with F1-Score of {best_model_metrics['f1_score']:.4f}")
    print(f"• {best_model_metrics['recall']*100:.1f}% of fraud cases detected")
    print(f"• {(1-best_model_metrics['precision'])*100:.1f}% false positive rate (acceptable for fraud detection)")
    print(f"• Strong performance on imbalanced data (PR-AUC: {best_model_metrics['pr_auc']:.4f})")

5. SHAP Explainability Analysis

In [7]:
# SHAP explainability for best model
if 'fraud_best_model' in locals():
    print("=== SHAP EXPLAINABILITY ANALYSIS ===")
    
    # Get best model
    best_model = fraud_models[fraud_best_model]
    
    # Select sample indices for detailed explanation
    fraud_indices = fraud_processed['y_test'][fraud_processed['y_test'] == 1].index[:3].tolist()
    legit_indices = fraud_processed['y_test'][fraud_processed['y_test'] == 0].index[:3].tolist()
    sample_indices = fraud_indices + legit_indices
    
    print(f"Analyzing {len(sample_indices)} sample predictions...")
    
    # Comprehensive SHAP analysis
    fraud_insights = explain_best_model(
        best_model,
        fraud_best_model,
        fraud_processed['X_train'],
        fraud_processed['X_test'],
        fraud_processed['y_test'],
        sample_indices
    )
    
    print("\nSHAP analysis completed!")

6. Credit Card Data Analysis

In [8]:
# Process credit card data if available
if not creditcard_df.empty:
    print("=== CREDIT CARD DATA ANALYSIS ===")
    
    # Preprocessing for credit card data
    creditcard_processed = full_preprocessing_pipeline(
        creditcard_df,
        target_col='Class',  # Note: Capital 'C' for creditcard data
        sampling_strategy='smote',
        scaling_method='standard'
    )
    
    # Train models
    cc_trainer = ModelTrainer(random_state=42)
    cc_models = cc_trainer.train_all_models(
        creditcard_processed['X_train'],
        creditcard_processed['y_train'],
        models_to_train=['logistic_regression', 'random_forest', 'xgboost'],
        hyperparameter_tuning=True
    )
    
    # Evaluate models
    cc_comparison, cc_best_model = evaluate_models_comprehensive(
        cc_models,
        creditcard_processed['X_train'],
        creditcard_processed['y_train'],
        creditcard_processed['X_test'],
        creditcard_processed['y_test'],
        creditcard_processed['feature_names']
    )
    
    print(f"\nBest model for credit card fraud: {cc_best_model}")
    display(cc_comparison[['f1_score', 'precision', 'recall', 'pr_auc', 'roc_auc']].round(4))
    
else:
    print("Credit card dataset not available for analysis")

=== CREDIT CARD DATA ANALYSIS ===
Starting preprocessing pipeline...
Preparing features...
Applying SMOTE...
✅ Preprocessing completed successfully!
Training 3 models...

LOGISTIC REGRESSION
----------------------------------------
Training Logistic Regression model...
Fitting 5 folds for each of 48 candidates, totalling 240 fits


KeyboardInterrupt: 

7. Key Findings and Insights

In [9]:
# Summary of key findings
print("=== KEY FINDINGS AND INSIGHTS ===")
print("\n1. MODEL PERFORMANCE SUMMARY")
print("-" * 40)

if 'fraud_comparison' in locals():
    print(f"\nFraud Detection Dataset:")
    print(f"• Best Model: {fraud_best_model}")
    print(f"• F1-Score: {fraud_comparison.loc[fraud_best_model, 'f1_score']:.4f}")
    print(f"• PR-AUC: {fraud_comparison.loc[fraud_best_model, 'pr_auc']:.4f}")
    print(f"• Recall: {fraud_comparison.loc[fraud_best_model, 'recall']:.4f}")

if 'cc_comparison' in locals():
    print(f"\nCredit Card Dataset:")
    print(f"• Best Model: {cc_best_model}")
    print(f"• F1-Score: {cc_comparison.loc[cc_best_model, 'f1_score']:.4f}")
    print(f"• PR-AUC: {cc_comparison.loc[cc_best_model, 'pr_auc']:.4f}")
    print(f"• Recall: {cc_comparison.loc[cc_best_model, 'recall']:.4f}")

print("\n2. FRAUD DRIVER INSIGHTS (from SHAP analysis)")
print("-" * 50)

if 'fraud_insights' in locals() and fraud_insights:
    if 'top_fraud_drivers' in fraud_insights:
        print("\nTop Risk Factors:")
        for driver in fraud_insights['top_fraud_drivers'][:5]:
            print(f"• {driver['feature']}: {driver['interpretation']}")
    
    if 'protective_factors' in fraud_insights:
        print("\nProtective Factors:")
        for factor in fraud_insights['protective_factors'][:5]:
            print(f"• {factor['feature']}: {factor['interpretation']}")

print("\n3. BUSINESS RECOMMENDATIONS")
print("-" * 30)
print("• Implement real-time scoring using the best performing model")
print("• Focus monitoring on high-risk features identified by SHAP")
print("• Set appropriate thresholds balancing fraud detection vs customer experience")
print("• Regular model retraining to adapt to new fraud patterns")
print("• Use SHAP explanations for fraud investigation and rule creation")

=== KEY FINDINGS AND INSIGHTS ===

1. MODEL PERFORMANCE SUMMARY
----------------------------------------

2. FRAUD DRIVER INSIGHTS (from SHAP analysis)
--------------------------------------------------

3. BUSINESS RECOMMENDATIONS
------------------------------
• Implement real-time scoring using the best performing model
• Focus monitoring on high-risk features identified by SHAP
• Set appropriate thresholds balancing fraud detection vs customer experience
• Regular model retraining to adapt to new fraud patterns
• Use SHAP explanations for fraud investigation and rule creation


8. Model Deployment Preparation

In [None]:
# Save best models and preprocessing objects for deployment
import joblib
import os

# Create models directory
os.makedirs('../models', exist_ok=True)

if 'fraud_best_model' in locals():
    # Save fraud detection model and preprocessing objects
    joblib.dump(fraud_models[fraud_best_model], f'../models/fraud_best_model_{fraud_best_model}.pkl')
    joblib.dump(fraud_processed['scaler'], '../models/fraud_scaler.pkl')
    
    # Save feature names
    with open('../models/fraud_feature_names.txt', 'w') as f:
        for feature in fraud_processed['feature_names']:
            f.write(f"{feature}\n")
    
    print(f"Fraud detection model saved: {fraud_best_model}")

if 'cc_best_model' in locals():
    # Save credit card model and preprocessing objects
    joblib.dump(cc_models[cc_best_model], f'../models/creditcard_best_model_{cc_best_model}.pkl')
    joblib.dump(creditcard_processed['scaler'], '../models/creditcard_scaler.pkl')
    
    # Save feature names
    with open('../models/creditcard_feature_names.txt', 'w') as f:
        for feature in creditcard_processed['feature_names']:
            f.write(f"{feature}\n")
    
    print(f"Credit card model saved: {cc_best_model}")

print("\nModels and preprocessing objects saved for deployment!")


Models and preprocessing objects saved for deployment!
