# Task 3: Model Explainability with SHAP

This notebook demonstrates training a fraud detection model using XGBoost with GPU acceleration, balancing the dataset with SMOTE, and interpreting model predictions using SHAP explainability techniques.


In [None]:
# 📦 Import libraries
import pandas as pd
import shap
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import HistGradientBoostingClassifier
import xgboost as xgb
import joblib


## Load Processed Data and Prepare Features

- Load the processed fraud dataset.
- Separate features (`X`) and target (`y`).
- Split into train and test sets with stratification to keep class balance.


In [None]:
# ⚙️ Load processed data
fraud_data = pd.read_csv('Fraud_Data.csv')

# 🎯 Separate features and target
X = fraud_data.drop('class', axis=1)
y = fraud_data['class']

# 📊 Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)


## Convert datetime columns to numeric timestamps

- Convert `signup_time` and `purchase_time` to datetime objects.
- Then convert them to integer timestamps for model compatibility.


In [None]:
# List datetime columns that need conversion
datetime_columns = ['signup_time', 'purchase_time']

# Convert datetime columns to timestamps
for col in datetime_columns:
    if col in X_train.columns:
        X_train[col] = pd.to_datetime(X_train[col], errors='coerce')
        X_test[col] = pd.to_datetime(X_test[col], errors='coerce')

        # Convert to numeric timestamp (seconds since epoch)
        X_train[col] = X_train[col].astype('int64') // 10**9
        X_test[col] = X_test[col].astype('int64') // 10**9


## Keep only numeric features for model training

- Drop any non-numeric columns after conversion.


In [None]:
# Keep only numeric columns
X_train = X_train.select_dtypes(include=['number'])
X_test = X_test.select_dtypes(include=['number'])


## Handle class imbalance by applying SMOTE to the training data


In [None]:
# Balance the data with SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)


## Scale features using StandardScaler

- Also, randomly sample 10,000 instances if dataset is large to speed up training.


In [None]:
# Scale the features
scaler = StandardScaler()
X_resampled = scaler.fit_transform(X_resampled)
X_test_scaled = scaler.transform(X_test)

# Subsample if dataset too large
sample_size = 10000
if X_resampled.shape[0] > sample_size:
    indices = np.random.choice(len(X_resampled), sample_size, replace=False)
    X_resampled = X_resampled[indices]
    y_resampled = y_resampled.iloc[indices]


## Train the final model using XGBoost with GPU acceleration


In [None]:
# Train the final model (XGBoost GPU)
model = xgb.XGBClassifier(tree_method='gpu_hist', gpu_id=0, use_label_encoder=False, eval_metric='logloss')
model.fit(X_resampled, y_resampled)


## Interpret model predictions using SHAP

- Generate SHAP values on a subset of test data.
- Create summary, bar, force, and waterfall plots.


In [None]:
# Initialize SHAP explainer
explainer = shap.Explainer(model, X_resampled)

# Calculate SHAP values for first 100 test samples
shap_values = explainer(X_test_scaled[:100])

# SHAP summary plot
shap.summary_plot(shap_values, X_test.iloc[:100])

# SHAP bar plot
shap.plots.bar(shap_values)

# SHAP force plot for a single prediction
sample_index = 0
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values[sample_index].values, X_test.iloc[sample_index])

# SHAP waterfall plot for the same sample
shap.plots.waterfall(shap_values[sample_index])
