In [2]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (
    roc_auc_score, average_precision_score,
    roc_curve, precision_recall_curve,
    confusion_matrix, classification_report
)
import joblib
import shap

In [3]:
# Load models and data

models_dir = "models"
df = pd.read_csv("data/processed/transactions_features.csv")
feature_cols = pd.read_csv(os.path.join(models_dir, "feature_list.csv")).iloc[:,0].tolist()

X = df[feature_cols]
y = df["TX_FRAUD"]

FileNotFoundError: [Errno 2] No such file or directory: 'data/processed/transactions_features.csv'

In [None]:
# Load trained model and thresholds
model_name = "XGBoost"  # or "LightGBM", "Ensemble", etc.
model = joblib.load(os.path.join(models_dir, f"{model_name}.pkl"))
thresholds = np.load(os.path.join(models_dir, "thresholds.npy"), allow_pickle=True).item()
threshold = thresholds.get(model_name, 0.5)

In [None]:
# Predict probabilities & classes

if model_name == "LightGBM":
    y_proba = model.predict(X)
else:
    y_proba = model.predict_proba(X)[:,1]

y_pred = (y_proba >= threshold).astype(int)

In [None]:
# Metrics report

roc_auc = roc_auc_score(y, y_proba)
pr_auc = average_precision_score(y, y_proba)
print(f"{model_name} ROC-AUC: {roc_auc:.4f}, PR-AUC: {pr_auc:.4f}")
print("Classification Report:\n", classification_report(y, y_pred))
cm = confusion_matrix(y, y_pred)
print("Confusion Matrix:\n", cm)

In [None]:
# Plot Confusion Matrix

plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.title(f"{model_name} Confusion Matrix (Threshold={threshold:.2f})")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
# ROC Curve

fpr, tpr, roc_thresholds = roc_curve(y, y_proba)
plt.figure(figsize=(6,5))
plt.plot(fpr, tpr, label=f"ROC-AUC={roc_auc:.4f}")
plt.plot([0,1],[0,1],'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title(f"{model_name} ROC Curve")
plt.legend()
plt.show()

In [None]:
# Precision-Recall Curve

precision, recall, pr_thresholds = precision_recall_curve(y, y_proba)
plt.figure(figsize=(6,5))
plt.plot(recall, precision, label=f"PR-AUC={pr_auc:.4f}")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title(f"{model_name} Precision-Recall Curve")
plt.legend()
plt.show()

In [None]:
# Cost vs Threshold Curve

fpr_limit = 0.005
cost_fp = 1
cost_fn = 25
costs = []
threshold_list = np.linspace(0,1,1000)
for t in threshold_list:
    y_p = (y_proba >= t).astype(int)
    tn = np.sum((y==0)&(y_p==0))
    fp = np.sum((y==0)&(y_p==1))
    fn = np.sum((y==1)&(y_p==0))
    cost = fp*cost_fp + fn*cost_fn
    costs.append(cost)

plt.figure(figsize=(6,5))
plt.plot(threshold_list, costs)
plt.axvline(x=threshold, color='r', linestyle='--', label=f"Selected Threshold={threshold:.3f}")
plt.xlabel("Threshold")
plt.ylabel("Cost")
plt.title(f"{model_name} Cost vs Threshold")
plt.legend()
plt.show()

In [None]:
# SHAP Explainability

print("Computing SHAP values for explainability...")
if model_name == "LightGBM":
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X)
else:
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X)


In [None]:
# Global feature importance
plt.figure()
shap.summary_plot(shap_values, X, plot_type="bar")

In [None]:
# Detailed summary
plt.figure()
shap.summary_plot(shap_values, X)

In [None]:
# Local explainability: show top 5 flagged fraud transactions
fraud_idx = np.where(y_pred==1)[0]
sample_idx = fraud_idx[:5]
for idx in sample_idx:
    print(f"\nTransaction ID: {df.iloc[idx]['TRANSACTION_ID']}")
    shap.force_plot(explainer.expected_value, shap_values[idx], X.iloc[idx], matplotlib=True)