# Credit Card Fraud Detection Demo

This notebook walks through:
- Downloading the dataset from Kaggle
- Loading and splitting the dataset
- Training multiple models
- Evaluating performance with imbalance-aware metrics



In [3]:
# Install and import dependencies
# !pip install kagglehub[pandas-datasets]  # Uncomment if not installed

import os
import numpy as np
import pandas as pd

# Import from src directory
import sys
sys.path.append('../src')

from data_utils import load_creditcard_data
from models import build_models
from evaluation import evaluate_predictions, print_detailed_report

# Option 1: Use KaggleHub (if working)
try:
    import kagglehub
    print("Downloading dataset from Kaggle using kagglehub...")
    # Download the dataset
    path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")
    csv_file = os.path.join(path, "creditcard.csv")
    df = pd.read_csv(csv_file)
    print("Dataset loaded successfully from KaggleHub")
except Exception as e:
    print(f"KaggleHub failed: {e}")
    print("Please download creditcard.csv manually from https://www.kaggle.com/mlg-ulb/creditcardfraud")
    print("and place it in the ../data/ directory")
    # For now, let's assume you have the file
    if os.path.exists("../data/creditcard.csv"):
        df = pd.read_csv("../data/creditcard.csv")
        print("Using existing dataset file")
    else:
        raise FileNotFoundError("Please download the dataset first")

# Save to data directory
os.makedirs("../data", exist_ok=True)
df.to_csv("../data/creditcard.csv", index=False)
print(f"Dataset saved to ../data/creditcard.csv")
print(f"Dataset shape: {df.shape}")
print(f"Fraud cases: {df['Class'].sum()}")
print(f"Non-fraud cases: {len(df) - df['Class'].sum()}")

# Load and split the data
(X_train,
 X_val,
 X_test,
 y_train,
 y_val,
 y_test,
 scaler,) = load_creditcard_data("../data/creditcard.csv")

models = build_models()



Downloading dataset from Kaggle using kagglehub...
KaggleHub failed: module 'kagglehub' has no attribute 'dataset_download'
Please download creditcard.csv manually from https://www.kaggle.com/mlg-ulb/creditcardfraud
and place it in the ../data/ directory
Using existing dataset file
Dataset saved to ../data/creditcard.csv
Dataset shape: (284807, 31)
Fraud cases: 492
Non-fraud cases: 284315


In [4]:
results = {}

for name, model in models.items():
    print(f"\n=== Model: {name} ===")
    model.fit(X_train, y_train)
    y_val_pred = model.predict(X_val)
    y_val_proba = None
    if hasattr(model, "predict_proba"):
        y_val_proba = model.predict_proba(X_val)[:, 1]
    elif hasattr(model, "decision_function"):
        y_val_proba = model.decision_function(X_val)

    metrics = evaluate_predictions(y_val, y_val_pred, y_val_proba)
    results[name] = metrics
    print(metrics)




=== Model: log_reg ===




{'accuracy': 0.9767213229872547, 'precision': 0.06017191977077364, 'recall': 0.8571428571428571, 'f1': 0.11244979919678715, 'roc_auc': 0.9614342993809791, 'pr_auc': -0.6692184044877432}

=== Model: decision_tree ===
{'accuracy': 0.9989817773252344, 'precision': 0.7272727272727273, 'recall': 0.6530612244897959, 'f1': 0.6881720430107527, 'roc_auc': 0.8263195824193492, 'pr_auc': -0.690465420458348}

=== Model: random_forest ===
{'accuracy': 0.9994382219725431, 'precision': 0.9714285714285714, 'recall': 0.6938775510204082, 'f1': 0.8095238095238095, 'roc_auc': 0.9257074523675537, 'pr_auc': -0.7979883332294069}

=== Model: grad_boost ===
{'accuracy': 0.9989817773252344, 'precision': 0.8125, 'recall': 0.5306122448979592, 'f1': 0.6419753086419753, 'roc_auc': 0.8160749457351877, 'pr_auc': -0.6647934554190484}


In [5]:
# pick best model by F1
best_name = max(results, key=lambda k: results[k].get("f1", 0.0))
print("Best model:", best_name, results[best_name])

best_model = models[best_name]
# retrain on train+val
import numpy as np

X_train_val = np.vstack([X_train, X_val])
y_train_val = np.concatenate([y_train, y_val])

best_model.fit(X_train_val, y_train_val)

y_test_pred = best_model.predict(X_test)
y_test_proba = None
if hasattr(best_model, "predict_proba"):
    y_test_proba = best_model.predict_proba(X_test)[:, 1]
elif hasattr(best_model, "decision_function"):
    y_test_proba = best_model.decision_function(X_test)

print("\nTest metrics:")
print(evaluate_predictions(y_test, y_test_pred, y_test_proba))
print_detailed_report(y_test, y_test_pred)



Best model: random_forest {'accuracy': 0.9994382219725431, 'precision': 0.9714285714285714, 'recall': 0.6938775510204082, 'f1': 0.8095238095238095, 'roc_auc': 0.9257074523675537, 'pr_auc': -0.7979883332294069}

Test metrics:
{'accuracy': 0.9995435553526912, 'precision': 0.9615384615384616, 'recall': 0.7653061224489796, 'f1': 0.8522727272727273, 'roc_auc': 0.9573048620123344, 'pr_auc': -0.8683508053012461}
Confusion matrix:
[[56861     3]
 [   23    75]]

Classification report:
              precision    recall  f1-score   support

           0     0.9996    0.9999    0.9998     56864
           1     0.9615    0.7653    0.8523        98

    accuracy                         0.9995     56962
   macro avg     0.9806    0.8826    0.9260     56962
weighted avg     0.9995    0.9995    0.9995     56962

