In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.metrics import f1_score, confusion_matrix, classification_report, precision_recall_curve, auc


In [2]:
# Fraud Data
fraud_X_train = pd.read_csv("../data/processed/fraud_X_train.csv")
fraud_X_test  = pd.read_csv("../data/processed/fraud_X_test.csv")
fraud_y_train = pd.read_csv("../data/processed/fraud_y_train.csv").values.ravel()
fraud_y_test  = pd.read_csv("../data/processed/fraud_y_test.csv").values.ravel()

# Credit Card Data
credit_X_train = pd.read_csv("../data/processed/credit_X_train.csv")
credit_X_test  = pd.read_csv("../data/processed/credit_X_test.csv")
credit_y_train = pd.read_csv("../data/processed/credit_y_train.csv").values.ravel()
credit_y_test  = pd.read_csv("../data/processed/credit_y_test.csv").values.ravel()


In [3]:

# 2. Baseline Model: Logistic Regression
# -----------------------------
log_reg = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
log_reg.fit(fraud_X_train, fraud_y_train)

# Predictions
y_pred_lr = log_reg.predict(fraud_X_test)
y_prob_lr = log_reg.predict_proba(fraud_X_test)[:,1]

# Metrics
f1_lr = f1_score(fraud_y_test, y_pred_lr)
cm_lr = confusion_matrix(fraud_y_test, y_pred_lr)
precision_lr, recall_lr, _ = precision_recall_curve(fraud_y_test, y_prob_lr)
pr_auc_lr = auc(recall_lr, precision_lr)

print("Logistic Regression — FRAUD DATA")
print("F1-Score:", f1_lr)
print("PR-AUC:", pr_auc_lr)
print("Confusion Matrix:\n", cm_lr)
print("Classification Report:\n", classification_report(fraud_y_test, y_pred_lr))

Logistic Regression — FRAUD DATA
F1-Score: 0.6705911209222467
PR-AUC: 0.5787699929435683
Confusion Matrix:
 [[23120   256]
 [ 1087  1367]]
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.99      0.97     23376
           1       0.84      0.56      0.67      2454

    accuracy                           0.95     25830
   macro avg       0.90      0.77      0.82     25830
weighted avg       0.94      0.95      0.94     25830



In [5]:
# Hyperparameter Grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 15, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

rf = RandomForestClassifier(class_weight='balanced', random_state=42)
grid_rf = GridSearchCV(rf, param_grid, scoring='f1', cv=5, n_jobs=-1, verbose=1)
grid_rf.fit(fraud_X_train, fraud_y_train)

best_rf = grid_rf.best_estimator_
print("Best RF Params:", grid_rf.best_params_)

# Predictions
y_pred_rf = best_rf.predict(fraud_X_test)
y_prob_rf = best_rf.predict_proba(fraud_X_test)[:,1]

precision_rf, recall_rf, _ = precision_recall_curve(fraud_y_test, y_prob_rf)
pr_auc_rf = auc(recall_rf, precision_rf)
f1_rf = f1_score(fraud_y_test, y_pred_rf)
cm_rf = confusion_matrix(fraud_y_test, y_pred_rf)

print("Random Forest — FRAUD DATA")
print("F1-Score:", f1_rf)
print("PR-AUC:", pr_auc_rf)
print("Confusion Matrix:\n", cm_rf)
print(classification_report(fraud_y_test, y_pred_rf))


Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best RF Params: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}
Random Forest — FRAUD DATA
F1-Score: 0.6881937436932392
PR-AUC: 0.6429118781361309
Confusion Matrix:
 [[23230   146]
 [ 1090  1364]]
              precision    recall  f1-score   support

           0       0.96      0.99      0.97     23376
           1       0.90      0.56      0.69      2454

    accuracy                           0.95     25830
   macro avg       0.93      0.77      0.83     25830
weighted avg       0.95      0.95      0.95     25830



In [6]:
# 4. Cross-Validation (Stratified K-Fold)
# -----------------------------
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
lr_cv_f1 = cross_val_score(log_reg, fraud_X_train, fraud_y_train, scoring='f1', cv=skf)
rf_cv_f1 = cross_val_score(best_rf, fraud_X_train, fraud_y_train, scoring='f1', cv=skf)

print("LR CV F1: {:.4f} ± {:.4f}".format(lr_cv_f1.mean(), lr_cv_f1.std()))
print("RF CV F1: {:.4f} ± {:.4f}".format(rf_cv_f1.mean(), rf_cv_f1.std()))

LR CV F1: 0.8551 ± 0.0011
RF CV F1: 0.9579 ± 0.0007


In [55]:
# Logistic Regression on Credit Card Data
log_reg.fit(credit_X_train, credit_y_train)
y_pred = log_reg.predict(credit_X_test)
y_prob = log_reg.predict_proba(credit_X_test)[:,1]

credit_lr_f1 = f1_score(credit_y_test, y_pred)
precision, recall, _ = precision_recall_curve(credit_y_test, y_prob)
credit_lr_aucpr = auc(recall, precision)
credit_lr_cm = confusion_matrix(credit_y_test, y_pred)

print("Logistic Regression — CREDIT DATA")
print("F1 Score:", credit_lr_f1)
print("AUC-PR:", credit_lr_aucpr)
print("Confusion Matrix:\n", credit_lr_cm)

Logistic Regression — CREDIT DATA
F1 Score: 0.23275862068965517
AUC-PR: 0.7586498457173735
Confusion Matrix:
 [[56131   520]
 [   14    81]]


In [59]:
# Random Forest on Credit Card Data
rf.fit(credit_X_train, credit_y_train)
y_pred = rf.predict(credit_X_test)
y_prob = rf.predict_proba(credit_X_test)[:,1]

credit_rf_f1 = f1_score(credit_y_test, y_pred)
precision, recall, _ = precision_recall_curve(credit_y_test, y_prob)
credit_rf_aucpr = auc(recall, precision)
credit_rf_cm = confusion_matrix(credit_y_test, y_pred)

print("Random Forest — CREDIT DATA")
print("F1 Score:", credit_rf_f1)
print("AUC-PR:", credit_rf_aucpr)
print("Confusion Matrix:\n", credit_rf_cm)

Random Forest — CREDIT DATA
F1 Score: 0.6695652173913044
AUC-PR: 0.7960997753841067
Confusion Matrix:
 [[56593    58]
 [   18    77]]


In [7]:
# -----------------------------
# 5. Feature Importance (Ensemble)
# -----------------------------
importances = pd.DataFrame({
    'feature': fraud_X_train.columns,
    'importance': best_rf.feature_importances_
}).sort_values(by='importance', ascending=False)

print("Top 10 Features (RF — Fraud Data):")
print(importances.head(10))


Top 10 Features (RF — Fraud Data):
                   feature  importance
2        time_since_signup    0.263552
5            short_account    0.171632
1                      age    0.083440
0           purchase_value    0.075185
9        purchase_velocity    0.074826
3              hour_of_day    0.067402
4              day_of_week    0.037419
187  country_United States    0.027759
52           country_China    0.013834
11              source_SEO    0.011231


In [8]:
# -----------------------------
# 6. Metrics Comparison Table
# -----------------------------
comparison = pd.DataFrame({
    'Model': ['Logistic Regression', 'Random Forest (Tuned)'],
    'F1': [f1_lr, f1_rf],
    'PR-AUC': [pr_auc_lr, pr_auc_rf]
})

print("\nModel Comparison:\n", comparison)



Model Comparison:
                    Model        F1    PR-AUC
0    Logistic Regression  0.670591  0.578770
1  Random Forest (Tuned)  0.688194  0.642912


## Credit Card Fraud Detection — Model Training

We trained two models on the Credit Card dataset:

1. Logistic Regression — interpretable baseline
2. Random Forest — ensemble model to capture nonlinear patterns

Because fraud datasets are highly imbalanced, we evaluate using F1-score and PR-AUC rather than accuracy. Stratified splits and cross-validation are used to preserve fraud class distribution.


In [9]:
# Stratified Cross-Validation

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

log_reg = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)

lr_cv_f1 = cross_val_score(
    log_reg,
    credit_X_train,
    credit_y_train,
    scoring='f1',
    cv=skf
)

print("Logistic Regression CV F1: {:.4f} ± {:.4f}".format(lr_cv_f1.mean(), lr_cv_f1.std()))


Logistic Regression CV F1: 0.9810 ± 0.0004


In [10]:
# Train Logistic Regression Baseline

log_reg.fit(credit_X_train, credit_y_train)

# Predictions
y_pred_lr = log_reg.predict(credit_X_test)
y_prob_lr = log_reg.predict_proba(credit_X_test)[:,1]

# Metrics
f1_lr = f1_score(credit_y_test, y_pred_lr)
precision_lr, recall_lr, _ = precision_recall_curve(credit_y_test, y_prob_lr)
aucpr_lr = auc(recall_lr, precision_lr)
cm_lr = confusion_matrix(credit_y_test, y_pred_lr)

print("Logistic Regression — CREDIT DATA")
print("F1 Score:", f1_lr)
print("PR-AUC:", aucpr_lr)
print("Confusion Matrix:\n", cm_lr)
print("Classification Report:\n", classification_report(credit_y_test, y_pred_lr))


Logistic Regression — CREDIT DATA
F1 Score: 0.23275862068965517
PR-AUC: 0.7586498457173735
Confusion Matrix:
 [[56131   520]
 [   14    81]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.99      1.00     56651
           1       0.13      0.85      0.23        95

    accuracy                           0.99     56746
   macro avg       0.57      0.92      0.61     56746
weighted avg       1.00      0.99      0.99     56746



In [11]:
# Hyperparameter Tuning for Random Forest

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}

grid_rf = GridSearchCV(
    RandomForestClassifier(class_weight='balanced', random_state=42),
    param_grid,
    scoring='f1',
    cv=5,
    n_jobs=-1
)

grid_rf.fit(credit_X_train, credit_y_train)

print("Best RF Parameters:", grid_rf.best_params_)

rf = grid_rf.best_estimator_


Best RF Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 100}


In [13]:
# Train Random Forest & Evaluate
y_pred_rf = rf.predict(credit_X_test)
y_prob_rf = rf.predict_proba(credit_X_test)[:,1]

f1_rf = f1_score(credit_y_test, y_pred_rf)
precision_rf, recall_rf, _ = precision_recall_curve(credit_y_test, y_prob_rf)
aucpr_rf = auc(recall_rf, precision_rf)
cm_rf = confusion_matrix(credit_y_test, y_pred_rf)

print("Random Forest — CREDIT DATA")
print("F1 Score:", f1_rf)
print("PR-AUC:", aucpr_rf)
print("Confusion Matrix:\n", cm_rf)
print("Classification Report:\n", classification_report(credit_y_test, y_pred_rf))


Random Forest — CREDIT DATA
F1 Score: 0.8135593220338984
PR-AUC: 0.8212551396272318
Confusion Matrix:
 [[56641    10]
 [   23    72]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     56651
           1       0.88      0.76      0.81        95

    accuracy                           1.00     56746
   macro avg       0.94      0.88      0.91     56746
weighted avg       1.00      1.00      1.00     56746



In [14]:
# Compare Models
comparison = pd.DataFrame({
    'Model': ['Logistic Regression', 'Random Forest'],
    'F1 Score': [f1_lr, f1_rf],
    'PR-AUC': [aucpr_lr, aucpr_rf]
})

comparison


Unnamed: 0,Model,F1 Score,PR-AUC
0,Logistic Regression,0.232759,0.75865
1,Random Forest,0.813559,0.821255


In [15]:
# Feature Importance
importances = pd.DataFrame({
    'feature': credit_X_train.columns,
    'importance': rf.feature_importances_
}).sort_values(by='importance', ascending=False)

importances.head(10)


Unnamed: 0,feature,importance
13,V14,0.234198
9,V10,0.156478
16,V17,0.111272
11,V12,0.090547
3,V4,0.08642
2,V3,0.05894
10,V11,0.04781
1,V2,0.047013
15,V16,0.029937
6,V7,0.023518


### Cross-Validation

We applied stratified 5-fold cross-validation to ensure each fold preserved the fraud ratio.
This provides more reliable estimates than a single train-test split, especially under heavy imbalance.
