In [46]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, confusion_matrix, classification_report, precision_recall_curve, auc
from sklearn.model_selection import StratifiedKFold, cross_val_score


In [47]:
credit_X_train = pd.read_csv("../data/processed/credit_X_train.csv")
credit_X_test  = pd.read_csv("../data/processed/credit_X_test.csv")
credit_y_train = pd.read_csv("../data/processed/credit_y_train.csv").values.ravel()
credit_y_test  = pd.read_csv("../data/processed/credit_y_test.csv").values.ravel()

fraud_X_train = pd.read_csv("../data/processed/fraud_X_train.csv")
fraud_X_test  = pd.read_csv("../data/processed/fraud_X_test.csv")
fraud_y_train = pd.read_csv("../data/processed/fraud_y_train.csv").values.ravel()
fraud_y_test  = pd.read_csv("../data/processed/fraud_y_test.csv").values.ravel()


In [63]:

# 2. Baseline Model: Logistic Regression
# -----------------------------
log_reg = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
log_reg.fit(fraud_X_train, fraud_y_train)

# Predictions
y_pred_lr = log_reg.predict(fraud_X_test)
y_prob_lr = log_reg.predict_proba(fraud_X_test)[:,1]

# Metrics
f1_lr = f1_score(fraud_y_test, y_pred_lr)
cm_lr = confusion_matrix(fraud_y_test, y_pred_lr)
precision_lr, recall_lr, _ = precision_recall_curve(fraud_y_test, y_prob_lr)
pr_auc_lr = auc(recall_lr, precision_lr)

print("Logistic Regression — FRAUD DATA")
print("F1-Score:", f1_lr)
print("PR-AUC:", pr_auc_lr)
print("Confusion Matrix:\n", cm_lr)
print("Classification Report:\n", classification_report(fraud_y_test, y_pred_lr))

Logistic Regression — FRAUD DATA
F1-Score: 0.6705911209222467
PR-AUC: 0.5787699929435683
Confusion Matrix:
 [[23120   256]
 [ 1087  1367]]
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.99      0.97     23376
           1       0.84      0.56      0.67      2454

    accuracy                           0.95     25830
   macro avg       0.90      0.77      0.82     25830
weighted avg       0.94      0.95      0.94     25830



In [64]:
# -----------------------------
# 3. Ensemble Model: Random Forest
# -----------------------------
rf = RandomForestClassifier(n_estimators=200, max_depth=10, class_weight='balanced', random_state=42)
rf.fit(fraud_X_train, fraud_y_train)

# Predictions
y_pred_rf = rf.predict(fraud_X_test)
y_prob_rf = rf.predict_proba(fraud_X_test)[:,1]

# Metrics
f1_rf = f1_score(fraud_y_test, y_pred_rf)
cm_rf = confusion_matrix(fraud_y_test, y_pred_rf)
precision_rf, recall_rf, _ = precision_recall_curve(fraud_y_test, y_prob_rf)
pr_auc_rf = auc(recall_rf, precision_rf)

print("Random Forest F1:", f1_rf)
print("Random Forest PR-AUC:", pr_auc_rf)
print("Confusion Matrix:\n", cm_rf)
print("Classification Report:\n", classification_report(fraud_y_test, y_pred_rf))


Random Forest F1: 0.6979623420170235
Random Forest PR-AUC: 0.6452925602390349
Confusion Matrix:
 [[23306    70]
 [ 1101  1353]]
Classification Report:
               precision    recall  f1-score   support

           0       0.95      1.00      0.98     23376
           1       0.95      0.55      0.70      2454

    accuracy                           0.95     25830
   macro avg       0.95      0.77      0.84     25830
weighted avg       0.95      0.95      0.95     25830



In [53]:
# 4. Cross-Validation (Stratified K-Fold)
# -----------------------------
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

lr_cv_f1 = cross_val_score(log_reg, fraud_X_train, fraud_y_train, scoring='f1', cv=skf)
rf_cv_f1 = cross_val_score(rf, fraud_X_train, fraud_y_train, scoring='f1', cv=skf)

print("LR CV F1: {:.4f} ± {:.4f}".format(lr_cv_f1.mean(), lr_cv_f1.std()))
print("RF CV F1: {:.4f} ± {:.4f}".format(rf_cv_f1.mean(), rf_cv_f1.std()))

LR CV F1: 0.8551 ± 0.0011
RF CV F1: 0.7234 ± 0.0028


In [55]:
# Logistic Regression on Credit Card Data
log_reg.fit(credit_X_train, credit_y_train)
y_pred = log_reg.predict(credit_X_test)
y_prob = log_reg.predict_proba(credit_X_test)[:,1]

credit_lr_f1 = f1_score(credit_y_test, y_pred)
precision, recall, _ = precision_recall_curve(credit_y_test, y_prob)
credit_lr_aucpr = auc(recall, precision)
credit_lr_cm = confusion_matrix(credit_y_test, y_pred)

print("Logistic Regression — CREDIT DATA")
print("F1 Score:", credit_lr_f1)
print("AUC-PR:", credit_lr_aucpr)
print("Confusion Matrix:\n", credit_lr_cm)

Logistic Regression — CREDIT DATA
F1 Score: 0.23275862068965517
AUC-PR: 0.7586498457173735
Confusion Matrix:
 [[56131   520]
 [   14    81]]


In [59]:
# Random Forest on Credit Card Data
rf.fit(credit_X_train, credit_y_train)
y_pred = rf.predict(credit_X_test)
y_prob = rf.predict_proba(credit_X_test)[:,1]

credit_rf_f1 = f1_score(credit_y_test, y_pred)
precision, recall, _ = precision_recall_curve(credit_y_test, y_prob)
credit_rf_aucpr = auc(recall, precision)
credit_rf_cm = confusion_matrix(credit_y_test, y_pred)

print("Random Forest — CREDIT DATA")
print("F1 Score:", credit_rf_f1)
print("AUC-PR:", credit_rf_aucpr)
print("Confusion Matrix:\n", credit_rf_cm)

Random Forest — CREDIT DATA
F1 Score: 0.6695652173913044
AUC-PR: 0.7960997753841067
Confusion Matrix:
 [[56593    58]
 [   18    77]]


In [66]:
# -----------------------------
# 5. Feature Importance (Ensemble)
# -----------------------------
importances = pd.DataFrame({
    'feature': fraud_X_train.columns,
    'importance': rf.feature_importances_
}).sort_values(by='importance', ascending=False)

print("Top 10 Features (RF):")
print(importances.head(10))

# -----------------------------
# 6. Metrics Comparison Table
# -----------------------------
comparison = pd.DataFrame({
    'Model': ['Logistic Regression', 'Random Forest'],
    'F1': [f1_lr, f1_rf],
    'PR-AUC': [pr_auc_lr, pr_auc_rf]
})

print("\nModel Comparison:\n", comparison)

Top 10 Features (RF):
                   feature  importance
5            short_account    0.426947
2        time_since_signup    0.425213
187  country_United States    0.030788
1                      age    0.011421
16                   sex_M    0.009870
9        purchase_velocity    0.008692
10           source_Direct    0.008456
0           purchase_value    0.008400
11              source_SEO    0.006673
52           country_China    0.006247

Model Comparison:
                  Model        F1    PR-AUC
0  Logistic Regression  0.670591  0.578770
1        Random Forest  0.697962  0.645293
