In [1]:
# ─── Cell 1: SIMULATE TRANSACTION DATA ────────────────────────────────────────
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification

# 1. Generate synthetic data: 100k transactions, 1% fraud
X, y = make_classification(
    n_samples=100_000,
    n_features=10,
    n_informative=5,
    n_redundant=2,
    n_clusters_per_class=1,
    weights=[0.99, 0.01],
    flip_y=0.01,
    class_sep=1.5,
    random_state=42
)

# 2. Wrap in DataFrame
cols = [f"feat_{i}" for i in range(X.shape[1])]
df_tx = pd.DataFrame(X, columns=cols)
df_tx['is_fraud'] = y

# 3. Peek at class balance & head
print("Class distribution:\n", df_tx['is_fraud'].value_counts(normalize=True))
display(df_tx.head())


Class distribution:
 is_fraud
0    0.98498
1    0.01502
Name: proportion, dtype: float64


Unnamed: 0,feat_0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,is_fraud
0,1.709998,1.924747,0.528911,-0.442117,-5.005879,1.408948,-2.746236,0.943649,-2.179255,1.851989,0
1,-0.965912,0.743583,0.388698,-1.135427,0.91616,-1.992164,1.851897,-0.177008,-0.878729,-0.249517,0
2,1.609364,-1.217612,-0.124207,-0.647378,-4.113871,2.693689,-2.228825,2.851005,-0.549156,0.431975,0
3,-0.195988,-1.305993,-0.949106,1.057753,-3.421402,2.499219,-1.076558,2.925982,-0.702132,-0.312159,0
4,2.06295,0.47707,-0.913289,-0.338946,-4.337922,0.994193,-2.63432,1.067582,-0.860559,2.88629,0


In [2]:
# ─── Cell 2: ISOLATION FOREST ANOMALY DETECTION ───────────────────────────────
from sklearn.ensemble import IsolationForest
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

# 1. Features & label
X_tx = df_tx.drop(columns=['is_fraud'])
y_tx = df_tx['is_fraud']

# 2. Train Isolation Forest (using known fraud rate as contamination)
iso = IsolationForest(
    n_estimators=100,
    contamination=y_tx.mean(),  # ≈1.5% fraud
    random_state=42
)
iso.fit(X_tx)

# 3. Predict: -1 = anomaly → fraud, 1 = normal
pred_raw   = iso.predict(X_tx)
y_pred_iso = (pred_raw == -1).astype(int)

# 4. Evaluate
print("Isolation Forest Metrics:")
print(" Precision: ", precision_score(y_tx, y_pred_iso))
print(" Recall:    ", recall_score(y_tx, y_pred_iso))
print(" F1 Score:  ", f1_score(y_tx, y_pred_iso))
print(" ROC-AUC:   ", roc_auc_score(y_tx, iso.decision_function(X_tx)))


Isolation Forest Metrics:
 Precision:  0.11584553928095873
 Recall:     0.11584553928095873
 F1 Score:   0.11584553928095873
 ROC-AUC:    0.23529564525213986


In [3]:
# ─── Cell 3: SUPERVISED RANDOM FOREST FOR FRAUD DETECTION ─────────────────────
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

# 1. Split into train/test (80/20 stratified)
X = df_tx.drop(columns=['is_fraud'])
y = df_tx['is_fraud']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# 2. Fit Random Forest with balanced class weights
rf_clf = RandomForestClassifier(
    n_estimators=100,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)
rf_clf.fit(X_train, y_train)

# 3. Predict & score at default 0.5
y_prob_rf = rf_clf.predict_proba(X_test)[:,1]
y_pred_rf = (y_prob_rf >= 0.5).astype(int)

print("Random Forest (@0.5) Metrics:")
print(" Precision: ", precision_score(y_test, y_pred_rf))
print(" Recall:    ", recall_score(y_test, y_pred_rf))
print(" F1 Score:  ", f1_score(y_test, y_pred_rf))
print(" ROC-AUC:   ", roc_auc_score(y_test, y_prob_rf))


Random Forest (@0.5) Metrics:
 Precision:  0.9675675675675676
 Recall:     0.5966666666666667
 F1 Score:   0.7381443298969073
 ROC-AUC:    0.8511716582064297


In [4]:
# ─── Cell 4: THRESHOLD TUNING FOR RF FRAUD CLASSIFIER ─────────────────────────
import numpy as np
from sklearn.metrics import precision_score, recall_score

thresholds = np.linspace(0.1, 0.9, 17)
print("Thresh  Precision   Recall")
for t in thresholds:
    preds_t = (y_prob_rf >= t).astype(int)
    p = precision_score(y_test, preds_t)
    r = recall_score(y_test, preds_t)
    print(f"{t:>5.2f}   {p:>8.4f}   {r:>8.4f}")


Thresh  Precision   Recall
 0.10     0.7462     0.6467
 0.15     0.8818     0.6467
 0.20     0.9061     0.6433
 0.25     0.9275     0.6400
 0.30     0.9265     0.6300
 0.35     0.9397     0.6233
 0.40     0.9531     0.6100
 0.45     0.9574     0.6000
 0.50     0.9676     0.5967
 0.55     0.9777     0.5833
 0.60     0.9829     0.5733
 0.65     0.9826     0.5633
 0.70     0.9820     0.5467
 0.75     0.9815     0.5300
 0.80     0.9868     0.4967
 0.85     0.9858     0.4633
 0.90     0.9844     0.4200


In [5]:
import joblib
import os

os.makedirs("models", exist_ok=True)

# Dump the fraud RF classifier
joblib.dump(rf_clf, "models/rf_fraud.pkl")

print("Saved RandomForest fraud model to models/rf_fraud.pkl")


Saved RandomForest fraud model to models/rf_fraud.pkl
