In [3]:
# Core
import pandas as pd
import numpy as np
from pathlib import Path

# ML
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    average_precision_score,
    precision_score,
    recall_score
)
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from xgboost import XGBClassifier

# Tracking
import mlflow


In [4]:
# Reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)


mlruns_path = Path("../mlruns").resolve()
mlruns_path.mkdir(parents=True, exist_ok=True)

mlflow.set_tracking_uri(f"file:///{mlruns_path}")
mlflow.set_experiment("creditcard_fraud_detection")

DATA_PATH = "../data/training/creditcard_prepared.csv"

df = pd.read_csv(DATA_PATH)

print("Dataset shape:", df.shape)
df.head()

  return FileStore(store_uri, store_uri)


Dataset shape: (284807, 31)


Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V22,V23,V24,V25,V26,V27,V28,Class,Amount_scaled,Time_scaled
0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,...,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,0,1.124303,-1.996583
1,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,...,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,0,-1.114639,-1.996583
2,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,...,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,0,1.682368,-1.996562
3,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,...,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,0,1.009339,-1.996562
4,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,...,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,0,0.670241,-1.996541


In [5]:
X = df.drop("Class", axis=1)
y = df["Class"]

print("Features shape:", X.shape)
print("Target shape:", y.shape)

Features shape: (284807, 30)
Target shape: (284807,)


In [6]:
X_train, X_temp, y_train, y_temp = train_test_split(
    X,
    y,
    test_size=0.30,
    stratify=y,
    random_state=RANDOM_STATE
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp,
    y_temp,
    test_size=0.50,
    stratify=y_temp,
    random_state=RANDOM_STATE
)

print("Train size:", X_train.shape)
print("Validation size:", X_val.shape)
print("Test size:", X_test.shape)

Train size: (199364, 30)
Validation size: (42721, 30)
Test size: (42722, 30)


In [None]:
def evaluate_model(model, name):
    with mlflow.start_run(run_name=name):
        model.fit(X_train, y_train)

        y_proba = model.predict_proba(X_val)[:, 1]
        y_val_pred = model.predict(X_val)

        auprc = average_precision_score(y_val, y_proba)
        precision = precision_score(y_val, y_val_pred)
        recall = recall_score(y_val, y_val_pred)

        print(f"{name} AUPRC:", auprc)
        print(f"{name} precision:", precision)
        print(f"{name} recall:", recall)

        mlflow.log_param("model", name)
        mlflow.log_params(model.get_params())

        mlflow.log_metric("AUPRC_validation", auprc)
        mlflow.log_metric("precision_validation", precision)
        mlflow.log_metric("recall_validation", recall)
    

In [8]:
rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    class_weight="balanced",
    n_jobs=-1,
    random_state=RANDOM_STATE
)

et = ExtraTreesClassifier(
    n_estimators=300,
    class_weight="balanced",
    n_jobs=-1,
    random_state=RANDOM_STATE
)

hgb = HistGradientBoostingClassifier(
    max_iter=200,
    learning_rate=0.05,
    random_state=RANDOM_STATE
)

xgb = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="logloss",
    random_state=RANDOM_STATE,
    n_jobs=-1
)

In [9]:
evaluate_model(rf, "RandomForest")
evaluate_model(et, "ExtraTrees")
evaluate_model(hgb, "HistGradientBoosting")
evaluate_model(xgb, "XGBoost")

RandomForest AUPRC: 0.817332003348725
RandomForest precision: 0.9807692307692307
RandomForest recall: 0.6891891891891891
ExtraTrees AUPRC: 0.8255643754585693
ExtraTrees precision: 0.9818181818181818
ExtraTrees recall: 0.7297297297297297


Exception in thread Thread-50 (_readerthread):
Traceback (most recent call last):
  File [35m"c:\Users\IRVIN\miniconda3\envs\test\Lib\threading.py"[0m, line [35m1043[0m, in [35m_bootstrap_inner[0m
    [31mself.run[0m[1;31m()[0m
    [31m~~~~~~~~[0m[1;31m^^[0m
  File [35m"c:\Users\IRVIN\miniconda3\envs\test\Lib\threading.py"[0m, line [35m994[0m, in [35mrun[0m
    [31mself._target[0m[1;31m(*self._args, **self._kwargs)[0m
    [31m~~~~~~~~~~~~[0m[1;31m^^^^^^^^^^^^^^^^^^^^^^^^^^^^^[0m
  File [35m"c:\Users\IRVIN\miniconda3\envs\test\Lib\subprocess.py"[0m, line [35m1615[0m, in [35m_readerthread[0m
    buffer.append([31mfh.read[0m[1;31m()[0m)
                  [31m~~~~~~~[0m[1;31m^^[0m
  File [35m"<frozen codecs>"[0m, line [35m325[0m, in [35mdecode[0m
[1;35mUnicodeDecodeError[0m: [35m'utf-8' codec can't decode byte 0xa2 in position 116: invalid start byte[0m


HistGradientBoosting AUPRC: 0.5349324459643102
HistGradientBoosting precision: 0.6153846153846154
HistGradientBoosting recall: 0.6486486486486487
XGBoost AUPRC: 0.8308643142436096
XGBoost precision: 0.9655172413793104
XGBoost recall: 0.7567567567567568


0.8308643142436096