# ML model to compute risk of fraud

In [4]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, average_precision_score, precision_recall_curve, f1_score

from catboost import CatBoostClassifier, Pool

### Get data

In [5]:
fraud_data = pd.read_csv('../../data_synthesizer/original_data/creditcard.csv')
fraud_data = fraud_data.drop(columns=['Time']).copy()
fraud_data['Class'] = fraud_data['Class'].astype(int)

In [6]:
fraud_data

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,...,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,0
2,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,...,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50,0
4,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,...,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284802,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,4.356170,...,0.213454,0.111864,1.014480,-0.509348,1.436807,0.250034,0.943651,0.823731,0.77,0
284803,-0.732789,-0.055080,2.035030,-0.738589,0.868229,1.058415,0.024330,0.294869,0.584800,-0.975926,...,0.214205,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,24.79,0
284804,1.919565,-0.301254,-3.249640,-0.557828,2.630515,3.031260,-0.296827,0.708417,0.432454,-0.484782,...,0.232045,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,67.88,0
284805,-0.240440,0.530483,0.702510,0.689799,-0.377961,0.623708,-0.686180,0.679145,0.392087,-0.399126,...,0.265245,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,10.00,0


### Select features and target

In [7]:
FEATURES = [
    'V1','V2','V3','V4','V5','V6','V7','V8','V9','V10',
    'V11','V12','V13','V14','V15','V16','V17','V18','V19','V20',
    'V21','V22','V23','V24','V25','V26','V27','V28','Amount'
]
TARGET = 'Class'   # binary

In [8]:
X = fraud_data[FEATURES].copy()
y = fraud_data[TARGET].astype(int).copy()

### Prepare data for modeling

In [9]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

train_pool = Pool(X_train, y_train)
val_pool   = Pool(X_val,   y_val)

### Build and train model

In [10]:
model = CatBoostClassifier(
    loss_function='Logloss',        # good for probabilities
    eval_metric='AUC',              # monitor ROC-AUC
    iterations=3000,                # enough for early stopping to kick in
    learning_rate=0.05,
    depth=6,
    l2_leaf_reg=3.0,
    random_seed=42,
    early_stopping_rounds=200,
    auto_class_weights='Balanced',  # or 'SqrtBalanced'  / OR use class_weights=[w0, w1]
    verbose=200
)

In [11]:
model.fit(train_pool, eval_set=val_pool, use_best_model=True)

0:	test: 0.9433339	best: 0.9433339 (0)	total: 70.5ms	remaining: 3m 31s
200:	test: 0.9725157	best: 0.9735497 (108)	total: 1.55s	remaining: 21.5s
400:	test: 0.9738102	best: 0.9747774 (271)	total: 2.91s	remaining: 18.9s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.9747774497
bestIteration = 271

Shrink model to first 272 iterations.


<catboost.core.CatBoostClassifier at 0x16b7725d0>

### Some tests

In [12]:
proba_val = model.predict_proba(X_val)[:, 1]
roc = roc_auc_score(y_val, proba_val)
ap  = average_precision_score(y_val, proba_val)

In [13]:
# Choose a threshold that maximizes F1 on the validation set (optional)
prec, rec, th = precision_recall_curve(y_val, proba_val)
# thresholds returned by precision_recall_curve align with points except the first
ths = np.r_[0.0, th]  # include 0.0
f1s = 2 * prec * rec / (prec + rec + 1e-12)
best_idx = np.nanargmax(f1s)
best_threshold = float(ths[best_idx])
best_f1 = float(f1s[best_idx])


In [14]:
print(f"Validation ROC-AUC: {roc:.4f}")
print(f"Validation PR-AUC : {ap:.4f}")
print(f"Best F1={best_f1:.4f} at threshold={best_threshold:.4f}")

Validation ROC-AUC: 0.9748
Validation PR-AUC : 0.8219
Best F1=0.8400 at threshold=0.9518


In [15]:
fraud_data.loc[X_val.index, 'fraud_prob'] = proba_val
fraud_data.loc[X_val.index, 'fraud_pred'] = (proba_val >= best_threshold).astype(int)

In [16]:
fi = pd.Series(model.get_feature_importance(train_pool), index=FEATURES).sort_values(ascending=False)
print("\nTop feature importances:")
print(fi.head(10))


Top feature importances:
V14       9.300353
V4        9.181803
V12       6.263768
V1        4.317962
V3        4.317426
Amount    4.128062
V11       3.793592
V7        3.754683
V10       3.662438
V17       3.600915
dtype: float64


### Save model as an artifact

In [18]:
model.save_model("catboost_fraud.cbm")
model.save_model("../../fraud_prevention_pipeline/ml_artifacts/catboost_fraud.cbm") # Save within pipeline folder