In [7]:
pip install lightgbm scikit-learn pandas numpy joblib

Defaulting to user installation because normal site-packages is not writeable
Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-macosx_12_0_arm64.whl.metadata (17 kB)
Downloading lightgbm-4.6.0-py3-none-macosx_12_0_arm64.whl (1.6 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: lightgbm
Successfully installed lightgbm-4.6.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [9]:
### 1. Imports & Config
import pandas as pd, numpy as np, joblib, json, pathlib
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve, precision_score, recall_score, f1_score, roc_auc_score
from lightgbm import LGBMClassifier

RANDOM_STATE = 42
DATA_PATH = "dataset.csv"          # adjust if needed
MODEL_OUT = "financeai_lgbm.pkl"
ENCODER_OUT = "financeai_cat_maps.json"  # mapping for categorical codes
THRESHOLD_OUT = "financeai_threshold.json"


In [10]:
### 2. Load & Clean
df = pd.read_csv(DATA_PATH)
for col in df.select_dtypes(include="object").columns:
    df[col] = df[col].str.strip("'")
df["amount"] = df["amount"].astype(float)
df["step"] = df["step"].astype(int)


In [11]:
### 3. Feature Engineering
df["log_amount"]   = np.log1p(df["amount"])
df["hour"]         = df["step"] % 24
df["merchant_freq"]  = df["merchant"].map(df["merchant"].value_counts())
df["category_freq"]  = df["category"].map(df["category"].value_counts())


In [12]:
### 4. Encode Categoricals as Integer Codes (LightGBM-friendly)
cat_cols = ["merchant", "category"]
cat_maps = {}
for c in cat_cols:
    df[c] = df[c].astype("category")
    cat_maps[c] = dict(enumerate(df[c].cat.categories))
    df[c] = df[c].cat.codes         # int32 codes starting at 0

FEATURES = ["log_amount", "hour", "merchant_freq", "category_freq"] + cat_cols
X = df[FEATURES]
y = df["fraud"]


In [13]:
### 5. Train / Validation Split (stratified)
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
)


In [14]:
### 6. LightGBM Model (handle imbalance via class_weight)
fraud_ratio = y_train.mean()
weight = {0:1, 1:int(1/fraud_ratio)}   # inverse ratio
lgbm = LGBMClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=-1,
    class_weight=weight,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=RANDOM_STATE,
    n_jobs=-1
)
lgbm.fit(X_train, y_train)


[LightGBM] [Info] Number of positive: 5760, number of negative: 469954
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005934 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 408
[LightGBM] [Info] Number of data points in the train set: 475714, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501255 -> initscore=0.005022
[LightGBM] [Info] Start training from score 0.005022


In [15]:
### 7. Evaluate & Choose Threshold for 95 %+ Precision
val_probs = lgbm.predict_proba(X_val)[:,1]
precision, recall, thresholds = precision_recall_curve(y_val, val_probs)

# find highest recall where precision ≥ 0.95
target_prec = 0.95
best_idx = next(i for i,p in enumerate(precision) if p >= target_prec)
best_threshold = thresholds[best_idx]

val_pred = (val_probs >= best_threshold).astype(int)
print("Chosen threshold:", best_threshold)
print("Precision:", precision_score(y_val, val_pred))
print("Recall   :", recall_score(y_val, val_pred))
print("F1       :", f1_score(y_val, val_pred))
print("ROC-AUC  :", roc_auc_score(y_val, val_probs))


Chosen threshold: 0.9942571149426819
Precision: 0.95
Recall   : 0.6597222222222222
F1       : 0.7786885245901639
ROC-AUC  : 0.9973655276002198


In [16]:
### 8. Save Model, Encoder Maps, Threshold
joblib.dump(lgbm, MODEL_OUT)
with open(ENCODER_OUT, "w") as f:
    json.dump(cat_maps, f)
with open(THRESHOLD_OUT, "w") as f:
    json.dump({"threshold": float(best_threshold)}, f)

print("✅ Saved", MODEL_OUT, ENCODER_OUT, THRESHOLD_OUT)


✅ Saved financeai_lgbm.pkl financeai_cat_maps.json financeai_threshold.json
