In [31]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from xgboost import XGBClassifier
import lightgbm as lgb

import pickle
import os

In [2]:
# Adjust the file path to your environment
df = pd.read_csv("transactions_with_risk.csv")

df.head()    # Quick preview

Unnamed: 0,TransactionID,Sender,Receiver,Amount,Currency,GasFee,Timestamp,Purpose,Region,AML_KYC_Verified,Geolocation_Receiver,Geolocation_Sender,Risk_Score,Risk_Category
0,0x55dbfe07923962d56fde48bf1c6eaa4b69d4f4dde05b...,0xd718e9100228b8f3f2ef527c93e699386c7c92c8,0x13aebec2778846670ad21a5845bd80c2db4d6ae0,3165.3,ETH,0.0033,2024-03-05 13:16:33,savings,EU,True,Dubai,Singapore,7,Low
1,0xfa2b1f7724288f8766c71b4775f17e72f55dcdbe4432...,0x01dc8fdb4b587a0b2567eec4c70c39f1a646c2e7,0x29e86518df3e1edf028d53b1aad4f9e636e2fc80,8755.73,ETH,0.0092,2024-12-11 13:16:33,investment,APAC,True,Dubai,United States,9,Low
2,0xae6a574327e43ca0e4e127ae263cd75833975886e4e3...,0xb03fb21072772b3781654f9a069c5399b04ae46d,0x4d054afd46c559d170a1b220935a377703b8c7be,188309.64,BTC,0.0084,2024-09-28 13:16:33,unregistered_trade,APAC,False,Japan,China,19,High
3,0x4f7ee8bd33bac89b489963725986259070ea5a5340a4...,0x54e4e717b8f10ddba2133a1993f587328d5e3d53,0xc6875c3432e2cd5ea803f5711c7ed998d4874a3d,5352.98,ETH,0.0074,2024-09-06 13:16:33,savings,APAC,True,Dubai,Germany,7,Low
4,0x83bb0f09c360dcc98e6a69fbc5e4084e4b530d2be73f...,0x4088d7dc6f8d7200aa1857cf33806b56e316cd9d,0xa4d3edbd0585ac2c5aa62f3bd5b0a65933f18b72,198.89,ETH,0.0026,2024-09-02 13:16:33,trade,APAC,True,Dubai,Dubai,8,Low


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20030 entries, 0 to 20029
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   TransactionID         20030 non-null  object 
 1   Sender                20030 non-null  object 
 2   Receiver              20030 non-null  object 
 3   Amount                20030 non-null  float64
 4   Currency              20030 non-null  object 
 5   GasFee                20030 non-null  float64
 6   Timestamp             20030 non-null  object 
 7   Purpose               20030 non-null  object 
 8   Region                20030 non-null  object 
 9   AML_KYC_Verified      20030 non-null  object 
 10  Geolocation_Receiver  20030 non-null  object 
 11  Geolocation_Sender    20030 non-null  object 
 12  Risk_Score            20030 non-null  int64  
 13  Risk_Category         20030 non-null  object 
dtypes: float64(2), int64(1), object(11)
memory usage: 2.1+ MB


In [4]:
df["Risk_Category"].value_counts()

Risk_Category
Low       16388
High       3630
Medium       12
Name: count, dtype: int64

In [5]:
# Keep only relevant columns
df_model = df[[
    "Amount",
    "GasFee",
    "Currency",
    "Purpose",
    "Region",
    "AML_KYC_Verified",
    "Geolocation_Receiver",
    "Geolocation_Sender",
    "Risk_Category"
]].copy()

In [6]:
risk_mapping = {"Low": 0, "Medium": 1, "High": 2}
df_model["Risk_Category"] = df_model["Risk_Category"].map(risk_mapping)

In [7]:
cat_cols = ["Currency", "Purpose", "Region", "Geolocation_Receiver", "Geolocation_Sender"]
label_encoders = {}

for col in cat_cols:
    le = LabelEncoder()
    df_model[col] = le.fit_transform(df_model[col].astype(str))
    label_encoders[col] = le


In [8]:
df_model["AML_KYC_Verified"].value_counts()

AML_KYC_Verified
False    11828
True      8172
No          17
Yes         13
Name: count, dtype: int64

In [9]:
# Convert 'AML_KYC_Verified' to boolean first, then to integer
df_model["AML_KYC_Verified"] = df_model["AML_KYC_Verified"].map({"True": True, "False": False, "Yes": True, "No": False})
# df_model["AML_KYC_Verified"] = df_model["AML_KYC_Verified"].astype(int)

In [12]:
df_model["AML_KYC_Verified"].value_counts()

AML_KYC_Verified
False    11845
True      8185
Name: count, dtype: int64

In [11]:
# convert AML_KYC_Verified to boolean 
df_model["AML_KYC_Verified"] = df_model["AML_KYC_Verified"].astype(bool)

In [13]:
df_model["AML_KYC_Verified"] = df_model["AML_KYC_Verified"].astype(int)

In [14]:
df_model["AML_KYC_Verified"].value_counts()

AML_KYC_Verified
0    11845
1     8185
Name: count, dtype: int64

In [15]:
X = df_model.drop("Risk_Category", axis=1)
y = df_model["Risk_Category"]

In [16]:
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.2,     # 20% test
    random_state=42,   # for reproducibility
    stratify=y         # keep class distribution
)

In [32]:
# 7) Train LightGBM
lgb_train = lgb.Dataset(X_train, label=y_train)
lgb_eval = lgb.Dataset(X_test, label=y_test, reference=lgb_train)

In [33]:
params = {
        "objective": "multiclass",
        "num_class": 4,  # Number of target classes
        "metric": "multi_logloss",
        "boosting_type": "gbdt",
        "learning_rate": 0.1,
        "num_leaves": 31,
        "max_depth": -1,
        "feature_fraction": 0.8,
        "bagging_fraction": 0.8,
        "bagging_freq": 5,
        "seed": 42,
    }

In [36]:
model = lgb.train(
    params,
    lgb_train,
    valid_sets=[lgb_train, lgb_eval],
    num_boost_round=100
)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000141 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 391
[LightGBM] [Info] Number of data points in the train set: 16024, number of used features: 8
[LightGBM] [Info] Start training from score -0.200712
[LightGBM] [Info] Start training from score -7.379258
[LightGBM] [Info] Start training from score -1.707999
[LightGBM] [Info] Start training from score -34.538776


In [37]:
# 8) Predict
y_pred = np.argmax(model.predict(X_test), axis=1)
acc = accuracy_score(y_test, y_pred)

In [38]:
print(f"Test Accuracy: {acc:.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Test Accuracy: 0.9985

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      3278
           1       0.25      1.00      0.40         2
           2       1.00      0.99      1.00       726

    accuracy                           1.00      4006
   macro avg       0.75      1.00      0.80      4006
weighted avg       1.00      1.00      1.00      4006

Confusion Matrix:
 [[3277    1    0]
 [   0    2    0]
 [   0    5  721]]


In [39]:
# 9) If accuracy is above threshold, pickle the model
threshold = 0.85  # example threshold
if acc >= threshold:
    with open("lgb_risk_model.pkl", "wb") as f:
        pickle.dump(model, f)
    print(f"Model accuracy ({acc:.2f}) >= {threshold}. Model saved as lgb_risk_model.pkl.")
else:
    print(f"Model accuracy ({acc:.2f}) < {threshold}. Not saving the model.")

Model accuracy (1.00) >= 0.85. Model saved as lgb_risk_model.pkl.
