In [1]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, classification_report, confusion_matrix
import xgboost as xgb
import lightgbm as lgb

In [9]:
path = 'HI-Small_Trans.csv'
df = pd.read_csv(path)

#Rename column 'Account' to 'From Account' and 'Account.1' to 'To Account'.
df = df.rename(columns={'Account': 'From Account'})
df = df.rename(columns={'Account.1': 'To Account'})
df.head()

Unnamed: 0,Timestamp,From Bank,From Account,To Bank,To Account,Amount Received,Receiving Currency,Amount Paid,Payment Currency,Payment Format,Is Laundering
0,2022/09/01 00:20,10,8000EBD30,10,8000EBD30,3697.34,US Dollar,3697.34,US Dollar,Reinvestment,0
1,2022/09/01 00:20,3208,8000F4580,1,8000F5340,0.01,US Dollar,0.01,US Dollar,Cheque,0
2,2022/09/01 00:00,3209,8000F4670,3209,8000F4670,14675.57,US Dollar,14675.57,US Dollar,Reinvestment,0
3,2022/09/01 00:02,12,8000F5030,12,8000F5030,2806.97,US Dollar,2806.97,US Dollar,Reinvestment,0
4,2022/09/01 00:06,10,8000F5200,10,8000F5200,36682.97,US Dollar,36682.97,US Dollar,Reinvestment,0


In [10]:
#drop 'Timestamp' and 'Account' columns
df.drop(columns=['Timestamp', 'From Account', 'To Account'], inplace=True)

categorical_columns = ['Receiving Currency', 'Payment Currency', 'Payment Format']
encoder = OneHotEncoder(sparse_output=False)
encoded_data = encoder.fit_transform(df[categorical_columns])
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(categorical_columns))

df = pd.concat([df, encoded_df], axis=1)
df.drop(columns=categorical_columns, inplace=True)

X = df.drop(columns=['Is Laundering'])
y = df['Is Laundering']

#80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#80% of 80% = 64% for training and 16% for validation)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


In [6]:
#########################################
############## XGBoost ##################
#########################################

#XGBoost classifier
xgb_model = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss')

#hyperparameter grid for tuning
xgb_param_grid = {
    'max_depth': [3, 5, 7],
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1, 0.2]
}

#hyperparameter tuning using GridSearchCV with F1 score as the metric
xgb_grid = GridSearchCV(estimator=xgb_model, param_grid=xgb_param_grid, cv=3, scoring='f1', verbose=1, n_jobs=-1)

xgb_grid.fit(X_train, y_train)

print("Best parameters found by XGBoost: ", xgb_grid.best_params_)
print("Best score achieved by XGBoost: ", xgb_grid.best_score_)

xgb_best = xgb_grid.best_estimator_
y_pred_xgb = xgb_best.predict(X_test)

#classification metrics
accuracy = accuracy_score(y_test, y_pred_xgb)
f1 = f1_score(y_test, y_pred_xgb)
precision = precision_score(y_test, y_pred_xgb)
recall = recall_score(y_test, y_pred_xgb)
roc_auc = roc_auc_score(y_test, y_pred_xgb)


print(f"XGBoost Accuracy: {accuracy}")
print(f"XGBoost F1 Score: {f1}")
print(f"XGBoost Precision: {precision}")
print(f"XGBoost Recall: {recall}")
print(f"XGBoost ROC-AUC: {roc_auc}")

print("\nXGBoost Classification Report:\n", classification_report(y_test, y_pred_xgb))

print("XGBoost Confusion Matrix:\n", confusion_matrix(y_test, y_pred_xgb))

Fitting 3 folds for each of 18 candidates, totalling 54 fits
Best parameters found by XGBoost:  {'learning_rate': 0.2, 'max_depth': 7, 'n_estimators': 200}
Best score achieved by XGBoost:  0.11199776217332397
XGBoost Accuracy: 0.9989740752154491
XGBoost F1 Score: 0.11394557823129252
XGBoost Precision: 0.7528089887640449
XGBoost Recall: 0.061637534498620056
XGBoost ROC-AUC: 0.5308079253459448

XGBoost Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00   1014582
           1       0.75      0.06      0.11      1087

    accuracy                           1.00   1015669
   macro avg       0.88      0.53      0.56   1015669
weighted avg       1.00      1.00      1.00   1015669

XGBoost Confusion Matrix:
 [[1014560      22]
 [   1020      67]]


In [7]:
#########################################
############## LightGBM #################
#########################################

#LightGBM classifier
lgb_model = lgb.LGBMClassifier(objective='binary', metric='binary_logloss')

#hyperparameter grid for tuning
lgb_param_grid = {
    'max_depth': [3, 5, 7],
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1, 0.2]
}

lgb_grid = GridSearchCV(estimator=lgb_model, param_grid=lgb_param_grid, cv=3, scoring='f1', verbose=1, n_jobs=-1)

lgb_grid.fit(X_train, y_train)

print("Best parameters found by LightGBM: ", lgb_grid.best_params_)
print("Best score achieved by LightGBM: ", lgb_grid.best_score_)

lgb_best = lgb_grid.best_estimator_
y_pred_lgb = lgb_best.predict(X_test)

accuracy = accuracy_score(y_test, y_pred_lgb)
f1 = f1_score(y_test, y_pred_lgb)
precision = precision_score(y_test, y_pred_lgb)
recall = recall_score(y_test, y_pred_lgb)
roc_auc = roc_auc_score(y_test, y_pred_lgb)

print(f"LightGBM Accuracy: {accuracy}")
print(f"LightGBM F1 Score: {f1}")
print(f"LightGBM Precision: {precision}")
print(f"LightGBM Recall: {recall}")
print(f"LightGBM ROC-AUC: {roc_auc}")

print("\nLightGBM Classification Report:\n", classification_report(y_test, y_pred_lgb))

#confusion matrix
print("LightGBM Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lgb))

Fitting 3 folds for each of 18 candidates, totalling 54 fits
[LightGBM] [Info] Number of positive: 2151, number of negative: 2164609
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.057126 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1094
[LightGBM] [Info] Number of data points in the train set: 2166760, number of used features: 41
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.000993 -> initscore=-6.914062
[LightGBM] [Info] Start training from score -6.914062
[LightGBM] [Info] Number of positive: 2150, number of negative: 2164610
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.031262 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1094
[LightGBM] [Info] Number of data points in the train set: 2166760, number of used features: 41
[LightGBM] [Inf

In [11]:
#########################################
########## SMOTE for Oversampling #######
#########################################

#apply SMOTE to balance the classes in the training data
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

In [12]:
#########################################
############## XGBoost with SMOTE #######
#########################################

xgb_model = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss')

xgb_param_grid = {
    'max_depth': [3, 5, 7],
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1, 0.2]
}

xgb_grid = GridSearchCV(estimator=xgb_model, param_grid=xgb_param_grid, cv=3, scoring='f1', verbose=1, n_jobs=-1)

xgb_grid.fit(X_train_res, y_train_res)

print("Best parameters found by XGBoost: ", xgb_grid.best_params_)
print("Best score achieved by XGBoost: ", xgb_grid.best_score_)

xgb_best = xgb_grid.best_estimator_

y_pred_xgb = xgb_best.predict(X_test)

accuracy = accuracy_score(y_test, y_pred_xgb)
f1 = f1_score(y_test, y_pred_xgb)
precision = precision_score(y_test, y_pred_xgb)
recall = recall_score(y_test, y_pred_xgb)
roc_auc = roc_auc_score(y_test, y_pred_xgb)

print(f"XGBoost Accuracy: {accuracy}")
print(f"XGBoost F1 Score: {f1}")
print(f"XGBoost Precision: {precision}")
print(f"XGBoost Recall: {recall}")
print(f"XGBoost ROC-AUC: {roc_auc}")

print("\nXGBoost Classification Report:\n", classification_report(y_test, y_pred_xgb))

print("XGBoost Confusion Matrix:\n", confusion_matrix(y_test, y_pred_xgb))

Fitting 3 folds for each of 18 candidates, totalling 54 fits
Best parameters found by XGBoost:  {'learning_rate': 0.2, 'max_depth': 7, 'n_estimators': 200}
Best score achieved by XGBoost:  0.9866491719606572
XGBoost Accuracy: 0.9787036918523653
XGBoost F1 Score: 0.056858812243830124
XGBoost Precision: 0.02984391449626951
XGBoost Recall: 0.5998160073597056
XGBoost ROC-AUC: 0.789462815415129

XGBoost Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.98      0.99   1014582
           1       0.03      0.60      0.06      1087

    accuracy                           0.98   1015669
   macro avg       0.51      0.79      0.52   1015669
weighted avg       1.00      0.98      0.99   1015669

XGBoost Confusion Matrix:
 [[993387  21195]
 [   435    652]]


In [13]:
#########################################
############## LightGBM with SMOTE ######
#########################################

lgb_model = lgb.LGBMClassifier(objective='binary', metric='binary_logloss')

lgb_param_grid = {
    'max_depth': [3, 5, 7],
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1, 0.2]
}

lgb_grid = GridSearchCV(estimator=lgb_model, param_grid=lgb_param_grid, cv=3, scoring='f1', verbose=1, n_jobs=-1)

lgb_grid.fit(X_train_res, y_train_res)

print("Best parameters found by LightGBM: ", lgb_grid.best_params_)
print("Best score achieved by LightGBM: ", lgb_grid.best_score_)

lgb_best = lgb_grid.best_estimator_

y_pred_lgb = lgb_best.predict(X_test)

accuracy = accuracy_score(y_test, y_pred_lgb)
f1 = f1_score(y_test, y_pred_lgb)
precision = precision_score(y_test, y_pred_lgb)
recall = recall_score(y_test, y_pred_lgb)
roc_auc = roc_auc_score(y_test, y_pred_lgb)

print(f"LightGBM Accuracy: {accuracy}")
print(f"LightGBM F1 Score: {f1}")
print(f"LightGBM Precision: {precision}")
print(f"LightGBM Recall: {recall}")
print(f"LightGBM ROC-AUC: {roc_auc}")

print("\nLightGBM Classification Report:\n", classification_report(y_test, y_pred_lgb))

print("LightGBM Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lgb))

Fitting 3 folds for each of 18 candidates, totalling 54 fits
[LightGBM] [Info] Number of positive: 2164610, number of negative: 2164609
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.983176 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 9949
[LightGBM] [Info] Number of data points in the train set: 4329219, number of used features: 41
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
[LightGBM] [Info] Number of positive: 2164610, number of negative: 2164609
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.722785 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 9949
[LightGBM] [Info] Number of data points in