In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
from sklearn.preprocessing import MinMaxScaler
from imblearn.combine import SMOTEENN
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier, StackingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, roc_curve, auc

# -------------------------------------------------------------------------------------
# Credit Card Fraud Detection using IMEML Methodology with SMOTE-ENN
#
# This script aims to replicate the methodology presented in the research paper:
# "IMEML: An Intelligent Meta Ensemble Machine Learning Model for Fraud Detection"
# Authors: Md. Alamin Talukder, Md. Ashraf Uddin, et al.
# Published in the Journal of Big Data.
#
# Reference Link: https://journalofbigdata.springeropen.com/articles/10.1186/s40537-024-00996-5
#
# The dataset used comes from Kaggle:
# https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud
# -------------------------------------------------------------------------------------

# 1. Load the dataset
file_path = "creditcard.csv"
df = pd.read_csv(file_path)

# Reduce dataset size to improve execution time (50% sample)
df = df.sample(frac=0.5, random_state=42)

# Display dataset information
display(df.head())
display(df.info())
display(df.describe())

# 2. Check class distribution
fraud_cases = df[df['Class'] == 1]
non_fraud_cases = df[df['Class'] == 0]

print(f"Total transactions: {len(df)}")
print(f"Legitimate transactions: {len(non_fraud_cases)} ({len(non_fraud_cases) / len(df) * 100:.4f}%)")
print(f"Fraudulent transactions: {len(fraud_cases)} ({len(fraud_cases) / len(df) * 100:.4f}%)")

# 3. Data Preprocessing
# Normalize features using MinMaxScaler
scaler = MinMaxScaler()
df.iloc[:, :-1] = scaler.fit_transform(df.iloc[:, :-1])

# Separate features and labels
X = df.drop(columns=['Class'])
y = df['Class']

# 4. Apply SMOTE-ENN for resampling
start_time = time.time()
smote_enn = SMOTEENN(n_jobs=-1, random_state=42)
X_bal, y_bal = smote_enn.fit_resample(X, y)
smote_enn_time = time.time() - start_time
print(f"SMOTE-ENN executed in {smote_enn_time:.2f} seconds.")

print("Data preprocessed and balanced successfully.")

# Save balancing times to a file
results_output = "results_summary.txt"
with open(results_output, "w") as f:
    f.write("Execution times for balancing techniques:\n")
    f.write(f"SMOTE-ENN: {smote_enn_time:.2f} seconds\n")

# 5. Define stratified cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 6. Hyperparameter optimization for LGBMClassifier (meta-model for stacking)
param_grid_lgbm = {
    'n_estimators': [100, 300, 500],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 6, 9]
}
gs_lgbm = GridSearchCV(LGBMClassifier(random_state=42), param_grid_lgbm, cv=cv, scoring='accuracy', n_jobs=-1)
gs_lgbm.fit(X_bal, y_bal)
best_lgbm = gs_lgbm.best_estimator_

# 7. Hyperparameter optimization for XGBoost
param_grid_xgb = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.1, 0.2]
}
gs_xgb = GridSearchCV(
    XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    param_grid_xgb,
    cv=cv,
    scoring='accuracy',
    n_jobs=-1
)
gs_xgb.fit(X_bal, y_bal)
best_xgb = gs_xgb.best_estimator_

# 8. Define optimized base models
models = [
    ('rf', best_lgbm),
    ('xgbc', best_xgb)
]

# 9. Voting Classifier with optimized models
voting_clf = VotingClassifier(estimators=models, voting='soft')
voting_clf.fit(X_bal, y_bal)
y_pred_voting = voting_clf.predict(X_bal)

# 10. Stacking Classifier with XGBoost as meta-model
stacking_clf = StackingClassifier(estimators=models, final_estimator=best_xgb)
stacking_clf.fit(X_bal, y_bal)
y_pred_stacking = stacking_clf.predict(X_bal)

# 11. Save the ensemble models' results to a file
with open(results_output, "a") as f:
    f.write("\nResults of ensemble models:\n")
    f.write("\nVoting Classifier:\n")
    f.write(f"Accuracy: {accuracy_score(y_bal, y_pred_voting):.4f}\n")
    f.write(f"Precision: {precision_score(y_bal, y_pred_voting):.4f}\n")
    f.write(f"Recall: {recall_score(y_bal, y_pred_voting):.4f}\n")
    f.write(f"F1-Score: {f1_score(y_bal, y_pred_voting):.4f}\n")
    f.write(f"AUC-ROC: {roc_auc_score(y_bal, y_pred_voting):.4f}\n")
    f.write("\nStacking Classifier:\n")
    f.write(f"Accuracy: {accuracy_score(y_bal, y_pred_stacking):.4f}\n")
    f.write(f"Precision: {precision_score(y_bal, y_pred_stacking):.4f}\n")
    f.write(f"Recall: {recall_score(y_bal, y_pred_stacking):.4f}\n")
    f.write(f"F1-Score: {f1_score(y_bal, y_pred_stacking):.4f}\n")
    f.write(f"AUC-ROC: {roc_auc_score(y_bal, y_pred_stacking):.4f}\n")

print(f"Summary of results saved in {results_output}")


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
43428,41505.0,-16.526507,8.584972,-18.649853,9.505594,-13.793819,-2.832404,-16.701694,7.517344,-8.507059,...,1.190739,-1.12767,-2.358579,0.673461,-1.4137,-0.462762,-2.018575,-1.042804,364.19,1
49906,44261.0,0.339812,-2.743745,-0.13407,-1.385729,-1.451413,1.015887,-0.524379,0.22406,0.899746,...,-0.213436,-0.942525,-0.526819,-1.156992,0.311211,-0.746647,0.040996,0.102038,520.12,0
29474,35484.0,1.39959,-0.590701,0.168619,-1.02995,-0.539806,0.040444,-0.712567,0.002299,-0.971747,...,0.102398,0.168269,-0.166639,-0.81025,0.505083,-0.23234,0.011409,0.004634,31.0,0
276481,167123.0,-0.432071,1.647895,-1.669361,-0.349504,0.785785,-0.630647,0.27699,0.586025,-0.484715,...,0.358932,0.873663,-0.178642,-0.017171,-0.207392,-0.157756,-0.237386,0.001934,1.5,0
278846,168473.0,2.01416,-0.137394,-1.015839,0.327269,-0.182179,-0.956571,0.043241,-0.160746,0.363241,...,-0.238644,-0.6164,0.347045,0.061561,-0.360196,0.17473,-0.078043,-0.070571,0.89,0


<class 'pandas.core.frame.DataFrame'>
Index: 142404 entries, 43428 to 195585
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    142404 non-null  float64
 1   V1      142404 non-null  float64
 2   V2      142404 non-null  float64
 3   V3      142404 non-null  float64
 4   V4      142404 non-null  float64
 5   V5      142404 non-null  float64
 6   V6      142404 non-null  float64
 7   V7      142404 non-null  float64
 8   V8      142404 non-null  float64
 9   V9      142404 non-null  float64
 10  V10     142404 non-null  float64
 11  V11     142404 non-null  float64
 12  V12     142404 non-null  float64
 13  V13     142404 non-null  float64
 14  V14     142404 non-null  float64
 15  V15     142404 non-null  float64
 16  V16     142404 non-null  float64
 17  V17     142404 non-null  float64
 18  V18     142404 non-null  float64
 19  V19     142404 non-null  float64
 20  V20     142404 non-null  float64
 21  V21     142

None

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,142404.0,142404.0,142404.0,142404.0,142404.0,142404.0,142404.0,142404.0,142404.0,142404.0,...,142404.0,142404.0,142404.0,142404.0,142404.0,142404.0,142404.0,142404.0,142404.0,142404.0
mean,94992.859112,0.006113,0.007551,-0.00314,0.000531,-0.004297,-0.000479,-0.000656,-0.001513,-0.001295,...,0.000685,0.002334,-0.000212,-0.000925,0.001505,-2.7e-05,0.000922,0.000588,87.720768,0.001727
std,47470.50184,1.947233,1.613946,1.523743,1.41451,1.360273,1.320591,1.226461,1.208729,1.0985,...,0.740061,0.725989,0.601197,0.605998,0.518452,0.481806,0.395969,0.323451,234.71154,0.041527
min,0.0,-46.855047,-48.060856,-33.680984,-5.683171,-40.427726,-21.929312,-41.506796,-50.42009,-13.434066,...,-22.889347,-8.887017,-32.828995,-2.824849,-8.696627,-2.068561,-22.565679,-11.710896,0.0,0.0
25%,54328.0,-0.915898,-0.595677,-0.892224,-0.846352,-0.691713,-0.770062,-0.555478,-0.210201,-0.644734,...,-0.227869,-0.540598,-0.162442,-0.35491,-0.316082,-0.326891,-0.070461,-0.052419,5.6,0.0
50%,84988.0,0.026747,0.066339,0.176889,-0.01737,-0.051841,-0.275989,0.03886,0.021725,-0.053516,...,-0.029237,0.009183,-0.011796,0.039499,0.01892,-0.05253,0.001611,0.011264,22.0,0.0
75%,139440.75,1.31747,0.804661,1.025914,0.745571,0.61174,0.402026,0.571016,0.326625,0.595109,...,0.186899,0.531707,0.14711,0.437894,0.351642,0.240168,0.091105,0.078255,76.9,0.0
max,172792.0,2.451888,21.467203,9.382558,13.129143,32.911462,23.917837,44.054461,19.168327,15.594995,...,27.202839,8.361985,22.083545,4.022866,6.07085,3.517346,12.152401,16.129609,10199.44,1.0


Total transactions: 142404
Legitimate transactions: 142158 (99.8273%)
Fraudulent transactions: 246 (0.1727%)
SMOTE-ENN executed in 65.60 seconds.
Data preprocessed and balanced successfully.
[LightGBM] [Info] Number of positive: 142158, number of negative: 141941
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.031730 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7650
[LightGBM] [Info] Number of data points in the train set: 284099, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500382 -> initscore=0.001528
[LightGBM] [Info] Start training from score 0.001528


Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 142158, number of negative: 141941
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.023030 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7650
[LightGBM] [Info] Number of data points in the train set: 284099, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500382 -> initscore=0.001528
[LightGBM] [Info] Start training from score 0.001528


Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 142158, number of negative: 141941
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.022956 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7650
[LightGBM] [Info] Number of data points in the train set: 284099, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500382 -> initscore=0.001528
[LightGBM] [Info] Start training from score 0.001528


Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 113727, number of negative: 113552
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.016738 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7650
[LightGBM] [Info] Number of data points in the train set: 227279, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500385 -> initscore=0.001540
[LightGBM] [Info] Start training from score 0.001540
[LightGBM] [Info] Number of positive: 113726, number of negative: 113553
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.017291 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7650
[LightGBM] [Info] Number of data points in the train set: 227279, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500381 -> initscore=0.001522
[LightGBM] [Info] Start training from score 0.001522
[LightGBM]

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Summary of results saved in results_summary.txt
