In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest

from BalanceDataset import *
from DeepLearningModels import *


In [2]:
df = pd.read_csv("creditcard.csv")

# Drop 'Time' column if present
if 'Time' in df.columns:
    df.drop('Time', axis=1, inplace=True)

# Remove outliers (Isolation Forest)
clf = IsolationForest(contamination=0.01, random_state=42)
mask = clf.fit_predict(df.drop("Class", axis=1)) != -1
df_cleaned = df[mask]


In [3]:
corr_matrix = df_cleaned.corr()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [col for col in upper.columns if any(upper[col].abs() > 0.95)]
df_selected = df_cleaned.drop(to_drop, axis=1)


In [6]:
# Check for NaNs in the 'Class' column and drop rows with NaNs
if df_selected['Class'].isnull().any():
    print("Warning: NaNs found in 'Class' column. Dropping rows with NaNs.")
    df_selected.dropna(subset=['Class'], inplace=True)

X = df_selected.drop("Class", axis=1)
y = df_selected["Class"]

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Balance using SMOTE
X_train_bal, y_train_bal = balanceWithSMOTE(X_train_scaled, y_train)

Non-Frauds: 34547 / 50.0 % of the dataset
Frauds: 34547 / 50.0 % of the dataset


In [7]:
results = []

# ANN
results.append(ANN_model(X_train_bal, X_test_scaled, y_train_bal, y_test))

# CNN
results.append(CNN_model(X_train_bal, X_test_scaled, y_train_bal, y_test))

# RNN
results.append(RNN_model(X_train_bal, X_test_scaled, y_train_bal, y_test))

# LSTM
results.append(LSTM_model(X_train_bal, X_test_scaled, y_train_bal, y_test))

# Autoencoder
results.append(autoencoders(X_train_scaled, X_test_scaled, y_train, y_test))  # Uses original class imbalance


Epoch 1/10
[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step - auc: 0.9612 - loss: 0.2106 - precision: 0.8788 - recall: 0.9001 - val_auc: 0.0000e+00 - val_loss: 0.0089 - val_precision: 1.0000 - val_recall: 1.0000
Epoch 2/10
[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - auc: 1.0000 - loss: 0.0056 - precision: 0.9974 - recall: 0.9999 - val_auc: 0.0000e+00 - val_loss: 0.0024 - val_precision: 1.0000 - val_recall: 1.0000
Epoch 3/10
[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - auc: 1.0000 - loss: 0.0021 - precision: 0.9990 - recall: 1.0000 - val_auc: 0.0000e+00 - val_loss: 0.0013 - val_precision: 1.0000 - val_recall: 1.0000
Epoch 4/10
[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - auc: 1.0000 - loss: 0.0012 - precision: 0.9992 - recall: 1.0000 - val_auc: 0.0000e+00 - val_loss: 5.7835e-04 - val_precision: 1.0000 - val_recall: 1.0000
Epoch 5/10
[1m432/432[0m [32m━━━━━━━━━━━━

In [8]:
df_results = pd.DataFrame(results, columns=["Model", "Precision", "Recall", "F1", "AUC"])
df_results.sort_values(by="F1", ascending=False).reset_index(drop=True)


Unnamed: 0,Model,Precision,Recall,F1,AUC
0,ANN,1.0,0.75,0.857143,0.874812
1,CNN,1.0,0.625,0.769231,0.874855
2,RNN,0.666667,0.75,0.705882,0.874493
3,LSTM,0.6,0.75,0.666667,0.87445
4,Autoencoder,0.018767,0.875,0.036745,0.940843


In [10]:
import joblib

In [11]:
# Save cleaned dataset and scaler for DL
df_selected.to_csv("cleaned_dataset_dl.csv", index=False)
joblib.dump(scaler, "scaler_dl.pkl")


['scaler_dl.pkl']