In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
from sklearn.preprocessing import MinMaxScaler
from imblearn.combine import SMOTEENN
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier, StackingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# -------------------------------------------------------------------------------------
# Credit Card Fraud Detection using SMOTEENN
# -------------------------------------------------------------------------------------

# 1. Load the dataset
file_path = "creditcard.csv"
df = pd.read_csv(file_path)

# Reduce dataset size to improve execution time (50% sample)
df = df.sample(frac=0.5, random_state=42)

# Display dataset information
print(df.head())
print(df.info())
print(df.describe())

# 2. Check class distribution
fraud_cases = df[df['Class'] == 1]
non_fraud_cases = df[df['Class'] == 0]

print(f"Total transactions: {len(df)}")
print(f"Legitimate transactions: {len(non_fraud_cases)} ({len(non_fraud_cases) / len(df) * 100:.4f}%)")
print(f"Fraudulent transactions: {len(fraud_cases)} ({len(fraud_cases) / len(df) * 100:.4f}%)")

# 3. Data Preprocessing
# Normalize features using MinMaxScaler
scaler = MinMaxScaler()
df.iloc[:, :-1] = scaler.fit_transform(df.iloc[:, :-1].values)

# Separate features and labels
X = df.drop(columns=['Class'])
y = df['Class']

# 4. Split dataset into training and test sets **before balancing**
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 5. Apply SMOTEENN to the training data only
start_time = time.time()
smote_enn = SMOTEENN(random_state=42, n_jobs=-1)
X_train_bal, y_train_bal = smote_enn.fit_resample(X_train, y_train)
smoteenn_time = time.time() - start_time
print(f"SMOTEENN executed in {smoteenn_time:.2f} seconds.")

# 6. Define stratified cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 7. Hyperparameter optimization for LGBMClassifier (Usando los mismos hiperparámetros del paper)
param_grid_lgbm = {
    'n_estimators': [100, 300, 500],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [-1, 6, 9],
    'reg_alpha': [0, 0.1, 1],
    'reg_lambda': [0, 1, 10]
}
gs_lgbm = GridSearchCV(LGBMClassifier(random_state=42), param_grid_lgbm, cv=cv, scoring='accuracy', n_jobs=-1)
gs_lgbm.fit(X_train_bal, y_train_bal)
best_lgbm = gs_lgbm.best_estimator_

# 8. Hyperparameter optimization for XGBoost (Usando los mismos hiperparámetros del paper)
param_grid_xgb = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.1, 0.2, 0.3],
    'max_depth': [3, 6, 9],
    'reg_alpha': [0, 0.1, 1],
    'reg_lambda': [1, 5, 10]
}
gs_xgb = GridSearchCV(
    XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    param_grid_xgb,
    cv=cv,
    scoring='accuracy',
    n_jobs=-1
)
gs_xgb.fit(X_train_bal, y_train_bal)
best_xgb = gs_xgb.best_estimator_

# 9. Define optimized base models
models = [
    ('lgbm', best_lgbm),
    ('xgbc', best_xgb)
]

# 10. Voting Classifier with optimized models
voting_clf = VotingClassifier(estimators=models, voting='soft')
voting_clf.fit(X_train_bal, y_train_bal)
y_pred_voting = voting_clf.predict(X_test)

# 11. Stacking Classifier with XGBoost as meta-model
stacking_clf = StackingClassifier(estimators=models, final_estimator=best_xgb)
stacking_clf.fit(X_train_bal, y_train_bal)
y_pred_stacking = stacking_clf.predict(X_test)

# 12. Save the ensemble models' results
results_output = "results_summary_smoteenn.txt"
with open(results_output, "w") as f:
    f.write("Execution time for SMOTEENN:\n")
    f.write(f"SMOTEENN executed in {smoteenn_time:.2f} seconds.\n")

    # Voting Classifier Results
    f.write("\nVoting Classifier:\n")
    f.write(f"Accuracy: {accuracy_score(y_test, y_pred_voting):.4f}\n")
    f.write(f"Precision: {precision_score(y_test, y_pred_voting):.4f}\n")
    f.write(f"Recall: {recall_score(y_test, y_pred_voting):.4f}\n")
    f.write(f"F1-Score: {f1_score(y_test, y_pred_voting):.4f}\n")
    f.write(f"AUC-ROC: {roc_auc_score(y_test, y_pred_voting):.4f}\n")

    # Confusion Matrix para Voting Classifier
    cm_voting = confusion_matrix(y_test, y_pred_voting)
    f.write("\nConfusion Matrix (Voting Classifier):\n")
    f.write(np.array2string(cm_voting) + "\n")

    # Stacking Classifier Results
    f.write("\nStacking Classifier:\n")
    f.write(f"Accuracy: {accuracy_score(y_test, y_pred_stacking):.4f}\n")
    f.write(f"Precision: {precision_score(y_test, y_pred_stacking):.4f}\n")
    f.write(f"Recall: {recall_score(y_test, y_pred_stacking):.4f}\n")
    f.write(f"F1-Score: {f1_score(y_test, y_pred_stacking):.4f}\n")
    f.write(f"AUC-ROC: {roc_auc_score(y_test, y_pred_stacking):.4f}\n")

    # Confusion Matrix para Stacking Classifier
    cm_stacking = confusion_matrix(y_test, y_pred_stacking)
    f.write("\nConfusion Matrix (Stacking Classifier):\n")
    f.write(np.array2string(cm_stacking) + "\n")

print(f"Summary of results saved in {results_output}")

results_hyperparams = "results_summary_hyperparams.txt"

with open(results_hyperparams, "w") as f:
    f.write("Mejores hiperparámetros encontrados:\n\n")

    # Guardar los mejores hiperparámetros para LGBMClassifier
    f.write("LGBMClassifier:\n")
    f.write(str(gs_lgbm.best_params_) + "\n\n")

    # Guardar los mejores hiperparámetros para XGBoostClassifier
    f.write("XGBoostClassifier:\n")
    f.write(str(gs_xgb.best_params_) + "\n")

print(f"Mejores hiperparámetros guardados en {results_hyperparams}")

            Time         V1        V2         V3        V4         V5  \
43428    41505.0 -16.526507  8.584972 -18.649853  9.505594 -13.793819   
49906    44261.0   0.339812 -2.743745  -0.134070 -1.385729  -1.451413   
29474    35484.0   1.399590 -0.590701   0.168619 -1.029950  -0.539806   
276481  167123.0  -0.432071  1.647895  -1.669361 -0.349504   0.785785   
278846  168473.0   2.014160 -0.137394  -1.015839  0.327269  -0.182179   

              V6         V7        V8        V9  ...       V21       V22  \
43428  -2.832404 -16.701694  7.517344 -8.507059  ...  1.190739 -1.127670   
49906   1.015887  -0.524379  0.224060  0.899746  ... -0.213436 -0.942525   
29474   0.040444  -0.712567  0.002299 -0.971747  ...  0.102398  0.168269   
276481 -0.630647   0.276990  0.586025 -0.484715  ...  0.358932  0.873663   
278846 -0.956571   0.043241 -0.160746  0.363241  ... -0.238644 -0.616400   

             V23       V24       V25       V26       V27       V28  Amount  \
43428  -2.358579  0.673461