In [1]:
import os
import zipfile
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#import statsmodels.api as sm

from scipy.stats import chi2_contingency
from scipy.stats import f_oneway
from scipy import stats

In [2]:
INPUT_ZIP = "../data/in/Fraud_Detction_Dataset.zip"  # Directorio del zip
OUTPUT_FOLDER = "../data/out/"  # Directorio de destino
TRAIN_FILENAME = "creditcard.csv"  # Nombre del fichero de entrenamiento

def fetch_data(input_path=INPUT_ZIP, output_dir=OUTPUT_FOLDER):
    
    # Comprobación de que el directorio de destino existe
    os.makedirs(output_dir, exist_ok=True)

    # Descomprime el archivo ZIP en caso de que no haya ningún csv en la carpeta
    if(len([file for file in os.listdir(output_dir) if file.endswith('.csv')]) == 0):
        with zipfile.ZipFile(input_path, 'r') as zip_ref:
            zip_ref.extractall(output_dir)


def load_data(directory=OUTPUT_FOLDER, filename=TRAIN_FILENAME):

    # Construir la ruta completa al archivo CSV
    file_path = os.path.join(directory, filename)

    # Verificar si el archivo existe
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"El archivo {filename} no se encuentra en el directorio {directory}")

    # Leer el archivo CSV en un DataFrame
    return pd.read_csv(file_path)

fetch_data()
df = load_data()

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, learning_curve
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, roc_auc_score, roc_curve


In [7]:
X = df.drop('Class', axis=1)
y = df['Class']

In [8]:
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.4, stratify=y, random_state=42
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42
)

In [11]:
from imblearn.over_sampling import SMOTE


In [13]:
smote = SMOTE(random_state=42)
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)

In [14]:
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
X_train_sm_scaled = scaler.fit_transform(X_train_sm)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

In [16]:
from sklearn.ensemble import RandomForestClassifier
tf_base = RandomForestClassifier(n_estimators=100,max_depth=None,class_weight='balanced',random_state=42)

In [17]:
tf_base.fit(X_train_sm_scaled, y_train_sm)

In [19]:
from sklearn.metrics import roc_auc_score, classification_report, precision_recall_curve
y_train_proba = tf_base.predict_proba(X_train_sm_scaled)[:, 1]
y_train_pred  = tf_base.predict(X_train_sm_scaled)
print("--- TRAIN BASE ---")
print(f"AUC   : {roc_auc_score(y_train_sm, y_train_proba):.4f}")
print(classification_report(y_train_sm, y_train_pred, digits=4))

--- TRAIN BASE ---
AUC   : 1.0000
              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000    170589
           1     1.0000    1.0000    1.0000    170589

    accuracy                         1.0000    341178
   macro avg     1.0000    1.0000    1.0000    341178
weighted avg     1.0000    1.0000    1.0000    341178



In [21]:
y_val_proba = tf_base.predict_proba(X_val_scaled)[:, 1]
y_val_pred  = tf_base.predict(X_val_scaled)
print("--- VALIDATION BASE ---")
print(f"AUC   : {roc_auc_score(y_val, y_val_proba):.4f}")
print(classification_report(y_val, y_val_pred, digits=4))

--- VALIDATION BASE ---
AUC   : 0.9632
              precision    recall  f1-score   support

           0     0.9996    0.9997    0.9997     56863
           1     0.8352    0.7755    0.8042        98

    accuracy                         0.9994     56961
   macro avg     0.9174    0.8876    0.9020     56961
weighted avg     0.9993    0.9994    0.9993     56961



Observamos que existe overfitting por lo que vamos a ajustar los hiperparametros

In [23]:
tf_base2 = RandomForestClassifier(
    n_estimators=100,
    max_depth=8,       
    class_weight='balanced',
    random_state=42
)
tf_base2.fit(X_train_sm_scaled, y_train_sm)

In [24]:
y_train_proba2 = tf_base2.predict_proba(X_train_sm_scaled)[:, 1]
y_train_pred2  = tf_base2.predict(X_train_sm_scaled)
print("--- TRAIN BASE ---")
print(f"AUC   : {roc_auc_score(y_train_sm, y_train_proba2):.4f}")
print(classification_report(y_train_sm, y_train_pred2, digits=4))

--- TRAIN BASE ---
AUC   : 0.9997
              precision    recall  f1-score   support

           0     0.9856    0.9988    0.9921    170589
           1     0.9987    0.9854    0.9920    170589

    accuracy                         0.9921    341178
   macro avg     0.9922    0.9921    0.9921    341178
weighted avg     0.9922    0.9921    0.9921    341178



In [28]:
#importar confusion_matrix
from sklearn.metrics import confusion_matrix
cm= confusion_matrix(y_train_sm, y_train_pred2)
print("Matriz de confusión:")
print(cm)

Matriz de confusión:
[[170378    211]
 [  2491 168098]]


In [25]:
y_val_proba2 = tf_base2.predict_proba(X_val_scaled)[:, 1]
y_val_pred2  = tf_base2.predict(X_val_scaled)
print("--- VALIDATION BASE ---")
print(f"AUC   : {roc_auc_score(y_val, y_val_proba2):.4f}")
print(classification_report(y_val, y_val_pred2, digits=4))

--- VALIDATION BASE ---
AUC   : 0.9824
              precision    recall  f1-score   support

           0     0.9997    0.9982    0.9990     56863
           1     0.4420    0.8163    0.5735        98

    accuracy                         0.9979     56961
   macro avg     0.7208    0.9073    0.7862     56961
weighted avg     0.9987    0.9979    0.9982     56961



In [31]:
from sklearn.metrics import confusion_matrix
cm2= confusion_matrix(y_val_sm, y_val_pred2)
print("Matriz de confusión:")
print(cm2)

NameError: name 'y_val_sm' is not defined