In [1]:
import os
import zipfile
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
INPUT_ZIP = "../data/in/Fraud_Detction_Dataset.zip"  # Directorio del zip
OUTPUT_FOLDER = "../data/out/"  # Directorio de destino
TRAIN_FILENAME = "creditcard.csv"  # Nombre del fichero de entrenamiento

def fetch_data(input_path=INPUT_ZIP, output_dir=OUTPUT_FOLDER):
    
    # Comprobación de que el directorio de destino existe
    os.makedirs(output_dir, exist_ok=True)

    # Descomprime el archivo ZIP en caso de que no haya ningún csv en la carpeta
    if(len([file for file in os.listdir(output_dir) if file.endswith('.csv')]) == 0):
        with zipfile.ZipFile(input_path, 'r') as zip_ref:
            zip_ref.extractall(output_dir)


def load_data(directory=OUTPUT_FOLDER, filename=TRAIN_FILENAME):

    # Construir la ruta completa al archivo CSV
    file_path = os.path.join(directory, filename)

    # Verificar si el archivo existe
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"El archivo {filename} no se encuentra en el directorio {directory}")

    # Leer el archivo CSV en un DataFrame
    return pd.read_csv(file_path)

fetch_data()
df = load_data()

In [3]:
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, learning_curve
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.preprocessing import StandardScaler

In [None]:
def crear_datasets(datos, target_col, porc_entrena, porc_prueba, porc_valida):
    y = datos[target_col]

    tamaño_prueba = (porc_prueba + porc_valida) / (porc_entrena + porc_prueba + porc_valida)
    data_entrena, data_test = train_test_split(
        datos, test_size=tamaño_prueba, random_state=42, shuffle=True, stratify=y
    )
    
    y_test = data_test[target_col]
    tamaño_prueba = porc_valida / (porc_prueba + porc_valida)
    data_prueba, data_valida = train_test_split(
        data_test, test_size=tamaño_prueba, random_state=42, shuffle=True, stratify=y_test
    )

    return data_entrena, data_prueba, data_valida


In [7]:
# Llamada a la función
df_train, df_test, df_val = crear_datasets(df, target_col='Class', porc_entrena=0.7, porc_prueba=0.15, porc_valida=0.15)


In [8]:
df_train['Amount'] = np.log1p(df_train['Amount'])
df_test['Amount'] = np.log1p(df_test['Amount'])
df_val['Amount'] = np.log1p(df_val['Amount'])


scaler_amount = StandardScaler()
scaler_time = StandardScaler()

df_train['ScaledAmount'] = scaler_amount.fit_transform(df_train[['Amount']])
df_train['ScaledTime'] = scaler_time.fit_transform(df_train[['Time']])

df_test['ScaledAmount'] = scaler_amount.transform(df_test[['Amount']])
df_test['ScaledTime'] = scaler_time.transform(df_test[['Time']])

df_val['ScaledAmount'] = scaler_amount.transform(df_val[['Amount']])
df_val['ScaledTime'] = scaler_time.transform(df_val[['Time']])

df_train.drop(columns=['Time', 'Amount'], inplace=True)
df_test.drop(columns=['Time', 'Amount'], inplace=True)
df_val.drop(columns=['Time', 'Amount'], inplace=True)

In [9]:
X_train= df_train.drop("Class",axis=1) 
y_train=df_train["Class"].copy() 
X_test= df_test.drop("Class",axis=1) 
y_test=df_test["Class"].copy() 
X_val= df_val.drop("Class",axis=1) 
y_val=df_val["Class"].copy()

In [10]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape, X_val.shape, y_val.shape

((199364, 30), (199364,), (42721, 30), (42721,), (42722, 30), (42722,))

### XGBoostClassifier


In [11]:
scale_pos_weight = (y_train.value_counts()[0] / y_train.value_counts()[1]) - 1
print(f"scale_pos_weight: {scale_pos_weight}")

scale_pos_weight: 577.546511627907


In [12]:
from xgboost import XGBClassifier
model = XGBClassifier(
  scale_pos_weight=scale_pos_weight,
    n_estimators=100,        
    learning_rate=0.1,        
    max_depth=4,              
    subsample=0.8,            
    colsample_bytree=0.8,    
    reg_alpha=0.1,            
    reg_lambda=1,             
    random_state=42,
    use_label_encoder=False,
    eval_metric='auc'
)

model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.



In [13]:
# 2. Evaluar en train
train_preds = model.predict(X_train)
print("Train", classification_report(y_train, train_preds))

# 3. Evaluar en val
val_preds = model.predict(X_val)
print("Validation:", classification_report(y_val, val_preds))

confusion_matrix_train = confusion_matrix(y_train, train_preds)
confusion_matrix_val = confusion_matrix(y_val, val_preds)
print("Confusion Matrix Train:")
print(confusion_matrix_train)
print("Confusion Matrix Validation:")
print(confusion_matrix_val)
y_proba = model.predict_proba(X_val)[:, 1]
print("\nROC AUC Score:", roc_auc_score(y_val, y_proba))

Train               precision    recall  f1-score   support

           0       1.00      1.00      1.00    199020
           1       0.53      1.00      0.70       344

    accuracy                           1.00    199364
   macro avg       0.77      1.00      0.85    199364
weighted avg       1.00      1.00      1.00    199364

Validation:               precision    recall  f1-score   support

           0       1.00      1.00      1.00     42648
           1       0.48      0.85      0.61        74

    accuracy                           1.00     42722
   macro avg       0.74      0.92      0.81     42722
weighted avg       1.00      1.00      1.00     42722

Confusion Matrix Train:
[[198721    299]
 [     0    344]]
Confusion Matrix Validation:
[[42579    69]
 [   11    63]]

ROC AUC Score: 0.9616375660973298


In [14]:
from imblearn.over_sampling import SMOTE
#aplicando metrica de oversampling
smote = SMOTE(random_state=42)

In [15]:
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [16]:
y_train_resampled.value_counts()

Class
0    199020
1    199020
Name: count, dtype: int64

In [17]:
model2 = XGBClassifier(
  scale_pos_weight=scale_pos_weight,
    n_estimators=100,        
    learning_rate=0.1,        
    max_depth=4,              
    subsample=0.8,            
    colsample_bytree=0.8,    
    reg_alpha=0.1,            
    reg_lambda=1,             
    random_state=42,
    use_label_encoder=False,
    eval_metric='auc'
)

model2.fit(X_train_resampled, y_train_resampled)

Parameters: { "use_label_encoder" } are not used.



In [18]:
# 2. Evaluar en train
train_preds2 = model2.predict(X_train_resampled)
print("Train", classification_report(y_train_resampled, train_preds2))

# 3. Evaluar en val
val_preds2 = model.predict(X_val)
print("Validation:", classification_report(y_val, val_preds2))

confusion_matrix_train2 = confusion_matrix(y_train_resampled, train_preds2)
confusion_matrix_val2 = confusion_matrix(y_val, val_preds2)
print("Confusion Matrix Train:")
print(confusion_matrix_train2)
print("Confusion Matrix Validation:")
print(confusion_matrix_val2)
y_proba2 = model.predict_proba(X_val)[:, 1]
print("\nROC AUC Score:", roc_auc_score(y_val, y_proba2))

Train               precision    recall  f1-score   support

           0       1.00      0.92      0.96    199020
           1       0.92      1.00      0.96    199020

    accuracy                           0.96    398040
   macro avg       0.96      0.96      0.96    398040
weighted avg       0.96      0.96      0.96    398040

Validation:               precision    recall  f1-score   support

           0       1.00      1.00      1.00     42648
           1       0.48      0.85      0.61        74

    accuracy                           1.00     42722
   macro avg       0.74      0.92      0.81     42722
weighted avg       1.00      1.00      1.00     42722

Confusion Matrix Train:
[[182801  16219]
 [     0 199020]]
Confusion Matrix Validation:
[[42579    69]
 [   11    63]]

ROC AUC Score: 0.9616375660973298


### Optimizacion de hiperparametros

### Randomized Search

In [19]:
from scipy.stats import uniform, randint
param_dist = {
    'n_estimators': randint(100, 200),
    'max_depth': randint(3, 7),
    'learning_rate': uniform(0.01, 0.2) 
}
random_search = RandomizedSearchCV(
    estimator=model2,
    param_distributions=param_dist,
    n_iter=30,              
    scoring='roc_auc',      
    cv=3,                   
    verbose=2,
    n_jobs=-1,              
    random_state=42
)
random_search.fit(X_train_resampled, y_train_resampled)
print("Mejores parámetros encontrados:")
print(random_search.best_params_)
print("Mejor puntuación de validación cruzada:")
print(random_search.best_score_)

Fitting 3 folds for each of 30 candidates, totalling 90 fits


Parameters: { "use_label_encoder" } are not used.



Mejores parámetros encontrados:
{'learning_rate': np.float64(0.19437484700462337), 'max_depth': 6, 'n_estimators': 177}
Mejor puntuación de validación cruzada:
0.999989567928675


In [20]:
param_grid = {
    'learning_rate': [0.18, 0.19, 0.20],
    'max_depth': [5, 6, 7],
    'n_estimators': [175, 180, 185]
}

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='roc_auc',
    cv=3,
    verbose=2,
    n_jobs=-1
)

grid_search.fit(X_train_resampled, y_train_resampled)

print("Mejor combinación encontrada:")
print(grid_search.best_params_)
print("Mejor puntuación de validación cruzada:")
print(grid_search.best_score_)


Fitting 3 folds for each of 27 candidates, totalling 81 fits


Parameters: { "use_label_encoder" } are not used.



Mejor combinación encontrada:
{'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 180}
Mejor puntuación de validación cruzada:
0.9999910330517031


Finalmente evaluamos en el dataset de test

In [22]:
# Evaluar en el conjunto de test sin balancear
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

# Matriz de confusión
cm = confusion_matrix(y_test, y_pred)
print("Matriz de confusión:")
print(cm)

print(classification_report(y_test, y_pred))

roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"ROC AUC Score: {roc_auc:.4f}")


Matriz de confusión:
[[42593    54]
 [   13    61]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     42647
           1       0.53      0.82      0.65        74

    accuracy                           1.00     42721
   macro avg       0.77      0.91      0.82     42721
weighted avg       1.00      1.00      1.00     42721

ROC AUC Score: 0.9688


### Logistic Regression

In [23]:
from sklearn.linear_model import LogisticRegression

model3=LogisticRegression(random_state=42)
model3.fit(X_train, y_train)

In [25]:
y_pred3 = model3.predict(X_test)
y_proba3 = model3.predict_proba(X_test)[:, 1]  
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred3))
print("\nClassification Report:\n", classification_report(y_test, y_pred3))
print("AUC:", roc_auc_score(y_test, y_proba3))

Confusion Matrix:
 [[42636    11]
 [   30    44]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     42647
           1       0.80      0.59      0.68        74

    accuracy                           1.00     42721
   macro avg       0.90      0.80      0.84     42721
weighted avg       1.00      1.00      1.00     42721

AUC: 0.9565879923114898


In [26]:
model4=LogisticRegression(random_state=42)
model4.fit(X_train_resampled, y_train_resampled)

In [27]:
y_pred4 = model4.predict(X_test)
y_proba4 = model4.predict_proba(X_test)[:, 1] 
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred4))
print("\nClassification Report:\n", classification_report(y_test, y_pred4))
print("AUC:", roc_auc_score(y_test, y_proba4))

Confusion Matrix:
 [[41688   959]
 [   10    64]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.98      0.99     42647
           1       0.06      0.86      0.12        74

    accuracy                           0.98     42721
   macro avg       0.53      0.92      0.55     42721
weighted avg       1.00      0.98      0.99     42721

AUC: 0.9658865139907183


In [28]:
param_dist_lr = {
    'C': uniform(0.01, 10),  
    'penalty': ['l2'],      
    'solver': ['lbfgs', 'saga'],  
    'class_weight': [None, 'balanced'],  
    'max_iter': [100, 200]  
    
}

random_search_lr = RandomizedSearchCV(
    estimator=model3,
    param_distributions=param_dist_lr,
    n_iter=30,               
    scoring='roc_auc',       
    cv=3,                    
    verbose=2,
    n_jobs=-1,               
    random_state=42
)

random_search_lr.fit(X_train, y_train)

print("Mejores parámetros encontrados:")
print(random_search_lr.best_params_)
print("Mejor puntuación de validación cruzada:")
print(random_search_lr.best_score_)

Fitting 3 folds for each of 30 candidates, totalling 90 fits
Mejores parámetros encontrados:
{'C': np.float64(0.15079822715084457), 'class_weight': None, 'max_iter': 100, 'penalty': 'l2', 'solver': 'saga'}
Mejor puntuación de validación cruzada:
0.9849149528341045




In [1]:

# 1. Usar el mejor modelo encontrado
best_model_lr = random_search_lr.best_estimator_

# 2. Predecir en TEST
y_pred5 = best_model_lr.predict(X_val)
y_pred_prob5 = best_model_lr.predict_proba(X_val)[:, 1]

# 3. Matriz de confusión
print("Matriz de confusión:")
print(confusion_matrix(y_val, y_pred5))

# 4. Classification report
print("\nClassification Report:")
print(classification_report(y_val, y_pred5))

# 5. AUC
print("AUC:", roc_auc_score(y_val, y_pred_prob5))


NameError: name 'random_search_lr' is not defined

## Conclusiones

XGBoost es más adecuado cuando el objetivo principal es detectar el mayor número posible de fraudes, aunque esto implique un mayor número de falsos positivos. Por otro lado, Logistic Regression ofrece un enfoque más conservador, priorizando una menor tasa de falsos positivos a costa de detectar menos fraudes. Además, Logistic Regression presenta una mejor puntuación en la curva ROC AUC, lo que indica una mejor capacidad de separación entre las clases.
