In [1]:
import os
import zipfile
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
INPUT_ZIP = "../data/in/Fraud_Detction_Dataset.zip"  # Directorio del zip
OUTPUT_FOLDER = "../data/out/"  # Directorio de destino
TRAIN_FILENAME = "creditcard.csv"  # Nombre del fichero de entrenamiento

def fetch_data(input_path=INPUT_ZIP, output_dir=OUTPUT_FOLDER):
    
    # Comprobación de que el directorio de destino existe
    os.makedirs(output_dir, exist_ok=True)

    # Descomprime el archivo ZIP en caso de que no haya ningún csv en la carpeta
    if(len([file for file in os.listdir(output_dir) if file.endswith('.csv')]) == 0):
        with zipfile.ZipFile(input_path, 'r') as zip_ref:
            zip_ref.extractall(output_dir)


def load_data(directory=OUTPUT_FOLDER, filename=TRAIN_FILENAME):

    # Construir la ruta completa al archivo CSV
    file_path = os.path.join(directory, filename)

    # Verificar si el archivo existe
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"El archivo {filename} no se encuentra en el directorio {directory}")

    # Leer el archivo CSV en un DataFrame
    return pd.read_csv(file_path)

fetch_data()
df = load_data()

In [3]:
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, learning_curve
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.preprocessing import StandardScaler

In [32]:
#creamos nueva variable para que nos de una información más clara de la hora
df['Hour'] = (df['Time'] // 3600) % 24

KeyError: 'Time'

In [4]:
df['Amount'] = np.log1p(df['Amount'])
scaler = StandardScaler()
df['ScaledAmount'] = scaler.fit_transform(df[['Amount']])

In [6]:
df['Hour_Scaled'] = scaler.fit_transform(df[['Hour']])

In [7]:
df.drop(columns=['Amount','Hour','Time'],inplace=True)
df

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V22,V23,V24,V25,V26,V27,V28,Class,ScaledAmount,Hour_Scaled
0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,...,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,0,1.124303,-2.40693
1,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,...,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,0,-1.114639,-2.40693
2,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,...,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,0,1.682368,-2.40693
3,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,...,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,0,1.009339,-2.40693
4,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,...,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,0,0.670241,-2.40693
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284802,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,4.356170,...,0.111864,1.014480,-0.509348,1.436807,0.250034,0.943651,0.823731,0,-1.558093,1.53423
284803,-0.732789,-0.055080,2.035030,-0.738589,0.868229,1.058415,0.024330,0.294869,0.584800,-0.975926,...,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,0,0.059034,1.53423
284804,1.919565,-0.301254,-3.249640,-0.557828,2.630515,3.031260,-0.296827,0.708417,0.432454,-0.484782,...,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,0,0.652027,1.53423
284805,-0.240440,0.530483,0.702510,0.689799,-0.377961,0.623708,-0.686180,0.679145,0.392087,-0.399126,...,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,0,-0.455313,1.53423


In [8]:
X=df.drop(columns=['Class'],axis=1)
y=df['Class']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
scale_pos_weight = (y_train.value_counts()[0] / y_train.value_counts()[1]) - 1
print(f"scale_pos_weight: {scale_pos_weight}")

scale_pos_weight: 576.2868020304569


In [23]:
from xgboost import XGBClassifier
model = XGBClassifier(
    scale_pos_weight=scale_pos_weight,
    n_estimators=100,
    learning_rate=0.1,
    max_depth=4,
    random_state=42,
    use_label_encoder=False,
    eval_metric='auc'
)

model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.



In [24]:
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nROC AUC Score:", roc_auc_score(y_test, y_proba))


Confusion Matrix:
 [[56707   157]
 [   11    87]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.36      0.89      0.51        98

    accuracy                           1.00     56962
   macro avg       0.68      0.94      0.75     56962
weighted avg       1.00      1.00      1.00     56962


ROC AUC Score: 0.9813859491461189


In [17]:

from imblearn.over_sampling import SMOTE
#aplicando metrica de oversampling
smote = SMOTE(random_state=42)

In [18]:
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [19]:
y_train_resampled.value_counts()

Class
0    227451
1    227451
Name: count, dtype: int64

In [20]:
model2 = XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=4,
    random_state=42,
    use_label_encoder=False,
    eval_metric='auc'
)
model2.fit(X_train_resampled, y_train_resampled)

Parameters: { "use_label_encoder" } are not used.



In [25]:
y_pred2 = model2.predict(X_test)
y_proba2 = model2.predict_proba(X_test)[:, 1] 
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred2))
print("\nClassification Report:\n", classification_report(y_test, y_pred2))
print("\nROC AUC Score:", roc_auc_score(y_test, y_proba))

Confusion Matrix:
 [[56517   347]
 [   10    88]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.99      1.00     56864
           1       0.20      0.90      0.33        98

    accuracy                           0.99     56962
   macro avg       0.60      0.95      0.66     56962
weighted avg       1.00      0.99      1.00     56962


ROC AUC Score: 0.9813859491461189


Seleccionamos el modelo sin smote ya que obtenemos mejores resultados y optimizaremos los hiperparametros para ver si mejoran las metricas

### Optimizacion de hiperparametros

### Randomized Search

In [28]:
from scipy.stats import uniform, randint
param_dist = {
    'n_estimators': randint(100, 200),
    'max_depth': randint(3, 7),
    'learning_rate': uniform(0.01, 0.2) 
}
random_search = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_dist,
    n_iter=30,              
    scoring='roc_auc',      
    cv=3,                   
    verbose=2,
    n_jobs=-1,              
    random_state=42
)
random_search.fit(X_train, y_train)
print("Mejores parámetros encontrados:")
print(random_search.best_params_)
print("Mejor puntuación de validación cruzada:")
print(random_search.best_score_)


Fitting 3 folds for each of 30 candidates, totalling 90 fits


Parameters: { "use_label_encoder" } are not used.



Mejores parámetros encontrados:
{'learning_rate': np.float64(0.11495128632644756), 'max_depth': 6, 'n_estimators': 188}
Mejor puntuación de validación cruzada:
0.9790107626854848


### Grid Search


In [30]:
param_grid = {
    'learning_rate': [ 0.10, 0.12, 0.14],
    'max_depth': [5, 6, 7],
    'n_estimators': [170, 180, 190, 200]
}
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='roc_auc',
    cv=3,
    verbose=2,
    n_jobs=-1
)
grid_search.fit(X_train, y_train)
print(" Mejor combinación encontrada:")
print(grid_search.best_params_)
print("Mejor puntuación de validación cruzada:")
print(grid_search.best_score_)

Fitting 3 folds for each of 36 candidates, totalling 108 fits


Parameters: { "use_label_encoder" } are not used.



 Mejor combinación encontrada:
{'learning_rate': 0.14, 'max_depth': 7, 'n_estimators': 190}
Mejor puntuación de validación cruzada:
0.9810218143514181


In [None]:
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, RocCurveDisplay
import matplotlib.pyplot as plt


best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

# Matriz de confusión
cm = confusion_matrix(y_test, y_pred)
print("Matriz de confusion")
print(cm)

print(classification_report(y_test, y_pred))

roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"ROC AUC Score: {roc_auc:.4f}")

Confusion Matrix:
[[56861     3]
 [   18    80]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.96      0.82      0.88        98

    accuracy                           1.00     56962
   macro avg       0.98      0.91      0.94     56962
weighted avg       1.00      1.00      1.00     56962

ROC AUC Score: 0.9828


In [34]:

from sklearn.linear_model import LogisticRegression

model3=LogisticRegression(random_state=42)
model3.fit(X_train, y_train)

In [40]:
y_pred3 = model3.predict(X_test)
y_proba3 = model3.predict_proba(X_test)[:, 1]  
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred3))
print("\nClassification Report:\n", classification_report(y_test, y_pred3))


Confusion Matrix:
 [[56854    10]
 [   40    58]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.85      0.59      0.70        98

    accuracy                           1.00     56962
   macro avg       0.93      0.80      0.85     56962
weighted avg       1.00      1.00      1.00     56962



In [37]:
model4=LogisticRegression(random_state=42)
model4.fit(X_train_resampled, y_train_resampled)

In [41]:
y_pred4 = model4.predict(X_test)
y_proba4 = model4.predict_proba(X_test)[:, 1] 
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred4))
print("\nClassification Report:\n", classification_report(y_test, y_pred4))


Confusion Matrix:
 [[55394  1470]
 [    8    90]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.97      0.99     56864
           1       0.06      0.92      0.11        98

    accuracy                           0.97     56962
   macro avg       0.53      0.95      0.55     56962
weighted avg       1.00      0.97      0.99     56962



Observamos que el modelo entrenado sin oversampling tiene mejores resultados