In [None]:
import os
import zipfile
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#import statsmodels.api as sm
from lightgbm import LGBMClassifier
from scipy.stats import chi2_contingency
from scipy.stats import f_oneway
from scipy import stats

In [2]:
INPUT_ZIP = "../data/in/Fraud_Detction_Dataset.zip"  # Directorio del zip
OUTPUT_FOLDER = "../data/out/"  # Directorio de destino
TRAIN_FILENAME = "creditcard.csv"  # Nombre del fichero de entrenamiento

def fetch_data(input_path=INPUT_ZIP, output_dir=OUTPUT_FOLDER):
    
    # Comprobación de que el directorio de destino existe
    os.makedirs(output_dir, exist_ok=True)

    # Descomprime el archivo ZIP en caso de que no haya ningún csv en la carpeta
    if(len([file for file in os.listdir(output_dir) if file.endswith('.csv')]) == 0):
        with zipfile.ZipFile(input_path, 'r') as zip_ref:
            zip_ref.extractall(output_dir)


def load_data(directory=OUTPUT_FOLDER, filename=TRAIN_FILENAME):

    # Construir la ruta completa al archivo CSV
    file_path = os.path.join(directory, filename)

    # Verificar si el archivo existe
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"El archivo {filename} no se encuentra en el directorio {directory}")

    # Leer el archivo CSV en un DataFrame
    return pd.read_csv(file_path)

fetch_data()
df = load_data()

In [3]:
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, learning_curve
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, roc_auc_score, roc_curve



In [4]:
def crear_datasets (datos, porc_entrena, porc_prueba, porc_valida): 
  tamaño_prueba=(porc_prueba+porc_valida)/(porc_entrena+porc_prueba+porc_valida) 
  data_entrena, data_test = train_test_split(datos, test_size=tamaño_prueba,random_state=42,shuffle=True) 
  tamaño_prueba=porc_valida/(porc_prueba+porc_valida) 
  data_prueba, data_valida=train_test_split(data_test,test_size=tamaño_prueba,random_state=42,shuffle=True) 
  return data_entrena, data_prueba, data_valida 

In [7]:
df_entrena,df_pru,df_valida=crear_datasets(df,70,15,15) 
print(f"Dimensiones del dataset de entrenamiento: {df_entrena.shape}") 
print(f"Dimensiones del dataset de prueba: {df_pru.shape}") 
print(f"Dimensiones del dataset de validación: {df_valida.shape}") 

Dimensiones del dataset de entrenamiento: (199364, 31)
Dimensiones del dataset de prueba: (42721, 31)
Dimensiones del dataset de validación: (42722, 31)


In [12]:
Q1 = df_entrena.quantile(0.25)
Q3 = df_entrena.quantile(0.75)
IQR = Q3 - Q1
df_cleaned_entrena = df_entrena[
    (df_entrena['Class'] == 1) | 
    ~((df_entrena < (Q1 - 3 * IQR)) | (df_entrena > (Q3 + 3 * IQR))).any(axis=1)
]

# Mostrar el nuevo DataFrame sin valores atípicos
df_cleaned_entrena

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
2557,2105.0,-2.289565,-0.480260,0.818685,-1.706423,0.822102,-1.660326,0.944047,-0.541765,1.323156,...,-0.210837,0.914737,0.867888,0.422969,0.310584,-0.781488,0.392241,-0.147757,1.00,0
152342,97283.0,-1.809763,-0.567439,2.265186,-0.960318,-1.212537,1.516493,-1.417176,0.903421,1.961027,...,-0.509915,-0.424978,-0.268621,0.010121,0.466862,0.835540,-0.062385,0.088079,75.00,0
103385,68628.0,1.192319,0.178575,0.141491,0.459628,-0.049959,-0.112122,-0.163883,0.155740,-0.067566,...,-0.240464,-0.739862,0.116799,-0.373837,0.125470,0.130126,-0.016956,0.011937,1.98,0
8771,11951.0,-0.963451,0.700311,1.097333,-1.547626,0.669966,0.513533,0.333683,0.270900,1.381880,...,-0.279519,-0.470181,-0.124037,-1.388839,-0.237453,0.785347,0.349708,0.216207,37.31,0
277115,167472.0,-0.428006,0.635064,-2.273366,-1.377586,2.615307,3.294754,-0.085830,1.225613,-0.043560,...,0.106080,0.207328,0.326171,0.622640,-0.845222,0.316149,-0.237558,0.047550,45.00,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
207892,136875.0,1.952840,-0.965820,-1.325992,-0.979280,-0.026764,0.205408,-0.411692,0.012614,-0.598558,...,0.072499,0.180845,0.321661,-1.031229,-0.441599,-0.337488,0.011392,-0.055575,77.00,0
110268,71750.0,0.960530,-1.006608,0.799673,-1.216255,-1.541568,-0.670999,-0.610605,0.058303,1.696968,...,0.254714,0.653521,-0.204047,0.621861,0.370590,0.070796,0.015218,0.037579,135.00,0
119879,75618.0,1.173488,0.100792,0.490512,0.461596,-0.296377,-0.213165,-0.165254,0.119221,-0.114199,...,-0.186027,-0.574283,0.161405,-0.006140,0.091444,0.109235,-0.020922,0.003967,1.98,0
131932,79795.0,-0.146609,0.992946,1.524591,0.485774,0.349308,-0.815198,1.076640,-0.395316,-0.491303,...,0.052649,0.354089,-0.291198,0.402849,0.237383,-0.398467,-0.121139,-0.196195,3.94,0


In [14]:
df_entrena.shape

(199364, 31)

In [13]:
df_cleaned_entrena.shape

(162361, 31)

In [15]:
#numero de filas eliminadas
print(f"Número de filas eliminadas: {df_entrena.shape[0] - df_cleaned_entrena.shape[0]}")

Número de filas eliminadas: 37003


In [16]:
X_entrena= df_cleaned_entrena.drop("Class",axis=1) 
y_entrena=df_cleaned_entrena["Class"].copy() 
X_pru= df_pru.drop("Class",axis=1) 
y_pru=df_pru["Class"].copy() 
X_valida= df_valida.drop("Class",axis=1) 
y_valida=df_valida["Class"].copy()

In [17]:
from sklearn.preprocessing import RobustScaler

scaler = RobustScaler()
scaler.fit(X_entrena[["Time", "Amount"]])

for X in (X_entrena, X_valida, X_pru):
    X[["Time_scaled", "Amount_scaled"]] = scaler.transform(X[["Time", "Amount"]])
    X.drop(columns=["Time", "Amount"], inplace=True)

### XGBCLASSIFIER

In [18]:
import lightgbm as lgb
from lightgbm import LGBMClassifier
import xgboost as xgb
from xgboost import XGBClassifier

# Entrenamos el modelo base con XGBoost

xgb_model = XGBClassifier(
    objective='binary:logistic',
    random_state=42
)

xgb_model.fit(X_entrena, y_entrena)


In [19]:
# Evaluar el modelo en entrenamiento
y_train_pred = xgb_model.predict(X_entrena)
y_train_pred_proba = xgb_model.predict_proba(X_entrena)[:, 1]

# AUC en entrenamiento
from sklearn.metrics import roc_auc_score
auc_train = roc_auc_score(y_entrena, y_train_pred_proba)
print(f"AUC en entrenamiento: {auc_train}")

# Reporte de clasificación en entrenamiento
from sklearn.metrics import classification_report
print("Reporte de clasificación en entrenamiento:")
print(classification_report(y_entrena, y_train_pred))

AUC en entrenamiento: 1.0
Reporte de clasificación en entrenamiento:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    162005
           1       1.00      1.00      1.00       356

    accuracy                           1.00    162361
   macro avg       1.00      1.00      1.00    162361
weighted avg       1.00      1.00      1.00    162361



In [23]:
# Predecir en el conjunto de validación
y_pred_valida = xgb_model.predict(X_valida)
y_proba_valida = xgb_model.predict_proba(X_valida)[:, 1]

# Evaluación: AUC y otras métricas
# AUC en el conjunto de validación
auc_valida = roc_auc_score(y_valida, y_proba_valida)
print(f"AUC en validación: {auc_valida}")

# Reporte de clasificación (Precision, Recall, F1, etc.)
print("Reporte de clasificación en validación:")
print(classification_report(y_valida, y_pred_valida))

AUC en validación: 0.99030517850382
Reporte de clasificación en validación:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     42663
           1       0.24      0.92      0.38        59

    accuracy                           1.00     42722
   macro avg       0.62      0.96      0.69     42722
weighted avg       1.00      1.00      1.00     42722



In [None]:
#matriz de confusion
cm_validation = confusion_matrix(y_valida, y_pred_valida)
print("Matriz de confusión:")
print(cm)

Matriz de confusión:
[[42492   171]
 [    5    54]]
