In [2]:
import os
import random
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
random.seed(RANDOM_STATE)


TRAIN_PATH = '../data/train_clean.csv'
TEST_PATH = '../data/test_clean.csv'

# Cargar datasets
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)

# Comprobaciones 
print("-Archivos cargados-")
print(f"Train shape: {train.shape}")
print(f"Test  shape: {test.shape}")

print("\n--- Primeras 5 filas de train ---")
display(train.head())

print("\n--- Tipos de columnas (train) ---")
print(train.dtypes.value_counts())

# Distribucion de la variable objetivo en train
if 'label' in train.columns:
    vc = train['label'].value_counts(dropna=False).sort_index()
    pct = train['label'].value_counts(normalize=True, dropna=False).sort_index() * 100
    dist_df = pd.DataFrame({'count': vc, 'pct': pct.round(4)})
    print("\n--- Distribución 'label' (train) ---")
    print(dist_df)
else:
    print("La columna 'label' NO se encontró en train.")


-Archivos cargados-
Train shape: (7030723, 16)
Test  shape: (7027943, 16)

--- Primeras 5 filas de train ---


Unnamed: 0,user_id,age_range,gender,merchant_id,label,activity_len,actions_0,actions_2,actions_3,unique_items,unique_categories,unique_brands,date_min,date_max,day_span,has_1111
0,34176,6,0,944,-1,1,1,0,0,1,1,1,2014-11-07,2014-11-07,0,0
1,34176,6,0,412,-1,8,8,0,0,7,4,1,2014-08-18,2014-10-31,74,0
2,34176,6,0,1945,-1,7,7,0,0,3,2,1,2014-08-18,2014-08-20,2,0
3,34176,6,0,4752,-1,1,1,0,0,1,1,1,2014-10-27,2014-10-27,0,0
4,34176,6,0,643,-1,1,0,0,1,1,1,1,2014-10-24,2014-10-24,0,0



--- Tipos de columnas (train) ---
int64     14
object     2
Name: count, dtype: int64

--- Distribución 'label' (train) ---
         count      pct
label                  
-1     6769859  96.2897
 0      244912   3.4835
 1       15952   0.2269


## XGBoost

In [None]:

import xgboost as xgb

# Convertir a DMatrix (estructura de datos interna optimizada de XGBoost para almacenar features y etiquetas.)
dtrain = xgb.DMatrix(X_train, label=y_train)
dval   = xgb.DMatrix(X_val,   label=y_val)

# Parametros
params = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'eta': 0.05,            # learning_rate
    'max_depth': 6,
    'scale_pos_weight': scale_pos_weight,
    'verbosity': 1,
    'tree_method': 'hist'   # más eficiente; cambia a 'auto' si prefieres
}

watchlist = [(dtrain, 'train'), (dval, 'eval')]

# Entrenar con early stopping (num_boost_round = n_estimators original)
bst = xgb.train(
    params,
    dtrain,
    num_boost_round=1000,
    evals=watchlist,
    early_stopping_rounds=50,
    verbose_eval=10
)

# Predicciones de probabilidad en validacion
y_val_proba = bst.predict(dval)
y_val_pred = (y_val_proba >= 0.5).astype(int)

# Metricas
print("\n--- Métricas en validacion ---")
print("AUC (val):", roc_auc_score(y_val, y_val_proba))
print("\nClassification report (val):")
print(classification_report(y_val, y_val_pred, digits=4))
print("\nConfusion matrix (val):")
print(confusion_matrix(y_val, y_val_pred))


fi = bst.get_score(importance_type='gain')
fi_sorted = sorted(fi.items(), key=lambda x: x[1], reverse=True)

print("\nTop 20 features (gain):")
for f, v in fi_sorted[:20]:
    print(f"{f:20s} -> gain: {v:.6f}")


[0]	train-auc:0.62887	eval-auc:0.61302
[10]	train-auc:0.64052	eval-auc:0.62505
[20]	train-auc:0.64482	eval-auc:0.62693
[30]	train-auc:0.64959	eval-auc:0.62843
[40]	train-auc:0.65441	eval-auc:0.63060
[50]	train-auc:0.65942	eval-auc:0.63220
[60]	train-auc:0.66324	eval-auc:0.63318
[70]	train-auc:0.66870	eval-auc:0.63517
[80]	train-auc:0.67376	eval-auc:0.63780
[90]	train-auc:0.67773	eval-auc:0.63952
[100]	train-auc:0.68035	eval-auc:0.64001
[110]	train-auc:0.68331	eval-auc:0.64067
[120]	train-auc:0.68562	eval-auc:0.64093
[130]	train-auc:0.68866	eval-auc:0.64181
[140]	train-auc:0.69117	eval-auc:0.64157
[150]	train-auc:0.69443	eval-auc:0.64231
[160]	train-auc:0.69689	eval-auc:0.64290
[170]	train-auc:0.70028	eval-auc:0.64351
[180]	train-auc:0.70285	eval-auc:0.64372
[190]	train-auc:0.70442	eval-auc:0.64355
[200]	train-auc:0.70611	eval-auc:0.64305
[210]	train-auc:0.70904	eval-auc:0.64291
[220]	train-auc:0.71125	eval-auc:0.64261
[225]	train-auc:0.71203	eval-auc:0.64252

--- Métricas en validacion

In [7]:
from sklearn.metrics import precision_recall_curve, average_precision_score, f1_score, classification_report, confusion_matrix

# 1) PR-AUC (average precision)
ap = average_precision_score(y_val, y_val_proba)
print("Average Precision (PR-AUC):", round(ap, 6))

# 2) Encontrar umbral que maximice F1 en validacion
precisions, recalls, thresholds = precision_recall_curve(y_val, y_val_proba)
f1_scores = 2 * (precisions[:-1] * recalls[:-1]) / (precisions[:-1] + recalls[:-1] + 1e-12)
best_idx = f1_scores.argmax()
best_threshold = thresholds[best_idx]
best_f1 = f1_scores[best_idx]

print(f"Mejor threshold por F1: {best_threshold:.4f}  -> F1: {best_f1:.4f}")
print(f"Precision@best: {precisions[:-1][best_idx]:.4f}, Recall@best: {recalls[:-1][best_idx]:.4f}")

# 3) Evaluar con ese umbral
y_val_pred_best = (y_val_proba >= best_threshold).astype(int)
print("\nClassification report (con threshold optimo):")
print(classification_report(y_val, y_val_pred_best, digits=4))
print("\nConfusion matrix (con threshold optimo):")
print(confusion_matrix(y_val, y_val_pred_best))

Average Precision (PR-AUC): 0.118822
Mejor threshold por F1: 0.5827  -> F1: 0.1832
Precision@best: 0.1319, Recall@best: 0.2997

Classification report (con threshold optimo):
              precision    recall  f1-score   support

           0     0.9503    0.8715    0.9092     48983
           1     0.1319    0.2997    0.1832      3190

    accuracy                         0.8366     52173
   macro avg     0.5411    0.5856    0.5462     52173
weighted avg     0.9002    0.8366    0.8648     52173


Confusion matrix (con threshold optimo):
[[42690  6293]
 [ 2234   956]]


## LightGBM

In [12]:

import lightgbm as lgb


df_model = train[train['label'].isin([0,1])].copy()  


merchant_counts = df_model['merchant_id'].value_counts()
df_model['merchant_freq'] = df_model['merchant_id'].map(merchant_counts)


features_lgb = [
    'activity_len','actions_0','actions_2','actions_3',
    'unique_items','unique_categories','unique_brands',
    'day_span','has_1111','age_range','gender','merchant_freq'
]

X = df_model[features_lgb]
y = df_model['label'].astype(int)


X_train_lgb, X_val_lgb, y_train_lgb, y_val_lgb = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=RANDOM_STATE
)


scale_pos_weight_lgb = (y_train_lgb==0).sum() / (y_train_lgb==1).sum()
print("Shapes -> X_train_lgb:", X_train_lgb.shape, "X_val_lgb:", X_val_lgb.shape)
print("scale_pos_weight (LGB):", scale_pos_weight_lgb)

# ---------- Dataset LightGBM ----------
lgb_train = lgb.Dataset(X_train_lgb, label=y_train_lgb)
lgb_val   = lgb.Dataset(X_val_lgb, label=y_val_lgb, reference=lgb_train)

# ---------- Parametros  ----------
lgb_params = {
    'objective': 'binary',
    'metric': 'average_precision',
    'learning_rate': 0.05,
    'num_leaves': 31,
    'max_depth': 6,
    'verbosity': -1,
    'is_unbalance': False,          
    'scale_pos_weight': scale_pos_weight_lgb
}


gbm = lgb.train(
    lgb_params,
    lgb_train,
    num_boost_round=1000,
    valid_sets=[lgb_train, lgb_val],
    valid_names=['train','valid'],
    callbacks=[
        lgb.early_stopping(stopping_rounds=50),
        lgb.log_evaluation(period=10)
    ]
)

# Predicciones y evaluacion 
y_val_proba_lgb = gbm.predict(X_val_lgb)
from sklearn.metrics import average_precision_score, classification_report, confusion_matrix
ap_lgb = average_precision_score(y_val_lgb, y_val_proba_lgb)
print("\nLGB PR-AUC (Average Precision):", round(ap_lgb, 6))

# Buscar threshold que maximice F1
from sklearn.metrics import precision_recall_curve
precisions, recalls, thresholds = precision_recall_curve(y_val_lgb, y_val_proba_lgb)
f1_scores = 2 * (precisions[:-1] * recalls[:-1]) / (precisions[:-1] + recalls[:-1] + 1e-12)
best_idx = f1_scores.argmax()
best_threshold_lgb = thresholds[best_idx]
print(f"Mejor threshold LGB por F1: {best_threshold_lgb:.4f} -> F1: {f1_scores[best_idx]:.4f}")
y_val_pred_lgb = (y_val_proba_lgb >= best_threshold_lgb).astype(int)

print("\nClassification report (LGB, threshold optimo):")
print(classification_report(y_val_lgb, y_val_pred_lgb, digits=4))
print("\nConfusion matrix (LGB, threshold optimo):")
print(confusion_matrix(y_val_lgb, y_val_pred_lgb))

# Top features (gain/importance)
imp = gbm.feature_importance(importance_type='gain')
names = gbm.feature_name()
feat_imp = sorted(zip(names, imp), key=lambda x: x[1], reverse=True)
print("\nTop 20 features (gain) LGB:")
for n, v in feat_imp[:20]:
    print(f"{n:20s} -> gain: {v:.6f}")


Shapes -> X_train_lgb: (208691, 12) X_val_lgb: (52173, 12)
scale_pos_weight (LGB): 15.352530951261558
Training until validation scores don't improve for 50 rounds
[10]	train's average_precision: 0.117801	valid's average_precision: 0.11549
[20]	train's average_precision: 0.120762	valid's average_precision: 0.117246
[30]	train's average_precision: 0.122879	valid's average_precision: 0.117603
[40]	train's average_precision: 0.125651	valid's average_precision: 0.118473
[50]	train's average_precision: 0.128153	valid's average_precision: 0.119987
[60]	train's average_precision: 0.131758	valid's average_precision: 0.119949
[70]	train's average_precision: 0.134135	valid's average_precision: 0.119607
[80]	train's average_precision: 0.138021	valid's average_precision: 0.121288
[90]	train's average_precision: 0.141099	valid's average_precision: 0.122117
[100]	train's average_precision: 0.144531	valid's average_precision: 0.122797
[110]	train's average_precision: 0.14735	valid's average_precision: