In [1]:
import sys
from pathlib import Path
sys.path.append(str(Path.cwd().parent.parent))

from config.config import settings
import pandas as pd
import joblib
import json
import toml

import warnings
warnings.simplefilter('ignore')

%load_ext autoreload
%autoreload 2


In [2]:
# Lectura del dataset
data_path = settings.DATA_DIR / 'processed/data_processed.parquet'
data = pd.read_parquet(path=str(data_path))

# Setear los ids como índices
data: pd.DataFrame = data.set_index('product_id')
data.sample(5, random_state=10)

Unnamed: 0_level_0,condition,state,city,local_pickup,free_shipping,shipping_mode,listing_type,buying_mode,attribute_group_id,attribute_group,...,status,accepts_mercadopago,currency,automatic_relist,title,stock_quantity,available_quantity,total_amount,date_difference_hr,time_difference_hr
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
mla5501620002,used,capital federal,nuñez,True,False,not_specified,bronze,buy_it_now,,,...,active,True,ars,False,timbre inahalambrico,1,1,0.0,0.000833,1440.0
mla2357269269,used,buenos aires,avellaneda,True,False,not_specified,bronze,buy_it_now,dflt,otros,...,active,True,ars,False,lote de 2 cinturones. 1 nuevo con etiqueta.mic...,8,8,0.0,695.485278,1440.0
mla4505955642,used,buenos aires,acassuso,True,False,me2,bronze,buy_it_now,,,...,active,True,ars,False,revista instituto de historia del derecho rica...,3,3,0.0,0.000833,1440.0
mla7853937105,used,capital federal,retiro,True,False,not_specified,free,buy_it_now,,,...,active,True,ars,False,susan sontag - la enfermedad y sus metaforas -...,1,1,0.0,0.000833,1440.0
mla7813601724,new,capital federal,almagro,True,False,not_specified,silver,buy_it_now,,,...,active,True,ars,False,vendas cambric marca vendsur de 10cm x 3mt en ...,7,7,2010.0,0.000556,1440.0


In [3]:


# Separación de los conjuntos de datos
# ===================================================================================================================
from sklearn.model_selection import train_test_split

# Inicializar una semilla
SEED = 25

# Separamos los features y el target
X = data.loc[:, data.columns != 'condition'] # type: ignore
y = data.loc[:, data.columns == 'condition'].squeeze() # type: ignore

# Verificar que los índices coinciden
assert (X.index == y.index).all(), 'Los índices de X e y no coinciden'

# Dividir el conjunto original en 70% entrenamiento y 30% para pruebas y validación
X_train, X_temp, y_train, y_temp = train_test_split(
    X, 
    y, 
    test_size=0.4, 
    random_state=SEED, 
    stratify=y
)

# Luego, dividir el 30% restante en 20% para validación y 10% para pruebas
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, 
    test_size=1/2, 
    random_state=SEED, 
    stratify=y_temp
)

# Categóricas
X_train['local_pickup'] = X_train['local_pickup'].astype('category')
X_val['local_pickup'] = X_val['local_pickup'].astype('category')
X_test['local_pickup'] = X_test['local_pickup'].astype('category')

In [4]:
from sklearn.preprocessing import LabelEncoder

# Codificación del target
le = LabelEncoder()
y_train = pd.Series(le.fit_transform(y_train), index=y_train.index)
y_val = pd.Series(le.transform(y_val), index=y_val.index)
y_test = pd.Series(le.transform(y_test), index=y_test.index)

In [5]:
from sklearn.pipeline import Pipeline

# Pipeline de procesamiento
pipe_feature_selection = joblib.load(f'{settings.ART_DIR}/pipelines/pipeline_feature_selection.pkl')
pipe_feature_engineering = joblib.load(f'{settings.ART_DIR}/pipelines/pipeline_feature_engineering.pkl')
pipe = Pipeline([
    ('feature_selection', pipe_feature_selection),
    ('feature_engineering', pipe_feature_engineering)
])

print(pipe.get_params())

{'memory': None, 'steps': [('feature_selection', Pipeline(steps=[('drop-features',
                 DropFeatures(features_to_drop=['attribute_group_id',
                                                'attribute_group',
                                                'attribute_id', 'title',
                                                'city'])),
                ('constant-features',
                 DropConstantFeatures(missing_values='ignore', tol=0.95,
                                      variables=['stock_quantity',
                                                 'available_quantity',
                                                 'total_amount',
                                                 'date_difference_hr',
                                                 'time_difference_hr', 'state',
                                                 'local_pickup',
                                                 'free_ship...
                                                 'listi

In [6]:
# Ejecución
pipe.fit(X_train, y_train)
X_train = pipe.transform(X_train)
X_val = pipe.transform(X_val)
X_test = pipe.transform(X_test)

In [7]:
import optuna
import mlflow
import mlflow.xgboost
import xgboost as xgb
import numpy as np
import plotly.graph_objects as go
from optuna.integration.mlflow import MLflowCallback

In [8]:
# --- Función para evaluar las métricas de xgb.cv ---
def eval_performance_metrics(cv_results: pd.DataFrame) -> tuple[int, float, float]:
    """
    Retorna:
      - best_iteration: número de iteración óptima según 'validation-auc'.
      - best_roc_auc: mejor AUC obtenido en validación.
      - best_logloss: log-loss en esa iteración.
    """
    best_iteration = int(cv_results['validation-auc-mean'].idxmax())
    best_roc_auc = float(cv_results.loc[best_iteration, 'validation-auc-mean'])
    best_logloss = float(cv_results.loc[best_iteration, 'validation-logloss-mean'])
    return best_iteration, best_roc_auc, best_logloss

In [9]:
# ---------------------------
# Función para ejecutar xgb.cv usando la API nativa (con GPU)
# ---------------------------
def run_experiment(
    X_train: pd.DataFrame,
    y_train: pd.Series,
    params: dict,
    num_boost_round: int,
    nfold: int,
    early_stopping_rounds: int = None
) -> pd.DataFrame:
    
    # Vector de pesos basado en la frecuencia de cada clase
    weight_dict = y_train.value_counts(normalize=True).to_dict()
    sample_weight = np.array([weight_dict[val] for val in y_train])
    
    dtrain_cv = xgb.DMatrix(
        data=X_train,
        label=y_train,
        weight=sample_weight,
        device='cuda'  # Usa 'gpu' para activar GPU (en XGBoost 2.1.4)
    )
    
    cv_results = xgb.cv(
        params=params,
        dtrain=dtrain_cv,
        num_boost_round=num_boost_round,
        nfold=nfold,
        stratified=True,
        early_stopping_rounds=early_stopping_rounds,
        metrics=['auc', 'logloss'],
        seed=params.get('seed', SEED),
        verbose_eval=False
    )
    return cv_results

In [10]:
# Definición de parámetros fijos para el entrenamiento
num_boost_round = 1000
early_stopping_rounds = 25
default_params = toml.load(str(settings.CONFIG_DIR / 'config.toml'))['xgb-default-params']
SEED = 25

In [11]:
from xgboost.callback import LearningRateScheduler

def lr_scheduler(current_round: int, booster: xgb.Booster = None) -> float:
    base_lr = params.get('learning_rate', 0.01)
    if current_round < 100:
        return base_lr
    elif current_round < 300:
        return base_lr * 0.5
    else:
        return base_lr * 0.1

In [None]:
# --- Función objetivo para Optuna ---
def objective(trial: optuna.Trial) -> float:
    # Sugerir hiperparámetros a tunear
    global params
    params = default_params.copy()
    params['max_depth'] = trial.suggest_int('max_depth', 3, 10)
    params['learning_rate'] = trial.suggest_loguniform('learning_rate', 0.001, 0.3)
    params['min_child_weight'] = trial.suggest_int('min_child_weight', 1, 10)
    params['subsample'] = trial.suggest_uniform('subsample', 0.5, 1.0)
    params['colsample_bytree'] = trial.suggest_uniform('colsample_bytree', 0.5, 1.0)
    params['reg_alpha'] = trial.suggest_loguniform('reg_alpha', 1e-8, 10.0)
    params['reg_lambda'] = trial.suggest_loguniform('reg_lambda', 1e-8, 10.0)
    
    # Crear DMatrix para entrenamiento y validación
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dval = xgb.DMatrix(X_val, label=y_val)
    evals = [(dtrain, 'train'), (dval, 'validation')]
    callbacks = [LearningRateScheduler(lr_scheduler)]
    
    evals_result = {}
    # Entrenar modelo con early stopping
    booster = xgb.train(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        evals=evals,
        early_stopping_rounds=early_stopping_rounds,
        callbacks=callbacks,
        evals_result=evals_result,
        verbose_eval=False
    )
    # La métrica que se optimiza es la AUC de validación
    best_iteration, best_roc_auc, best_logloss = eval_performance_metrics(
        pd.DataFrame({
            'train-auc-mean': evals_result['train']['auc'],
            'train-logloss-mean': evals_result['train']['logloss'],
            'validation-auc-mean': evals_result['validation']['auc'],
            'validation-logloss-mean': evals_result['validation']['logloss']
        })
    )
    trial.set_user_attr('best_iteration', best_iteration)
    trial.set_user_attr('best_roc_auc', best_roc_auc)
    
    return best_roc_auc


In [14]:
# Configuración de mlflow + optuna
mlflow.set_tracking_uri(settings.MLFLOW_TRACKING_URI)
mlflow.set_experiment('xgb_hyperparam_tuning')

mlflow_callback = MLflowCallback(
    tracking_uri=settings.MLFLOW_TRACKING_URI,
    create_experiment=False
)
    
study = optuna.create_study(
    direction='maximize'
)

study.optimize(
    objective, 
    n_trials=50, 
    callbacks=[mlflow_callback],
    n_jobs=-1,
    show_progress_bar=True
)

best_trial = study.best_trial
print(f'Mejores hiperparámetros: {best_trial.params}')
print(f'Mejor AUC en validación: {best_trial.value}')

[I 2025-02-13 21:19:46,520] A new study created in memory with name: no-name-8ba5ff3f-3a4d-4a5e-9f14-a1008e6fa296


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-02-13 21:19:55,891] Trial 8 finished with value: 0.8720666747663489 and parameters: {'max_depth': 6, 'learning_rate': 0.09945811829216437, 'min_child_weight': 3, 'subsample': 0.9603898382520749, 'colsample_bytree': 0.7536836580146604, 'reg_alpha': 0.09884773595219716, 'reg_lambda': 0.0065655223184726995}. Best is trial 8 with value: 0.8720666747663489.
[I 2025-02-13 21:19:55,915] Trial 7 finished with value: 0.8720666747663489 and parameters: {'max_depth': 6, 'learning_rate': 0.054582465297348864, 'min_child_weight': 2, 'subsample': 0.9895587339068495, 'colsample_bytree': 0.8258376219310615, 'reg_alpha': 3.083037765163854, 'reg_lambda': 6.310367657809426e-06}. Best is trial 8 with value: 0.8720666747663489.
[I 2025-02-13 21:19:55,977] Trial 10 finished with value: 0.8720666747663489 and parameters: {'max_depth': 6, 'learning_rate': 0.0018568972936553099, 'min_child_weight': 7, 'subsample': 0.6421332041539702, 'colsample_bytree': 0.984626284882188, 'reg_alpha': 8.665981356894395

In [24]:
# ---------------------------
# Entrenamiento final con los mejores hiperparámetros
# ---------------------------
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)

final_params = default_params.copy()
final_params.update(best_trial.params)

evals_result = {}
final_model = xgb.train(
    final_params,
    dtrain,
    num_boost_round=best_trial.user_attrs['best_iteration'],
    evals=[(dtrain, 'train'), (dval, 'val')],
    early_stopping_rounds=None,
    evals_result=evals_result,
    verbose_eval=False,
)

In [25]:
# ---------------------------
# Curvas de aprendizaje con Plotly
# ---------------------------
train_auc_curve = evals_result['train']['auc']
val_auc_curve = evals_result['val']['auc']
train_logloss_curve = evals_result['train']['logloss']
val_logloss_curve = evals_result['val']['logloss']

# Número de boosting rounds (asumiendo que todas las listas tienen la misma longitud)
rounds = list(range(1, len(train_auc_curve) + 1))

# --- Gráfico para AUC ---
fig_auc = go.Figure()
fig_auc.add_trace(go.Scatter(
    x=rounds,
    y=train_auc_curve,
    mode='lines+markers',
    name='Train AUC',
    line=dict(color='#da1322')
))
fig_auc.add_trace(go.Scatter(
    x=rounds,
    y=val_auc_curve,
    mode='lines+markers',
    name='Val AUC',
    line=dict(color='#19a042')
))
fig_auc.update_layout(
    title='Curva de Aprendizaje / AUC',
    xaxis_title='Boosting Rounds',
    yaxis_title='AUC',
    template='plotly_dark'
)
fig_auc.show()

# --- Gráfico para Log-loss ---
fig_logloss = go.Figure()
fig_logloss.add_trace(go.Scatter(
    x=rounds,
    y=train_logloss_curve,
    mode='lines+markers',
    name='Train Log-loss',
    line=dict(color='#da1322')
))
fig_logloss.add_trace(go.Scatter(
    x=rounds,
    y=val_logloss_curve,
    mode='lines+markers',
    name='val Log-loss',
    line=dict(color='#19a042')
))
fig_logloss.update_layout(
    title='Curva de Aprendizaje / Log-Loss',
    xaxis_title='Boosting Rounds',
    yaxis_title='Log-loss',
    template='plotly_dark'
)
fig_logloss.show()


In [None]:
# Model parameters

# Ruta al archivo de configuración usando pathlib
config_file_path = Path(settings.CONFIG_DIR) / 'config.toml'

# Cargar el archivo de configuración existente, o inicializar un diccionario vacío si no existe
if config_file_path.exists():
    with config_file_path.open('r') as file:
        config = toml.load(file)
else:
    config = {}
    
# Actualizamos (o agregamos) la sección 'pipeline-feature-selection'
final_params.pop('eta', None)
final_params['num_boosted_rounds'] = final_model.num_boosted_rounds()
config['xgb-tuned-params'] = final_params

# Escribimos los cambios en el archivo de configuración
with config_file_path.open('w') as file:
    toml.dump(config, file)

In [28]:
# Unimos los datos de entrenamiento y validación
X_train_val = pd.concat([X_train, X_val], axis=0)
y_train_val = pd.concat([y_train, y_val], axis=0)

# Creamos el DMatrix final
dtrain_val = xgb.DMatrix(X_train_val, label=y_train_val)
hyper_params = toml.load((str(settings.CONFIG_DIR / 'config.toml')))['xgb-tuned-params']

final_model = xgb.train(
    hyper_params,
    dtrain_val,
    num_boost_round=best_trial.user_attrs['best_iteration'],
    early_stopping_rounds=None,
)

In [29]:
from sklearn.metrics import roc_auc_score, log_loss

# Crear DMatrix para train y test
dtrain_eval = xgb.DMatrix(X_train, label=y_train)
dval_eval = xgb.DMatrix(X_val, label=y_val)
dtest_eval  = xgb.DMatrix(X_test, label=y_test)

# Obtener las probabilidades predichas para cada conjunto
train_preds = final_model.predict(dtrain_eval)  # Para objetivo binario, retorna la probabilidad de la clase 1
val_preds  = final_model.predict(dval_eval)
test_preds  = final_model.predict(dtest_eval)

# Calcular las métricas
train_auc = roc_auc_score(y_train, train_preds)
train_logloss = log_loss(y_train, train_preds)

val_auc = roc_auc_score(y_val, val_preds)
val_logloss = log_loss(y_val, val_preds)

test_auc = roc_auc_score(y_test, test_preds)
test_logloss = log_loss(y_test, test_preds)

print(f'Train AUC: {train_auc:.4f}, Train Log-loss: {train_logloss:.4f}')
print(f'Val AUC: {val_auc:.4f}, Val Log-loss: {val_logloss:.4f}')
print(f'Test AUC: {test_auc:.4f}, Test Log-loss: {test_logloss:.4f}')

Train AUC: 0.8769, Train Log-loss: 0.4240
Val AUC: 0.8739, Val Log-loss: 0.4269
Test AUC: 0.8725, Test Log-loss: 0.4313


In [30]:
# ---------------------------
# Registrar el modelo final y la curva en MLflow
# ---------------------------
artifacts_dir = Path(settings.ART_DIR)
artifacts_dir.mkdir(parents=True, exist_ok=True)
final_model_file = artifacts_dir / 'final_model.json'
final_model.save_model(str(final_model_file))

with mlflow.start_run() as run:
    current_run = mlflow.active_run()
    print(f'Active run id is: {current_run.info.run_id}')
    print(f'Active run name is: {current_run.info.run_name}')
    
    mlflow.log_params(best_trial.params)
    mlflow.log_metric('best_val_auc', best_trial.value)
    mlflow.log_metric('test_auc', test_auc)
    mlflow.log_artifact(str(final_model_file), artifact_path='models')
    mlflow.end_run()

Active run id is: 865a4e3d92d848cf8fd3859016d49f3a
Active run name is: amusing-shrike-367


In [None]:
!mlflow ui --backend-store-uri "{settings.MLFLOW_TRACKING_URI}"