In [1]:
import sys
from pathlib import Path
sys.path.append(str(Path.cwd().parent.parent))

from config.config import settings
import pandas as pd
import joblib
import json
import toml

import warnings
warnings.simplefilter('ignore')

%load_ext autoreload
%autoreload 2

In [2]:
# Lectura del dataset
data_path = settings.DATA_DIR / 'processed/data_processed.parquet'
data = pd.read_parquet(path=str(data_path))

# Setear los ids como índices
data: pd.DataFrame = data.set_index('product_id')
data.sample(5, random_state=10)

Unnamed: 0_level_0,condition,state,city,local_pickup,free_shipping,shipping_mode,listing_type,buying_mode,attribute_group_id,attribute_group,...,status,accepts_mercadopago,currency,automatic_relist,title,stock_quantity,available_quantity,total_amount,date_difference_hr,time_difference_hr
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
mla5501620002,used,capital federal,nuñez,True,False,not_specified,bronze,buy_it_now,,,...,active,True,ars,False,timbre inahalambrico,1,1,0.0,0.000833,1440.0
mla2357269269,used,buenos aires,avellaneda,True,False,not_specified,bronze,buy_it_now,dflt,otros,...,active,True,ars,False,lote de 2 cinturones. 1 nuevo con etiqueta.mic...,8,8,0.0,695.485278,1440.0
mla4505955642,used,buenos aires,acassuso,True,False,me2,bronze,buy_it_now,,,...,active,True,ars,False,revista instituto de historia del derecho rica...,3,3,0.0,0.000833,1440.0
mla7853937105,used,capital federal,retiro,True,False,not_specified,free,buy_it_now,,,...,active,True,ars,False,susan sontag - la enfermedad y sus metaforas -...,1,1,0.0,0.000833,1440.0
mla7813601724,new,capital federal,almagro,True,False,not_specified,silver,buy_it_now,,,...,active,True,ars,False,vendas cambric marca vendsur de 10cm x 3mt en ...,7,7,2010.0,0.000556,1440.0


In [3]:
# Separación de los conjuntos de datos
# ===================================================================================================================
from sklearn.model_selection import train_test_split

# Inicializar una semilla
SEED = 25

# Separamos los features y el target
X = data.loc[:, data.columns != 'condition'] # type: ignore
y = data.loc[:, data.columns == 'condition'].squeeze() # type: ignore

# Verificar que los índices coinciden
assert (X.index == y.index).all(), 'Los índices de X e y no coinciden'

# Dividir el conjunto original en 70% entrenamiento y 30% para pruebas y validación
X_train, X_temp, y_train, y_temp = train_test_split(
    X, 
    y, 
    test_size=0.4, 
    random_state=SEED, 
    stratify=y
)

# Luego, dividir el 30% restante en 20% para validación y 10% para pruebas
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, 
    test_size=1/2, 
    random_state=SEED, 
    stratify=y_temp
)

# Categóricas
X_train['local_pickup'] = X_train['local_pickup'].astype('category')
X_val['local_pickup'] = X_val['local_pickup'].astype('category')
X_test['local_pickup'] = X_test['local_pickup'].astype('category')

In [4]:
from sklearn.preprocessing import LabelEncoder

# Codificación del target
le = LabelEncoder()
y_train = pd.Series(le.fit_transform(y_train), index=y_train.index)
y_val = pd.Series(le.transform(y_val), index=y_val.index)
y_test = pd.Series(le.transform(y_test), index=y_test.index)

In [5]:
from sklearn.pipeline import Pipeline

# Pipeline de procesamiento
pipe_feature_selection = joblib.load(f'{settings.SRC_DIR}/pipelines/pipeline_feature_selection.pkl')
pipe_feature_engineering = joblib.load(f'{settings.SRC_DIR}/pipelines/pipeline_feature_engineering.pkl')
pipe = Pipeline([
    ('feature_selection', pipe_feature_selection),
    ('feature_engineering', pipe_feature_engineering)
])

print(pipe.get_params())

{'memory': None, 'steps': [('feature_selection', Pipeline(steps=[('drop-features',
                 DropFeatures(features_to_drop=['attribute_group_id',
                                                'attribute_group',
                                                'attribute_id', 'title',
                                                'city'])),
                ('constant-features',
                 DropConstantFeatures(missing_values='ignore', tol=0.95,
                                      variables=['stock_quantity',
                                                 'available_quantity',
                                                 'total_amount',
                                                 'date_difference_hr',
                                                 'time_difference_hr', 'state',
                                                 'local_pickup',
                                                 'free_ship...
                                                 'listi

In [6]:
# Ejecución
pipe.fit(X_train, y_train)
X_train = pipe.transform(X_train)
X_val = pipe.transform(X_val)
X_test = pipe.transform(X_test)

In [7]:
X_train

Unnamed: 0_level_0,state,local_pickup,shipping_mode,listing_type,available_quantity,total_amount,date_difference_hr,time_difference_hr
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
mla1354899716,2,0,1,2,7,7,0,2
mla2226746858,2,0,2,0,7,7,7,2
mla1838150894,0,1,2,2,4,7,7,2
mla4645501325,1,0,2,3,7,7,3,2
mla7355660646,1,0,1,2,7,5,7,2
...,...,...,...,...,...,...,...,...
mla4269481325,2,0,1,2,7,7,7,2
mla1905258844,1,0,2,3,7,7,2,2
mla7968471061,1,0,2,2,7,7,7,2
mla1578202997,1,0,1,2,5,7,7,2


In [7]:
import optuna
import mlflow
import mlflow.xgboost
import xgboost as xgb
import numpy as np
import plotly.graph_objects as go
from optuna.integration.mlflow import MLflowCallback

In [8]:
# --- Función para evaluar las métricas de xgb.cv ---
def eval_performance_metrics(cv_results: pd.DataFrame) -> tuple[int, float, float]:
    """
    Retorna:
      - best_iteration: número de iteración óptima según 'validation-auc'.
      - best_roc_auc: mejor AUC obtenido en validación.
      - best_logloss: log-loss en esa iteración.
    """
    best_iteration = int(cv_results['validation-auc-mean'].idxmax())
    best_roc_auc = float(cv_results.loc[best_iteration, 'validation-auc-mean'])
    best_logloss = float(cv_results.loc[best_iteration, 'validation-logloss-mean'])
    return best_iteration, best_roc_auc, best_logloss


In [24]:
# ---------------------------
# Función para ejecutar xgb.cv usando la API nativa (con GPU)
# ---------------------------
def run_experiment(
    X_train: pd.DataFrame,
    y_train: pd.Series,
    X_val: pd.DataFrame,
    y_val: pd.Series,
    params: dict,
    num_boost_round: int,
    nfold: int,
    early_stopping_rounds: int = None
) -> pd.DataFrame:
    # Combinar training y validación
    X_cv = pd.concat([X_train, X_val], axis=0)
    y_cv = pd.concat([y_train, y_val], axis=0)
    
    # Vector de pesos basado en la frecuencia de cada clase
    weight_dict = y_cv.value_counts(normalize=True).to_dict()
    sample_weight = np.array([weight_dict[val] for val in y_cv])
    
    dtrain_cv = xgb.DMatrix(
        data=X_cv,
        label=y_cv,
        weight=sample_weight,
        # device='cuda'  # Usa 'gpu' para activar GPU (en XGBoost 2.1.4)
    )
    
    cv_results = xgb.cv(
        params=params,
        dtrain=dtrain_cv,
        num_boost_round=num_boost_round,
        nfold=nfold,
        stratified=True,
        early_stopping_rounds=early_stopping_rounds,
        metrics=['auc', 'logloss'],
        seed=params.get('seed', SEED),
        verbose_eval=False,
        device='cuda',
        n_jobs=-1
    )
    return cv_results

In [10]:
# Definición de parámetros fijos para el entrenamiento
num_boost_round = 1000
early_stopping_rounds = 50
default_params = toml.load(str(settings.CONFIG_DIR / 'config.toml'))['xgb-default-params']
SEED = 25

In [20]:
from xgboost.callback import LearningRateScheduler

def lr_scheduler(current_round: int, booster: xgb.Booster = None) -> float:
    base_lr = params.get('learning_rate', 0.1)
    if current_round < 200:
        return base_lr
    elif current_round < 400:
        return base_lr * 0.5
    else:
        return base_lr * 0.1

In [11]:
from xgboost.callback import TrainingCallback

class ReduceLROnPlateauCallback(TrainingCallback):
    def __init__(self, patience: int = 50, factor: float = 0.5, min_lr: float = 1e-5, verbose: bool = True, metric: str = "logloss", dataset: str = "validation"):
        """
        patience: Número de rondas sin mejora para reducir la tasa.
        factor: Factor por el cual se reduce la tasa (por ejemplo, 0.5 la reduce a la mitad).
        min_lr: Tasa de aprendizaje mínima permitida.
        verbose: Si True, imprime información en cada reducción.
        metric: Métrica a monitorear (por ejemplo, "logloss" o "auc").
        dataset: Nombre del conjunto en evals (por ejemplo, "validation").
        """
        self.patience = patience
        self.factor = factor
        self.min_lr = min_lr
        self.verbose = verbose
        self.metric = metric
        self.dataset = dataset
        
        self.best_score = None
        self.wait = 0

    def after_iteration(self, model: xgb.Booster, epoch: int, evals_log: dict) -> bool:
        # Verificamos que el evals_log contenga la métrica en el dataset deseado
        if self.dataset not in evals_log or self.metric not in evals_log[self.dataset]:
            return False  # no hacemos nada si no se encuentra la métrica
        
        # Obtenemos la métrica actual (asumimos que se monitorea al final de la iteración)
        current_score = evals_log[self.dataset][self.metric][-1]
        
        # Si se trata de log-loss, se quiere minimizar; si se tratara de AUC, se querría maximizar.
        # Aquí suponemos que se trata de log-loss, por lo que menor es mejor.
        if self.best_score is None or current_score < self.best_score:
            self.best_score = current_score
            self.wait = 0
        else:
            self.wait += 1

        if self.wait >= self.patience:
            # Obtener el learning rate actual; la API de Booster no tiene un getter directo, 
            # pero se puede obtener del diccionario de parámetros de modelo usando attributes() si se ha registrado.
            current_lr = float(model.attributes().get('learning_rate', 0.1))
            new_lr = max(current_lr * self.factor, self.min_lr)
            model.set_param('learning_rate', new_lr)
            if self.verbose:
                print(f"Round {epoch}: No improvement for {self.patience} rounds. Reducing learning rate from {current_lr} to {new_lr}.")
            self.wait = 0  # reiniciar el contador
        return False


In [18]:
# --- Función objetivo para Optuna ---
def objective(trial: optuna.Trial) -> float:
    # Sugerir hiperparámetros a tunear
    global params
    params = default_params.copy()
    params['max_depth'] = trial.suggest_int('max_depth', 3, 10)
    params['learning_rate'] = trial.suggest_loguniform('learning_rate', 0.001, 0.3)
    params['min_child_weight'] = trial.suggest_int('min_child_weight', 1, 10)
    params['subsample'] = trial.suggest_uniform('subsample', 0.5, 1.0)
    params['colsample_bytree'] = trial.suggest_uniform('colsample_bytree', 0.5, 1.0)
    params['reg_alpha'] = trial.suggest_loguniform('reg_alpha', 1e-8, 10.0)
    params['reg_lambda'] = trial.suggest_loguniform('reg_lambda', 1e-8, 10.0)
        
    booster_type = trial.suggest_categorical('booster', ['gbtree', 'dart'])
    params['booster'] = booster_type
    
    # Crear DMatrix para entrenamiento y validación
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dval = xgb.DMatrix(X_val, label=y_val)
    evals = [(dtrain, 'train'), (dval, 'validation')]
    
    # callbacks = [LearningRateScheduler(lr_scheduler)]
    # callbacks = [ReduceLROnPlateauCallback(patience=50, factor=0.5, min_lr=1e-5, verbose=False, metric='logloss', dataset='validation')]
    callbacks = [LearningRateScheduler(lr_scheduler)]

    
    evals_result = {}
    # Entrenar modelo con early stopping
    booster = xgb.train(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        evals=evals,
        early_stopping_rounds=early_stopping_rounds,
        callbacks=callbacks,
        evals_result=evals_result,
        verbose_eval=False
    )
    # La métrica que se optimiza es la AUC de validación
    best_iteration, best_roc_auc, best_logloss = eval_performance_metrics(
        pd.DataFrame({
            'train-auc-mean': evals_result['train']['auc'],
            'train-logloss-mean': evals_result['train']['logloss'],
            'validation-auc-mean': evals_result['validation']['auc'],
            'validation-logloss-mean': evals_result['validation']['logloss']
        })
    )
    trial.set_user_attr('best_iteration', best_iteration)
    trial.set_user_attr('best_logloss', best_logloss)
    
    return best_logloss

In [21]:
# Configuración de mlflow + optuna
mlflow.set_tracking_uri(settings.MLFLOW_TRACKING_URI)
mlflow.set_experiment('xgb_hyperparam_tuning')

mlflow_callback = MLflowCallback(
    tracking_uri=settings.MLFLOW_TRACKING_URI,
    create_experiment=False
)
    
study = optuna.create_study(
    direction='minimize'
)

study.optimize(
    objective, 
    n_trials=30, 
    callbacks=[mlflow_callback],
    n_jobs=-1,
    show_progress_bar=True
)

best_trial = study.best_trial
print(f'Mejores hiperparámetros: {best_trial.params}')
print(f'Mejor AUC en validación: {best_trial.value}')

[I 2025-02-11 21:19:54,230] A new study created in memory with name: no-name-6cdc4c0c-2524-4812-bb6b-565437fb8f13


  0%|          | 0/30 [00:00<?, ?it/s]

[I 2025-02-11 21:20:33,447] Trial 0 finished with value: 0.43093716007609617 and parameters: {'max_depth': 9, 'learning_rate': 0.00844574278943192, 'min_child_weight': 1, 'subsample': 0.7765434881933713, 'colsample_bytree': 0.8766669800505946, 'reg_alpha': 4.614013104328108e-08, 'reg_lambda': 1.523166196768946e-07, 'booster': 'dart'}. Best is trial 0 with value: 0.43093716007609617.
[I 2025-02-11 21:20:33,612] Trial 1 finished with value: 0.43093716007609617 and parameters: {'max_depth': 10, 'learning_rate': 0.0011101855758650276, 'min_child_weight': 9, 'subsample': 0.6818721814737251, 'colsample_bytree': 0.7023170739347447, 'reg_alpha': 0.007532325617002501, 'reg_lambda': 0.0022998101025096342, 'booster': 'gbtree'}. Best is trial 0 with value: 0.43093716007609617.
[I 2025-02-11 21:20:33,660] Trial 6 finished with value: 0.43093716007609617 and parameters: {'max_depth': 4, 'learning_rate': 0.10404422734840903, 'min_child_weight': 5, 'subsample': 0.8229858376366215, 'colsample_bytree': 

In [25]:
# ---------------------------
# Entrenamiento final con los mejores hiperparámetros
# ---------------------------
# Combinar train y validación para entrenar el modelo final
X_train_val = pd.concat([X_train, X_val], axis=0)
y_train_val = pd.concat([y_train, y_val], axis=0)
dtrain_val = xgb.DMatrix(X_train_val, label=y_train_val)
dtest = xgb.DMatrix(X_test, label=y_test)

final_params = default_params.copy()
final_params.update(best_trial.params)

evals_result = {}
final_model = xgb.train(
    final_params,
    dtrain_val,
    num_boost_round=num_boost_round,
    evals=[(dtrain_val, 'train'), (dtest, 'test')],
    early_stopping_rounds=early_stopping_rounds,
    evals_result=evals_result,
    verbose_eval=False,
)

KeyboardInterrupt: 

In [None]:
from sklearn.metrics import roc_auc_score, log_loss

# Crear DMatrix para train y test
dtrain_eval = xgb.DMatrix(X_train, label=y_train)
dval_eval = xgb.DMatrix(X_val, label=y_val)
dtest_eval  = xgb.DMatrix(X_test, label=y_test)

# Obtener las probabilidades predichas para cada conjunto
train_preds = final_model.predict(dtrain_eval)  # Para objetivo binario, retorna la probabilidad de la clase 1
val_preds  = final_model.predict(dval_eval)
test_preds  = final_model.predict(dtest_eval)

# Calcular las métricas
train_auc = roc_auc_score(y_train, train_preds)
train_logloss = log_loss(y_train, train_preds)

val_auc = roc_auc_score(y_val, val_preds)
val_logloss = log_loss(y_val, val_preds)

test_auc = roc_auc_score(y_test, test_preds)
test_logloss = log_loss(y_test, test_preds)

print(f'Train AUC: {train_auc:.4f}, Train Log-loss: {train_logloss:.4f}')
print(f'Val AUC: {val_auc:.4f}, Val Log-loss: {val_logloss:.4f}')
print(f'Test AUC: {test_auc:.4f}, Test Log-loss: {test_logloss:.4f}')

In [None]:
# Model parameters

# Ruta al archivo de configuración usando pathlib
config_file_path = Path(settings.CONFIG_DIR) / 'config.toml'

# Cargar el archivo de configuración existente, o inicializar un diccionario vacío si no existe
if config_file_path.exists():
    with config_file_path.open('r') as file:
        config = toml.load(file)
else:
    config = {}
    
# Actualizamos (o agregamos) la sección 'pipeline-feature-selection'
config['xgb-tunned-params'] = study.best_params

# Escribimos los cambios en el archivo de configuración
with config_file_path.open('w') as file:
    toml.dump(config, file)

In [None]:
# ---------------------------
# Curvas de aprendizaje con Plotly
# ---------------------------
train_auc_curve = evals_result['train']['auc']
test_auc_curve = evals_result['test']['auc']
train_logloss_curve = evals_result['train']['logloss']
test_logloss_curve = evals_result['test']['logloss']

# Número de boosting rounds (asumiendo que todas las listas tienen la misma longitud)
rounds = list(range(1, len(train_auc_curve) + 1))

# --- Gráfico para AUC ---
fig_auc = go.Figure()
fig_auc.add_trace(go.Scatter(
    x=rounds,
    y=train_auc_curve,
    mode='lines+markers',
    name='Train AUC',
    line=dict(color='#da1322')
))
fig_auc.add_trace(go.Scatter(
    x=rounds,
    y=test_auc_curve,
    mode='lines+markers',
    name='Test AUC',
    line=dict(color='#19a042')
))
fig_auc.update_layout(
    title='Curva de Aprendizaje / AUC',
    xaxis_title='Boosting Rounds',
    yaxis_title='AUC',
    template='plotly_dark'
)
fig_auc.show()

# --- Gráfico para Log-loss ---
fig_logloss = go.Figure()
fig_logloss.add_trace(go.Scatter(
    x=rounds,
    y=train_logloss_curve,
    mode='lines+markers',
    name='Train Log-loss',
    line=dict(color='#da1322')
))
fig_logloss.add_trace(go.Scatter(
    x=rounds,
    y=test_logloss_curve,
    mode='lines+markers',
    name='Test Log-loss',
    line=dict(color='#19a042')
))
fig_logloss.update_layout(
    title='Curva de Aprendizaje / Log-Loss',
    xaxis_title='Boosting Rounds',
    yaxis_title='Log-loss',
    template='plotly_dark'
)
fig_logloss.show()

In [None]:
# ---------------------------
# Registrar el modelo final y la curva en MLflow
# ---------------------------
artifacts_dir = Path(settings.MODELS_DIR) / 'artifacts'
artifacts_dir.mkdir(parents=True, exist_ok=True)
final_model_file = artifacts_dir / 'final_model.json'
final_model.save_model(str(final_model_file))

with mlflow.start_run() as run:
    current_run = mlflow.active_run()
    print(f'Active run id is: {current_run.info.run_id}')
    print(f'Active run name is: {current_run.info.run_name}')
    
    mlflow.log_params(best_trial.params)
    mlflow.log_metric('best_val_auc', best_trial.value)
    mlflow.log_metric('test_auc', test_auc)
    mlflow.log_artifact(str(final_model_file), artifact_path='models')
    mlflow.end_run()

In [None]:
!mlflow ui --backend-store-uri "{settings.MLFLOW_TRACKING_URI}"