In [1]:
import sys
from pathlib import Path
sys.path.append(str(Path.cwd().parent.parent))

from src.utils.utils_fn import capture_variables, gather_variable_info

import warnings
warnings.simplefilter('ignore')

%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.pipeline import Pipeline
from feature_engine.imputation import RandomSampleImputer
from feature_engine.encoding import RareLabelEncoder, OrdinalEncoder
from feature_engine.discretisation import GeometricWidthDiscretiser
import optuna
from optuna.integration.mlflow import MLflowCallback
import mlflow
import mlflow.sklearn
import mlflow.xgboost
import joblib
import toml

from config.config import settings

In [3]:
# Definir la ruta absoluta para la carpeta de pipelines
root_path = Path.cwd().resolve().parent.parent

# Crear el directorio si no existe
root_path.mkdir(parents=True, exist_ok=True)

# Lectura del dataset
data = pd.read_parquet(
    path=str(root_path / 'data/processed/data_processed.parquet'), 
)

# Setear los ids como índices
data: pd.DataFrame = data.set_index('product_id')
data.sample(5, random_state=10)

Unnamed: 0_level_0,condition,state,city,local_pickup,free_shipping,shipping_mode,listing_type,buying_mode,attribute_group_id,attribute_group,...,status,accepts_mercadopago,currency,automatic_relist,title,stock_quantity,available_quantity,total_amount,date_difference_hr,time_difference_hr
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
mla5501620002,used,capital federal,nuñez,True,False,not_specified,bronze,buy_it_now,,,...,active,True,ars,False,timbre inahalambrico,1,1,0.0,0.000833,1440.0
mla2357269269,used,buenos aires,avellaneda,True,False,not_specified,bronze,buy_it_now,dflt,otros,...,active,True,ars,False,lote de 2 cinturones. 1 nuevo con etiqueta.mic...,8,8,0.0,695.485278,1440.0
mla4505955642,used,buenos aires,acassuso,True,False,me2,bronze,buy_it_now,,,...,active,True,ars,False,revista instituto de historia del derecho rica...,3,3,0.0,0.000833,1440.0
mla7853937105,used,capital federal,retiro,True,False,not_specified,free,buy_it_now,,,...,active,True,ars,False,susan sontag - la enfermedad y sus metaforas -...,1,1,0.0,0.000833,1440.0
mla7813601724,new,capital federal,almagro,True,False,not_specified,silver,buy_it_now,,,...,active,True,ars,False,vendas cambric marca vendsur de 10cm x 3mt en ...,7,7,2010.0,0.000556,1440.0


In [4]:
# Separación de los conjuntos de datos
# ===================================================================================================================
from sklearn.model_selection import train_test_split

# Inicializar una semilla
SEED = 25

# Separamos los features y el target
X = data.loc[:, data.columns != 'condition'] # type: ignore
y = data.loc[:, data.columns == 'condition'].squeeze() # type: ignore

# Verificar que los índices coinciden
assert (X.index == y.index).all(), 'Los índices de X e y no coinciden'

# Dividir el conjunto original en 70% entrenamiento y 30% para pruebas y validación
X_train, X_temp, y_train, y_temp = train_test_split(
    X, 
    y, 
    test_size=0.4, 
    random_state=SEED, 
    stratify=y
)

# Luego, dividir el 30% restante en 20% para validación y 10% para pruebas
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, 
    test_size=1/2, 
    random_state=SEED, 
    stratify=y_temp
)

| Subconjunto   | Proporción | Descripción                                                 |
|---------------|------------|-------------------------------------------------------------|
| Entrenamiento | 60%        | Se usa para entrenar el modelo.                             |
| Validación    | 20%        | Se usa para afinar hiperparámetros y evaluar durante el ajuste. |
| Prueba        | 20%        | Se usa para evaluar el rendimiento final del modelo.        |

In [5]:
from sklearn.preprocessing import LabelEncoder

# Codificación del target
le = LabelEncoder()
y_train = pd.Series(le.fit_transform(y_train), index=y_train.index)
y_val = pd.Series(le.transform(y_val), index=y_val.index)
y_test = pd.Series(le.transform(y_test), index=y_test.index)

In [6]:
# Función para capturar los tipos de variables
continuous, categoricals, discretes, temporaries = capture_variables(
    data=X_train
)

		Tipos de variables
Hay 5 variables continuas
Hay 0 variables discretas
Hay 0 variables temporales
Hay 15 variables categóricas


In [7]:
# Capturar información de las variables
features_selection = gather_variable_info(
    X=X_train,
    continuous=continuous,
    categoricals=categoricals,
    discretes=discretes,
    missing_threshold=0.05,       
    cardinality_threshold=5 
)

{
    "continuous_more_than_threshold": [],
    "continuous_less_than_threshold": [],
    "categoricals_more_than_threshold": [
        "attribute_group_id",
        "attribute_group",
        "attribute_id"
    ],
    "categoricals_less_than_threshold": [
        "state",
        "city"
    ],
    "categoricals_high_cardinality": [
        "state",
        "city",
        "listing_type",
        "attribute_group_id",
        "attribute_group",
        "attribute_id",
        "title"
    ],
    "categoricals_low_cardinality": [
        "local_pickup",
        "free_shipping",
        "shipping_mode",
        "buying_mode",
        "status",
        "accepts_mercadopago",
        "currency",
        "automatic_relist"
    ],
    "discretes_more_than_threshold": [],
    "discretes_less_than_threshold": [],
    "discretes_high_cardinality": [],
    "discretes_low_cardinality": []
}


In [8]:
from sklearn.pipeline import Pipeline
from feature_engine.selection import DropFeatures
from feature_engine.selection import DropConstantFeatures
from feature_engine.selection import DropCorrelatedFeatures
from feature_engine.preprocessing import MatchVariables, MatchCategories

high_cardinality_from_eda = ['title', 'city']
categoricals_more_than_threshold = features_selection['categoricals_more_than_threshold']

# Pipeline de procesadores
pipe = Pipeline([
    ('drop-features', DropFeatures(features_to_drop=categoricals_more_than_threshold + high_cardinality_from_eda)),
    ('constant-features', DropConstantFeatures(variables=[var for var in continuous + categoricals if var not in categoricals_more_than_threshold + high_cardinality_from_eda], 
                                               missing_values='ignore',
                                               tol=0.95)),
    ('match-features', MatchVariables(missing_values='ignore')),
    ('correlated-features', DropCorrelatedFeatures(variables=continuous,
                                                   missing_values='ignore',
                                                   method='pearson',
                                                   threshold=0.8)),
    ('match-categories', MatchCategories(missing_values='ignore')),
])

pipe.fit(X_train)
X_train = pipe.transform(X_train)
X_val = pipe.transform(X_val)
X_test = pipe.transform(X_test)

In [9]:
# Exportar la pipeline de feature selection
path = Path(settings.SRC_DIR) / 'pipelines'
joblib.dump(pipe, str(path / 'pipeline_feature_selection.pkl'))

['/home/lynn/Documentos/Development/Scripts_and_Notebooks/Proyectos Profesionales/Pruebas técnicas/MELI/src/pipelines/pipeline_feature_selection.pkl']

In [10]:
# Feature Selection

# Ruta al archivo de configuración usando pathlib
config_file_path = Path(settings.CONFIG_DIR) / 'config.toml'

# Filtramos el diccionario para excluir las listas vacías
features_selection = {'high_cardinality_from_eda': high_cardinality_from_eda} | {key: value for key, value in features_selection.items() if value}

# Cargar el archivo de configuración existente, o inicializar un diccionario vacío si no existe
if config_file_path.exists():
    with config_file_path.open('r') as file:
        config = toml.load(file)
else:
    config = {}
    
# Actualizamos (o agregamos) la sección 'pipeline-feature-selection'
config['pipeline-feature-selection'] = features_selection

# Escribimos los cambios en el archivo de configuración
with config_file_path.open('w') as file:
    toml.dump(config, file)

In [11]:
X_train['local_pickup'] = X_train['local_pickup'].astype('category')
X_val['local_pickup'] = X_val['local_pickup'].astype('category')
X_test['local_pickup'] = X_test['local_pickup'].astype('category')

In [12]:
# Función para capturar los tipos de variables
continuous, categoricals, discretes, temporaries = capture_variables(
    data=X_train
)

		Tipos de variables
Hay 4 variables continuas
Hay 0 variables discretas
Hay 0 variables temporales
Hay 4 variables categóricas


In [13]:
# Capturar información de las variables
features_engineering = gather_variable_info(
    X=X_train,
    continuous=continuous,
    categoricals=categoricals,
    discretes=discretes,
    missing_threshold=0.05,       
    cardinality_threshold=5 
)

{
    "continuous_more_than_threshold": [],
    "continuous_less_than_threshold": [],
    "categoricals_more_than_threshold": [],
    "categoricals_less_than_threshold": [
        "state"
    ],
    "categoricals_high_cardinality": [
        "state",
        "listing_type"
    ],
    "categoricals_low_cardinality": [
        "local_pickup",
        "shipping_mode"
    ],
    "discretes_more_than_threshold": [],
    "discretes_less_than_threshold": [],
    "discretes_high_cardinality": [],
    "discretes_low_cardinality": []
}


In [14]:
# Feature Engineering

# Ruta al archivo de configuración usando pathlib
config_file_path = Path(settings.CONFIG_DIR) / 'config.toml'

# Filtramos el diccionario para excluir las listas vacías
features_engineering = {key: value for key, value in features_engineering.items() if value}

# Cargar el archivo de configuración existente, o inicializar un diccionario vacío si no existe
if config_file_path.exists():
    with config_file_path.open('r') as file:
        config = toml.load(file)
else:
    config = {}
    
# Actualizamos (o agregamos) la sección 'pipeline-feature-selection'
config['pipeline-feature-engineering'] = features_engineering

# Escribimos los cambios en el archivo de configuración
with config_file_path.open('w') as file:
    toml.dump(config, file)

In [15]:
X_train

Unnamed: 0_level_0,state,local_pickup,shipping_mode,listing_type,available_quantity,total_amount,date_difference_hr,time_difference_hr
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
mla1354899716,buenos aires,True,me2,bronze,4,0.0,720.154444,1440.000000
mla2226746858,buenos aires,True,not_specified,gold,6,0.0,0.008611,1440.006944
mla1838150894,córdoba,False,not_specified,bronze,60,0.0,0.001111,1440.000000
mla4645501325,capital federal,True,not_specified,free,6,0.0,84.331389,1440.000000
mla7355660646,capital federal,True,me2,bronze,1,1350.0,0.000278,1440.000000
...,...,...,...,...,...,...,...,...
mla4269481325,buenos aires,True,me2,bronze,5,0.0,0.018611,1440.000000
mla1905258844,capital federal,True,not_specified,free,3,0.0,3.771389,1440.000000
mla7968471061,capital federal,True,not_specified,bronze,2,0.0,0.000000,1440.000000
mla1578202997,capital federal,True,me2,bronze,20,0.0,0.008889,1440.006944


---

In [16]:
# -----------------------------
# Funciones necesarias
# -----------------------------

def eval_performance_metrics(cv_results: pd.DataFrame) -> tuple[int, float, float]:
    """
    Evalúa los resultados de xgb.cv y retorna:
      - best_iteration: la iteración óptima (según test-auc-mean).
      - best_roc_auc: el valor de test-auc-mean en esa iteración.
      - best_logloss: el valor de test-logloss-mean en esa iteración.
    """
    best_iteration: int = int(cv_results['test-auc-mean'].idxmax())
    best_roc_auc: float = float(cv_results.loc[best_iteration, 'test-auc-mean'])
    best_logloss: float = float(cv_results.loc[best_iteration, 'test-logloss-mean'])
    return best_iteration, best_roc_auc, best_logloss

In [17]:
def run_experiment(
    X_train: pd.DataFrame,
    y_train: pd.Series,
    X_val: pd.DataFrame,
    y_val: pd.Series,
    params: dict,
    num_boost_round: int,
    nfold: int,
    early_stopping_rounds: int = None
) -> pd.DataFrame:
    """
    Ejecuta la validación cruzada usando la API nativa de XGBoost (xgb.cv) sobre
    la unión de X_train y X_val. Se utiliza un vector de pesos basado en la frecuencia
    de las clases. Se configura para usar GPU mediante 'device': 'gpu'.
    """
    X_cv: pd.DataFrame = pd.concat([X_train, X_val], axis=0)
    y_cv: pd.Series = pd.concat([y_train, y_val], axis=0)
    weight_dict = y_cv.value_counts(normalize=True).to_dict()
    sample_weight: np.ndarray = np.array([weight_dict[val] for val in y_cv])
    
    dtrain_cv: xgb.DMatrix = xgb.DMatrix(
        data=X_cv,
        label=y_cv,
        weight=sample_weight
    )
    
    cv_results: pd.DataFrame = xgb.cv( # type: ignore
        params=params,
        dtrain=dtrain_cv,
        num_boost_round=num_boost_round,
        nfold=nfold,
        stratified=True,
        early_stopping_rounds=early_stopping_rounds,
        metrics=['auc', 'logloss'],
        seed=params.get('seed', SEED),
        verbose_eval=False
    )
    return cv_results

In [18]:
def build_pipeline(
    n_cat_high: int,
    n_cat_low: int, 
    bins: int
    ) -> Pipeline:
    """
    Instancia primero los transformadores y luego construye la pipeline de feature engineering:
      1. Imputación con RandomSampleImputer sobre info['categoricals_less_than_threshold'].
      2. RareLabelEncoder para variables de alta cardinalidad (n_categories = n_cat_high).
      3. RareLabelEncoder para variables de baja cardinalidad (n_categories = n_cat_low).
      4. Discretización con GeometricWidthDiscretiser sobre variables continuas (bins).
      5. Codificación con OrdinalEncoder sobre continuous + categoricals.
    """
    imputer = RandomSampleImputer(
        variables=features_engineering['categoricals_less_than_threshold'],
        random_state=SEED
    )
    rare_high = RareLabelEncoder(
        tol=0.05,
        n_categories=n_cat_high,
        variables=features_engineering['categoricals_high_cardinality']
    )
    # Aquí usamos una variable fija para baja cardinalidad, por ejemplo 'shipping_mode'
    rare_low = RareLabelEncoder(
        tol=0.05,
        n_categories=n_cat_low,
        variables=['shipping_mode']
    )
    discretiser = GeometricWidthDiscretiser(
        variables=continuous,
        bins=bins,
        return_object=True
    )
    encoder = OrdinalEncoder(
        variables=continuous + categoricals,
        encoding_method='ordered'
    )
    pipe = Pipeline([
        ('imputer', imputer),
        ('rare_high', rare_high),
        ('rare_low', rare_low),
        ('discretiser', discretiser),
        ('encoder', encoder)
    ])
    return pipe

In [19]:
def run_pipeline_experiment(
    pipeline_params: tuple[int, int, int],
    params: dict,
    num_boost_round: int,
    nfold: int,
    early_stopping_rounds: int
) -> tuple[float, tuple[int, int, int]]:
    """
    Construye la pipeline de feature engineering con la configuración (n_cat_high, n_cat_low, bins),
    la ajusta sobre X_train y transforma X_train y X_val, luego evalúa mediante run_experiment.
    Retorna (best_roc_auc, pipeline_params).
    """
    n_cat_high, n_cat_low, bins = pipeline_params
    pipe = build_pipeline(n_cat_high, n_cat_low, bins)
    pipe.fit(X_train, y_train)
    X_train_transformed: pd.DataFrame = pipe.transform(X_train)
    X_val_transformed: pd.DataFrame = pipe.transform(X_val)
    
    cv_results: pd.DataFrame = run_experiment(
        X_train=X_train_transformed,
        y_train=y_train,
        X_val=X_val_transformed,
        y_val=y_val,
        params=params,
        num_boost_round=num_boost_round,
        nfold=nfold,
        early_stopping_rounds=early_stopping_rounds
    )
    best_iteration, best_roc_auc, best_logloss = eval_performance_metrics(cv_results)
    return best_iteration, best_roc_auc, best_logloss


In [20]:
# -----------------------------
# Función objetivo para Optuna (integrada con MLflow a través de callback)
# -----------------------------
def objective(trial: optuna.Trial) -> float:
    # Sugerir hiperparámetros para la parte de feature engineering
    n_cat_high = trial.suggest_int('n_cat_high', 2, 5)
    n_cat_low  = trial.suggest_int('n_cat_low', 2, 5)
    bins       = trial.suggest_int('bins', 5, 10)
    
    best_iteration, best_roc_auc, best_logloss = run_pipeline_experiment( # type: ignore
        pipeline_params=(n_cat_high, n_cat_low, bins),
        params=params,
        num_boost_round=num_boost_round,
        nfold=nfold,
        early_stopping_rounds=early_stopping_rounds
    )
    
    # Puedes almacenar información adicional en el trial si lo deseas
    trial.set_user_attr('best_iteration', best_iteration)
    trial.set_user_attr('best_logloss', best_logloss)
    
    # Optimizamos para maximizar el ROC-AUC
    return best_roc_auc

In [22]:
# -----------------------------
# Función principal: optimización bayesiana y registro en MLflow
# -----------------------------
def main() -> None:
    # Crear las carpetas necesarias
    settings.EXPERIMENTS_DIR.mkdir(parents=True, exist_ok=True)
    pipeline_dir: Path = Path(settings.SRC_DIR) / 'pipelines'
    pipeline_dir.mkdir(parents=True, exist_ok=True)
    
    mlflow.set_tracking_uri(settings.MLFLOW_TRACKING_URI)
    mlflow.set_experiment('feature_engineering_experiment')
    
    # Parámetros fijos para XGBoost
    global params, num_boost_round, nfold, early_stopping_rounds
    params = {
        'objective': 'binary:logistic',
        'eta': 0.1,
        'eval_metric': ['auc', 'logloss'],
        'seed': SEED,
        'device': 'cuda'  # Usar GPU mediante la API nativa
    }
    num_boost_round = 300
    nfold = 10
    early_stopping_rounds = 10
    
    # Crear el callback de MLflow para Optuna
    mlflow_callback = MLflowCallback(
        tracking_uri=settings.MLFLOW_TRACKING_URI,
        create_experiment=False
    )
    
    # Crear el estudio de Optuna y ejecutar la optimización
    study = optuna.create_study(direction='maximize')
    
    # Especificamos, por ejemplo, 50 trials
    study.optimize(
        func=objective, 
        n_trials=50, 
        callbacks=[mlflow_callback],
        n_jobs=-1,
        show_progress_bar=True
    )
    
    best_trial = study.best_trial
    print(f'La mejor configuración es: {best_trial.params} con ROC-AUC = {best_trial.value}')
    
    # Reconstruir la pipeline con la mejor configuración y ajustarla sobre X_train
    best_pipeline = build_pipeline(
        best_trial.params['n_cat_high'],
        best_trial.params['n_cat_low'],
        best_trial.params['bins']
    )
    best_pipeline.fit(X_train, y_train)
    
    # Guardar la pipeline en un archivo .pkl en settings.MODELS_DIR / artifacts
    best_pipeline_file: Path = pipeline_dir / 'pipeline_feature_engineering.pkl'
    joblib.dump(best_pipeline, str(best_pipeline_file))
    
    # Registrar la mejor pipeline y sus métricas en un run final de MLflow
    with mlflow.start_run():
        current_run = mlflow.active_run()
        print(f'Active run id is: {current_run.info.run_id}')
        print(f'Active run name is: {current_run.info.run_name}')
        mlflow.log_params(best_trial.params)
        mlflow.log_metric('best_roc_auc', best_trial.value)
        mlflow.log_artifact(str(best_pipeline_file), artifact_path='artifacts')
        mlflow.end_run()

if __name__ == '__main__':
    main()

2025/02/10 21:56:50 INFO mlflow.tracking.fluent: Experiment with name 'feature_engineering_experiment' does not exist. Creating a new experiment.
[I 2025-02-10 21:56:50,798] A new study created in memory with name: no-name-1da3b0d1-520c-4be0-9417-f198500b9ef9


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-02-10 21:57:47,896] Trial 8 finished with value: 0.8605756741345433 and parameters: {'n_cat_high': 3, 'n_cat_low': 3, 'bins': 5}. Best is trial 8 with value: 0.8605756741345433.
[I 2025-02-10 21:57:52,639] Trial 16 finished with value: 0.8735026345906084 and parameters: {'n_cat_high': 3, 'n_cat_low': 4, 'bins': 9}. Best is trial 16 with value: 0.8735026345906084.
[I 2025-02-10 21:57:52,755] Trial 6 finished with value: 0.8735026345906084 and parameters: {'n_cat_high': 3, 'n_cat_low': 5, 'bins': 9}. Best is trial 16 with value: 0.8735026345906084.
[I 2025-02-10 21:57:52,772] Trial 1 finished with value: 0.8735026345906084 and parameters: {'n_cat_high': 4, 'n_cat_low': 4, 'bins': 9}. Best is trial 16 with value: 0.8735026345906084.
[I 2025-02-10 21:57:52,955] Trial 20 finished with value: 0.8735026345906084 and parameters: {'n_cat_high': 5, 'n_cat_low': 5, 'bins': 9}. Best is trial 16 with value: 0.8735026345906084.
[I 2025-02-10 21:57:53,102] Trial 21 finished with value: 0.8735

In [23]:
# Model parameters

# Ruta al archivo de configuración usando pathlib
config_file_path = Path(settings.CONFIG_DIR) / 'config.toml'

# Cargar el archivo de configuración existente, o inicializar un diccionario vacío si no existe
if config_file_path.exists():
    with config_file_path.open('r') as file:
        config = toml.load(file)
else:
    config = {}
    
# Actualizamos (o agregamos) la sección 'pipeline-feature-selection'
config['xgb-params'] = params

# Escribimos los cambios en el archivo de configuración
with config_file_path.open('w') as file:
    toml.dump(config, file)

In [24]:
!mlflow ui --backend-store-uri "{settings.MLFLOW_TRACKING_URI}"

[2025-02-10 21:59:28 -0500] [230788] [INFO] Starting gunicorn 23.0.0
[2025-02-10 21:59:28 -0500] [230788] [INFO] Listening at: http://127.0.0.1:5000 (230788)
[2025-02-10 21:59:28 -0500] [230788] [INFO] Using worker: sync
[2025-02-10 21:59:28 -0500] [230789] [INFO] Booting worker with pid: 230789
[2025-02-10 21:59:28 -0500] [230790] [INFO] Booting worker with pid: 230790
[2025-02-10 21:59:28 -0500] [230791] [INFO] Booting worker with pid: 230791
[2025-02-10 21:59:28 -0500] [230792] [INFO] Booting worker with pid: 230792
[2025-02-10 22:00:12 -0500] [230788] [INFO] Handling signal: int
^C

Aborted!
[2025-02-10 22:00:13 -0500] [230790] [INFO] Worker exiting (pid: 230790)
[2025-02-10 22:00:13 -0500] [230789] [INFO] Worker exiting (pid: 230789)
[2025-02-10 22:00:13 -0500] [230791] [INFO] Worker exiting (pid: 230791)
[2025-02-10 22:00:13 -0500] [230792] [INFO] Worker exiting (pid: 230792)
[2025-02-10 22:00:13 -0500] [230788] [ERROR] Worker (pid:230791) was sent SIGHUP!
