In [1]:
import sys
from pathlib import Path
sys.path.append(str(Path.cwd().parent.parent))

import polars as pl
import pandas as pd
import numpy as np
from src.utils.utils_fn import capture_variables, gather_variable_info

import warnings
warnings.simplefilter('ignore')

%load_ext autoreload
%autoreload 2

In [2]:
# Definir la ruta absoluta para la carpeta de pipelines
root_path = Path.cwd().resolve().parent.parent

# Crear el directorio si no existe
root_path.mkdir(parents=True, exist_ok=True)

# Lectura del dataset
data = pd.read_parquet(
    path=str(root_path / 'data/processed/data_processed.parquet'), 
)

# Setear los ids como índices
data: pd.DataFrame = data.set_index('product_id')
data.sample(5, random_state=10)

Unnamed: 0_level_0,condition,state,city,local_pickup,free_shipping,shipping_mode,listing_type,buying_mode,attribute_group_id,attribute_group,...,status,accepts_mercadopago,currency,automatic_relist,title,stock_quantity,available_quantity,total_amount,date_difference_hr,time_difference_hr
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
mla5501620002,used,capital federal,nuñez,True,False,not_specified,bronze,buy_it_now,,,...,active,True,ars,False,timbre inahalambrico,1,1,0.0,0.000833,1440.0
mla2357269269,used,buenos aires,avellaneda,True,False,not_specified,bronze,buy_it_now,dflt,otros,...,active,True,ars,False,lote de 2 cinturones. 1 nuevo con etiqueta.mic...,8,8,0.0,695.485278,1440.0
mla4505955642,used,buenos aires,acassuso,True,False,me2,bronze,buy_it_now,,,...,active,True,ars,False,revista instituto de historia del derecho rica...,3,3,0.0,0.000833,1440.0
mla7853937105,used,capital federal,retiro,True,False,not_specified,free,buy_it_now,,,...,active,True,ars,False,susan sontag - la enfermedad y sus metaforas -...,1,1,0.0,0.000833,1440.0
mla7813601724,new,capital federal,almagro,True,False,not_specified,silver,buy_it_now,,,...,active,True,ars,False,vendas cambric marca vendsur de 10cm x 3mt en ...,7,7,2010.0,0.000556,1440.0


In [3]:
# Separación de los conjuntos de datos
# ===================================================================================================================
from sklearn.model_selection import train_test_split

# Inicializar una semilla
SEED = 25

# Separamos los features y el target
X = data.loc[:, data.columns != 'condition'] # type: ignore
y = data.loc[:, data.columns == 'condition'].squeeze() # type: ignore

# Verificar que los índices coinciden
assert (X.index == y.index).all(), 'Los índices de X e y no coinciden'

# Dividir el conjunto original en 70% entrenamiento y 30% para pruebas y validación
X_train, X_temp, y_train, y_temp = train_test_split(
    X, 
    y, 
    test_size=0.4, 
    random_state=SEED, 
    stratify=y
)

# Luego, dividir el 30% restante en 20% para validación y 10% para pruebas
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, 
    test_size=1/2, 
    random_state=SEED, 
    stratify=y_temp
)

| Subconjunto   | Proporción | Descripción                                                 |
|---------------|------------|-------------------------------------------------------------|
| Entrenamiento | 60%        | Se usa para entrenar el modelo.                             |
| Validación    | 20%        | Se usa para afinar hiperparámetros y evaluar durante el ajuste. |
| Prueba        | 20%        | Se usa para evaluar el rendimiento final del modelo.        |

In [4]:
from sklearn.preprocessing import LabelEncoder

# Codificación del target
le = LabelEncoder()
y_train = pd.Series(le.fit_transform(y_train), index=y_train.index)
y_val = pd.Series(le.fit_transform(y_val), index=y_val.index)
y_test = pd.Series(le.fit_transform(y_test), index=y_test.index)

In [5]:
# Función para capturar los tipos de variables
continuous, categoricals, discretes, temporaries = capture_variables(
    data=X_train
)

		Tipos de variables
Hay 5 variables continuas
Hay 0 variables discretas
Hay 0 variables temporales
Hay 15 variables categóricas


In [6]:
# Capturar información de las variables
info = gather_variable_info(
    X=X_train,
    continuous=continuous,
    categoricals=categoricals,
    discretes=discretes,
    missing_threshold=0.05,       
    cardinality_threshold=5 
)

{
    "continuous_more_than_threshold": [],
    "continuous_less_than_threshold": [],
    "categoricals_more_than_threshold": [
        "attribute_group_id",
        "attribute_group",
        "attribute_id"
    ],
    "categoricals_less_than_threshold": [
        "state",
        "city"
    ],
    "categoricals_high_cardinality": [
        "state",
        "city",
        "listing_type",
        "attribute_group_id",
        "attribute_group",
        "attribute_id",
        "title"
    ],
    "categoricals_low_cardinality": [
        "local_pickup",
        "free_shipping",
        "shipping_mode",
        "buying_mode",
        "status",
        "accepts_mercadopago",
        "currency",
        "automatic_relist"
    ],
    "discretes_more_than_threshold": [],
    "discretes_less_than_threshold": [],
    "discretes_high_cardinality": [],
    "discretes_low_cardinality": []
}


In [7]:
from sklearn.pipeline import Pipeline
from feature_engine.selection import DropFeatures
from feature_engine.selection import DropConstantFeatures
from feature_engine.selection import DropCorrelatedFeatures
from feature_engine.preprocessing import MatchVariables, MatchCategories

high_cardinality = ['title', 'city']
categoricals_more_than_threshold = info['categoricals_more_than_threshold']

# Pipeline de procesadores
pipe = Pipeline([
    ('drop-features', DropFeatures(features_to_drop=categoricals_more_than_threshold + high_cardinality)),
    ('constant-features', DropConstantFeatures(variables=[var for var in continuous + categoricals if var not in categoricals_more_than_threshold + high_cardinality], 
                                               missing_values='ignore',
                                               tol=0.95)),
    ('match-features', MatchVariables(missing_values='ignore')),
    ('correlated-features', DropCorrelatedFeatures(variables=continuous,
                                                   missing_values='ignore',
                                                   method='pearson',
                                                   threshold=0.8)),
    ('match-categories', MatchCategories(missing_values='ignore')),
])

pipe.fit(X_train)
X_train = pipe.transform(X_train)
X_val = pipe.transform(X_val)
X_test = pipe.transform(X_test)

In [8]:
X_train['local_pickup'] = X_train['local_pickup'].astype('category')
X_val['local_pickup'] = X_val['local_pickup'].astype('category')
X_test['local_pickup'] = X_test['local_pickup'].astype('category')

In [9]:
# Función para capturar los tipos de variables
continuous, categoricals, discretes, temporaries = capture_variables(
    data=X_train
)

		Tipos de variables
Hay 4 variables continuas
Hay 0 variables discretas
Hay 0 variables temporales
Hay 4 variables categóricas


In [10]:
# Capturar información de las variables
info = gather_variable_info(
    X=X_train,
    continuous=continuous,
    categoricals=categoricals,
    discretes=discretes,
    missing_threshold=0.05,       
    cardinality_threshold=5 
)

{
    "continuous_more_than_threshold": [],
    "continuous_less_than_threshold": [],
    "categoricals_more_than_threshold": [],
    "categoricals_less_than_threshold": [
        "state"
    ],
    "categoricals_high_cardinality": [
        "state",
        "listing_type"
    ],
    "categoricals_low_cardinality": [
        "local_pickup",
        "shipping_mode"
    ],
    "discretes_more_than_threshold": [],
    "discretes_less_than_threshold": [],
    "discretes_high_cardinality": [],
    "discretes_low_cardinality": []
}


In [11]:
from feature_engine.imputation import RandomSampleImputer, CategoricalImputer
from feature_engine.encoding import RareLabelEncoder
from feature_engine.discretisation import GeometricWidthDiscretiser
from sklearn.preprocessing import KBinsDiscretizer
from feature_engine.encoding import OrdinalEncoder

rare_labels_high_cardinality = info['categoricals_high_cardinality']
rare_labels_low_cardinality = ['shipping_mode']

# Pipeline de procesadores
pipe = Pipeline([
    ('random-imputer', RandomSampleImputer(
        variables=info['categoricals_less_than_threshold'],
        random_state=SEED
        )),
    ('rare-labels-high-cardinality', RareLabelEncoder(
        tol=0.05, 
        n_categories=3, # Probar entre 2, 3, 4 o 5
        variables=rare_labels_high_cardinality
        )),
    ('rare-labels-low-cardinality', RareLabelEncoder(
        tol=0.05, 
        n_categories=2, # Probar entre 2, 3, 4 o 5
        variables=rare_labels_low_cardinality
        )),
    ('discretiser', GeometricWidthDiscretiser(
        variables=continuous,
        bins=10, # Probar 5 - 10
        return_object=True
        )),
    ('encoder', OrdinalEncoder(
        variables=continuous+categoricals,
        encoding_method='ordered'
        )),
])

pipe.fit(X_train, y_train)
X_train = pipe.transform(X_train)
X_val = pipe.transform(X_val)
X_test = pipe.transform(X_test)

In [14]:
import mlflow
import mlflow.xgboost
import xgboost as xgb
from config.config import settings


def eval_performance_metrics(cv_results: pd.DataFrame):
    """
    Evalúa los resultados de la validación cruzada y devuelve:
      - La iteración óptima (según el mejor test-auc-mean).
      - El valor de ROC-AUC en esa iteración.
      - El valor de log-loss en esa iteración.

    Parámetros:
        cv_results (pd.DataFrame): Resultados devueltos por xgb.cv, que incluye columnas como
                                   'test-auc-mean' y 'test-logloss-mean'.

    Retorna:
        best_iteration (int): Número de iteración óptima.
        best_roc_auc (float): Valor de ROC-AUC en la iteración óptima.
        best_logloss (float): Valor de log-loss en la iteración óptima.
    """
    # Se asume que se desea maximizar el test-auc-mean.
    best_iteration = cv_results['test-auc-mean'].idxmax()
    best_roc_auc = cv_results.loc[best_iteration, 'test-auc-mean']
    best_logloss = cv_results.loc[best_iteration, 'test-logloss-mean']
    return best_iteration, best_roc_auc, best_logloss

In [15]:
# Función para hacer una ejecución
def run_experiment(
    X_train: pd.DataFrame,
    y_train: pd.Series,
    X_val: pd.DataFrame,
    y_val: pd.Series,
    params: dict, 
    num_boost_round: int, 
    nfold: int, 
    early_stopping_rounds: int = None
):
    """
    Ejecuta la validación cruzada usando xgb.cv.

    Parámetros:
        data (dict): Diccionario con las llaves 'X' e 'y' conteniendo los datos de entrada y etiquetas.
        params (dict): Parámetros para el modelo XGBoost.
        num_boost_round (int): Número máximo de iteraciones (boosting rounds).
        nfold (int): Número de folds para la validación cruzada.
        early_stopping_rounds (int, opcional): Número de rondas sin mejora para detener anticipadamente.

    Retorna:
        cv_results (pd.DataFrame): DataFrame con las métricas agregadas en cada iteración.
    """
    
    # Combinar train y val para la validación cruzada
    X_cv = pd.concat([X_train, X_val], axis=0)
    y_cv = pd.concat([y_train, y_val], axis=0)
    dtrain_cv = xgb.DMatrix(data=X_cv, label=y_cv)
    
    cv_results = xgb.cv(
        params=params,
        dtrain=dtrain_cv,
        num_boost_round=num_boost_round,
        nfold=nfold,
        stratified=True,
        early_stopping_rounds=early_stopping_rounds,
        metrics=['auc', 'logloss'],
        seed=SEED,
        verbose_eval=False
    )
    return cv_results

In [None]:
def main():
    # Definir parámetros para XGBoost
    params = {
        'objective': 'binary:logistic',
        'eta': 0.1,
        'eval_metric': ['auc', 'logloss'], # Se pueden especificar múltiples métricas; la primera se usará para early stopping
        'seed': SEED
    }
    
    num_boost_round = 400
    nfold = 10
    early_stopping_rounds = 10
    experiment_name = 'feature_engineering_experiment'
    
    # Luego, recrea la carpeta
    settings.EXPERIMENTS_DIR.mkdir(parents=True, exist_ok=True)
    
    # Iniciar experimento en mlflow
    mlflow.set_tracking_uri(uri=settings.MLFLOW_TRACKING_URI)
    mlflow.set_experiment(experiment_name)
    with mlflow.start_run():
        
        current_run = mlflow.active_run()
        print(f'Active run ID: {current_run.info.run_id}') # type: ignore
        print(f'Active run name: {current_run.info.run_name}') # type: ignore
        
        # Registrar parámetros en mlflow (convirtiendo la lista a string para mayor claridad)
        mlflow.log_params({
            'eval_metric': str(params['eval_metric']),
            'num_boost_round': num_boost_round,
            'nfold': nfold,
            'early_stopping_rounds': early_stopping_rounds
        })

        # Ejecutar la validación cruzada
        cv_results = run_experiment(
            X_train=X_train, 
            y_train=y_train,
            X_val=X_val,
            y_val=y_val,
            params=params,
            num_boost_round=num_boost_round,
            nfold=nfold,
            early_stopping_rounds=early_stopping_rounds
        )
        print('Resultados de CV:')
        print(cv_results)

        # Evaluar las métricas obtenidas
        best_iteration, best_roc_auc, best_logloss = eval_performance_metrics(cv_results)
        print(f'Mejor iteración: {best_iteration}')
        print(f'Mejor ROC-AUC: {best_roc_auc}')
        print(f'Mejor Log-loss: {best_logloss}')

        # Registrar las métricas en mlflow
        metrics = {
            'best_iteration': best_iteration,
            'best_roc_auc': best_roc_auc,
            'best_logloss': best_logloss
        }
        
        # Registrar métricas en mlflow
        mlflow.log_metrics(metrics)

        # Opcional: Entrenar un modelo final con el número óptimo de iteraciones y registrarlo en mlflow
        dtrain = xgb.DMatrix(data=X_train, label=y_train)
        dval = xgb.DMatrix(X_val, label=y_val)
        final_model = xgb.train(
            params=params,
            dtrain=dtrain,
            num_boost_round=best_iteration,
            evals=[(dtrain, 'train'), (dval, 'val')]
        )
        
        mlflow.xgboost.log_model(final_model, artifact_path='artifacts/best_model')
        mlflow.end_run()
        
if __name__ == '__main__':
    main()


--- 
### DIOS Y SE PUEDA

In [12]:
from __future__ import annotations  # Para compatibilidad con anotaciones futuras (opcional)
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from feature_engine.imputation import RandomSampleImputer, CategoricalImputer
from feature_engine.encoding import RareLabelEncoder, OrdinalEncoder
from feature_engine.discretisation import GeometricWidthDiscretiser
from sklearn.preprocessing import KBinsDiscretizer
import mlflow
import mlflow.sklearn
import mlflow.xgboost
import concurrent.futures

# --- 1. Custom wrapper para KBinsDiscretizer ---
class CustomKBinsDiscretiser(BaseEstimator, TransformerMixin):
    def __init__(self, variables: list[str], n_bins: int = 5, strategy: str = 'uniform') -> None:
        self.variables = variables
        self.n_bins = n_bins
        self.strategy = strategy
        self.kbins = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy=strategy)
    
    def fit(self, X: pd.DataFrame, y: pd.Series | None = None) -> CustomKBinsDiscretiser:
        self.kbins.fit(X[self.variables])
        return self
    
    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        X = X.copy()
        X[self.variables] = self.kbins.transform(X[self.variables])
        return X

In [13]:
# --- 2. Función para construir la pipeline de forma dinámica ---
def build_pipeline(
    dynamic_params: dict[str, dict[str, any]],
    info: dict[str, any],
    continuous: list[str],
    categoricals: list[str],
    rare_labels_high_cardinality: list[str],
    rare_labels_low_cardinality: list[str],
    seed: int
    ) -> Pipeline:
    """
    Construye una pipeline de preprocesamiento con pasos:
      1. Imputación (RandomSampleImputer o CategoricalImputer)
      2. RareLabelEncoder para variables de alta y baja cardinalidad
      3. Discretización (CustomKBinsDiscretiser o GeometricWidthDiscretiser)
      4. Codificación (OrdinalEncoder)
    Los parámetros de cada transformador se pasan de forma dinámica a través de `dynamic_params`.
    """
    # Paso 1: Imputación
    imputer_info: dict[str, any] = dynamic_params.get('imputer', {})
    imputer_type: str = imputer_info.get('type', 'random')
    imputer_params: dict[str, any] = imputer_info.get('params', {})
    imputer_params.setdefault('variables', info['categoricals_less_than_threshold'])
    if imputer_type == 'random':
        imputer: BaseEstimator = RandomSampleImputer(**imputer_params, random_state=seed)
    elif imputer_type == "mode":
        # Para el imputador categórico:
        # 1. Se usa imputation_method="frequent" (valor aceptado por la librería)
        # 2. Se fuerza a que las variables sean de tipo object, con un transformer que también imprime los dtypes para debug.
        from sklearn.preprocessing import FunctionTransformer
        from sklearn.pipeline import make_pipeline

        # Eliminar cualquier parámetro conflictivo que se haya podido pasar
        imputer_params.pop("imputation_method", None)

        def cast_columns(X: pd.DataFrame) -> pd.DataFrame:
            X = X.copy()
            for col in imputer_params.get("variables", []):
                # Imprimir los tipos antes y después para ayudar a depurar
                print(f"Columna {col} antes del cast: {X[col].dtype}")
                X[col] = X[col].astype("object")
                print(f"Columna {col} después del cast: {X[col].dtype}")
            return X

        cast_transformer = FunctionTransformer(cast_columns)
        imputer = make_pipeline(
            cast_transformer,
            CategoricalImputer(**imputer_params, imputation_method="frequent")
        )
    else:
        raise ValueError(f"Tipo de imputador desconocido: {imputer_type}")
    
    # Paso 2: Rare Label Encoding
    rare_high_info: dict[str, any] = dynamic_params.get('rare_high', {})
    rare_high_info.setdefault('tol', 0.05)
    rare_high_info.setdefault('variables', rare_labels_high_cardinality)
    rare_high_info.setdefault('n_categories', 3)
    rare_high: BaseEstimator = RareLabelEncoder(**rare_high_info)
    
    rare_low_info: dict[str, any] = dynamic_params.get('rare_low', {})
    rare_low_info.setdefault('tol', 0.05)
    rare_low_info.setdefault('variables', rare_labels_low_cardinality)
    rare_low_info.setdefault('n_categories', 2)
    rare_low: BaseEstimator = RareLabelEncoder(**rare_low_info)
    
    # Paso 3: Discretización
    discretiser_info: dict[str, any] = dynamic_params.get('discretiser', {})
    discretiser_type: str = discretiser_info.get('type', 'kbins')
    discretiser_params: dict[str, any] = discretiser_info.get('params', {})
    if discretiser_type == 'kbins':
        discretiser_params.setdefault('variables', continuous)
        discretiser_params.setdefault('n_bins', 5)
        discretiser_params.setdefault('strategy', 'uniform')
        discretiser: BaseEstimator = CustomKBinsDiscretiser(**discretiser_params)
    elif discretiser_type == 'geometric':
        discretiser_params.setdefault('variables', continuous)
        discretiser_params.setdefault('bins', 5)
        discretiser_params.setdefault('return_object', True)
        discretiser = GeometricWidthDiscretiser(**discretiser_params)
    else:
        raise ValueError(f"Tipo de discretizador desconocido: {discretiser_type}")
    
    # Paso 4: Codificación
    encoder_info: dict[str, any] = dynamic_params.get('encoder', {}).get('params', {})
    encoder_info.setdefault('variables', continuous + categoricals)
    encoder_info.setdefault('encoding_method', 'ordered')
    encoder: BaseEstimator = OrdinalEncoder(**encoder_info)
    
    pipe: Pipeline = Pipeline([
        ('imputer', imputer),
        ('rare_high', rare_high),
        ('rare_low', rare_low),
        ('discretiser', discretiser),
        ('encoder', encoder)
    ])
    
    return pipe

In [14]:
# --- 3. Función run_experiment (para XGBoost cv) con sample_weights ---
def run_experiment(
    X_train: pd.DataFrame,
    y_train: pd.Series,
    X_val: pd.DataFrame,
    y_val: pd.Series,
    xgb_params: dict[str, any],
    num_boost_round: int,
    nfold: int,
    early_stopping_rounds: int,
    sample_weights: np.ndarray | None = None
    ) -> pd.DataFrame:
    """
    Ejecuta validación cruzada con XGBoost usando la unión de (X_train, X_val) y (y_train, y_val).
    Permite pasar sample_weights (o None).
    """
    X_cv: pd.DataFrame = pd.concat([X_train, X_val], axis=0)
    y_cv: pd.Series = pd.concat([y_train, y_val], axis=0)
    dtrain_cv: xgb.DMatrix = xgb.DMatrix(data=X_cv, label=y_cv, weight=sample_weights)
    
    cv_results: pd.DataFrame = xgb.cv( # type: ignore
        params=xgb_params,
        dtrain=dtrain_cv,
        num_boost_round=num_boost_round,
        nfold=nfold,
        stratified=True,
        early_stopping_rounds=early_stopping_rounds,
        metrics=['auc', 'logloss'],
        seed=xgb_params.get('seed', 42),
        verbose_eval=False
    )
    return cv_results

In [31]:
# --- Helper: función para registrar parámetros con prefijo y "aplanarlos"
def prefixed_log_params(prefix: str, params: dict[str, any]) -> None:
    """
    Registra en MLflow cada par clave-valor del diccionario, añadiendo un prefijo para que
    las claves resultantes sean únicas.
    
    Si algún valor es un diccionario, se aplana añadiendo el subprefijo correspondiente.
    """
    flattened: dict[str, any] = {}
    for key, value in params.items():
        if isinstance(value, dict):
            for subkey, subvalue in value.items():
                flattened[f'{prefix}_{key}_{subkey}'] = subvalue
        else:
            flattened[f'{prefix}_{key}'] = value
    mlflow.log_params(flattened)


# --- 4. Función para ejecutar un experimento de pipeline con MLflow ---
def run_pipeline_experiment(
    dynamic_params: dict[str, dict[str, any]],
    xgb_params: dict[str, any],
    num_boost_round: int,
    nfold: int,
    early_stopping_rounds: int,
    X_train: pd.DataFrame,
    y_train: pd.Series,
    X_val: pd.DataFrame,
    y_val: pd.DataFrame,
    sample_weights: np.ndarray | None,
    info: dict[str, any],
    continuous: list[str],
    categoricals: list[str],
    rare_labels_high_cardinality: list[str],
    rare_labels_low_cardinality: list[str],
    seed: int
    ) -> tuple[float, pd.DataFrame]:
    """
    Construye y ajusta la pipeline de feature engineering (según parámetros dinámicos), transforma los datos,
    y evalúa el rendimiento mediante XGBoost cv.
    Se registra el run en MLflow.
    
    Devuelve una métrica (por ejemplo, best ROC-AUC) y el DataFrame de cv_results.
    """
    with mlflow.start_run() as run:
        
        # Registrar parámetros dinámicos con prefijos para evitar conflictos en las claves
        prefixed_log_params('imputer', dynamic_params.get('imputer', {}))
        prefixed_log_params('rare_high', dynamic_params.get('rare_high', {}))
        prefixed_log_params('rare_low', dynamic_params.get('rare_low', {}))
        prefixed_log_params('discretiser', dynamic_params.get('discretiser', {}))
        prefixed_log_params('encoder', dynamic_params.get('encoder', {}))
        
        mlflow.log_params(xgb_params)
        mlflow.log_param('num_boost_round', num_boost_round)
        mlflow.log_param('nfold', nfold)
        mlflow.log_param('early_stopping_rounds', early_stopping_rounds)
        
        # Construir la pipeline
        pipe: Pipeline = build_pipeline(
            dynamic_params, 
            info, 
            continuous, 
            categoricals,
            rare_labels_high_cardinality, 
            rare_labels_low_cardinality, 
            seed
        )
        
        pipe.fit(X_train, y_train)
        X_train_transformed: pd.DataFrame = pipe.transform(X_train)
        X_val_transformed: pd.DataFrame = pipe.transform(X_val)
        
        # Ejecutar XGBoost cv sobre los datos transformados
        cv_results: pd.DataFrame = run_experiment(
            X_train_transformed, y_train,
            X_val_transformed, y_val,
            xgb_params, 
            num_boost_round, 
            nfold, 
            early_stopping_rounds,
            sample_weights
        )
        
        # Evaluar las métricas
        best_iteration, best_roc_auc, best_logloss = eval_performance_metrics(cv_results)
        mlflow.log_metric('best_iteration', best_iteration)
        mlflow.log_metric('best_roc_auc', best_roc_auc)
        mlflow.log_metric('best_logloss', best_logloss)
        
        # Usamos best_roc_auc como métrica representativa
        dummy_metric: float = best_roc_auc
        
        return dummy_metric, cv_results

In [16]:
# --- 5. Función de evaluación de métricas ---
def eval_performance_metrics(cv_results: pd.DataFrame) -> tuple[int, float, float]:
    """
    Dado un DataFrame con los resultados de xgb.cv, retorna:
      - best_iteration: índice de la mejor iteración basado en 'test-auc-mean'
      - best_roc_auc: valor de 'test-auc-mean' en la mejor iteración
      - best_logloss: valor de 'test-logloss-mean' en la mejor iteración
    """
    best_iteration: int = int(cv_results['test-auc-mean'].idxmax())
    best_roc_auc: float = float(cv_results.loc[best_iteration, 'test-auc-mean'])
    best_logloss: float = float(cv_results.loc[best_iteration, 'test-logloss-mean'])
    return best_iteration, best_roc_auc, best_logloss

In [17]:
# --- 6. Función para ejecutar grid search en paralelo ---
def run_grid_search(
    grid: list[dict[str, dict[str, any]]],
    xgb_params: dict[str, any],
    num_boost_round: int,
    nfold: int,
    early_stopping_rounds: int,
    X_train: pd.DataFrame,
    y_train: pd.Series,
    X_val: pd.DataFrame,
    y_val: pd.Series,
    sample_weights: np.ndarray | None,
    info: dict[str, any],
    continuous: list[str],
    categoricals: list[str],
    rare_labels_high_cardinality: list[str],
    rare_labels_low_cardinality: list[str],
    seed: int
    ) -> list[tuple[float, pd.DataFrame]]:
    """
    Ejecuta múltiples experimentos en paralelo usando combinaciones de parámetros dinámicos.
    Retorna una lista de tuplas (dummy_metric, cv_results).
    """
    results: list[tuple[float, pd.DataFrame]] = []
    with concurrent.futures.ProcessPoolExecutor() as executor:
        futures = [
            executor.submit(
                run_pipeline_experiment,
                dynamic_params, 
                xgb_params, 
                num_boost_round, 
                nfold, 
                early_stopping_rounds,
                X_train, 
                y_train, 
                X_val, 
                y_val, 
                sample_weights,
                info, 
                continuous, 
                categoricals,
                rare_labels_high_cardinality, 
                rare_labels_low_cardinality, 
                seed
            )
            for dynamic_params in grid
        ]
        for future in concurrent.futures.as_completed(futures):
            try:
                result = future.result()
                results.append(result)
            except Exception as e:
                print('Error in grid search run:', e)
    return results


In [42]:
def main() -> None:
    # Suponemos que ya tienes estas variables definidas en otro lugar:
    # X_train, y_train, X_val, y_val, info, continuous, categoricals,
    # rare_labels_high_cardinality, rare_labels_low_cardinality, xgb_params

    rare_labels_high_cardinality = info['categoricals_high_cardinality']
    rare_labels_low_cardinality = ['shipping_mode']

    # Configuración de MLflow (asegúrate de que la carpeta mlruns exista y la Tracking URI esté configurada)
    from config.config import settings
    settings.EXPERIMENTS_DIR.mkdir(parents=True, exist_ok=True)
    mlflow.set_tracking_uri(settings.MLFLOW_TRACKING_URI)
    mlflow.set_experiment("feature_engineering_experiment")
    
    # Definir el grid de parámetros dinámicos para los transformadores  
    # (Aquí puedes generar la cuadrícula de combinaciones que desees probar; este es un ejemplo sencillo)
    grid = [
        {
            'imputer': {'type': 'random', 'params': {}},
            'rare_high': {'n_categories': 3},
            'rare_low': {'n_categories': 2},
            'discretiser': {'type': 'kbins', 'params': {'n_bins': 5}},
            'encoder': {'params': {}}
        },
        {
            'imputer': {'type': 'mode', 'params': {}},
            'rare_high': {'n_categories': 4},
            'rare_low': {'n_categories': 3},
            'discretiser': {'type': 'geometric', 'params': {'bins': 10}},
            'encoder': {'params': {}}
        }
        # Puedes agregar más combinaciones según tus necesidades
    ]
    
    # Parámetros para XGBoost
    xgb_params: dict[str, any] = {
        'objective': 'binary:logistic',
        'eta': 0.1,
        'eval_metric': ['auc', 'logloss'],
        'seed': 42
    }
    num_boost_round = 400
    nfold = 10
    early_stopping_rounds = 10
    sample_weights = None  # O un array de pesos si deseas probar esa opción

    # Ejecutar la búsqueda de cuadrícula en paralelo
    results = run_grid_search(
        grid,
        xgb_params,
        num_boost_round,
        nfold,
        early_stopping_rounds,
        X_train,
        y_train,
        X_val,
        y_val,
        sample_weights,
        info,
        continuous,
        categoricals,
        rare_labels_high_cardinality,
        rare_labels_low_cardinality,
        seed=42
    )
    
    # Mostrar los resultados obtenidos (por ejemplo, la mejor ROC-AUC de cada run)
    print("Resultados del grid search:")
    for best_roc_auc, cv_res in results:
        print(f"Best ROC-AUC: {best_roc_auc}")

if __name__ == '__main__':
    main()


Error in grid search run: imputation_method takes only values 'missing' or 'frequent'
Error in grid search run: Some of the variables are not categorical. Please cast them as object or categorical before using this transformer.
Resultados del grid search:


In [41]:
X_val.dtypes

state                 category
local_pickup          category
shipping_mode         category
listing_type          category
available_quantity       int64
total_amount           float64
date_difference_hr     float64
time_difference_hr     float64
dtype: object

In [34]:
X_train.isnull().mean()

state                 0.000033
local_pickup          0.000000
shipping_mode         0.000000
listing_type          0.000000
available_quantity    0.000000
total_amount          0.000000
date_difference_hr    0.000000
time_difference_hr    0.000000
dtype: float64