In [8]:
import sys
from pathlib import Path

# Añade src al path para importar los módulos
sys.path.append(str(Path().resolve().parent / 'src'))

In [10]:
# Preparación de datos con el pipeline completo desde model.py
from src.model import preparar_datos_pipeline, train_evaluate_xgboost

# Obtén los datos listos para modelar
X_train, y_train, X_test, y_test = preparar_datos_pipeline(
    parquet_file='ts_df_bolleria_20250803.parquet',
    split_date='2025-03-03',
    target='base_imponible'
)

In [11]:
# Función objetivo Optuna con parámetros recomendados para XGBoost (regresión)
# Como hay pocas filas incluimos set de validación fijo (20% del train) en lugar de incluir splits

import optuna


def objective(trial):
    # Hiperparámetros a optimizar
    params = {
        'objective': 'reg:squarederror',
        'booster': trial.suggest_categorical('booster', ['gbtree', 'gblinear', 'dart']),
        'lambda': trial.suggest_float('lambda', 1e-8, 1.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-8, 1.0, log=True),
        'subsample': trial.suggest_float('subsample', 0.2, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.2, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 2, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'random_state': 42
    }
    # Split interno para validación (20% del train)
    X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=False)
    result = train_evaluate_xgboost(X_tr, y_tr, X_val, y_val, params)
    return result['mae']

# Lanzar el estudio de optimización
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=30)

print('Mejores hiperparámetros Optuna (validación interna):', study.best_params)
print('Mejor MAE Optuna (validación interna):', study.best_value)

[I 2025-08-05 20:47:27,089] A new study created in memory with name: no-name-a50eeedf-47d6-4ae7-b2a2-681c5e8528b8
[I 2025-08-05 20:47:27,412] Trial 0 finished with value: 84.61112955729168 and parameters: {'booster': 'dart', 'lambda': 0.00011385484587041947, 'alpha': 6.115999639819006e-06, 'subsample': 0.6296700514721693, 'colsample_bytree': 0.5333837151542318, 'n_estimators': 263, 'max_depth': 3, 'learning_rate': 0.2577472928263704}. Best is trial 0 with value: 84.61112955729168.
[I 2025-08-05 20:47:27,412] Trial 0 finished with value: 84.61112955729168 and parameters: {'booster': 'dart', 'lambda': 0.00011385484587041947, 'alpha': 6.115999639819006e-06, 'subsample': 0.6296700514721693, 'colsample_bytree': 0.5333837151542318, 'n_estimators': 263, 'max_depth': 3, 'learning_rate': 0.2577472928263704}. Best is trial 0 with value: 84.61112955729168.
Parameters: { "colsample_bytree", "max_depth", "subsample" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
[I 2025-08-05 20:47:27

Mejores hiperparámetros Optuna (validación interna): {'booster': 'gbtree', 'lambda': 1.5217227415395248e-07, 'alpha': 3.805804544409099e-06, 'subsample': 0.2068525521593345, 'colsample_bytree': 0.8857508899692184, 'n_estimators': 51, 'max_depth': 7, 'learning_rate': 0.08962161965694515}
Mejor MAE Optuna (validación interna): 58.911400349934894


In [12]:
import joblib
from paths import MODELS_DIR
import os

best_model = train_evaluate_xgboost(X_train, y_train, X_test, y_test, study.best_params)['model']
joblib.dump(best_model, MODELS_DIR / 'xgboost_optimized.pkl')

['C:\\Workspace\\mlops_fleca_project\\models\\xgboost_optimized.pkl']