In [1]:
import sys
from pathlib import Path

# Añade src al path para importar los módulos
sys.path.append(str(Path().resolve().parent / 'src'))


In [None]:
import pandas as pd
from paths import PROCESSED_DIR

# Cargamos el DataFrame procesado con los lags seleccionados (excluidos lags 4,12,24,52, porque empeoraban el rendimiento)
df = pd.read_parquet(PROCESSED_DIR / 'ts_df_bolleria_20250803.parquet')
df

Unnamed: 0,base_imponible,is_summer_peak,is_easter,week_start,base_imponible_lag1,base_imponible_lag2,base_imponible_lag3,base_imponible_lag52,base_imponible_next1
0,597.65,0,0,2024-01-15,572.51,534.79,563.18,825.11,680.30
1,680.30,0,0,2024-01-22,597.65,572.51,534.79,658.40,603.99
2,603.99,0,0,2024-01-29,680.30,597.65,572.51,741.40,600.14
3,600.14,0,0,2024-02-05,603.99,680.30,597.65,653.64,689.32
4,689.32,0,0,2024-02-12,600.14,603.99,680.30,680.46,627.76
...,...,...,...,...,...,...,...,...,...
68,756.42,0,0,2025-05-19,810.97,842.62,891.38,828.30,802.16
69,802.16,0,0,2025-05-26,756.42,810.97,842.62,854.34,881.72
70,881.72,0,0,2025-06-02,802.16,756.42,810.97,782.38,1015.97
71,1015.97,0,0,2025-06-09,881.72,802.16,756.42,708.74,1014.76


In [3]:
from data_split import train_test_split
from datetime import datetime

# Realiza el split usando la fecha de corte adecuada y el target
X_train, y_train, X_test, y_test = train_test_split(
    df,
    split_date=datetime(2025, 3, 3),  # Ajusta la fecha si es necesario
    target='base_imponible'
)

In [4]:
from catboost import CatBoostRegressor

# Baseline con CatBoost
cat_model = CatBoostRegressor(verbose=0, iterations=100, learning_rate=0.1, depth=6)

# Elimina la columna "week_start" si existe en X_train y X_test
if 'week_start' in X_train.columns:
    X_train = X_train.drop(columns=['week_start'])
if 'week_start' in X_test.columns:
    X_test = X_test.drop(columns=['week_start'])

# Entrenamos el modelo
cat_model.fit(X_train, y_train)

<catboost.core.CatBoostRegressor at 0x27899209cf0>

In [5]:
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error, mean_absolute_percentage_error

# Realizamos las predicciones
y_pred_cat = cat_model.predict(X_test)

# Evaluamos el modelo
mae = mean_absolute_error(y_test, y_pred_cat)
rmse = mean_squared_error(y_test, y_pred_cat) ** 0.5
mape = mean_absolute_percentage_error(y_test, y_pred_cat)
r2 = r2_score(y_test, y_pred_cat)

print(f"MAE CatBoost: {mae:.2f}, RMSE: {rmse:.2f}, MAPE: {mape:.2F}, R2: {r2:.2f}")

MAE CatBoost: 80.11, RMSE: 123.96, MAPE: 0.10, R2: 0.70


In [6]:
import joblib
from paths import MODELS_DIR
import os

# Guardamos el modelo CatBoost
joblib.dump(cat_model, MODELS_DIR / 'catboost.pkl')

['C:\\Workspace\\mlops_fleca_project\\models\\catboost.pkl']