In [33]:
import sys
import os
from pathlib import Path

# Obtener la ruta del directorio actual y añadir src al path
notebook_path = os.getcwd()
project_root = Path(notebook_path).parent  # Directorio padre del directorio notebooks
sys.path.append(str(project_root / 'src'))

print(f"Directorio raíz del proyecto: {project_root}")
print(f"Directorio src añadido al path: {project_root / 'src'}")

Directorio raíz del proyecto: c:\Workspace\mlops_fleca_project
Directorio src añadido al path: c:\Workspace\mlops_fleca_project\src


In [34]:
import pandas as pd
from paths import PROCESSED_DIR # type: ignore

df = pd.read_parquet(PROCESSED_DIR / 'ts_df_bolleria_20250801.parquet')
df

Unnamed: 0,base_imponible,is_summer_peak,is_easter,week_start,base_imponible_lag1,base_imponible_lag2,base_imponible_lag3,base_imponible_lag4,base_imponible_lag12,base_imponible_lag24,base_imponible_lag52,base_imponible_next1
0,597.65,0,0,2024-01-15,572.51,534.79,563.18,806.54,912.13,1745.97,825.11,680.30
1,680.30,0,0,2024-01-22,597.65,572.51,534.79,563.18,750.99,1681.41,658.40,603.99
2,603.99,0,0,2024-01-29,680.30,597.65,572.51,534.79,821.84,1753.02,741.40,600.14
3,600.14,0,0,2024-02-05,603.99,680.30,597.65,572.51,607.02,1835.18,653.64,689.32
4,689.32,0,0,2024-02-12,600.14,603.99,680.30,597.65,304.73,2127.71,680.46,627.76
...,...,...,...,...,...,...,...,...,...,...,...,...
68,756.42,0,0,2025-05-19,810.97,842.62,891.38,894.20,725.72,756.58,828.30,802.16
69,802.16,0,0,2025-05-26,756.42,810.97,842.62,891.38,635.99,604.75,854.34,881.72
70,881.72,0,0,2025-06-02,802.16,756.42,810.97,842.62,673.60,636.84,782.38,1015.97
71,1015.97,0,0,2025-06-09,881.72,802.16,756.42,810.97,546.93,571.22,708.74,1014.76


In [35]:
# -----------------------------
# Utilidad para calcular punto de corte óptimo (split_date)
# -----------------------------
def calcular_split_date(df: pd.DataFrame, train_ratio: float = 0.8) -> pd.Timestamp:
    """
    Calcula el mejor punto de corte (split_date) para separar el DataFrame en train/test según el ratio deseado.
    Usa la columna 'week_start' como referencia temporal.

    Parámetros:
    - df: pd.DataFrame - DataFrame con columna 'week_start'.
    - train_ratio: float - Proporción de datos para train (por defecto 0.8).

    Retorna:
    - split_date: pd.Timestamp - Fecha (primer lunes) que separa train y test.
    """
    weeks = df['week_start'].sort_values().unique()
    n_train = int(len(weeks) * train_ratio)
    split_date = weeks[n_train - 1]  # El último lunes que entra en train


    return pd.to_datetime(split_date)

print("\nCalculando split_date óptimo...")
split_date = calcular_split_date(df)
print(f"split_date calculado para 80% train: {split_date}")


Calculando split_date óptimo...
split_date calculado para 80% train: 2025-03-03 00:00:00


In [36]:
import pandas as pd 
from data_split import train_test_split
from datetime import datetime


X_train, y_train, X_test, y_test = train_test_split(
    df,
    split_date=datetime(2025, 3, 3),  # Fecha de corte para el split
    target='base_imponible'
)

print(f'{X_train.shape=}, {y_train.shape=}, {X_test.shape=}, {y_test.shape=}')

X_train.shape=(58, 11), y_train.shape=(58,), X_test.shape=(15, 11), y_test.shape=(15,)


# Entrenamos Baseline I

Este baseline va a predecir con datos de la semana anterior

In [52]:
import numpy as np

class BaselineModelPreviousWeek:
    """
    Predicción basada en el valor de la semana anterior.
    """

    def fit(self, X_train: pd.DataFrame, y_train: pd.Series):
        pass

    def predict(self, X_test: pd.DataFrame) -> np.array:
        # Predecir usando los datos de la última semana
        return X_test['base_imponible_lag1']


In [53]:
model_bs1 = BaselineModelPreviousWeek()

predictions_bs1 = model_bs1.predict(X_test)
predictions_bs1

0      635.99
1      673.60
2      546.93
3      612.87
4      714.39
5      620.41
6     1515.87
7      894.20
8      891.38
9      842.62
10     810.97
11     756.42
12     802.16
13     881.72
14    1015.97
Name: base_imponible_lag1, dtype: float64

In [None]:
# Evaluación del modelo

from sklearn.metrics import mean_absolute_error, mean_squared_error

test_mae_bs1 = mean_absolute_error(y_test, predictions_bs1)
test_mse_bs1 = mean_squared_error(y_test, predictions_bs1)

print(f"MAE Baseline Previous Week: {test_mae_bs1:.4f}")
print(f"MSE Baseline Previous Week: {test_mse_bs1:.4f}")


MAE Baseline Previous Week: 156.0927
MSE Baseline Previous Week: 84138.5346


# Entrenamos Baseline II

Este baseline va a predecir con la media de las últimas 4 semanas

In [67]:
import numpy as np

class BaselineModelMeanLast4Weeks:
    """
    Predicción basada en la media de las 4 últimas semanas (lag1, lag2, lag3, lag4).
    """

    def fit(self, X_train: pd.DataFrame, y_train: pd.Series):
        pass

    def predict(self, X_test: pd.DataFrame) -> np.array:
        # Predecir usando la media de las 4 últimas semanas
        lags = ['base_imponible_lag1', 'base_imponible_lag2', 'base_imponible_lag3', 'base_imponible_lag4']
        return X_test[lags].mean(axis=1).values


In [68]:
model_bs2 = BaselineModelMeanLast4Weeks()
# Predecir usando la media de las 4 últimas semanas

predictions_bs2 = model_bs2.predict(X_test)
predictions_bs2

array([ 662.5975,  652.27  ,  645.56  ,  617.3475,  636.9475,  623.65  ,
        865.885 ,  936.2175,  980.465 , 1036.0175,  859.7925,  825.3475,
        803.0425,  812.8175,  864.0675])

In [69]:
# Evaluación del modelo
from sklearn.metrics import mean_absolute_error, mean_squared_error
test_mae_bs2 = mean_absolute_error(y_test, predictions_bs2)
test_mse_bs2 = mean_squared_error(y_test, predictions_bs2)

print(f"MAE Baseline Mean 4 Previous Weeks: {test_mae_bs2:.4f}")
print(f"MSE Baseline Mean 4 Previous Week: {test_mse_bs2:.4f}")

MAE Baseline Mean 4 Previous Weeks: 143.3307
MSE Baseline Mean 4 Previous Week: 64792.4056


# Entrenamos Baseline III

Este baseline va a predecir con los datos de la misma semana del año anterior 

In [70]:
import numpy as np

class BaselineModelSameWeekLastYear:
    """
    Predicción basada en los datos de la misma semana del año anterior.
    """

    def fit(self, X_train: pd.DataFrame, y_train: pd.Series):
        pass

    def predict(self, X_test: pd.DataFrame) -> np.array:
        # Predecir usando la misma semana del año anterior
        # Aquí asumimos que 'base_imponible_lag52' es la columna con los datos de la semana del año anterior    
        lags = ['base_imponible_lag52']
        return X_test[lags].mean(axis=1).values

In [71]:
model_bs3 = BaselineModelSameWeekLastYear()
# Predecir usando la misma semana del año anterior

predictions_bs3 = model_bs3.predict(X_test)
predictions_bs3

array([ 561.28,  569.69,  639.09,  706.42, 1460.7 ,  823.62,  679.21,
        744.58,  677.46,  915.18,  828.3 ,  854.34,  782.38,  708.74,
        737.77])

In [72]:
# Evaluación del modelo
from sklearn.metrics import mean_absolute_error, mean_squared_error
test_mae_bs3 = mean_absolute_error(y_test, predictions_bs3)
test_mse_bs3 = mean_squared_error(y_test, predictions_bs3)

print(f"MAE Baseline same week last year: {test_mae_bs3:.4f}")
print(f"MSE Baseline same week last year: {test_mse_bs3:.4f}")

MAE Baseline same week last year: 209.3727
MSE Baseline same week last year: 99597.4867


# Entrenamos baseline IV

Este baseline va a predecir con los datos de las últimas 24 semanas

In [102]:
import numpy as np

class BaselineModelMeanLast24Weeks:
    """
    Predicción en la media de las últimas 12 semanas.
    """

    def fit(self, X_train: pd.DataFrame, y_train: pd.Series):
        pass

    def predict(self, X_test: pd.DataFrame) -> np.array:
        # Predecir usando la media de las últimas 24 semanas
        # Aquí asumimos que 'base_imponible_lag24' es la columna con los datos de las últimas 12 semanas   
        lags = ['base_imponible_lag12']
        return X_test[lags].mean(axis=1).values

In [103]:
model_bs4 = BaselineModelMeanLast24Weeks()
# Predecir usando la media de las últimas 24 semanas

predictions_bs4 = model_bs4.predict(X_test)
predictions_bs4

array([636.84, 571.22, 534.36, 564.57, 541.98, 649.69, 691.44, 667.37,
       714.91, 573.77, 725.72, 635.99, 673.6 , 546.93, 612.87])

In [104]:
# Evaluación del modelo
from sklearn.metrics import mean_absolute_error, mean_squared_error
test_mae_bs4 = mean_absolute_error(y_test, predictions_bs4)
test_mse_bs4 = mean_squared_error(y_test, predictions_bs4)

print(f"MAE Baseline Mean last 12 weeks: {test_mae_bs4:.4f}")
print(f"MSE Baseline Mean last 12 weeks: {test_mse_bs4:.4f}")

MAE Baseline Mean last 12 weeks: 220.1060
MSE Baseline Mean last 12 weeks: 93614.5987


# Entrenamos Baseline V

ESte modelo va a predecir con la media de los lags de las últimas 4 semanas, 12, 24 y 52

In [99]:
import numpy as np

class BaselineModelMeanLags:
    """
    Predicción basada en la media de lags (lag1, lag12, lag24, lag52).
    """

    def fit(self, X_train: pd.DataFrame, y_train: pd.Series):
        pass

    def predict(self, X_test: pd.DataFrame) -> np.array:
        # Predecir usando la media de las 4 últimas semanas
        lags = ['base_imponible_lag1', 'base_imponible_lag2', 'base_imponible_lag3','base_imponible_lag4','base_imponible_lag12', 'base_imponible_lag24', 'base_imponible_lag52']
        return X_test[lags].mean(axis=1).values

In [100]:
model_bs5 = BaselineModelMeanLags()


predictions_bs5 = model_bs5.predict(X_test)
predictions_bs5

array([699.72571429, 647.73      , 657.46142857, 650.04428571,
       765.52285714, 682.71142857, 792.29571429, 842.41142857,
       855.91857143, 914.73571429, 821.39571429, 770.92428571,
       757.85571429, 725.45142857, 763.03857143])

In [101]:
# Evaluación del modelo
from sklearn.metrics import mean_absolute_error, mean_squared_error
test_mae_bs5 = mean_absolute_error(y_test, predictions_bs5)
test_mse_bs5 = mean_squared_error(y_test, predictions_bs5)

print(f"MAE Baseline Mean Lags: {test_mae_bs5:.4f}")
print(f"MSE Baseline Mean Lags: {test_mse_bs5:.4f}")

MAE Baseline Mean Lags: 149.6258
MSE Baseline Mean Lags: 61614.2719
