##### Usar Libreria statsforecast que es más rápida

In [2]:
import pandas as pd
import numpy as np
from pmdarima import auto_arima
from statsforecast import StatsForecast
from statsforecast.models import AutoARIMA,  ETS, Theta
from joblib import Parallel, delayed  # Para paralelizar (opcional)
import warnings
warnings.filterwarnings('ignore')  # Ignorar warnings

In [23]:
productos_ok = pd.read_csv("https://storage.googleapis.com/open-courses/austral2025-af91/labo3v/product_id_apredecir201912.txt", sep="\t")

df = pd.read_csv("../../data/raw/sell-in.csv", sep="\t")
df.shape

(2945818, 7)

In [4]:
ts = df.groupby(["periodo", "product_id"]).agg({"tn": "sum"}).reset_index()

ts = ts[ts['product_id'].isin(productos_ok['product_id'])]

In [5]:
# Convertir 'periodo' (yyyymm) a datetime
ts['periodo_dt'] = pd.to_datetime(ts['periodo'].astype(str), format='%Y%m')
ts = ts.sort_values(['product_id', 'periodo_dt'])  # Ordenar por producto y fecha
ts

Unnamed: 0,periodo,product_id,tn,periodo_dt
0,201701,20001,934.77222,2017-01-01
785,201702,20001,798.01620,2017-02-01
1566,201703,20001,1303.35771,2017-03-01
2352,201704,20001,1069.96130,2017-04-01
3136,201705,20001,1502.20132,2017-05-01
...,...,...,...,...
27494,201908,21276,0.01265,2019-08-01
28424,201909,21276,0.01856,2019-09-01
29376,201910,21276,0.02079,2019-10-01
30315,201911,21276,0.03341,2019-11-01


In [6]:
ts.drop(columns=['periodo'], inplace=True)  # Eliminar columna temporal
ts.rename(columns={'tn': 'y', 'periodo_dt':'ds', 'product_id':'unique_id'}, inplace=True)  # Renombrar columna de tn

In [None]:
# ts['crisis'] = ((ts['ds'].dt.year == 2019) & (ts['ds'].dt.month == 8)).astype(int)  # Ejemplo dummy

In [None]:
# # 3. Configurar y entrenar AutoARIMA con StatsForecast
# tsc = ts.copy()
# tsc = ts[ts['unique_id'] == 20001]  # Filtrar por un producto específico para el ejemplo
# tsc

Unnamed: 0,unique_id,y,ds
0,20001,934.77222,2017-01-01
785,20001,798.0162,2017-02-01
1566,20001,1303.35771,2017-03-01
2352,20001,1069.9613,2017-04-01
3136,20001,1502.20132,2017-05-01
3942,20001,1520.06539,2017-06-01
4765,20001,1030.67391,2017-07-01
5591,20001,1267.39462,2017-08-01
6438,20001,1316.94604,2017-09-01
7267,20001,1439.75563,2017-10-01


In [None]:
# models = [
#     AutoARIMA(season_length=12),
#     ETS(season_length=12),
#     Theta(season_length=12)
# ]


# sf = StatsForecast(
#     models=models,
#     freq='MS',  # Frecuencia mensual
#     n_jobs=-1   # Paralelizar en todos los núcleos
# )

# # 4. Predecir 2 meses adelante
# forecasts = sf.forecast(h=2, df=tsc)
# print(forecasts)

                  ds    AutoARIMA          ETS        Theta
unique_id                                                  
20001     2020-01-01  1488.118128  1498.447866  1528.403687
20001     2020-02-01  1488.118128  1498.614501  1534.384277


In [None]:
# pred = forecasts.iloc[[-1]]
# pred = pred[['AutoARIMA', 'ETS', 'Theta']].mean(axis=1)
# pred

unique_id
20001    1507.038969
dtype: float64

In [8]:
ts

Unnamed: 0,unique_id,y,ds
0,20001,934.77222,2017-01-01
785,20001,798.01620,2017-02-01
1566,20001,1303.35771,2017-03-01
2352,20001,1069.96130,2017-04-01
3136,20001,1502.20132,2017-05-01
...,...,...,...
27494,21276,0.01265,2019-08-01
28424,21276,0.01856,2019-09-01
29376,21276,0.02079,2019-10-01
30315,21276,0.03341,2019-11-01


In [9]:
models = [
    AutoARIMA(season_length=12),
    ETS(season_length=12),
    Theta(season_length=12)
]

predictions = {}

for product_id in productos_ok['product_id'].unique():
    df_product = ts[ts['unique_id'] == product_id].copy()
    
    # Añadir columna unique_id (requerida por StatsForecast)
    df_product['unique_id'] = product_id
    
    # Seleccionar columnas necesarias y ordenar por fecha
    df_product = df_product[['unique_id', 'ds', 'y']].sort_values('ds')
    
    if len(df_product) >= 12:
        try:
            sf = StatsForecast(models=models, freq='MS', n_jobs=-1)
            pred = sf.forecast(h=2, df=df_product)
            
            # Obtener la última predicción (h=2)
            pred_feb2020 = pred.sort_values('ds').iloc[[-1]]
            
            # Calcular promedio de modelos
            mean_pred = pred_feb2020[['AutoARIMA', 'ETS', 'Theta']].mean(axis=1).iloc[0]
            predictions[product_id] = mean_pred
            
            print(f"Producto {product_id}: Modelo ajustado")
            
        except Exception as e:
            print(f"Error en producto {product_id}: {str(e)}")
            predictions[product_id] = None
    else:
        print(f"Producto {product_id}: Insuficientes datos ({len(df_product)} observaciones)")
        predictions[product_id] = None

# Convertir a DataFrame
df_predictions = pd.DataFrame({
    'product_id': predictions.keys(),
    'prediccion_mes+2': predictions.values()
})

Producto 20001: Modelo ajustado
Producto 20002: Modelo ajustado
Producto 20003: Modelo ajustado
Producto 20004: Modelo ajustado
Producto 20005: Modelo ajustado
Producto 20006: Modelo ajustado
Producto 20007: Modelo ajustado
Producto 20008: Modelo ajustado
Producto 20009: Modelo ajustado
Producto 20010: Modelo ajustado
Producto 20011: Modelo ajustado
Producto 20012: Modelo ajustado
Producto 20013: Modelo ajustado
Producto 20014: Modelo ajustado
Producto 20015: Modelo ajustado
Producto 20016: Modelo ajustado
Producto 20017: Modelo ajustado
Producto 20018: Modelo ajustado
Producto 20019: Modelo ajustado
Producto 20020: Modelo ajustado
Producto 20021: Modelo ajustado
Producto 20022: Modelo ajustado
Producto 20023: Modelo ajustado
Producto 20024: Modelo ajustado
Producto 20025: Modelo ajustado
Producto 20026: Modelo ajustado
Producto 20027: Modelo ajustado
Producto 20028: Modelo ajustado
Producto 20029: Modelo ajustado
Producto 20030: Modelo ajustado
Producto 20031: Modelo ajustado
Producto

In [11]:
df_predictions

Unnamed: 0,product_id,prediccion_mes+2
0,20001,1507.038969
1,20002,1314.490897
2,20003,770.206630
3,20004,572.858422
4,20005,568.824719
...,...,...
775,21263,0.006694
776,21265,
777,21266,
778,21267,


In [12]:
df_predictions.rename(columns={'prediccion_mes+2': 'tn'}, inplace=True)

In [14]:
# 2. Reemplazar negativos por 0 ANTES del modelo
df_predictions['tn'] = np.where(df_predictions['tn'] < 0, 0, df_predictions['tn'])


In [25]:
df_copy = df.copy()
df_copy = df_copy[df_copy['periodo'] == 201912]  # Filtrar por diciembre 2019
df_copy = df_copy.groupby(["periodo", "product_id"]).agg({"tn": "sum"}).reset_index()
df_copy

Unnamed: 0,periodo,product_id,tn
0,201912,20001,1504.68856
1,201912,20002,1087.30855
2,201912,20003,892.50129
3,201912,20004,637.90002
4,201912,20005,593.24443
...,...,...,...
922,201912,21265,0.05007
923,201912,21266,0.05121
924,201912,21267,0.01569
925,201912,21271,0.00298


In [27]:
df_predictions

Unnamed: 0,product_id,tn
0,20001,1507.038969
1,20002,1314.490897
2,20003,770.206630
3,20004,572.858422
4,20005,568.824719
...,...,...
775,21263,0.006694
776,21265,
777,21266,
778,21267,


In [26]:
df_final = df_predictions.merge(df_copy[['product_id', 'tn']], on='product_id', how='left', suffixes=('', '_original'))
df_final

Unnamed: 0,product_id,tn,tn_original
0,20001,1507.038969,1504.68856
1,20002,1314.490897,1087.30855
2,20003,770.206630,892.50129
3,20004,572.858422,637.90002
4,20005,568.824719,593.24443
...,...,...,...
775,21263,0.006694,0.01270
776,21265,,0.05007
777,21266,,0.05121
778,21267,,0.01569


In [28]:
df_final['tn'] = df_final['tn'].fillna(df_final['tn_original'])
df_final.isna().sum()

product_id     0
tn             0
tn_original    0
dtype: int64

In [32]:
df_final.drop(columns=['tn_original'], inplace=True)

In [33]:
df_final.to_csv("../../outputs/autorima_exp2.csv", index=False, sep=",")