# Experimento 7

Feature Engineering con Autogluon: solo Crisis (Pasos 201918)

In [1]:
import pandas as pd
import numpy as np
from autogluon.timeseries import TimeSeriesDataFrame, TimeSeriesPredictor
from statsmodels.tsa.seasonal import seasonal_decompose

In [2]:
df = pd.read_csv('../../data/preprocessed/base.csv', sep=',')
df.head()

Unnamed: 0,periodo,customer_id,product_id,plan_precios_cuidados,cust_request_qty,cust_request_tn,tn,cat1,cat2,cat3,brand,sku_size,stock_final
0,201701,10234,20524,0,2,0.053,0.053,HC,VAJILLA,Cristalino,Importado,500.0,
1,201701,10032,20524,0,1,0.13628,0.13628,HC,VAJILLA,Cristalino,Importado,500.0,
2,201701,10217,20524,0,1,0.03028,0.03028,HC,VAJILLA,Cristalino,Importado,500.0,
3,201701,10125,20524,0,1,0.02271,0.02271,HC,VAJILLA,Cristalino,Importado,500.0,
4,201701,10012,20524,0,11,1.54452,1.54452,HC,VAJILLA,Cristalino,Importado,500.0,


In [3]:
dfg = df.groupby(['periodo', 'product_id']).agg({'tn': 'sum'}).reset_index()
dfg.head()

Unnamed: 0,periodo,product_id,tn
0,201701,20001,934.77222
1,201701,20002,550.15707
2,201701,20003,1063.45835
3,201701,20004,555.91614
4,201701,20005,494.27011


In [4]:
dfg['periodo_dt'] = pd.to_datetime(dfg['periodo'].astype(str), format='%Y%m')
dfg.rename(columns={'tn': 'target', 'product_id':'item_id', 'periodo_dt': 'timestamp'}, inplace=True)
dfg.drop(columns=['periodo'], inplace=True)


In [5]:
# Versión más concisa usando merge cruzado
future_periods = pd.DataFrame({
    'timestamp': pd.to_datetime(['2020-01-01', '2020-02-01'])
})

df_future = (dfg[['item_id']].drop_duplicates()
             .merge(future_periods, how='cross')
             .assign(target=0))  # Asignar 0 a tn

dfg_completo = pd.concat([dfg, df_future], ignore_index=True)

dfg = dfg_completo.copy()

##### Productos

In [6]:
productos_df = pd.read_csv('../../data/raw/tb_productos.csv', sep='\t')
productos_df = productos_df.drop_duplicates(subset=['product_id'], keep='first')
productos_df.rename(columns={'product_id': 'item_id'}, inplace=True)

##### Crisis

In [7]:
# Opción 1: Usando condiciones booleanas
dfg['crisis'] = ((dfg['timestamp'].dt.year == 2019) & (dfg['timestamp'].dt.month == 8)).astype(int)

##### Levantamos productos a predecir

In [8]:
productos_ok = pd.read_csv('../../data/raw/product_id_apredecir201912.csv', sep=',')
productos_ok.head()

Unnamed: 0,product_id
0,20001
1,20002
2,20003
3,20004
4,20005


##### Filtramos productos a predecir

In [9]:
dfg = dfg[dfg['item_id'].isin(productos_ok['product_id'].unique())]

##### Sacamos dataset de entrenamiento y dataset futuro para la predicción

In [10]:
futuro = dfg[dfg['timestamp'] >= '2020-01-01'].copy()
dfg = dfg[dfg['timestamp'] < '2020-01-01'].copy()

##### Dataset para Autogluon 

In [11]:
data = TimeSeriesDataFrame.from_data_frame(dfg,
                                           id_column="item_id",
                                           timestamp_column="timestamp",
                                           static_features_df=productos_df
                                           )
data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,target,crisis
item_id,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1
20001,2017-01-01,934.77222,0
20002,2017-01-01,550.15707,0
20003,2017-01-01,1063.45835,0
20004,2017-01-01,555.91614,0
20005,2017-01-01,494.27011,0


In [12]:
data.static_features.head()

Unnamed: 0_level_0,cat1,cat2,cat3,brand,sku_size
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
20001,HC,ROPA LAVADO,Liquido,ARIEL,3000
20002,HC,ROPA LAVADO,Liquido,LIMPIEX,3000
20003,FOODS,ADEREZOS,Mayonesa,NATURA,475
20004,FOODS,ADEREZOS,Mayonesa,NATURA,240
20005,FOODS,ADEREZOS,Mayonesa,NATURA,120


##### Entrenamiento

El entrenamiento tardó 98 minutos

In [13]:
covariates = data.drop(columns = "target").columns

predictor = TimeSeriesPredictor(target='target',
                                prediction_length=2, 
                                freq="M",
                                #eval_metric = "MSE",
                                known_covariates_names = covariates).fit(data, 
                                                                         num_val_windows=2)

  std_freq = pd.tseries.frequencies.to_offset(self.freq).freqstr
Frequency 'M' stored as 'ME'
Beginning AutoGluon training...
AutoGluon will save models to 'c:\Users\Usuario\Documents\Universidad\austral\2025\Lab3\Lab3-MCD\notebooks\model_autogluon\AutogluonModels\ag-20250607_151827'
AutoGluon Version:  1.2
Python Version:     3.11.4
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.26100
CPU Count:          4
GPU Count:          0
Memory Avail:       4.33 GB / 15.89 GB (27.3%)
Disk Space Avail:   411.65 GB / 893.49 GB (46.1%)

Fitting with arguments:
{'enable_ensemble': True,
 'eval_metric': WQL,
 'freq': 'ME',
 'hyperparameters': 'default',
 'known_covariates_names': ['crisis'],
 'num_val_windows': 2,
 'prediction_length': 2,
 'quantile_levels': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
 'random_seed': 123,
 'refit_every_n_windows': 1,
 'refit_full': False,
 'skip_model_selection': False,
 'target': 'target',
 'verbosity': 2}

train_data with frequen

##### Leaderboard

In [36]:
predictor.leaderboard()


Unnamed: 0,model,score_val,pred_time_val,fit_time_marginal,fit_order
0,WeightedEnsemble,-0.034212,15.090804,3.053643,12
1,DirectTabular,-0.034212,15.090804,2805.955686,3
2,RecursiveTabular,-0.176927,1.874368,325.170048,2
3,ChronosZeroShot[bolt_base],-0.190284,10.255714,40.619236,7
4,AutoETS,-0.19555,18.249639,16.638718,6
5,DynamicOptimizedTheta,-0.20199,2.465895,64.111537,5
6,TiDE,-0.231938,1.301187,616.581481,11
7,SeasonalNaive,-0.237611,0.88486,3.809673,1
8,NPTS,-0.288207,1.968401,6.763232,4
9,TemporalFusionTransformer,-0.311265,1.119807,1174.841228,8


##### Feature Importance

TARDA 22 MINUTOS

In [18]:
predictor.feature_importance()

Computing feature importance


Unnamed: 0,importance,stdev,n,p99_low,p99_high
cat1,0.000136,0.000711,5.0,-0.001329,0.0016
cat2,0.000329,0.00049,5.0,-0.000679,0.001337
cat3,-1.7e-05,0.000521,5.0,-0.00109,0.001055
brand,-8e-05,0.00057,5.0,-0.001254,0.001095
sku_size,-0.000175,0.001384,5.0,-0.003026,0.002675
crisis,0.003308,0.010835,5.0,-0.019002,0.025619


##### Filtramos fechas futuras

In [14]:
futuro_c = futuro.copy()
futuro_c.drop(columns=['target'], inplace=True)
futuro_c['timestamp'] = futuro_c['timestamp'] + pd.offsets.MonthEnd(0)
futuro_c['timestamp'] = futuro_c['timestamp'].dt.strftime('%Y-%m-%d')

print(futuro.timestamp.min(), futuro.timestamp.max())

known_covariates_future = TimeSeriesDataFrame.from_data_frame(futuro_c,
                                                              id_column="item_id",
                                                              timestamp_column="timestamp")
known_covariates_future.head()

2020-01-01 00:00:00 2020-02-01 00:00:00


Unnamed: 0_level_0,Unnamed: 1_level_0,crisis
item_id,timestamp,Unnamed: 2_level_1
20001,2020-01-31,0
20001,2020-02-29,0
20002,2020-01-31,0
20002,2020-02-29,0
20003,2020-01-31,0


In [15]:
predictions = predictor.predict(data,
                                known_covariates = known_covariates_future)

data with frequency 'None' has been resampled to frequency 'ME'.
Model not specified in predict, will default to the model with the best validation score: WeightedEnsemble


In [16]:
predictions_v1 = predictions.copy()
predictions_v1 = predictions_v1.reset_index()
predictions_v1 = predictions_v1[["item_id", "timestamp", "mean"]]
predictions_v1 = predictions_v1[predictions_v1.timestamp == "2020-02-29"]
predictions_v1 = predictions_v1.drop(columns = {"timestamp"})
predictions_v1 = predictions_v1.rename(columns = {"item_id":"product_id", "mean":"tn"})
predictions_v1.head(5)

Unnamed: 0,product_id,tn
1,20001,1285.106555
3,20002,1062.697731
5,20003,664.6289
7,20004,516.781357
9,20005,499.909334


In [17]:
predictions_v1.to_csv("../../outputs/autogluon_FE_con_crisis.csv", sep = ",", index = False)