# Experimento 7

Feature Engineering con Autogluon

In [4]:
import pandas as pd
import numpy as np
from autogluon.timeseries import TimeSeriesDataFrame, TimeSeriesPredictor
from statsmodels.tsa.seasonal import seasonal_decompose

In [5]:
df = pd.read_csv('../../data/preprocessed/base.csv', sep=',')
df.head()

Unnamed: 0,periodo,customer_id,product_id,plan_precios_cuidados,cust_request_qty,cust_request_tn,tn,cat1,cat2,cat3,brand,sku_size,stock_final
0,201701,10234,20524,0,2,0.053,0.053,HC,VAJILLA,Cristalino,Importado,500.0,
1,201701,10032,20524,0,1,0.13628,0.13628,HC,VAJILLA,Cristalino,Importado,500.0,
2,201701,10217,20524,0,1,0.03028,0.03028,HC,VAJILLA,Cristalino,Importado,500.0,
3,201701,10125,20524,0,1,0.02271,0.02271,HC,VAJILLA,Cristalino,Importado,500.0,
4,201701,10012,20524,0,11,1.54452,1.54452,HC,VAJILLA,Cristalino,Importado,500.0,


In [6]:
dfg = df.groupby(['periodo', 'product_id']).agg({'tn': 'sum'}).reset_index()
dfg.head()

Unnamed: 0,periodo,product_id,tn
0,201701,20001,934.77222
1,201701,20002,550.15707
2,201701,20003,1063.45835
3,201701,20004,555.91614
4,201701,20005,494.27011


In [7]:
dfg['periodo_dt'] = pd.to_datetime(dfg['periodo'].astype(str), format='%Y%m')
dfg.rename(columns={'tn': 'target', 'product_id':'item_id', 'periodo_dt': 'timestamp'}, inplace=True)
dfg.drop(columns=['periodo'], inplace=True)


In [13]:
# Versión más concisa usando merge cruzado
future_periods = pd.DataFrame({
    'timestamp': pd.to_datetime(['2020-01-01', '2020-02-01'])
})

df_future = (dfg[['item_id']].drop_duplicates()
             .merge(future_periods, how='cross')
             .assign(target=0))  # Asignar 0 a tn

dfg_completo = pd.concat([dfg, df_future], ignore_index=True)

dfg = dfg_completo.copy()

In [14]:
productos_df = pd.read_csv('../../data/raw/tb_productos.csv', sep='\t')
productos_df = productos_df.drop_duplicates(subset=['product_id'], keep='first')
productos_df.rename(columns={'product_id': 'item_id'}, inplace=True)

##### Extracción de componentes temporales


In [15]:
dfg['year'] = dfg['timestamp'].dt.year
dfg['month'] = dfg['timestamp'].dt.month
dfg['quarter'] = dfg['timestamp'].dt.quarter
# Variables dummy estacionales
dfg['semester'] = np.where(dfg['month'] <= 6, 1, 2)
dfg['quarter'] = dfg['timestamp'].dt.quarter

# Efectos de fin de año
dfg['year_end'] = np.where(dfg['month'].isin([11, 12]), 1, 0)
dfg['year_start'] = np.where(dfg['month'].isin([1, 2]), 1, 0)
# Indicadores estacionales
dfg['season'] = dfg['month'] % 12 // 3 + 1  # 1:Invierno, 2:Primavera, etc.
# Variables cíclicas (para capturar patrones estacionales)
dfg['month_sin'] = np.sin(2 * np.pi * dfg['month']/12)
dfg['month_cos'] = np.cos(2 * np.pi * dfg['month']/12)

#####  Lags, diferencias, medias móviles y otras yerbas

In [16]:
# Ordenamos por fecha para asegurar consistencia
dfg = dfg.sort_values('timestamp')

## 1. Lags (rezagos) de 1 a 12 meses
for i in range(1, 13):
    dfg[f'lag_{i}'] = dfg['target'].shift(i)

## 2. Diferencias (deltas) - cambio respecto al mes anterior
for i in range(1, 13):
    dfg[f'delta_{i}'] = dfg['target'].diff(i)

## 3. Diferencias porcentuales
for i in range(1, 13):
    dfg[f'pct_change_{i}'] = dfg['target'].pct_change(i)

## 4. Medias móviles (promedios móviles)
windows = [2, 3, 6, 9, 12]  # También puedes incluir [2,4,5,7] según necesidad
for w in windows:
    dfg[f'rolling_mean_{w}'] = dfg['target'].rolling(window=w, min_periods=1).mean()
    dfg[f'rolling_std_{w}'] = dfg['target'].rolling(window=w, min_periods=1).std()
    dfg[f'rolling_min_{w}'] = dfg['target'].rolling(window=w, min_periods=1).min()
    dfg[f'rolling_max_{w}'] = dfg['target'].rolling(window=w, min_periods=1).max()
    dfg[f'rolling_median_{w}'] = dfg['target'].rolling(window=w, min_periods=1).median()

## 5. Características de tendencia y estacionalidad
dfg['expanding_mean'] = dfg['target'].expanding().mean()
dfg['cumulative_sum'] = dfg['target'].cumsum()

## 6. Características de diferencia estacional (12 meses para datos mensuales)
dfg['seasonal_diff_12'] = dfg['target'].diff(12)

## 7. Estadísticas anuales comparativas
dfg['vs_prev_year'] = dfg['target'] / dfg['lag_12'] - 1  # Crecimiento interanual

## 8. Componentes de descomposición (simplificada)
# Tendencia (usando media móvil de 12 meses)
dfg['trend'] = dfg['target'].rolling(window=12, min_periods=1).mean()
# Estacionalidad (diferencia entre valor real y tendencia)
dfg['seasonality'] = dfg['target'] - dfg['trend']

## 9. Variables booleanas para eventos especiales
dfg['new_high'] = (dfg['target'] == dfg['rolling_max_12']).astype(int)
dfg['new_low'] = (dfg['target'] == dfg['rolling_min_12']).astype(int)

## 10. Características de aceleración/deceleración
dfg['acceleration'] = dfg['delta_1'].diff(1)  # Cambio en la tasa de cambio

##### Estadísticas de Ventana Dinámica

In [17]:
# Medias móviles exponenciales
dfg['ewm_alpha_0.3'] = dfg['target'].ewm(alpha=0.3, adjust=False).mean()
dfg['ewm_alpha_0.5'] = dfg['target'].ewm(alpha=0.5, adjust=False).mean()

# Medias móviles centradas
dfg['rolling_center_mean_3'] = dfg['target'].rolling(window=3, center=True).mean()

# Sumas acumuladas por año
dfg['ytd_sum'] = dfg.groupby(dfg['timestamp'].dt.year)['target'].cumsum()

##### Crisis

In [18]:
# Opción 1: Usando condiciones booleanas
dfg['crisis'] = ((dfg['timestamp'].dt.year == 2019) & (dfg['timestamp'].dt.month == 8)).astype(int)

##### Levantamos productos a predecir

In [19]:
productos_ok = pd.read_csv('../../data/raw/product_id_apredecir201912.csv', sep=',')
productos_ok.head()

Unnamed: 0,product_id
0,20001
1,20002
2,20003
3,20004
4,20005


##### Filtramos productos a predecir

In [20]:
dfg = dfg[dfg['item_id'].isin(productos_ok['product_id'].unique())]

##### Sacamos dataset de entrenamiento y dataset futuro para la predicción

In [21]:
futuro = dfg[dfg['timestamp'] >= '2020-01-01'].copy()
dfg = dfg[dfg['timestamp'] < '2020-01-01'].copy()

##### Dataset para Autogluon 

In [29]:
data = TimeSeriesDataFrame.from_data_frame(dfg,
                                           id_column="item_id",
                                           timestamp_column="timestamp",
                                           static_features_df=productos_df
                                           )
data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,target,year,month,quarter,semester,year_end,year_start,season,month_sin,month_cos,...,trend,seasonality,new_high,new_low,acceleration,ewm_alpha_0.3,ewm_alpha_0.5,rolling_center_mean_3,ytd_sum,crisis
item_id,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
20001,2017-01-01,934.77222,2017,1,1,1,0,1,1,0.5,0.866025,...,934.77222,0.0,1,1,,934.77222,934.77222,,934.77222,0
20702,2017-01-01,3.52501,2017,1,1,1,0,1,1,0.5,0.866025,...,469.148615,-465.623605,0,1,,655.398057,469.148615,315.976613,938.29723,0
20705,2017-01-01,3.30497,2017,1,1,1,0,1,1,0.5,0.866025,...,237.808702,-234.503732,0,1,-12.43524,324.159387,121.347791,5.89495,951.23481,0
20706,2017-01-01,4.74727,2017,1,1,1,0,1,1,0.5,0.866025,...,191.196416,-186.449146,0,0,7.76994,228.335752,63.047531,3.69992,955.98208,0
20708,2017-01-01,3.04752,2017,1,1,1,0,1,1,0.5,0.866025,...,159.838267,-156.790747,0,1,-3.14205,160.749282,33.047525,3.66905,959.0296,0


In [30]:
data.static_features.head()

Unnamed: 0_level_0,cat1,cat2,cat3,brand,sku_size
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
20001,HC,ROPA LAVADO,Liquido,ARIEL,3000
20702,PC,PIEL2,LIQUIDOS,ESPADOL,220
20705,HC,ROPA ACONDICIONADOR,Concentrado,VIVERE,450
20706,PC,PIEL2,Jabon Regular,DEOS1,220
20708,PC,PIEL1,CREMA,LANCOME,530


##### Entrenamiento

El entrenamiento tardó 98 minutos

In [31]:
covariates = data.drop(columns = "target").columns

predictor = TimeSeriesPredictor(target='target',
                                prediction_length=2, 
                                freq="M",
                                #eval_metric = "MSE",
                                known_covariates_names = covariates).fit(data, 
                                                                         num_val_windows=2,
                                                                         val_step_size=1)

  std_freq = pd.tseries.frequencies.to_offset(self.freq).freqstr
Frequency 'M' stored as 'ME'
Beginning AutoGluon training...
AutoGluon will save models to 'c:\Users\Usuario\Documents\Universidad\austral\2025\Lab3\Lab3-MCD\notebooks\model_autogluon\AutogluonModels\ag-20250606_221001'
AutoGluon Version:  1.2
Python Version:     3.11.4
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.26100
CPU Count:          4
GPU Count:          0
Memory Avail:       5.39 GB / 15.89 GB (33.9%)
Disk Space Avail:   413.69 GB / 893.49 GB (46.3%)

Fitting with arguments:
{'enable_ensemble': True,
 'eval_metric': WQL,
 'freq': 'ME',
 'hyperparameters': 'default',
 'known_covariates_names': ['year',
                            'month',
                            'quarter',
                            'semester',
                            'year_end',
                            'year_start',
                            'season',
                            'month_sin',
       

##### Leaderboard

In [36]:
predictor.leaderboard()


Unnamed: 0,model,score_val,pred_time_val,fit_time_marginal,fit_order
0,WeightedEnsemble,-0.034212,15.090804,3.053643,12
1,DirectTabular,-0.034212,15.090804,2805.955686,3
2,RecursiveTabular,-0.176927,1.874368,325.170048,2
3,ChronosZeroShot[bolt_base],-0.190284,10.255714,40.619236,7
4,AutoETS,-0.19555,18.249639,16.638718,6
5,DynamicOptimizedTheta,-0.20199,2.465895,64.111537,5
6,TiDE,-0.231938,1.301187,616.581481,11
7,SeasonalNaive,-0.237611,0.88486,3.809673,1
8,NPTS,-0.288207,1.968401,6.763232,4
9,TemporalFusionTransformer,-0.311265,1.119807,1174.841228,8


##### Feature Importance

TARDA 22 MINUTOS

In [37]:
predictor.feature_importance()

Computing feature importance


Unnamed: 0,importance,stdev,n,p99_low,p99_high
cat1,0.000000,0.000000,5.0,0.000000,0.000000
cat2,0.000000,0.000000,5.0,0.000000,0.000000
cat3,0.000000,0.000000,5.0,0.000000,0.000000
brand,0.000000,0.000000,5.0,0.000000,0.000000
sku_size,0.000539,0.000714,5.0,-0.000931,0.002009
...,...,...,...,...,...
ewm_alpha_0.3,0.000192,0.000385,5.0,-0.000600,0.000984
ewm_alpha_0.5,0.061388,0.008321,5.0,0.044256,0.078520
rolling_center_mean_3,0.000006,0.000074,5.0,-0.000145,0.000158
ytd_sum,0.000079,0.000108,5.0,-0.000143,0.000302


##### Filtramos fechas futuras

In [None]:
futuro_c = futuro.copy()
futuro_c.drop(columns=['target'], inplace=True)
futuro_c['timestamp'] = futuro_c['timestamp'] + pd.offsets.MonthEnd(0)
futuro_c['timestamp'] = futuro_c['timestamp'].dt.strftime('%Y-%m-%d')

print(futuro.timestamp.min(), futuro.timestamp.max())

known_covariates_future = TimeSeriesDataFrame.from_data_frame(futuro_c,
                                                              id_column="item_id",
                                                              timestamp_column="timestamp")
known_covariates_future.head()

2020-01-31 2020-02-29


Unnamed: 0_level_0,Unnamed: 1_level_0,target,year,month,quarter,semester,year_end,year_start,season,month_sin,month_cos,...,trend,seasonality,new_high,new_low,acceleration,ewm_alpha_0.3,ewm_alpha_0.5,rolling_center_mean_3,ytd_sum,crisis
item_id,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
21032,2020-01-31,0.0,2020,1,1,1,0,1,1,0.5,0.866025,...,8.982973,-8.982973,0,1,0.0,4.544852,1.785648,0.0,0.0,0
20994,2020-01-31,0.0,2020,1,1,1,0,1,1,0.5,0.866025,...,7.650407,-7.650407,0,1,0.0,3.181397,0.892824,0.0,0.0,0
21038,2020-01-31,0.0,2020,1,1,1,0,1,1,0.5,0.866025,...,7.556882,-7.556882,0,1,0.0,2.226978,0.446412,0.0,0.0,0
21028,2020-01-31,0.0,2020,1,1,1,0,1,1,0.5,0.866025,...,5.756963,-5.756963,0,1,0.0,1.091219,0.111603,0.0,0.0,0
21024,2020-01-31,0.0,2020,1,1,1,0,1,1,0.5,0.866025,...,2.15101,-2.15101,0,1,0.0,0.374288,0.01395,0.0,0.0,0


In [39]:
predictions = predictor.predict(data,
                                known_covariates = known_covariates_future)

data with frequency 'None' has been resampled to frequency 'ME'.
Trying to fill missing values in an unsorted dataframe. It is highly recommended to call `ts_df.sort_index()` before calling `ts_df.fill_missing_values()`
Model not specified in predict, will default to the model with the best validation score: DirectTabular


In [None]:
predictions_v1 = predictions.copy()
predictions_v1 = predictions_v1.reset_index()
predictions_v1 = predictions_v1[["item_id", "timestamp", "mean"]]
predictions_v1 = predictions_v1[predictions_v1.timestamp == "2020-02-29"]
predictions_v1 = predictions_v1.drop(columns = {"timestamp"})
predictions_v1 = predictions_v1.rename(columns = {"item_id":"product_id", "mean":"tn"})
predictions_v1.head(5)

Unnamed: 0,product_id,tn
1,20001,326.319824
3,20702,0.96707
5,20705,0.884208
7,20706,0.795946
9,20708,0.892387


In [47]:
predictions_v1.to_csv("../../outputs/autogluon_FE_01.csv", sep = ",", index = False)