# Experimento 7

Feature Engineering con Autogluon

In [None]:
import pandas as pd
import numpy as np
from autogluon.timeseries import TimeSeriesDataFrame, TimeSeriesPredictor
from statsmodels.tsa.seasonal import seasonal_decompose

In [3]:
df = pd.read_csv('../../data/preprocessed/base.csv', sep=',')
df.head()

Unnamed: 0,periodo,customer_id,product_id,plan_precios_cuidados,cust_request_qty,cust_request_tn,tn,cat1,cat2,cat3,brand,sku_size,stock_final
0,201701,10234,20524,0,2,0.053,0.053,HC,VAJILLA,Cristalino,Importado,500.0,
1,201701,10032,20524,0,1,0.13628,0.13628,HC,VAJILLA,Cristalino,Importado,500.0,
2,201701,10217,20524,0,1,0.03028,0.03028,HC,VAJILLA,Cristalino,Importado,500.0,
3,201701,10125,20524,0,1,0.02271,0.02271,HC,VAJILLA,Cristalino,Importado,500.0,
4,201701,10012,20524,0,11,1.54452,1.54452,HC,VAJILLA,Cristalino,Importado,500.0,


In [4]:
dfg = df.groupby(['periodo', 'product_id']).agg({'tn': 'sum'}).reset_index()
dfg.head()

Unnamed: 0,periodo,product_id,tn
0,201701,20001,934.77222
1,201701,20002,550.15707
2,201701,20003,1063.45835
3,201701,20004,555.91614
4,201701,20005,494.27011


In [5]:
dfg['periodo_dt'] = pd.to_datetime(dfg['periodo'].astype(str), format='%Y%m')
dfg.rename(columns={'tn': 'target', 'product_id':'item_id', 'periodo_dt': 'timestamp'}, inplace=True)
dfg.drop(columns=['periodo'], inplace=True)

In [None]:
# Versión más concisa usando merge cruzado
future_periods = pd.DataFrame({
    'timestamp': pd.to_datetime(['2020-01-01', '2020-02-01'])
})

df_future = (dfg[['item_id']].drop_duplicates()
             .merge(future_periods, how='cross')
             .assign(target=0))  # Asignar 0 a tn

dfg_completo = pd.concat([dfg, df_future], ignore_index=True)

dfg = dfg_completo.copy()

In [None]:
productos_df = pd.read_csv('../../data/raw/tb_productos.csv', sep='\t')
productos_df = productos_df.drop_duplicates(subset=['product_id'], keep='first')
productos_df.rename(columns={'product_id': 'item_id'}, inplace=True)

Unnamed: 0,item_id,target,timestamp,cat1,cat2,cat3,brand,sku_size,product_id
0,20001,934.77222,2017-01-01,HC,ROPA LAVADO,Liquido,ARIEL,3000.0,20001.0
1,20002,550.15707,2017-01-01,HC,ROPA LAVADO,Liquido,LIMPIEX,3000.0,20002.0
2,20003,1063.45835,2017-01-01,FOODS,ADEREZOS,Mayonesa,NATURA,475.0,20003.0
3,20004,555.91614,2017-01-01,FOODS,ADEREZOS,Mayonesa,NATURA,240.0,20004.0
4,20005,494.27011,2017-01-01,FOODS,ADEREZOS,Mayonesa,NATURA,120.0,20005.0
...,...,...,...,...,...,...,...,...,...
31238,21265,0.05007,2019-12-01,PC,PIEL1,CUIDADO ESPECIAL,LANCOME,32.0,21265.0
31239,21266,0.05121,2019-12-01,PC,PIEL1,CUIDADO ESPECIAL,LANCOME,32.0,21266.0
31240,21267,0.01569,2019-12-01,PC,PIEL1,Cara,NIVEA,250.0,21267.0
31241,21271,0.00298,2019-12-01,REF,TE,Frutas,TWININGS,20.0,21271.0


##### Extracción de componentes temporales


In [None]:
dfg['year'] = dfg['timestamp'].dt.year
dfg['month'] = dfg['timestamp'].dt.month
dfg['quarter'] = dfg['timestamp'].dt.quarter
# Variables dummy estacionales
dfg['semester'] = np.where(dfg['month'] <= 6, 1, 2)
dfg['quarter'] = dfg['timestamp'].dt.quarter

# Efectos de fin de año
dfg['year_end'] = np.where(dfg['month'].isin([11, 12]), 1, 0)
dfg['year_start'] = np.where(dfg['month'].isin([1, 2]), 1, 0)
# Indicadores estacionales
dfg['season'] = dfg['month'] % 12 // 3 + 1  # 1:Invierno, 2:Primavera, etc.
# Variables cíclicas (para capturar patrones estacionales)
dfg['month_sin'] = np.sin(2 * np.pi * dfg['month']/12)
dfg['month_cos'] = np.cos(2 * np.pi * dfg['month']/12)

#####  Lags, diferencias, medias móviles y otras yerbas

In [None]:
# Ordenamos por fecha para asegurar consistencia
dfg = dfg.sort_values('timestamp')

## 1. Lags (rezagos) de 1 a 12 meses
for i in range(1, 13):
    dfg[f'lag_{i}'] = dfg['target'].shift(i)

## 2. Diferencias (deltas) - cambio respecto al mes anterior
for i in range(1, 13):
    dfg[f'delta_{i}'] = dfg['target'].diff(i)

## 3. Diferencias porcentuales
for i in range(1, 13):
    dfg[f'pct_change_{i}'] = dfg['target'].pct_change(i)

## 4. Medias móviles (promedios móviles)
windows = [2, 3, 6, 9, 12]  # También puedes incluir [2,4,5,7] según necesidad
for w in windows:
    dfg[f'rolling_mean_{w}'] = dfg['target'].rolling(window=w, min_periods=1).mean()
    dfg[f'rolling_std_{w}'] = dfg['target'].rolling(window=w, min_periods=1).std()
    dfg[f'rolling_min_{w}'] = dfg['target'].rolling(window=w, min_periods=1).min()
    dfg[f'rolling_max_{w}'] = dfg['target'].rolling(window=w, min_periods=1).max()
    dfg[f'rolling_median_{w}'] = dfg['target'].rolling(window=w, min_periods=1).median()

## 5. Características de tendencia y estacionalidad
dfg['expanding_mean'] = dfg['target'].expanding().mean()
dfg['cumulative_sum'] = dfg['target'].cumsum()

## 6. Características de diferencia estacional (12 meses para datos mensuales)
dfg['seasonal_diff_12'] = dfg['target'].diff(12)

## 7. Estadísticas anuales comparativas
dfg['vs_prev_year'] = dfg['target'] / dfg['lag_12'] - 1  # Crecimiento interanual

## 8. Componentes de descomposición (simplificada)
# Tendencia (usando media móvil de 12 meses)
dfg['trend'] = dfg['target'].rolling(window=12, min_periods=1).mean()
# Estacionalidad (diferencia entre valor real y tendencia)
dfg['seasonality'] = dfg['target'] - dfg['trend']

## 9. Variables booleanas para eventos especiales
dfg['new_high'] = (dfg['target'] == dfg['rolling_max_12']).astype(int)
dfg['new_low'] = (dfg['target'] == dfg['rolling_min_12']).astype(int)

## 10. Características de aceleración/deceleración
dfg['acceleration'] = dfg['delta_1'].diff(1)  # Cambio en la tasa de cambio

##### Estadísticas de Ventana Dinámica

In [None]:
# Medias móviles exponenciales
dfg['ewm_alpha_0.3'] = dfg['target'].ewm(alpha=0.3, adjust=False).mean()
dfg['ewm_alpha_0.5'] = dfg['target'].ewm(alpha=0.5, adjust=False).mean()

# Medias móviles centradas
dfg['rolling_center_mean_3'] = dfg['target'].rolling(window=3, center=True).mean()

# Sumas acumuladas por año
dfg['ytd_sum'] = dfg.groupby(dfg['timestamp'].dt.year)['target'].cumsum()

##### Características de Tendencia y Ciclo

In [None]:
# Modelado de tendencia polinomial
dfg['time_index'] = range(len(dfg))
dfg['trend_linear'] = np.poly1d(np.polyfit(dfg['time_index'], dfg['target'], 1))(dfg['time_index'])
dfg['trend_quadratic'] = np.poly1d(np.polyfit(dfg['time_index'], dfg['target'], 2))(dfg['time_index'])

# Residuales de tendencia
dfg['residual_trend'] = dfg['target'] - dfg['trend_linear']

##### Características de Cambio de Régimen

In [None]:
# Z-Score respecto a ventana móvil
dfg['zscore_6'] = (dfg['target'] - dfg['rolling_mean_6']) / dfg['rolling_std_6']

# Detección de outliers
dfg['is_outlier_3sigma'] = np.where(np.abs(dfg['zscore_6']) > 3, 1, 0)

# Cambios bruscos (spikes)
dfg['spike_up'] = np.where(dfg['delta_1'] > dfg['rolling_std_3'], 1, 0)
dfg['spike_down'] = np.where(dfg['delta_1'] < -dfg['rolling_std_3'], 1, 0)

##### Características de Patrones Temporales

In [None]:
# Autocorrelaciones parciales
from statsmodels.tsa.stattools import pacf
pacf_values = pacf(dfg['target'].dropna(), nlags=12)
for i in range(1, 6):
    dfg[f'pacf_{i}'] = dfg['target'].shift(i) * pacf_values[i]

# Estacionalidad múltiple (si hay patrones semestrales)
dfg['semester_mean'] = dfg.groupby(['year', 'semester'])['target'].transform('mean')

##### Características de Forecast Ingenieriles

In [None]:
# Método ingenuo (último valor)
dfg['naive_forecast'] = dfg['target'].shift(1)

# Seasonal naive (valor del mismo período año anterior)
dfg['seasonal_naive'] = dfg['target'].shift(12)

# Promedio móvil como forecast
dfg['ma_forecast_3'] = dfg['rolling_mean_3'].shift(1)

##### Características de Decomposición Temporal

In [None]:
# Descomposición clásica (additiva o multiplicativa)
result = seasonal_decompose(dfg['target'].dropna(), model='additive', period=12)
dfg['trend_decomposed'] = result.trend
dfg['seasonal_decomposed'] = result.seasonal
dfg['residual_decomposed'] = result.resid

##### Características de Ventanas Asimétricas

In [None]:
# Mejor mes histórico
dfg['best_month_rank'] = dfg.groupby('month')['target'].rank(ascending=False)

# Comparación con mismo mes año anterior
dfg['vs_last_year_same_month'] = dfg['target'] / dfg['lag_12'] - 1

# Acumulado últimos 3 vs mismos 3 meses año anterior
dfg['last3_vs_ly3'] = (dfg['target'] + dfg['lag_1'] + dfg['lag_2']) / (dfg['lag_12'] + dfg['lag_13'] + dfg['lag_14']) - 1

##### Transformaciones Matemáticas

In [None]:
# Transformaciones para estabilizar varianza
dfg['log_target'] = np.log1p(dfg['target'])
dfg['sqrt_target'] = np.sqrt(dfg['target'])
dfg['boxcox_target'], _ = stats.boxcox(dfg['target'] + 1)  # Requiere from scipy import stats

# Diferenciación para estacionariedad
dfg['diff1_log'] = np.log1p(dfg['target']).diff(1)

##### Características de Interacción

In [None]:
# Interacción entre tendencia y estacionalidad
dfg['trend_season_interaction'] = dfg['trend'] * dfg['seasonal_decomposed']

# Interacción lags con estacionalidad
for i in [1, 2, 3, 12]:
    dfg[f'lag_{i}_season_adj'] = dfg[f'lag_{i}'] / dfg['seasonal_decomposed']

##### Levantamos productos a predecir

In [5]:
productos_ok = pd.read_csv('../../data/raw/product_id_apredecir201912.csv', sep=',')
productos_ok.head()

Unnamed: 0,product_id
0,20001
1,20002
2,20003
3,20004
4,20005


##### Filtramos productos a predecir

In [6]:
dfg = dfg[dfg['item_id'].isin(productos_ok['product_id'].unique())]

##### Sacamos dataset de entrenamiento y dataset futuro para la predicción

In [None]:
futuro = dfg[dfg['timestamp'] >= '2020-01-01'].copy()
dfg = dfg[dfg['timestamp'] < '2020-01-01'].copy()

##### Dataset para Autogluon 

In [None]:
data = TimeSeriesDataFrame.from_data_frame(dfg,
                                           id_column="item_id",
                                           timestamp_column="timestamp",
                                           static_features_df=productos_df
                                           )
data.head()

In [None]:
data.static_features.head()

##### Entrenamiento

In [None]:
covariates = data.drop(columns = "target").columns

predictor = TimeSeriesPredictor(target='target',
                                prediction_length=2, 
                                freq="M",
                                #eval_metric = "MSE",
                                known_covariates_names = covariates).fit(data, 
                                                                         num_val_windows=2,
                                                                         val_step_size=1)

  std_freq = pd.tseries.frequencies.to_offset(self.freq).freqstr
Frequency 'M' stored as 'ME'
Beginning AutoGluon training...
AutoGluon will save models to 'c:\Users\Usuario\Documents\Universidad\austral\2025\Lab3\Lab3-MCD\notebooks\model_autogluon\AutogluonModels\ag-20250605_173243'
AutoGluon Version:  1.2
Python Version:     3.11.4
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.26100
CPU Count:          4
GPU Count:          0
Memory Avail:       5.15 GB / 15.89 GB (32.4%)
Disk Space Avail:   415.90 GB / 893.49 GB (46.5%)
Setting presets to: high_quality

Fitting with arguments:
{'enable_ensemble': True,
 'eval_metric': MSE,
 'freq': 'ME',
 'hyperparameters': 'default',
 'known_covariates_names': [],
 'num_val_windows': 2,
 'prediction_length': 2,
 'quantile_levels': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
 'random_seed': 123,
 'refit_every_n_windows': 1,
 'refit_full': True,
 'skip_model_selection': False,
 'target': 'target',
 'val_step_size':

##### Leaderboard

In [None]:
predictor.leaderboard()

##### Feature Importance

In [None]:
predictor.feature_importance()

##### Filtramos fechas futuras

In [None]:
futuro_c = futuro.copy()
futuro_c.drop(columns=['target'], inplace=True)
futuro_c['timestamp'] = futuro_c['timestamp'] + pd.offsets.MonthEnd(0)
futuro_c['timestamp'] = futuro_c['timestamp'].dt.strftime('%Y-%m-%d')

print(futuro.timestamp.min(), futuro.timestamp.max())

known_covariates_future = TimeSeriesDataFrame.from_data_frame(futuro_c,
                                                              id_column="item_id",
                                                              timestamp_column="timestamp")
known_covariates_future.head()

In [9]:
predictions_v1 = predictions.copy()
predictions_v1 = predictions_v1.reset_index()
predictions_v1 = predictions_v1[["item_id", "timestamp", "mean"]]
predictions_v1 = predictions_v1[predictions_v1.timestamp == "2020-02-29"]
predictions_v1 = predictions_v1.drop(columns = {"timestamp"})
predictions_v1 = predictions_v1.rename(columns = {"item_id":"product_id", "mean":"tn"})
predictions_v1.head(5)

Unnamed: 0,product_id,tn
1,20001,1335.368101
3,20002,1049.383669
5,20003,776.909453
7,20004,528.254723
9,20005,502.187073


In [None]:
predictions_v1.to_csv("../../outputs/prediccion_autogluon_hiperparametros_highquality.csv", sep = ",", index = False)