#  Modelo: AUTOGLUON

- Hiperparmetro: num_val_windows = 2
- Sin FE ni variables exógenas (dolar, ipc)


In [1]:
import pandas as pd
import numpy as np
from autogluon.timeseries import TimeSeriesDataFrame, TimeSeriesPredictor

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ESTOY_EN_KAGGLE = True

In [4]:
if ESTOY_EN_KAGGLE:
    df = pd.read_csv("../entregable/datasets/periodo_x_producto_con_target_transformado_con_feature_engineering_201912.csv", sep=',', encoding='utf-8')
else:
    df = pd.read_csv("../entregable/datasets/periodo_x_producto_con_target_transformado_con_feature_engineering.csv", sep=',', encoding='utf-8')
df

Unnamed: 0,product_id,periodo,nacimiento_producto,muerte_producto,mes_n,total_meses,producto_nuevo,ciclo_de_vida_inicial,sku_size,stock_final,...,tn_lag_12_season_adj_add,tn_lag_12_season_adj_mul,tn_lag_12_season_adj_add_norm,tn_lag_12_season_adj_mul_norm,dtw_cluster,dist_to_centroid,simil_to_top,corr_tn_dolar,corr_tn_ipc,corr_tn_dolar_x_prod
0,20001,201701,201701,201912,1,36,0,0,3000.0,0.0,...,0.0,0.0,0.0,0.0,41,3.489215,0.000000,,,0.343657
1,20001,201702,201701,201912,2,36,0,0,3000.0,0.0,...,0.0,0.0,0.0,0.0,41,3.489215,0.000000,,,0.343657
2,20001,201703,201701,201912,3,36,0,0,3000.0,0.0,...,0.0,0.0,0.0,0.0,41,3.489215,0.000000,,,0.343657
3,20001,201704,201701,201912,4,36,0,0,3000.0,0.0,...,0.0,0.0,0.0,0.0,41,3.489215,0.000000,,,0.343657
4,20001,201705,201701,201912,5,36,0,0,3000.0,0.0,...,0.0,0.0,0.0,0.0,41,3.489215,0.000000,,,0.343657
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31357,21281,201704,201702,201708,3,7,1,1,0.0,0.0,...,0.0,0.0,0.0,0.0,37,2.317218,2258.537821,-0.125809,-0.067496,0.204382
31358,21281,201705,201702,201708,4,7,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,37,2.317218,2258.537821,-0.098866,-0.047311,0.204382
31359,21281,201706,201702,201708,5,7,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,37,2.317218,2258.537821,-0.377332,-0.420269,0.204382
31360,21281,201707,201702,201708,6,7,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,37,2.317218,2258.537821,-0.337101,-0.376122,0.204382


In [5]:
dfg = df.groupby(['periodo', 'product_id']).agg({'tn': 'sum'}).reset_index()
dfg

Unnamed: 0,periodo,product_id,tn
0,201701,20001,934.77222
1,201701,20002,550.15707
2,201701,20003,1063.45835
3,201701,20004,555.91614
4,201701,20005,494.27011
...,...,...,...
31357,201912,21265,0.05007
31358,201912,21266,0.05121
31359,201912,21267,0.01569
31360,201912,21271,0.00298


In [None]:
# dfg = dfg[dfg['periodo'] < 201911]

In [6]:
dfg['periodo_dt'] = pd.to_datetime(dfg['periodo'].astype(str), format='%Y%m')
dfg.rename(columns={'tn': 'target', 'product_id':'item_id', 'periodo_dt': 'timestamp'}, inplace=True)
dfg.drop(columns=['periodo'], inplace=True)

In [7]:
productos_ok = pd.read_csv('../../data/raw/product_id_apredecir201912.csv', sep=',')
productos_ok.head()

Unnamed: 0,product_id
0,20001
1,20002
2,20003
3,20004
4,20005


In [8]:
dfg = dfg[dfg['item_id'].isin(productos_ok['product_id'].unique())]

In [9]:
data = TimeSeriesDataFrame(dfg)
data

Unnamed: 0_level_0,Unnamed: 1_level_0,target
item_id,timestamp,Unnamed: 2_level_1
20001,2017-01-01,934.77222
20002,2017-01-01,550.15707
20003,2017-01-01,1063.45835
20004,2017-01-01,555.91614
20005,2017-01-01,494.27011
...,...,...
21263,2019-12-01,0.01270
21265,2019-12-01,0.05007
21266,2019-12-01,0.05121
21267,2019-12-01,0.01569


In [10]:
def entrenar_con_semillerio(data, semillas=[42, 101, 202, 303, 404], prediction_length=2):
    """
    Entrena múltiples modelos con diferentes semillas y devuelve una lista de predictores
    
    Args:
        data: TimeSeriesDataFrame con datos de entrenamiento
        semillas: Lista de semillas a utilizar
        prediction_length: Horizonte de predicción
        
    Returns:
        Lista de predictores entrenados
    """
    predictors = []
    
    for seed in semillas:
        print(f"\nEntrenando con semilla {seed}")
        
        predictor = TimeSeriesPredictor(
            target='target',
            prediction_length=prediction_length,
            freq="M",
        )
        
        predictor.fit(
            data,
            num_val_windows=2,
            presets="medium_quality",
            random_seed=seed,  # Semilla para AutoGluon
            # time_limit=3600  # Límite de tiempo por modelo (ajustar)
        )
        
        predictors.append(predictor)
    
    return predictors

In [11]:
def predecir_con_semillerio(predictors, data, future_covariates=None):
    """
    Genera predicciones promediando los resultados de múltiples predictores
    
    Args:
        predictors: Lista de predictores entrenados
        data: Datos históricos
        future_covariates: Variables exógenas futuras
        
    Returns:
        TimeSeriesDataFrame con predicciones combinadas
    """
    all_predictions = []
    
    for predictor in predictors:
        preds = predictor.predict(data)
        all_predictions.append(preds)
    
    # Combinar predicciones (promedio)
    combined = all_predictions[0].copy()
    for col in combined.columns:
        if col not in ["item_id", "timestamp"]:
            combined[col] = np.mean([p[col] for p in all_predictions], axis=0)
    
    return combined

In [12]:
# Configuración
semillas = [42, 101, 202, 303, 404]
prediction_length = 2

# 1. Entrenamiento con semillerio
predictors = entrenar_con_semillerio(data, semillas, prediction_length)

# 2. Preparar variables exógenas futuras (ajusta esto a tus datos)
# future_covariates = TimeSeriesDataFrame.from_data_frame(
#     pd.DataFrame({
#         'timestamp': pd.date_range(start=data.index.get_level_values('timestamp').max() + pd.DateOffset(months=1),
#         periods=prediction_length,
#         freq="M"),
#         'dolar': [valores_reales_o_proyectados_dolar],
#         'ipc': [valores_reales_o_proyectados_ipc]
#     }),
#     id_column="item_id",
#     timestamp_column="timestamp"
# )

# 3. Predicción ensemble
predictions = predecir_con_semillerio(predictors, data)

# 4. Guardar resultados
predictions.reset_index()[["item_id", "timestamp", "mean"]].to_csv("predicciones_semillerio_201912.csv", index=False)


Entrenando con semilla 42


  offset = pd.tseries.frequencies.to_offset(self.freq)
Frequency 'M' stored as 'ME'
Beginning AutoGluon training...
AutoGluon will save models to 'c:\Users\Usuario\Documents\Universidad\austral\2025\Lab3\Lab3-MCD\notebooks\entregable\AutogluonModels\ag-20250622_154705'
AutoGluon Version:  1.3.1
Python Version:     3.11.13
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.26100
CPU Count:          4
GPU Count:          0
Memory Avail:       2.73 GB / 15.89 GB (17.2%)
Disk Space Avail:   417.18 GB / 893.49 GB (46.7%)
Setting presets to: medium_quality

Fitting with arguments:
{'enable_ensemble': True,
 'eval_metric': WQL,
 'freq': 'ME',
 'hyperparameters': 'light',
 'known_covariates_names': [],
 'num_val_windows': 2,
 'prediction_length': 2,
 'quantile_levels': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
 'random_seed': 42,
 'refit_every_n_windows': 1,
 'refit_full': False,
 'skip_model_selection': False,
 'target': 'target',
 'verbosity': 2}

train_data 


Entrenando con semilla 101


train_data with frequency 'MS' has been resampled to frequency 'ME'.
Provided train_data has 22375 rows, 780 time series. Median time series length is 36 (min=4, max=36). 
	Removing 75 short time series from train_data. Only series with length >= 9 will be used for training.
	After filtering, train_data has 21916 rows, 705 time series. Median time series length is 36 (min=9, max=36). 

Provided data contains following columns:
	target: 'target'

AutoGluon will gauge predictive performance using evaluation metric: 'WQL'
	This metric's sign has been flipped to adhere to being higher_is_better. The metric score can be multiplied by -1 to get the metric value.

Starting training. Start time is 2025-06-22 13:32:00
Models that will be trained: ['Naive', 'SeasonalNaive', 'RecursiveTabular', 'DirectTabular', 'ETS', 'Theta', 'Chronos[bolt_small]', 'TemporalFusionTransformer']
Training timeseries model Naive. 
	-0.2807       = Validation score (-WQL)
	32.42   s     = Training runtime
	2.04    s 


Entrenando con semilla 202


train_data with frequency 'MS' has been resampled to frequency 'ME'.
Provided train_data has 22375 rows, 780 time series. Median time series length is 36 (min=4, max=36). 
	Removing 75 short time series from train_data. Only series with length >= 9 will be used for training.
	After filtering, train_data has 21916 rows, 705 time series. Median time series length is 36 (min=9, max=36). 

Provided data contains following columns:
	target: 'target'

AutoGluon will gauge predictive performance using evaluation metric: 'WQL'
	This metric's sign has been flipped to adhere to being higher_is_better. The metric score can be multiplied by -1 to get the metric value.

Starting training. Start time is 2025-06-22 13:47:00
Models that will be trained: ['Naive', 'SeasonalNaive', 'RecursiveTabular', 'DirectTabular', 'ETS', 'Theta', 'Chronos[bolt_small]', 'TemporalFusionTransformer']
Training timeseries model Naive. 
	-0.2807       = Validation score (-WQL)
	5.84    s     = Training runtime
	1.58    s 


Entrenando con semilla 303


train_data with frequency 'MS' has been resampled to frequency 'ME'.
Provided train_data has 22375 rows, 780 time series. Median time series length is 36 (min=4, max=36). 
	Removing 75 short time series from train_data. Only series with length >= 9 will be used for training.
	After filtering, train_data has 21916 rows, 705 time series. Median time series length is 36 (min=9, max=36). 

Provided data contains following columns:
	target: 'target'

AutoGluon will gauge predictive performance using evaluation metric: 'WQL'
	This metric's sign has been flipped to adhere to being higher_is_better. The metric score can be multiplied by -1 to get the metric value.

Starting training. Start time is 2025-06-22 14:04:34
Models that will be trained: ['Naive', 'SeasonalNaive', 'RecursiveTabular', 'DirectTabular', 'ETS', 'Theta', 'Chronos[bolt_small]', 'TemporalFusionTransformer']
Training timeseries model Naive. 
	-0.2807       = Validation score (-WQL)
	6.60    s     = Training runtime
	1.70    s 


Entrenando con semilla 404


train_data with frequency 'MS' has been resampled to frequency 'ME'.
Provided train_data has 22375 rows, 780 time series. Median time series length is 36 (min=4, max=36). 
	Removing 75 short time series from train_data. Only series with length >= 9 will be used for training.
	After filtering, train_data has 21916 rows, 705 time series. Median time series length is 36 (min=9, max=36). 

Provided data contains following columns:
	target: 'target'

AutoGluon will gauge predictive performance using evaluation metric: 'WQL'
	This metric's sign has been flipped to adhere to being higher_is_better. The metric score can be multiplied by -1 to get the metric value.

Starting training. Start time is 2025-06-22 14:26:04
Models that will be trained: ['Naive', 'SeasonalNaive', 'RecursiveTabular', 'DirectTabular', 'ETS', 'Theta', 'Chronos[bolt_small]', 'TemporalFusionTransformer']
Training timeseries model Naive. 
	-0.2807       = Validation score (-WQL)
	40.53   s     = Training runtime
	1.60    s 

In [19]:
# predictor = TimeSeriesPredictor(target='target', prediction_length=2, freq="M").fit(data, num_val_windows = 2)
# predictions = predictor.predict(data)
predictions

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9
item_id,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
20001,2020-01-31,1330.899967,996.651531,1108.340478,1194.311209,1263.548535,1330.899967,1404.420033,1474.849767,1564.038924,1690.019717
20001,2020-02-29,1361.457900,1009.830324,1128.475070,1219.867749,1291.453237,1361.457900,1435.047109,1511.922569,1605.398951,1736.272157
20002,2020-01-31,1159.430708,802.645661,921.565195,1012.629181,1087.699214,1159.430708,1236.578443,1315.962936,1406.000354,1541.250777
20002,2020-02-29,1152.941188,779.230040,904.376898,1002.061629,1077.813726,1152.941188,1227.843522,1311.321840,1406.688577,1544.303367
20003,2020-01-31,790.578299,581.978112,650.669783,705.944974,749.348587,790.578299,836.686400,885.122322,942.216194,1023.774383
...,...,...,...,...,...,...,...,...,...,...,...
20995,2020-02-29,2.039325,0.133690,0.766843,1.258305,1.664651,2.039325,2.434302,2.892243,3.434241,4.199217
21087,2020-01-31,0.950191,0.421081,0.594287,0.735396,0.841673,0.950191,1.060152,1.182520,1.330456,1.545424
21087,2020-02-29,1.011751,0.317745,0.546222,0.729323,0.871241,1.011751,1.152634,1.307915,1.493995,1.765561
21214,2020-01-31,0.302589,-0.148960,0.000251,0.115823,0.212070,0.302589,0.396931,0.498496,0.623433,0.801283


In [20]:
predictions_v1 = predictions.copy()
predictions_v1 = predictions_v1.reset_index()
predictions_v1 = predictions_v1[["item_id", "timestamp", "mean"]]
predictions_v1 = predictions_v1[predictions_v1.timestamp == "2020-02-29"]
predictions_v1 = predictions_v1.drop(columns = {"timestamp"})
predictions_v1 = predictions_v1.rename(columns = {"item_id":"product_id", "mean":"tn"})
predictions_v1.head(5)

Unnamed: 0,product_id,tn
1,20001,1361.4579
3,20002,1152.941188
5,20003,713.264305
7,20004,548.175147
9,20005,540.477765


In [21]:
predictions_v1.shape

(780, 2)

In [23]:
predictions_v1.to_csv("./outputs/prediccion_autogluon_sinFE_conSEMILLLERO_KAGGLE.csv", sep = ",", index = False)