In [1]:
# 📦 1. Importar librerías
import pandas as pd
import numpy as np
from autogluon.timeseries import TimeSeriesPredictor, TimeSeriesDataFrame
from sklearn.preprocessing import StandardScaler

# 💬 Instalar AutoGluon si es necesario
# %pip install autogluon.timeseries

# 📄 2. Cargar datasets
df_sellin = pd.read_csv("../data/sell-in.txt", sep="\t")
df_productos = pd.read_csv("../data/tb_productos.txt", sep="\t")

# 📄 Leer lista de productos a predecir
with open("../data/product_id_apredecir201912.txt", "r") as f:
    product_ids = [int(line.strip()) for line in f if line.strip().isdigit()]

# 🧹 3. Preprocesamiento
# Convertir periodo a datetime
df_sellin['timestamp'] = pd.to_datetime(df_sellin['periodo'], format='%Y%m')

# Filtrar hasta dic 2019 y productos requeridos
df_filtered = df_sellin[
    (df_sellin['timestamp'] <= '2019-12-01') &
    (df_sellin['product_id'].isin(product_ids))
]

# Agregar tn por periodo, cliente y producto
df_grouped = df_filtered.groupby(['timestamp', 'customer_id', 'product_id'], as_index=False)['tn'].sum()

# Agregar tn total por periodo y producto
df_monthly_product = df_grouped.groupby(['timestamp', 'product_id'], as_index=False)['tn'].sum()

# Agregar columna 'item_id' para AutoGluon
df_monthly_product['item_id'] = df_monthly_product['product_id']

# 🆕 3.1. Feature Engineering
# Crear características temporales
df_monthly_product['month'] = df_monthly_product['timestamp'].dt.month
df_monthly_product['year'] = df_monthly_product['timestamp'].dt.year

# Crear características de lag y rolling
for lag in [1, 3, 6, 12]:
    df_monthly_product[f'tn_lag_{lag}'] = df_monthly_product.groupby('product_id')['tn'].shift(lag)

df_monthly_product['tn_rolling_mean_3'] = df_monthly_product.groupby('product_id')['tn'].shift(1).rolling(window=3).mean()
df_monthly_product['tn_rolling_std_3'] = df_monthly_product.groupby('product_id')['tn'].shift(1).rolling(window=3).std()

# Incorporar características estáticas de tb_productos
df_monthly_product = df_monthly_product.merge(df_productos, on='product_id', how='left')

# Normalizar características numéricas
scaler = StandardScaler()
numeric_cols = ['tn_lag_1', 'tn_lag_3', 'tn_lag_6', 'tn_lag_12', 'tn_rolling_mean_3', 'tn_rolling_std_3']
df_monthly_product[numeric_cols] = scaler.fit_transform(df_monthly_product[numeric_cols].fillna(0))



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df_monthly_product

Unnamed: 0,timestamp,product_id,tn,item_id,month,year,tn_lag_1,tn_lag_3,tn_lag_6,tn_lag_12,tn_rolling_mean_3,tn_rolling_std_3,cat1,cat2,cat3,brand,sku_size
0,2017-01-01,20001,934.77222,20001,1,2017,-0.404361,-0.391227,-0.369555,-0.322445,-0.433534,-0.303098,HC,ROPA LAVADO,Liquido,ARIEL,3000
1,2017-01-01,20002,550.15707,20002,1,2017,-0.404361,-0.391227,-0.369555,-0.322445,-0.433534,-0.303098,HC,ROPA LAVADO,Liquido,LIMPIEX,3000
2,2017-01-01,20003,1063.45835,20003,1,2017,-0.404361,-0.391227,-0.369555,-0.322445,-0.433534,-0.303098,FOODS,ADEREZOS,Mayonesa,NATURA,475
3,2017-01-01,20004,555.91614,20004,1,2017,-0.404361,-0.391227,-0.369555,-0.322445,-0.433534,-0.303098,FOODS,ADEREZOS,Mayonesa,NATURA,240
4,2017-01-01,20005,494.27011,20005,1,2017,-0.404361,-0.391227,-0.369555,-0.322445,-0.433534,-0.303098,FOODS,ADEREZOS,Mayonesa,NATURA,120
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22677,2019-12-01,21263,0.01270,21263,12,2019,-0.404105,-0.391136,-0.369472,-0.321898,-0.433281,-0.303044,PC,CABELLO,SHAMPOO,VICHY,250
22678,2019-12-01,21265,0.05007,21265,12,2019,-0.403841,-0.391089,-0.369126,-0.322445,-0.433178,-0.302692,PC,PIEL1,CUIDADO ESPECIAL,LANCOME,32
22679,2019-12-01,21266,0.05121,21266,12,2019,-0.403832,-0.390996,-0.369116,-0.322445,-0.433064,-0.302720,PC,PIEL1,CUIDADO ESPECIAL,LANCOME,32
22680,2019-12-01,21267,0.01569,21267,12,2019,-0.404042,-0.391079,-0.369062,-0.322445,-0.433041,-0.302809,PC,PIEL1,Cara,NIVEA,250


In [3]:
# # ⏰ 4. Crear TimeSeriesDataFrame
# # ts_data = TimeSeriesDataFrame.from_data_frame(
# #     df_monthly_product,
# #     id_column='item_id',
# #     timestamp_column='timestamp',
# #     static_features=df_productos.set_index('product_id')
# # )
# # 1) Renombrar product_id → item_id
# df_prod_static = df_productos.rename(columns={'product_id': 'item_id'})

# # 2) Quitar duplicados (si existieran)
# df_prod_static = df_prod_static.drop_duplicates(subset=['item_id'])

# # 4) Crear TimeSeriesDataFrame CORREGIDO:
# ts_data = TimeSeriesDataFrame.from_data_frame(
#     df_monthly_product,
#     id_column='item_id',
#     timestamp_column='timestamp',
#     static_features_df=df_prod_static   # aquí 'item_id' debe estar en df_prod_static.columns
# )

# # Imputar valores faltantes
# ts_data = ts_data.fill_missing_values(method='ffill', value=0)

# # # ⚙ 5. Definir y entrenar predictor
# # predictor = TimeSeriesPredictor(
# #     prediction_length=1,  # Solo predecir febrero 2020
# #     target='tn',
# #     freq='MS',  # Frecuencia mensual
# #     eval_metric='WQL',
# #     known_covariates_names=['month', 'year'] + numeric_cols,
# #     hyperparameters={
# #         'TemporalFusionTransformer': {
# #             'hidden_size': 64,
# #             'dropout_rate': 0.1,
# #             'max_epochs': 50
# #         },
# #         'DeepAR': {
# #             'epochs': 50,
# #             'num_layers': 3,
# #             'hidden_size': 40
# #         },
# #         'PatchTST': {
# #             'patch_len': 16,
# #             'stride': 8,
# #             'num_layers': 3
# #         }
# #     }
# # )

# # # Entrenar con más ventanas de validación
# # predictor.fit(ts_data, num_val_windows=3, time_limit=7200)  # 2 horas

# predictor = TimeSeriesPredictor(
#     prediction_length=1,       # Solo predecir febrero 2020
#     target='tn',
#     freq='MS',                 # Frecuencia mensual
#     eval_metric='WQL',
#     known_covariates_names=['month', 'year'] + numeric_cols,
# )

# # 6) Entrenar pasando hyperparameters a .fit()
# predictor.fit(
#     train_data=ts_data,
#     hyperparameters={
#         'TemporalFusionTransformer': {
#             'hidden_size': 64,
#             'dropout_rate': 0.1,
#             'max_epochs': 50
#         },
#         'DeepAR': {
#             'epochs': 50,
#             'num_layers': 3,
#             'hidden_size': 40
#         },
#         'PatchTST': {
#             'patch_len': 16,
#             'stride': 8,
#             'num_layers': 3
#         }
#     },
#     num_val_windows=3,
#     time_limit=60*60  # 2 horas
# )

# # 🔮 6. Generar predicción
# # Crear datos para febrero 2020 con covariables
# future_timestamps = pd.date_range(start='2020-02-01', periods=1, freq='MS')
# future_data = []
# for item_id in product_ids:
#     last_row = df_monthly_product[df_monthly_product['item_id'] == item_id].iloc[-1]
#     future_data.append({
#         'item_id': item_id,
#         'timestamp': future_timestamps[0],
#         'month': future_timestamps[0].month,
#         'year': future_timestamps[0].year,
#         'tn_lag_1': last_row['tn'],
#         'tn_lag_3': last_row['tn_lag_3'],
#         'tn_lag_6': last_row['tn_lag_6'],
#         'tn_lag_12': last_row['tn_lag_12'],
#         'tn_rolling_mean_3': last_row['tn_rolling_mean_3'],
#         'tn_rolling_std_3': last_row['tn_rolling_std_3']
#     })

# future_ts_data = TimeSeriesDataFrame.from_data_frame(pd.DataFrame(future_data))

# # Predecir
# forecast = predictor.predict(ts_data, known_covariates=future_ts_data)

# # 🧪 7. Evaluar modelos adicionales
# leaderboard = predictor.leaderboard(ts_data, metrics=['WQL', 'MAE', 'RMSE'])
# print(leaderboard)

# # 📊 8. Visualizar resultados
# predictor.plot(ts_data, forecast, quantile_levels=[0.1, 0.9], max_history_length=200, max_num_item_ids=4)

# # 💾 9. Guardar predicciones
# forecast_mean = forecast['mean'].reset_index()
# resultado = forecast_mean[forecast_mean['timestamp'] == '2020-02-01'][['item_id', 'mean']]
# resultado.columns = ['product_id', 'tn']
# resultado.to_csv("../data/autogluon_202002_vleo1.csv", index=False)
# print(resultado.head())

In [4]:
# —————————————
# 0) SETUP e imports
# —————————————
import pandas as pd
import numpy as np
from autogluon.timeseries import TimeSeriesPredictor, TimeSeriesDataFrame

# (asume que ya definiste numeric_cols y cargaste df_monthly_product, df_productos, product_ids)

# —————————————
# 1) Crear ts_data con características estáticas
# —————————————
df_prod_static = (
    df_productos
    .rename(columns={'product_id': 'item_id'})
    .drop_duplicates(subset=['item_id'])
)  # dejamos item_id como columna

ts_data = TimeSeriesDataFrame.from_data_frame(
    df_monthly_product,
    id_column='item_id',
    timestamp_column='timestamp',
    static_features_df=df_prod_static
).fill_missing_values(method='ffill', value=0)

# —————————————
# 2) Definir y entrenar predictor
# —————————————
predictor = TimeSeriesPredictor(
    prediction_length=1,
    target='tn',
    freq='MS',
    eval_metric='WQL',
    known_covariates_names=['month', 'year'] + numeric_cols
)

predictor.fit(
    train_data=ts_data,
    num_val_windows=3,
    time_limit=60*60
)

# —————————————
# 3) Construir known_covariates
# —————————————
# 3.1) Sólo data como argumento
future_template = predictor.make_future_data_frame(ts_data)

# 3.2) Covariables “a mano”
future_rows = []
for item_id in product_ids:
    last = df_monthly_product[df_monthly_product['item_id']==item_id].iloc[-1]
    future_rows.append({
        'item_id': item_id,
        'timestamp': pd.Timestamp('2020-02-01'),
        'month':  2,
        'year':   2020,
        'tn_lag_1':          last['tn_lag_1'],
        'tn_lag_3':          last['tn_lag_3'],
        'tn_lag_6':          last['tn_lag_6'],
        'tn_lag_12':         last['tn_lag_12'],
        'tn_rolling_mean_3': last['tn_rolling_mean_3'],
        'tn_rolling_std_3':  last['tn_rolling_std_3'],
    })
df_future_covars = pd.DataFrame(future_rows)

# 3.3) Merge plantilla + covariables
future_covars = future_template.merge(
    df_future_covars,
    on=['item_id','timestamp'],
    how='left'
)

# 3.4) A TimeSeriesDataFrame
ts_future_covars = TimeSeriesDataFrame.from_data_frame(
    future_covars,
    id_column='item_id',
    timestamp_column='timestamp'
).fill_missing_values(method='ffill', value=0)

# 4) Predecir
forecast = predictor.predict(
    data=ts_data,
    known_covariates=ts_future_covars
)



Beginning AutoGluon training... Time limit = 3600s
AutoGluon will save models to 'c:\Users\leona\source\repos\lab3\notebooks\AutogluonModels\ag-20250713_140507'
AutoGluon Version:  1.3.1
Python Version:     3.10.17
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.26100
CPU Count:          12
GPU Count:          0
Memory Avail:       11.45 GB / 31.71 GB (36.1%)
Disk Space Avail:   207.07 GB / 952.62 GB (21.7%)

Fitting with arguments:
{'enable_ensemble': True,
 'eval_metric': WQL,
 'freq': 'MS',
 'hyperparameters': 'default',
 'known_covariates_names': ['month',
                            'year',
                            'tn_lag_1',
                            'tn_lag_3',
                            'tn_lag_6',
                            'tn_lag_12',
                            'tn_rolling_mean_3',
                            'tn_rolling_std_3'],
 'num_val_windows': 3,
 'prediction_length': 1,
 'quantile_levels': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8

In [37]:
forecast

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9
item_id,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
20001,2020-01-01,1380.203417,954.091084,1119.000703,1211.317959,1298.338216,1380.203412,1471.899781,1560.925172,1681.345954,1846.482721
20002,2020-01-01,1112.240539,751.491362,894.353535,966.017518,1035.757895,1112.240537,1193.296046,1272.297077,1372.522987,1515.056100
20003,2020-01-01,793.266090,600.793400,679.869490,711.970182,749.585536,793.266092,842.770633,889.102493,945.000229,1028.689185
20004,2020-01-01,526.547966,341.625394,413.438953,446.777253,484.697745,526.547966,567.599010,615.462877,667.210901,750.755988
20005,2020-01-01,512.681072,309.269295,388.103745,425.793522,467.814345,512.681073,560.595073,612.529673,672.138342,765.920392
...,...,...,...,...,...,...,...,...,...,...,...
20962,2020-01-01,2.600382,0.915253,1.519046,1.911180,2.270532,2.600382,2.954823,3.397775,3.948959,4.852421
20975,2020-01-01,2.016750,0.299211,0.928910,1.309665,1.663697,2.016750,2.406884,2.886400,3.476987,4.358749
20995,2020-01-01,1.887807,0.315863,0.886999,1.238733,1.561348,1.887807,2.244007,2.665186,3.196131,3.984515
21087,2020-01-01,0.971638,0.427035,0.630531,0.759022,0.864076,0.971638,1.078465,1.187325,1.329387,1.564718


In [41]:
resultado = forecast['0.6'].reset_index()[['item_id', '0.6']]
resultado.columns = ['product_id', 'tn']

In [39]:
resultado

Unnamed: 0,product_id,tn
0,20001,1471.899781
1,20002,1193.296046
2,20003,842.770633
3,20004,567.599010
4,20005,560.595073
...,...,...
775,20962,2.954823
776,20975,2.406884
777,20995,2.244007
778,21087,1.078465


In [42]:
# # Filtrar solo febrero 2020
resultado = forecast['0.6'].reset_index()


In [43]:
resultado

Unnamed: 0,item_id,timestamp,0.6
0,20001,2020-01-01,1471.899781
1,20002,2020-01-01,1193.296046
2,20003,2020-01-01,842.770633
3,20004,2020-01-01,567.599010
4,20005,2020-01-01,560.595073
...,...,...,...
775,20962,2020-01-01,2.954823
776,20975,2020-01-01,2.406884
777,20995,2020-01-01,2.244007
778,21087,2020-01-01,1.078465


In [46]:
# convert time_stamp to date only
resultado['timestamp'] = resultado['timestamp'].dt.date


In [47]:
resultado

Unnamed: 0,item_id,timestamp,0.6
0,20001,2020-01-01,1471.899781
1,20002,2020-01-01,1193.296046
2,20003,2020-01-01,842.770633
3,20004,2020-01-01,567.599010
4,20005,2020-01-01,560.595073
...,...,...,...
775,20962,2020-01-01,2.954823
776,20975,2020-01-01,2.406884
777,20995,2020-01-01,2.244007
778,21087,2020-01-01,1.078465


In [32]:
resultado = resultado[resultado['timestamp'] == '2020-02-01']

In [None]:
resultado

# # Renombrar columnas
resultado = resultado[['item_id', '0.6']]
resultado.columns = ['product_id', 'tn']

Unnamed: 0,item_id,timestamp,0.6


In [22]:
resultado.to_csv("../data/autogluon_202002_lhv5.csv", index=False)
resultado.head()

Unnamed: 0,product_id,tn
0,20001,1681.345954
1,20002,1372.522987
2,20003,945.000229
3,20004,667.210901
4,20005,672.138342
