AutoGluon - Predicci√≥n de ventas (tn) por producto para febrero 2020

In [1]:
# üì¶ 1. Importar librer√≠as
import pandas as pd

In [2]:
# üí¨ Instalar AutoGluon si es necesario
# %pip install autogluon.timeseries

from autogluon.timeseries import TimeSeriesPredictor, TimeSeriesDataFrame

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# üìÑ 2. Cargar datasets
df_sellin = pd.read_csv("../../data/raw/sell-in.csv", sep="\t")
df_productos = pd.read_csv("../../data/raw/tb_productos.csv", sep="\t")

In [5]:
product_ids = pd.read_csv("../../data/raw/product_id_apredecir201912.csv", sep="\t")['product_id'].unique()

In [6]:
# üßπ 3. Preprocesamiento
# Convertir periodo a datetime
df_sellin['timestamp'] = pd.to_datetime(df_sellin['periodo'], format='%Y%m')

In [7]:
# Filtrar hasta dic 2019 y productos requeridos
df_filtered = df_sellin[
    (df_sellin['timestamp'] <= '2019-12-01') &
    (df_sellin['product_id'].isin(product_ids))
]

In [8]:
# Agregar tn por periodo, cliente y producto
df_grouped = df_filtered.groupby(['timestamp', 'customer_id', 'product_id'], as_index=False)['tn'].sum()

In [9]:
# Agregar tn total por periodo y producto
df_monthly_product = df_grouped.groupby(['timestamp', 'product_id'], as_index=False)['tn'].sum()

In [10]:
# Agregar columna 'item_id' para AutoGluon
df_monthly_product['item_id'] = df_monthly_product['product_id']

In [11]:
# ‚è∞ 4. Crear TimeSeriesDataFrame
ts_data = TimeSeriesDataFrame.from_data_frame(
    df_monthly_product,
    id_column='item_id',
    timestamp_column='timestamp'
)

In [12]:
# Completar valores faltantes
ts_data = ts_data.fill_missing_values()

In [13]:
# ‚öôÔ∏è 5. Definir y entrenar predictor
predictor = TimeSeriesPredictor(
    prediction_length=2,
    target='tn',
    freq='MS'  # Frecuencia mensual (Month Start), 
)

predictor.fit(ts_data, num_val_windows=2, time_limit=60*60)

Beginning AutoGluon training... Time limit = 3600s
AutoGluon will save models to 'c:\Users\Usuario\Documents\Universidad\austral\2025\Lab3\Lab3-MCD\notebooks\model_autogluon\AutogluonModels\ag-20250704_005246'
AutoGluon Version:  1.3.1
Python Version:     3.11.13
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.26100
CPU Count:          4
GPU Count:          0
Memory Avail:       2.23 GB / 15.89 GB (14.0%)
Disk Space Avail:   409.62 GB / 893.49 GB (45.8%)

Fitting with arguments:
{'enable_ensemble': True,
 'eval_metric': WQL,
 'freq': 'MS',
 'hyperparameters': 'default',
 'known_covariates_names': [],
 'num_val_windows': 2,
 'prediction_length': 2,
 'quantile_levels': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
 'random_seed': 123,
 'refit_every_n_windows': 1,
 'refit_full': False,
 'skip_model_selection': False,
 'target': 'tn',
 'time_limit': 3600,
 'verbosity': 2}

train_data with frequency 'IRREG' has been resampled to frequency 'MS'.
Provided train

<autogluon.timeseries.predictor.TimeSeriesPredictor at 0x2ee44d10a10>

In [14]:
# üîÆ 6. Generar predicci√≥n
forecast = predictor.predict(ts_data)

data with frequency 'IRREG' has been resampled to frequency 'MS'.
Model not specified in predict, will default to the model with the best validation score: WeightedEnsemble


In [15]:
# Extraer predicci√≥n media y filtrar febrero 2020
forecast_mean = forecast['mean'].reset_index()
print(forecast_mean.columns)

Index(['item_id', 'timestamp', 'mean'], dtype='object')


In [16]:
# Tomar solo item_id y la predicci√≥n 'mean'
resultado = forecast['mean'].reset_index()[['item_id', 'mean']]
resultado.columns = ['product_id', 'tn']

# Filtrar solo febrero 2020
resultado = forecast['mean'].reset_index()
resultado = resultado[resultado['timestamp'] == '2020-02-01']

# Renombrar columnas
resultado = resultado[['item_id', 'mean']]
resultado.columns = ['product_id', 'tn']


In [17]:
# üíæ 7. Guardar archivo
resultado.to_csv("./dataset/predicciones_febrero2020_fecha_01_07.csv", index=False)
resultado.head()

Unnamed: 0,product_id,tn
1,20001,1290.183234
3,20002,1059.023193
5,20003,715.545347
7,20004,543.455412
9,20005,519.751946


Ensemble archivos

In [19]:
reg_lineal = pd.read_csv("./dataset/predicciones_regresion_lineal_v1.csv", sep=",")
reg_lineal.rename(columns={'tn': 'tn_rl'}, inplace=True)
resultado.rename(columns={'tn': 'tn_ag'}, inplace=True)


In [None]:
resultado = resultado.merge(reg_lineal, on='product_id', how='left')

In [24]:
resultado['tn'] = (resultado['tn_ag'] + resultado['tn_rl']) / 2
resultado[['product_id','tn']].to_csv("./dataset/predicciones_febrero2020_fecha_01_07_ensemble.csv", index=False)

In [26]:
reg_ag = pd.read_csv("./dataset/prediccion_autogluon_2ventanas.csv", sep=",")
reg_ag.rename(columns={'tn': 'tn_ag'}, inplace=True)


In [30]:
resultado = resultado.merge(reg_ag, on='product_id', how='left')
resultado['tn'] = (resultado['tn_ag1'] + resultado['tn_ag']) / 2
resultado[['product_id','tn']].to_csv("./dataset/predicciones_febrero2020_fecha_01_07_ensemble_v2.csv", index=False)