In [1]:
# !pip install autogluon
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import warnings
warnings.filterwarnings("ignore")
from autogluon.timeseries import TimeSeriesDataFrame, TimeSeriesPredictor

In [2]:
sellin = pd.read_csv("datasets/sell-in.csv", sep='\t')
productos = pd.read_csv("datasets/tb_productos.csv", sep='\t')
stocks = pd.read_csv("datasets/tb_stocks.csv", sep='\t')

In [3]:
# Verificación inicial
print(f"Sell-In: {sellin.shape[0]} filas y {sellin.shape[1]} columnas")
print(f"Productos: {productos.shape[0]} filas y {productos.shape[1]} columnas")
print(f"Stocks: {stocks.shape[0]} filas y {stocks.shape[1]} columnas")

Sell-In: 2945818 filas y 7 columnas
Productos: 1262 filas y 6 columnas
Stocks: 13691 filas y 3 columnas


In [4]:
# 3. MERGE INICIAL
df = sellin.merge(productos, on="product_id", how="left")
df = df.merge(stocks, on=["product_id", "periodo"], how="left")
print(f"Ventas-Productos-Stocks: {df.shape[0]} filas y {df.shape[1]} columnas")

Ventas-Productos-Stocks: 2988650 filas y 13 columnas


In [5]:
productos_clean = productos.drop_duplicates(subset=['product_id'], keep='first')
print(productos_clean.shape)

(1251, 6)


In [6]:
df = sellin.merge(productos_clean, on="product_id", how="left")
df = df.merge(stocks, on=["product_id", "periodo"], how="left")
print(sellin.shape)
print(df.shape)

(2945818, 7)
(2945818, 13)


In [7]:
# Supongamos que df ya contiene las columnas: periodo, customer_id, product_id, tn
df["periodo_dt"] = pd.to_datetime(df["periodo"].astype(str), format="%Y%m")

# Paso 1: Rango total de periodos
todos_los_periodos = pd.date_range(start=df["periodo_dt"].min(), end=df["periodo_dt"].max(), freq="MS")

# Paso 2: Todos los clientes únicos
todos_los_clientes = df["customer_id"].unique()

# Paso 3: Determinar vida útil de cada producto
vida_producto = df.groupby("product_id")["periodo_dt"].agg(["min", "max"]).reset_index()

# Paso 4: Generar combinaciones (periodo, producto) considerando restricciones
combinaciones_producto_periodo = []
fecha_limite_nuevos = pd.to_datetime("2017-03", format="%Y-%m")

# Los productos de 35 y 36 meeses de vida son considerados "vitales"
productos_vitales = df.groupby("product_id")["periodo_dt"].agg(["min", "max"]).reset_index()
mask = (productos_vitales['min'] == '2017-01-01') & ((productos_vitales['max'] == '2019-12-01'))
productos_vitales = productos_vitales[mask]['product_id'].unique()  

for _, row in vida_producto.iterrows():
    producto = row["product_id"]
    min_fecha = row["min"]
    max_fecha = row["max"]
    periodos_validos = pd.date_range(start=min_fecha, end=max_fecha, freq="MS")
    es_nuevo = min_fecha >= fecha_limite_nuevos  # solo si el producto es nuevo a partir de 2017-02
    
    for p in periodos_validos:
        if producto in productos_vitales:
            combinaciones_producto_periodo.append((p, producto))
            continue
        # Excluir primeros 3 meses si es nuevo (a partir de 2017-02)
        if es_nuevo and (p < min_fecha + pd.DateOffset(months=3)):
            continue
        # Excluir últimos 3 meses del producto
        if p > max_fecha - pd.DateOffset(months=3):
            continue
        combinaciones_producto_periodo.append((p, producto))

df_producto_periodo = pd.DataFrame(combinaciones_producto_periodo, columns=["periodo_dt", "product_id"])

# Paso 5: Generar combinaciones de todos los clientes con (periodo, producto)
combinaciones = []
for _, row in df_producto_periodo.iterrows():
    periodo = row["periodo_dt"]
    producto = row["product_id"]
    for cliente in todos_los_clientes:
        # if producto in df[df["customer_id"] == cliente]["product_id"].unique(): ###### <------ ESTO TARDA 3 AÑOS
        combinaciones.append((periodo, producto, cliente)) 

df_completo = pd.DataFrame(combinaciones, columns=["periodo_dt", "product_id", "customer_id"])

# Paso 6: Unir con toneladas efectivas
df_merge = df_completo.merge(df[["periodo_dt", "product_id", "customer_id", "tn"]],
                             on=["periodo_dt", "product_id", "customer_id"],
                             how="left")
df_merge["tn"] = df_merge["tn"].fillna(0)

# Paso 7: Recuperar periodo AAAAMM si lo necesitás
df_merge["periodo"] = df_merge["periodo_dt"].dt.strftime("%Y%m").astype(int)

# Resultado final
df_final = df_merge[["periodo", "product_id", "customer_id", "tn"]]

# Vista previa
print(df_final.head())

   periodo  product_id  customer_id        tn
0   201701       20001        10234   0.33579
1   201701       20001        10032  12.31230
2   201701       20001        10217   0.00000
3   201701       20001        10125   0.08954
4   201701       20001        10012   6.97324


In [8]:
df_final['item_id'] = df_final['customer_id'].astype(str) + '_' + df_final['product_id'].astype(str)

In [9]:
df_final["periodo_dt"] = pd.to_datetime(df_final["periodo"].astype(str), format="%Y%m")



In [10]:
df_final.head()

Unnamed: 0,periodo,product_id,customer_id,tn,item_id,periodo_dt
0,201701,20001,10234,0.33579,10234_20001,2017-01-01
1,201701,20001,10032,12.3123,10032_20001,2017-01-01
2,201701,20001,10217,0.0,10217_20001,2017-01-01
3,201701,20001,10125,0.08954,10125_20001,2017-01-01
4,201701,20001,10012,6.97324,10012_20001,2017-01-01


In [11]:
df_final.drop(columns=["customer_id", "product_id", "periodo"], inplace=True)

In [12]:
df_final.head()

Unnamed: 0,tn,item_id,periodo_dt
0,0.33579,10234_20001,2017-01-01
1,12.3123,10032_20001,2017-01-01
2,0.0,10217_20001,2017-01-01
3,0.08954,10125_20001,2017-01-01
4,6.97324,10012_20001,2017-01-01


In [13]:
df_final.rename(columns={'periodo_dt': 'timestamp', 'tn': 'target'}, inplace=True)

In [14]:
data = TimeSeriesDataFrame(df_final)
data

Unnamed: 0_level_0,Unnamed: 1_level_0,target
item_id,timestamp,Unnamed: 2_level_1
10234_20001,2017-01-01,0.33579
10032_20001,2017-01-01,12.31230
10217_20001,2017-01-01,0.00000
10125_20001,2017-01-01,0.08954
10012_20001,2017-01-01,6.97324
...,...,...
10591_21281,2017-05-01,0.00000
10559_21281,2017-05-01,0.00000
10560_21281,2017-05-01,0.00000
10582_21281,2017-05-01,0.00000


In [16]:
# 🔍 3️⃣ Validar datos antes de fit
print(f"🔎 Número de series: {data.num_items}")
print(f"🔎 Longitudes por serie:\n{data.num_timesteps_per_item().describe()}")
print(f"🔎 Target - estadísticas globales:\n{data['target'].describe()}")
print(f"🔎 NaN en target: {data['target'].isna().sum()}")


🔎 Número de series: 662670
🔎 Longitudes por serie:
count    662670.000000
mean         25.618018
std          12.336436
min           1.000000
25%          15.000000
50%          36.000000
75%          36.000000
max          36.000000
dtype: float64
🔎 Target - estadísticas globales:
count    1.697629e+07
mean     7.560725e-02
std      1.297295e+00
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      5.478785e+02
Name: target, dtype: float64
🔎 NaN en target: 0


In [20]:
valid_series = data.num_timesteps_per_item() >= 3
valid_item_ids = valid_series[valid_series].index
data = data[data.index.get_level_values('item_id').isin(valid_item_ids)]

In [15]:
# Filtrar solo productos a predecir
productos_ids = productos_a_predecir['product_id'].unique()

In [21]:
predictor = TimeSeriesPredictor(target='target', prediction_length=2, freq="M").fit(data,time_limit=60,presets='fast_training')
predictions = predictor.predict(data)

Frequency 'M' stored as 'ME'


Beginning AutoGluon training... Time limit = 60s
AutoGluon will save models to 'c:\Users\iparra\Documents\GestionDeDatos\Austral\Lab3\Lab3-MCD-main\Lab3-MCD-main\AutogluonModels\ag-20250531_013034'
AutoGluon Version:  1.3.1
Python Version:     3.12.4
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22631
CPU Count:          24
GPU Count:          0
Memory Avail:       0.92 GB / 15.67 GB (5.9%)
Disk Space Avail:   120.72 GB / 446.17 GB (27.1%)
Setting presets to: fast_training

Fitting with arguments:
{'enable_ensemble': True,
 'eval_metric': WQL,
 'freq': 'ME',
 'hyperparameters': 'very_light',
 'known_covariates_names': [],
 'num_val_windows': 1,
 'prediction_length': 2,
 'quantile_levels': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
 'random_seed': 123,
 'refit_every_n_windows': 1,
 'refit_full': False,
 'skip_model_selection': False,
 'target': 'target',
 'time_limit': 60,
 'verbosity': 2}

train_data with frequency 'MS' has been resampled to frequen

ValueError: Trainer has no fit models that can predict.

In [13]:
predictions_v1 = predictions.copy()
predictions_v1 = predictions_v1.reset_index()
predictions_v1 = predictions_v1[["item_id", "timestamp", "mean"]]
predictions_v1 = predictions_v1[predictions_v1.timestamp == "2020-02-29"]
predictions_v1 = predictions_v1.drop(columns = {"timestamp"})
predictions_v1 = predictions_v1.rename(columns = {"item_id":"product_id", "mean":"tn"})
predictions_v1.head(5)

Unnamed: 0,product_id,tn
1,20001,1302.976815
3,20002,1102.488131
5,20003,666.269509
7,20004,550.978395
9,20005,577.40509


In [17]:
predictions_v1 = predictions_v1[predictions_v1['product_id'].isin(productos_ids)]

In [21]:
predictions_v1.to_csv("./kaggle/predictions_autogluon.csv", index=False)