In [1]:
import pandas as pd
import numpy as np
import gc

In [2]:
df = pd.read_csv("./datasets/dataset.csv", sep=',')
df.shape

(17173448, 17)

In [3]:
df.drop(columns=['nacimiento_cliente_dt','cat1','cat2','cat3','brand'], inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17173448 entries, 0 to 17173447
Data columns (total 12 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   product_id             int64  
 1   customer_id            int64  
 2   periodo                int64  
 3   nacimiento_producto    int64  
 4   muerte_de_producto     int64  
 5   sku_size               float64
 6   stock_final            float64
 7   tn                     float64
 8   cust_request_tn        float64
 9   cust_request_qty       float64
 10  plan_precios_cuidados  float64
 11  target                 float64
dtypes: float64(7), int64(5)
memory usage: 1.5 GB


In [4]:
df.drop(columns=['sku_size','stock_final','cust_request_tn','cust_request_qty','plan_precios_cuidados'], inplace=True)

In [5]:
gc.collect()

7

In [6]:
# Convertir periodo a datetime para cálculos temporales
df['periodo_dt'] = pd.to_datetime(df['periodo'].astype(str), format='%Y%m')
df['crisis'] = (df['periodo_dt'].dt.year == 2019) & (df['periodo_dt'].dt.month == 8)


# 🧩 1️⃣ Lags de tn por producto y cliente
lags = [1, 2, 3, 6, 12, 14, 16, 18, 20, 22, 24]
for lag in lags:
    df[f'tn_lag_{lag}'] = df.groupby(['product_id', 'customer_id'])['tn'].shift(lag)

# 🧩 2️⃣ Deltas (diferencias) entre lags
for lag in lags:
    df[f'tn_delta_{lag}'] = df['tn'] - df[f'tn_lag_{lag}']

# 🧩 3️⃣ Rolling mean, std por producto y cliente
windows = [3, 6, 12]
for window in windows:
    df[f'tn_roll_mean_{window}'] = (
        df.groupby(['product_id', 'customer_id'])['tn']
        .rolling(window=window, min_periods=1).mean().reset_index(level=[0,1], drop=True)
    )
    df[f'tn_roll_std_{window}'] = (
        df.groupby(['product_id', 'customer_id'])['tn']
        .rolling(window=window, min_periods=1).std().reset_index(level=[0,1], drop=True)
    )

# 🧩 4️⃣ Tendencia (diferencia entre rolling mean de largo y corto plazo)
df['tn_trend'] = df['tn_roll_mean_12'] - df['tn_roll_mean_3']

# 🧩 5️⃣ Ratio entre tn y rolling mean
df['tn_ratio_mean_3'] = df['tn'] / (df['tn_roll_mean_3'] + 1e-6)  # Evitar división por cero

# 🧩 6️⃣ Velocidad de crecimiento (delta con respecto al mes anterior)
df['tn_growth_1'] = df['tn_delta_1'] / (df['tn_lag_1'] + 1e-6)

# 🧩 7️⃣ Codificación de categorías (con numeración o one-hot)
# for col in ['cat1', 'cat2', 'cat3', 'brand']:
#     df[f'{col}_code'] = df[col].astype('category').cat.codes

# 🧩 8️⃣ Rolling features de stock_final (igual que tn)
# for lag in lags:
#     df[f'stock_lag_{lag}'] = df.groupby(['product_id', 'customer_id'])['stock_final'].shift(lag)
# for window in windows:
#     df[f'stock_roll_mean_{window}'] = (
#         df.groupby(['product_id', 'customer_id'])['stock_final']
#         .rolling(window=window, min_periods=1).mean().reset_index(level=[0,1], drop=True)
#     )

# 🧩 9️⃣ Tendencia del stock
# df['stock_trend'] = df['stock_roll_mean_12'] - df['stock_roll_mean_3']

# 🧩 🔟 Interacciones entre tn y stock
# df['tn_stock_ratio'] = df['tn'] / (df['stock_final'] + 1e-6)
# df.drop(columns=['stock_final'], inplace=True)
# gc.collect()

# 🧩 🔟.5 Codificación temporal (año, mes, trimestre)
df['year'] = df['periodo_dt'].dt.year
df['month'] = df['periodo_dt'].dt.month
df['quarter'] = df['periodo_dt'].dt.quarter

# 🧩 🔟.6 Diferencia de periodo (en meses) respecto al nacimiento del producto
df['months_since_nacimiento_producto'] = (
    (df['periodo_dt'] - pd.to_datetime(df['nacimiento_producto'], format='%Y%m')).dt.days // 30
)
df.drop(columns=['nacimiento_producto','muerte_de_producto', 'periodo_dt' ], inplace=True)
gc.collect()


# 🧩 🔟.7 Posición temporal (crecimiento acumulado por cliente-producto)
df['tn_cumsum'] = df.groupby(['product_id', 'customer_id'])['tn'].cumsum()

# 🧩 🔟.8 Recuento de compras anteriores por cliente-producto
df['compra_count'] = df.groupby(['product_id', 'customer_id']).cumcount()

# 🧩 🔟.9 Flag de plan_precios_cuidados (puede ser útil)
# df['plan_precios_cuidados_flag'] = (df['plan_precios_cuidados'] > 0).astype(int)
# df.drop(columns=['plan_precios_cuidados'], inplace=True)
# gc.collect()

# 🧩 🔟🔟 Si querés, normalizar tn (z-score) por producto
df['tn_norm_producto'] = (
    df.groupby('product_id')['tn'].transform(lambda x: (x - x.mean()) / (x.std() + 1e-6))
)
# gc.collect()

# 🧹 Eliminá los registros sin target (por ejemplo si querés predecir)
df['target'].fillna(0, inplace=True)

print(f"✅ Feature engineering completo. Dataset tiene {df.shape[1]} columnas y {len(df):,} filas.")


✅ Feature engineering completo. Dataset tiene 44 columnas y 17,173,448 filas.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['target'].fillna(0, inplace=True)


In [7]:
dt_kgl = df[df["periodo"].isin([201912])]
df = df.drop(df[df["periodo"].isin([201911,201912])].index,axis=0)

In [8]:
import gc
gc.collect()

0

In [9]:
import lightgbm as lgb
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# 🧹 Preparar features y target
feature_cols = [col for col in df.columns if col not in ['periodo_dt', 'target', 'nacimiento_producto', 'muerte_de_producto', 'periodo']]
X = df[feature_cols].astype(np.float32)  # Convertir a float32
y = df['target'].astype(np.float32)


# Definir el modelo LightGBM
lgb_reg = lgb.LGBMRegressor(random_state=12345)

# Entrenar el modelo
lgb_reg.fit(X, y)

print("Modelo LightGBM entrenado con éxito.")



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.817451 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 9270
[LightGBM] [Info] Number of data points in the train set: 16058944, number of used features: 42
[LightGBM] [Info] Start training from score 0.077271
Modelo LightGBM entrenado con éxito.


In [12]:
X_kgl = dt_kgl.drop(columns=['target','periodo'])
y_pred = lgb_reg.predict(X_kgl)

In [13]:
productos_ok = pd.read_csv("https://storage.googleapis.com/open-courses/austral2025-af91/labo3v/product_id_apredecir201912.txt", sep="\t")
result = pd.DataFrame({"product_id": X_kgl["product_id"],  "tn": y_pred})
result = result[result["product_id"].isin(productos_ok["product_id"])]
result = result.groupby("product_id").agg({"tn":"sum"}).reset_index()
result

Unnamed: 0,product_id,tn
0,20001,1175.854757
1,20002,1056.297877
2,20003,837.931074
3,20004,619.455506
4,20005,490.001896
...,...,...
775,21263,2.932130
776,21265,2.943556
777,21266,2.943556
778,21267,2.946382


In [14]:
result.to_csv("./kaggle/lgb-probando12345.csv", index=False, sep=",")