In [1]:
import pandas as pd
import numpy as np
import gc

In [2]:
df = pd.read_csv("./datasets/dataset.csv", sep=',')
df.shape

(17173448, 17)

In [3]:
df = df[df['periodo'] >= 201801]
df.shape

(12115915, 17)

In [4]:
df.drop(columns=['nacimiento_cliente_dt'], inplace=True)
df.drop(columns=['cat1','cat2','cat3','brand'], inplace=True)
df.drop(columns=['sku_size','stock_final','cust_request_tn','cust_request_qty','plan_precios_cuidados'], inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12115915 entries, 12 to 17173447
Data columns (total 7 columns):
 #   Column               Dtype  
---  ------               -----  
 0   product_id           int64  
 1   customer_id          int64  
 2   periodo              int64  
 3   nacimiento_producto  int64  
 4   muerte_de_producto   int64  
 5   tn                   float64
 6   target               float64
dtypes: float64(2), int64(5)
memory usage: 739.5 MB


In [5]:
gc.collect()

10

In [6]:
# Asegurate de tener tu DataFrame df cargado y con columnas:
# ['product_id', 'customer_id', 'periodo', 'tn', 'nacimiento_producto', 'muerte_de_producto', 'cat1', 'cat2', 'cat3', 'brand', 'sku_size', 'stock_final', 'cust_request_tn', 'cust_request_qty', 'plan_precios_cuidados']

# Convertir periodo a datetime
df['periodo_dt'] = pd.to_datetime(df['periodo'].astype(str), format='%Y%m')

# 1️⃣ Lags
lags = [1, 2, 3, 6, 12, 14]
for lag in lags:
    df[f'tn_lag_{lag}'] = df.groupby(['product_id', 'customer_id'])['tn'].shift(lag)

# 2️⃣ Deltas
for lag in lags:
    df[f'tn_delta_{lag}'] = df['tn'] - df[f'tn_lag_{lag}']

# 3️⃣ Rolling mean, std
windows = [3, 6, 12]
for window in windows:
    df[f'tn_roll_mean_{window}'] = df.groupby(['product_id', 'customer_id'])['tn'].rolling(window=window, min_periods=1).mean().reset_index(level=[0,1], drop=True)
    df[f'tn_roll_std_{window}'] = df.groupby(['product_id', 'customer_id'])['tn'].rolling(window=window, min_periods=1).std().reset_index(level=[0,1], drop=True)

# 4️⃣ Tendencia
df['tn_trend'] = df['tn_roll_mean_12'] - df['tn_roll_mean_3']

# 5️⃣ Ratio y growth
df['tn_ratio_mean_3'] = df['tn'] / (df['tn_roll_mean_3'] + 1e-6)
df['tn_growth_1'] = df['tn_delta_1'] / (df['tn_lag_1'] + 1e-6)

# # 6️⃣ Codificación categorías
# for col in ['cat1', 'cat2', 'cat3', 'brand']:
#     df[f'{col}_code'] = df[col].astype('category').cat.codes

# 7️⃣ Stock features
# for lag in lags:
#     df[f'stock_lag_{lag}'] = df.groupby(['product_id', 'customer_id'])['stock_final'].shift(lag)
# for window in windows:
#     df[f'stock_roll_mean_{window}'] = df.groupby(['product_id', 'customer_id'])['stock_final'].rolling(window=window, min_periods=1).mean().reset_index(level=[0,1], drop=True)
# df['stock_trend'] = df['stock_roll_mean_12'] - df['stock_roll_mean_3']
# df['tn_stock_ratio'] = df['tn'] / (df['stock_final'] + 1e-6)

# 8️⃣ Temporal features
df['year'] = df['periodo_dt'].dt.year
df['month'] = df['periodo_dt'].dt.month
df['quarter'] = df['periodo_dt'].dt.quarter
# df['months_since_nacimiento_producto'] = (df['periodo_dt'] - pd.to_datetime(df['nacimiento_producto'], format='%Y%m')).dt.days // 30
df['tn_cumsum'] = df.groupby(['product_id', 'customer_id'])['tn'].cumsum()
df['compra_count'] = df.groupby(['product_id', 'customer_id']).cumcount()
# df['plan_precios_cuidados_flag'] = (df['plan_precios_cuidados'] > 0).astype(int)

# 9️⃣ Z-score por producto
df['tn_norm_producto'] = df.groupby('product_id')['tn'].transform(lambda x: (x - x.mean()) / (x.std() + 1e-6))

# 🔟 Quantiles
df['tn_quantile_25'] = df.groupby(['product_id', 'customer_id'])['tn'].transform(lambda x: x.quantile(0.25))
df['tn_quantile_75'] = df.groupby(['product_id', 'customer_id'])['tn'].transform(lambda x: x.quantile(0.75))

# 🔟.1 Total compras por cliente-producto
df['total_compras_cliente_producto'] = df.groupby(['product_id', 'customer_id'])['tn'].transform('count')

# 🔟.2 Días desde última compra
df['last_purchase'] = df.groupby(['product_id', 'customer_id'])['periodo_dt'].transform('max')
df['days_since_last_purchase'] = (df['periodo_dt'] - df['last_purchase']).dt.days
df.drop(columns=['last_purchase'], inplace=True)

# # 🔟.3 Interacciones de categorías con producto
# for cat_col in ['cat1', 'cat2', 'cat3', 'brand']:
#     df[f'{cat_col}_product_interaction'] = df['product_id'].astype(str) + '_' + df[cat_col].astype(str)
#     df[f'{cat_col}_product_interaction'] = df[f'{cat_col}_product_interaction'].astype('category').cat.codes

# 🔟.4 Exponential smoothing
df['tn_ewm_3'] = df.groupby(['product_id', 'customer_id'])['tn'].transform(lambda x: x.ewm(span=3, adjust=False).mean())
df['tn_ewm_6'] = df.groupby(['product_id', 'customer_id'])['tn'].transform(lambda x: x.ewm(span=6, adjust=False).mean())

# 🔟.5 Rolling range
df['tn_roll_range_3'] = df['tn_roll_mean_3'] - df.groupby(['product_id', 'customer_id'])['tn'].rolling(window=3).min().reset_index(level=[0,1], drop=True)

# 🔟.6 Flags de eventos
df['is_december'] = (df['month'] == 12).astype(int)
df['is_black_friday'] = ((df['month'] == 11) & (df['periodo'] % 100 == 11)).astype(int)

# 🔟.7 Coeficiente de variación
df['tn_cv'] = df['tn_roll_std_12'] / (df['tn_roll_mean_12'] + 1e-6)

# 🔟.8 Ratios frente a total del producto
df['tn_total_producto'] = df.groupby('product_id')['tn'].transform('sum')
df['tn_ratio_total_producto'] = df['tn'] / (df['tn_total_producto'] + 1e-6)

# 🔟.9 Flag de últimos meses
# df['is_last_3_months'] = df['months_since_nacimiento_producto'] >= (df.groupby('product_id')['months_since_nacimiento_producto'].transform('max') - 3)

print(f"✅ Feature engineering completo. Total columnas: {df.shape[1]}, total filas: {len(df):,}")


✅ Feature engineering completo. Total columnas: 47, total filas: 12,115,915


In [7]:
dt_kgl = df[df["periodo"].isin([201912])]
df = df.drop(df[df["periodo"].isin([201911,201912])].index,axis=0)

In [8]:
import gc
gc.collect()

17

In [None]:
df.drop(columns=['nacimiento_producto','muerte_de_producto','periodo_dt'])

Unnamed: 0,product_id,customer_id,periodo,tn,target,tn_lag_1,tn_lag_2,tn_lag_3,tn_lag_6,tn_lag_12,...,total_compras_cliente_producto,days_since_last_purchase,tn_ewm_3,tn_ewm_6,tn_roll_range_3,is_december,is_black_friday,tn_cv,tn_total_producto,tn_ratio_total_producto
12,20524,10234,201801,0.0,0.0,,,,,,...,24,-699,0.0,0.0,,0,0,,182.78990,0.0
13,20524,10234,201802,0.0,0.0,0.0,,,,,...,24,-668,0.0,0.0,,0,0,0.0,182.78990,0.0
14,20524,10234,201803,0.0,0.0,0.0,0.0,,,,...,24,-640,0.0,0.0,0.0,0,0,0.0,182.78990,0.0
15,20524,10234,201804,0.0,0.0,0.0,0.0,0.0,,,...,24,-609,0.0,0.0,0.0,0,0,0.0,182.78990,0.0
16,20524,10234,201805,0.0,0.0,0.0,0.0,0.0,,,...,24,-579,0.0,0.0,0.0,0,0,0.0,182.78990,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17170438,21054,10504,201910,0.0,0.0,,,,,,...,3,-61,0.0,0.0,,0,0,,5.26171,0.0
17170441,21054,10455,201910,0.0,0.0,,,,,,...,3,-61,0.0,0.0,,0,0,,5.26171,0.0
17170444,21054,10479,201910,0.0,0.0,,,,,,...,3,-61,0.0,0.0,,0,0,,5.26171,0.0
17170447,21054,10538,201910,0.0,0.0,,,,,,...,3,-61,0.0,0.0,,0,0,,5.26171,0.0


In [10]:
import lightgbm as lgb
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# 🧹 Preparar features y target
feature_cols = [col for col in df.columns if col not in ['periodo_dt', 'target', 'nacimiento_producto', 'muerte_de_producto', 'periodo']]
X = df[feature_cols].astype(np.float32)  # Convertir a float32
y = df['target'].astype(np.float32)


# Definir el modelo LightGBM
lgb_reg = lgb.LGBMRegressor(random_state=12345)

# Entrenar el modelo
lgb_reg.fit(X, y)

print("Modelo LightGBM entrenado con éxito.")



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.523928 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8809
[LightGBM] [Info] Number of data points in the train set: 11001411, number of used features: 42
[LightGBM] [Info] Start training from score 0.068016
Modelo LightGBM entrenado con éxito.


In [11]:
# X_kgl = dt_kgl.drop(columns=['target','periodo'])
X_kgl = dt_kgl[feature_cols].astype(np.float32)  # Convertir a float32
y_pred = lgb_reg.predict(X_kgl)

In [12]:
productos_ok = pd.read_csv("https://storage.googleapis.com/open-courses/austral2025-af91/labo3v/product_id_apredecir201912.txt", sep="\t")
result = pd.DataFrame({"product_id": X_kgl["product_id"],  "tn": y_pred})
result = result[result["product_id"].isin(productos_ok["product_id"])]
result = result.groupby("product_id").agg({"tn":"sum"}).reset_index()
result

Unnamed: 0,product_id,tn
0,20001.0,1330.381758
1,20002.0,862.522429
2,20003.0,594.492258
3,20004.0,459.706791
4,20005.0,391.587775
...,...,...
775,21263.0,2.058279
776,21265.0,2.076579
777,21266.0,2.085033
778,21267.0,2.068006


In [13]:
result.to_csv("./kaggle/lgb-probando123456.csv", index=False, sep=",")