In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_error
import lightgbm as lgb
import xgboost as xgb
import optuna
import joblib

In [3]:
import pandas as pd
sellin = pd.read_csv("datasets/sell-in.csv", sep='\t')
productos = pd.read_csv("datasets/tb_productos.csv", sep='\t')
stocks = pd.read_csv("datasets/tb_stocks.csv", sep='\t')
productos = productos.drop_duplicates(subset=['product_id'], keep='first')

# Merge inicial
df = sellin.merge(productos, on="product_id", how="left").merge(stocks, on=["product_id", "periodo"], how="left")
df['periodo_dt'] = pd.to_datetime(df['periodo'].astype(str), format='%Y%m')

# Sumarizar ventas por periodo y producto
ventas = df.groupby(['periodo', 'product_id']).agg({'tn':'sum'}).reset_index()
ventas['periodo_dt'] = pd.to_datetime(ventas['periodo'].astype(str), format='%Y%m')

# Determinar min y max periodo por producto
vida_producto = ventas.groupby('product_id')['periodo_dt'].agg(['min','max']).reset_index()

# Crear DataFrame expandido con todos <periodo, producto>
all_periods = []
for _, row in vida_producto.iterrows():
    pid = row['product_id']
    periods = pd.date_range(row['min'], row['max'], freq='MS')
    for p in periods:
        all_periods.append((pid, p))
df_full = pd.DataFrame(all_periods, columns=['product_id','periodo_dt'])

# Unir con ventas reales
ventas = ventas[['product_id','periodo_dt','tn']]
df_pp = df_full.merge(ventas, on=['product_id','periodo_dt'], how='left')
df_pp['tn'] = df_pp['tn'].fillna(0)

# Merge con información adicional
df_pp['periodo'] = df_pp['periodo_dt'].dt.strftime('%Y%m').astype(int)
df_pp = df_pp.merge(productos, on="product_id", how="left").merge(stocks, on=["product_id", "periodo"], how="left")


In [4]:
# Feature engineering avanzado
df_pp = df_pp.sort_values(['product_id','periodo_dt'])
for lag in range(1,13):
    df_pp[f'tn_lag_{lag}'] = df_pp.groupby('product_id')['tn'].shift(lag)
    df_pp[f'stock_lag_{lag}'] = df_pp.groupby('product_id')['stock_final'].shift(lag)

df_pp['tn_diff'] = df_pp['tn'] - df_pp['tn_lag_1']
df_pp['stock_diff'] = df_pp['stock_final'] - df_pp['stock_lag_1']
df_pp['stock_ratio'] = df_pp.apply(lambda x: x['tn']/x['stock_final'] if x['stock_final']>0 else 0, axis=1)
df_pp['month'] = df_pp['periodo_dt'].dt.month
df_pp['quarter'] = df_pp['periodo_dt'].dt.quarter
df_pp['year'] = df_pp['periodo_dt'].dt.year

# Rolling means
for window in [3,6,12]:
    df_pp[f'tn_roll_mean_{window}'] = df_pp.groupby('product_id')['tn'].rolling(window).mean().reset_index(0,drop=True)
    df_pp[f'stock_roll_mean_{window}'] = df_pp.groupby('product_id')['stock_final'].rolling(window).mean().reset_index(0,drop=True)

df_pp['tn_target'] = df_pp.groupby('product_id')['tn'].shift(-2)

In [5]:
# Preparar X e y
feature_columns = [col for col in df_pp.columns if col not in ['periodo_dt','tn_target']]
# Preparar features
X = df_pp[feature_columns].copy()

# Identificar y codificar columnas categóricas
cols_categoricas = X.select_dtypes(include=['object', 'category']).columns.tolist()
print(f"Columnas categóricas: {cols_categoricas}")

for col in cols_categoricas:
    X[col] = X[col].astype('category').cat.codes

# Convertir a float32
X = X.astype(np.float32)
y = df_pp['tn_target'].fillna(0)

# Codificar categorías si las hay
for col in X.select_dtypes(include=['object','category']).columns:
    X[col] = X[col].astype('category').cat.codes

Columnas categóricas: ['cat1', 'cat2', 'cat3', 'brand']


In [None]:

# Función objetivo con Optuna
def objective(trial):
    tscv = TimeSeriesSplit(n_splits=3)
    rmses = []
    for train_idx, val_idx in tscv.split(X):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        lgb_params = {
            'objective':'regression',
            'metric':'rmse',
            'num_leaves': trial.suggest_int('num_leaves',20,150),
            'max_depth': trial.suggest_int('max_depth',3,15),
            'learning_rate': trial.suggest_float('learning_rate',1e-3,0.1,log=True),
            'n_estimators': trial.suggest_int('n_estimators',100,1000)
        }
        xgb_params = {
            'objective':'reg:squarederror',
            'max_depth': trial.suggest_int('xgb_max_depth',3,15),
            'learning_rate': trial.suggest_float('xgb_learning_rate',1e-3,0.1,log=True),
            'n_estimators': trial.suggest_int('xgb_n_estimators',100,1000)
        }

        # Modelos
        lgb_model = lgb.LGBMRegressor(**lgb_params)
        xgb_model = xgb.XGBRegressor(**xgb_params)

        # Entrenamiento con early stopping
        lgb_model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)]
        )
        xgb_model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            verbose=False
        )

        preds_lgb = lgb_model.predict(X_val)
        preds_xgb = xgb_model.predict(X_val)
        preds_ensemble = (preds_lgb + preds_xgb) / 2
        rmse = mean_squared_error(y_val, preds_ensemble, squared=False)
        rmses.append(rmse)

    return np.mean(rmses)

# Optuna
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=5)
print("✅ Mejores parámetros encontrados:", study.best_params)

# Entrenamiento final
best_params = study.best_params
lgb_model = lgb.LGBMRegressor(num_leaves=best_params['num_leaves'],
                              max_depth=best_params['max_depth'],
                              learning_rate=best_params['learning_rate'],
                              n_estimators=best_params['n_estimators'])
xgb_model = xgb.XGBRegressor(objective='reg:squarederror',
                             max_depth=best_params['xgb_max_depth'],
                             learning_rate=best_params['xgb_learning_rate'],
                             n_estimators=best_params['xgb_n_estimators'])

lgb_model.fit(X, y)
xgb_model.fit(X, y)



[I 2025-05-30 18:50:35,206] A new study created in memory with name: no-name-3f41c218-899f-4909-ab89-f759297c7629


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001016 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9289
[LightGBM] [Info] Number of data points in the train set: 7882, number of used features: 45
[LightGBM] [Info] Start training from score 134.240064
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[249]	valid_0's rmse: 12.0568




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001643 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9373
[LightGBM] [Info] Number of data points in the train set: 15762, number of used features: 45
[LightGBM] [Info] Start training from score 75.331341
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[268]	valid_0's rmse: 6.08461




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002180 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9423
[LightGBM] [Info] Number of data points in the train set: 23642, number of used features: 45
[LightGBM] [Info] Start training from score 52.155359
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[269]	valid_0's rmse: 2.69197


[I 2025-05-30 18:50:44,123] Trial 0 finished with value: 16.169454064339654 and parameters: {'num_leaves': 129, 'max_depth': 10, 'learning_rate': 0.028715768811065898, 'n_estimators': 269, 'xgb_max_depth': 7, 'xgb_learning_rate': 0.0018478407710437695, 'xgb_n_estimators': 709}. Best is trial 0 with value: 16.169454064339654.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001575 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9289
[LightGBM] [Info] Number of data points in the train set: 7882, number of used features: 45
[LightGBM] [Info] Start training from score 134.240064
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[713]	valid_0's rmse: 60.5111




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001652 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9373
[LightGBM] [Info] Number of data points in the train set: 15762, number of used features: 45
[LightGBM] [Info] Start training from score 75.331341
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[713]	valid_0's rmse: 32.8238




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001881 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9423
[LightGBM] [Info] Number of data points in the train set: 23642, number of used features: 45
[LightGBM] [Info] Start training from score 52.155359
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[713]	valid_0's rmse: 23.7556


[I 2025-05-30 18:51:01,287] Trial 1 finished with value: 22.19268787351942 and parameters: {'num_leaves': 81, 'max_depth': 3, 'learning_rate': 0.0014668902916720574, 'n_estimators': 713, 'xgb_max_depth': 9, 'xgb_learning_rate': 0.01097135460071146, 'xgb_n_estimators': 542}. Best is trial 0 with value: 16.169454064339654.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001177 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9289
[LightGBM] [Info] Number of data points in the train set: 7882, number of used features: 45
[LightGBM] [Info] Start training from score 134.240064
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[789]	valid_0's rmse: 24.5731




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001825 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9373
[LightGBM] [Info] Number of data points in the train set: 15762, number of used features: 45
[LightGBM] [Info] Start training from score 75.331341
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[789]	valid_0's rmse: 12.2377




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001641 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9423
[LightGBM] [Info] Number of data points in the train set: 23642, number of used features: 45
[LightGBM] [Info] Start training from score 52.155359
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[789]	valid_0's rmse: 7.74204


[I 2025-05-30 18:51:05,735] Trial 2 finished with value: 10.682470439077116 and parameters: {'num_leaves': 61, 'max_depth': 4, 'learning_rate': 0.003560417722248154, 'n_estimators': 789, 'xgb_max_depth': 6, 'xgb_learning_rate': 0.024870973709153595, 'xgb_n_estimators': 402}. Best is trial 2 with value: 10.682470439077116.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001208 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9289
[LightGBM] [Info] Number of data points in the train set: 7882, number of used features: 45
[LightGBM] [Info] Start training from score 134.240064
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[688]	valid_0's rmse: 12.6358




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001836 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9373
[LightGBM] [Info] Number of data points in the train set: 15762, number of used features: 45
[LightGBM] [Info] Start training from score 75.331341
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[693]	valid_0's rmse: 6.20181




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001951 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9423
[LightGBM] [Info] Number of data points in the train set: 23642, number of used features: 45
[LightGBM] [Info] Start training from score 52.155359
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[681]	valid_0's rmse: 3.44193


[I 2025-05-30 18:51:21,483] Trial 3 finished with value: 6.743348971352766 and parameters: {'num_leaves': 131, 'max_depth': 4, 'learning_rate': 0.041093608412445935, 'n_estimators': 693, 'xgb_max_depth': 8, 'xgb_learning_rate': 0.03151071695885424, 'xgb_n_estimators': 816}. Best is trial 3 with value: 6.743348971352766.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001505 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9289
[LightGBM] [Info] Number of data points in the train set: 7882, number of used features: 45
[LightGBM] [Info] Start training from score 134.240064
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[740]	valid_0's rmse: 48.0401




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001364 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9373
[LightGBM] [Info] Number of data points in the train set: 15762, number of used features: 45
[LightGBM] [Info] Start training from score 75.331341
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[740]	valid_0's rmse: 27.0981




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001790 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9423
[LightGBM] [Info] Number of data points in the train set: 23642, number of used features: 45
[LightGBM] [Info] Start training from score 52.155359
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[740]	valid_0's rmse: 18.654


[I 2025-05-30 18:51:30,626] Trial 4 finished with value: 17.905366137058817 and parameters: {'num_leaves': 73, 'max_depth': 6, 'learning_rate': 0.0015017128975264886, 'n_estimators': 740, 'xgb_max_depth': 6, 'xgb_learning_rate': 0.05447936466510139, 'xgb_n_estimators': 728}. Best is trial 3 with value: 6.743348971352766.


✅ Mejores parámetros encontrados: {'num_leaves': 131, 'max_depth': 4, 'learning_rate': 0.041093608412445935, 'n_estimators': 693, 'xgb_max_depth': 8, 'xgb_learning_rate': 0.03151071695885424, 'xgb_n_estimators': 816}
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002303 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9467
[LightGBM] [Info] Number of data points in the train set: 31522, number of used features: 45
[LightGBM] [Info] Start training from score 39.421567


LightGBMError: The number of features in data (44) is not the same as it was in training data (45).
You can set ``predict_disable_shape_check=true`` to discard this error, but please be aware what you are doing.

✅ Ensemble entrenado, predicciones guardadas en ./kaggle/ensemble_lgb_xgb.csv y modelos almacenados.


In [10]:
# Entrenar modelos finales
best_params = study.best_params
lgb_model = lgb.LGBMRegressor(num_leaves=best_params['num_leaves'], max_depth=best_params['max_depth'],
                              learning_rate=best_params['learning_rate'], n_estimators=best_params['n_estimators'])
xgb_model = xgb.XGBRegressor(max_depth=best_params['xgb_max_depth'], learning_rate=best_params['xgb_learning_rate'],
                             n_estimators=best_params['xgb_n_estimators'], objective='reg:squarederror')
lgb_model.fit(X,y)
xgb_model.fit(X,y)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002041 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9467
[LightGBM] [Info] Number of data points in the train set: 31522, number of used features: 45
[LightGBM] [Info] Start training from score 39.421567


In [11]:
# Predicción sobre periodo 201912
X_test = df_pp[df_pp['periodo'] == 201912].drop(columns=['periodo_dt', 'tn_target'], errors='ignore')
for col in X_test.select_dtypes(include=['object','category']).columns:
    X_test[col] = X_test[col].astype('category').cat.codes


preds_lgb = lgb_model.predict(X_test)
preds_xgb = xgb_model.predict(X_test)
preds_ensemble = (preds_lgb + preds_xgb) / 2

result = pd.DataFrame({'product_id': df_pp[df_pp['periodo'] == 201912]['product_id'], 'tn': preds_ensemble})
result['product_id'] = result['product_id'].astype(int)
result.to_csv('./kaggle/ensemble_lgb_xgb.csv', index=False)
joblib.dump((lgb_model, xgb_model), './models/ensemble_models.pkl')
print("✅ Ensemble entrenado, predicciones guardadas en ./kaggle/ensemble_lgb_xgb.csv y modelos almacenados.")


✅ Ensemble entrenado, predicciones guardadas en ./kaggle/ensemble_lgb_xgb.csv y modelos almacenados.


In [12]:
result

Unnamed: 0,product_id,tn
35,20001,52.086784
71,20002,17.778145
107,20003,-4.862595
143,20004,10.286376
179,20005,-5.612878
...,...,...
31384,21265,-0.539231
31394,21266,-0.539231
31404,21267,-0.452304
31449,21271,-0.458562
