In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_error
import lightgbm as lgb
import xgboost as xgb
import optuna
import joblib

In [2]:
# Cargar datasets
sellin = pd.read_csv("datasets/sell-in.csv", sep='\t')
productos = pd.read_csv("datasets/tb_productos.csv", sep='\t')
stocks = pd.read_csv("datasets/tb_stocks.csv", sep='\t')
productos = productos.drop_duplicates(subset=['product_id'], keep='first')

# Merge inicial
df = sellin.merge(productos, on="product_id", how="left").merge(stocks, on=["product_id", "periodo"], how="left")
df['periodo_dt'] = pd.to_datetime(df['periodo'].astype(str), format='%Y%m')

In [3]:
# Feature engineering avanzado
df = df.sort_values(['product_id','periodo_dt'])
for lag in range(1,13):
    df[f'tn_lag_{lag}'] = df.groupby('product_id')['tn'].shift(lag)
    df[f'stock_lag_{lag}'] = df.groupby('product_id')['stock_final'].shift(lag)

df['tn_diff'] = df['tn'] - df['tn_lag_1']
df['stock_diff'] = df['stock_final'] - df['stock_lag_1']
df['stock_ratio'] = df.apply(lambda x: x['tn']/x['stock_final'] if x['stock_final']>0 else 0, axis=1)
df['month'] = df['periodo_dt'].dt.month
df['quarter'] = df['periodo_dt'].dt.quarter
df['year'] = df['periodo_dt'].dt.year

# Rolling means
for window in [3,6,12]:
    df[f'tn_roll_mean_{window}'] = df.groupby('product_id')['tn'].rolling(window).mean().reset_index(0,drop=True)
    df[f'stock_roll_mean_{window}'] = df.groupby('product_id')['stock_final'].rolling(window).mean().reset_index(0,drop=True)

df['tn_target'] = df.groupby('product_id')['tn'].shift(-2)

In [4]:
# Preparar X e y
feature_columns = [col for col in df.columns if col not in ['periodo','periodo_dt','tn','tn_target']]
# Preparar features
X = df[feature_columns].copy()

# Identificar y codificar columnas categóricas
cols_categoricas = X.select_dtypes(include=['object', 'category']).columns.tolist()
print(f"Columnas categóricas: {cols_categoricas}")

for col in cols_categoricas:
    X[col] = X[col].astype('category').cat.codes

# Convertir a float32
X = X.astype(np.float32)
y = df['tn_target'].fillna(0)

# Codificar categorías si las hay
for col in X.select_dtypes(include=['object','category']).columns:
    X[col] = X[col].astype('category').cat.codes

Columnas categóricas: ['cat1', 'cat2', 'cat3', 'brand']


In [7]:


# Función objetivo con Optuna
def objective(trial):
    tscv = TimeSeriesSplit(n_splits=3)
    rmses = []
    for train_idx, val_idx in tscv.split(X):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        lgb_params = {
            'objective':'regression',
            'metric':'rmse',
            'num_leaves': trial.suggest_int('num_leaves',20,150),
            'max_depth': trial.suggest_int('max_depth',3,15),
            'learning_rate': trial.suggest_float('learning_rate',1e-3,0.1,log=True),
            'n_estimators': trial.suggest_int('n_estimators',100,1000)
        }
        xgb_params = {
            'objective':'reg:squarederror',
            'max_depth': trial.suggest_int('xgb_max_depth',3,15),
            'learning_rate': trial.suggest_float('xgb_learning_rate',1e-3,0.1,log=True),
            'n_estimators': trial.suggest_int('xgb_n_estimators',100,1000)
        }

        # Modelos
        lgb_model = lgb.LGBMRegressor(**lgb_params)
        xgb_model = xgb.XGBRegressor(**xgb_params)

        # Entrenamiento con early stopping
        lgb_model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)]
        )
        xgb_model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            verbose=False
        )

        preds_lgb = lgb_model.predict(X_val)
        preds_xgb = xgb_model.predict(X_val)
        preds_ensemble = (preds_lgb + preds_xgb) / 2
        rmse = mean_squared_error(y_val, preds_ensemble, squared=False)
        rmses.append(rmse)

    return np.mean(rmses)

# Optuna
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=5)
print("✅ Mejores parámetros encontrados:", study.best_params)

# Entrenamiento final
best_params = study.best_params
lgb_model = lgb.LGBMRegressor(num_leaves=best_params['num_leaves'],
                              max_depth=best_params['max_depth'],
                              learning_rate=best_params['learning_rate'],
                              n_estimators=best_params['n_estimators'])
xgb_model = xgb.XGBRegressor(objective='reg:squarederror',
                             max_depth=best_params['xgb_max_depth'],
                             learning_rate=best_params['xgb_learning_rate'],
                             n_estimators=best_params['xgb_n_estimators'])

lgb_model.fit(X, y)
xgb_model.fit(X, y)

# Predicción sobre periodo 201912
X_test = df_pp[df_pp['periodo'] == 201912].drop(columns=['periodo_dt', 'tn_target', 'tn'], errors='ignore')
for col in X_test.select_dtypes(include=['object','category']).columns:
    X_test[col] = X_test[col].astype('category').cat.codes
X_test = X_test.astype(np.float32)

preds_lgb = lgb_model.predict(X_test)
preds_xgb = xgb_model.predict(X_test)
preds_ensemble = (preds_lgb + preds_xgb) / 2

result = pd.DataFrame({'product_id': df_pp[df_pp['periodo'] == 201912]['product_id'], 'tn': preds_ensemble})
result['product_id'] = result['product_id'].astype(int)
result.to_csv('./kaggle/ensemble_lgb_xgb.csv', index=False)
joblib.dump((lgb_model, xgb_model), './models/ensemble_models.pkl')
print("✅ Ensemble entrenado, predicciones guardadas en ./kaggle/ensemble_lgb_xgb.csv y modelos almacenados.")


[I 2025-05-30 14:25:49,559] A new study created in memory with name: no-name-d247d021-553b-44e4-a1b7-ee048971562f


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.019935 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 9457
[LightGBM] [Info] Number of data points in the train set: 736456, number of used features: 47
[LightGBM] [Info] Start training from score 1.386727
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[543]	valid_0's rmse: 0.827289




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.038440 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 9599
[LightGBM] [Info] Number of data points in the train set: 1472910, number of used features: 47
[LightGBM] [Info] Start training from score 0.819672
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[264]	valid_0's rmse: 0.383333




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.051374 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 9660
[LightGBM] [Info] Number of data points in the train set: 2209364, number of used features: 47
[LightGBM] [Info] Start training from score 0.586431
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[317]	valid_0's rmse: 0.154923


[I 2025-05-30 14:30:22,523] Trial 0 finished with value: 0.5198215910692864 and parameters: {'num_leaves': 141, 'max_depth': 8, 'learning_rate': 0.0537887103460302, 'n_estimators': 808, 'xgb_max_depth': 10, 'xgb_learning_rate': 0.00163280130319715, 'xgb_n_estimators': 522}. Best is trial 0 with value: 0.5198215910692864.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.020533 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 9457
[LightGBM] [Info] Number of data points in the train set: 736456, number of used features: 47
[LightGBM] [Info] Start training from score 1.386727
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[449]	valid_0's rmse: 0.820357




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.037018 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 9599
[LightGBM] [Info] Number of data points in the train set: 1472910, number of used features: 47
[LightGBM] [Info] Start training from score 0.819672
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[451]	valid_0's rmse: 0.384361




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.048203 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 9660
[LightGBM] [Info] Number of data points in the train set: 2209364, number of used features: 47
[LightGBM] [Info] Start training from score 0.586431
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[449]	valid_0's rmse: 0.159319


[I 2025-05-30 14:33:57,223] Trial 1 finished with value: 0.5354560101399806 and parameters: {'num_leaves': 29, 'max_depth': 6, 'learning_rate': 0.02698879281154298, 'n_estimators': 452, 'xgb_max_depth': 7, 'xgb_learning_rate': 0.0011879809045334674, 'xgb_n_estimators': 541}. Best is trial 0 with value: 0.5198215910692864.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.026905 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 9457
[LightGBM] [Info] Number of data points in the train set: 736456, number of used features: 47
[LightGBM] [Info] Start training from score 1.386727
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[816]	valid_0's rmse: 0.835044




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.034771 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 9599
[LightGBM] [Info] Number of data points in the train set: 1472910, number of used features: 47
[LightGBM] [Info] Start training from score 0.819672
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[817]	valid_0's rmse: 0.382796




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.050019 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 9660
[LightGBM] [Info] Number of data points in the train set: 2209364, number of used features: 47
[LightGBM] [Info] Start training from score 0.586431
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[815]	valid_0's rmse: 0.156291


[I 2025-05-30 14:42:51,151] Trial 2 finished with value: 0.5014549354836089 and parameters: {'num_leaves': 73, 'max_depth': 6, 'learning_rate': 0.010364254601710296, 'n_estimators': 817, 'xgb_max_depth': 10, 'xgb_learning_rate': 0.0014039579607277375, 'xgb_n_estimators': 948}. Best is trial 2 with value: 0.5014549354836089.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.019495 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 9457
[LightGBM] [Info] Number of data points in the train set: 736456, number of used features: 47
[LightGBM] [Info] Start training from score 1.386727
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[411]	valid_0's rmse: 0.8309




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.040698 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 9599
[LightGBM] [Info] Number of data points in the train set: 1472910, number of used features: 47
[LightGBM] [Info] Start training from score 0.819672
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[410]	valid_0's rmse: 0.385383




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.052828 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 9660
[LightGBM] [Info] Number of data points in the train set: 2209364, number of used features: 47
[LightGBM] [Info] Start training from score 0.586431
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[411]	valid_0's rmse: 0.161854


[I 2025-05-30 14:47:07,589] Trial 3 finished with value: 0.4839191380496221 and parameters: {'num_leaves': 126, 'max_depth': 15, 'learning_rate': 0.016211180252534055, 'n_estimators': 411, 'xgb_max_depth': 10, 'xgb_learning_rate': 0.006649623877393885, 'xgb_n_estimators': 488}. Best is trial 3 with value: 0.4839191380496221.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.019743 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 9457
[LightGBM] [Info] Number of data points in the train set: 736456, number of used features: 47
[LightGBM] [Info] Start training from score 1.386727
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[453]	valid_0's rmse: 1.0221




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.040514 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 9599
[LightGBM] [Info] Number of data points in the train set: 1472910, number of used features: 47
[LightGBM] [Info] Start training from score 0.819672
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[453]	valid_0's rmse: 0.506728




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.052760 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 9660
[LightGBM] [Info] Number of data points in the train set: 2209364, number of used features: 47
[LightGBM] [Info] Start training from score 0.586431
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[453]	valid_0's rmse: 0.316013


[I 2025-05-30 14:58:30,674] Trial 4 finished with value: 0.5803677837278326 and parameters: {'num_leaves': 47, 'max_depth': 13, 'learning_rate': 0.0020895825923078525, 'n_estimators': 453, 'xgb_max_depth': 13, 'xgb_learning_rate': 0.016127359639733517, 'xgb_n_estimators': 904}. Best is trial 3 with value: 0.4839191380496221.


✅ Mejores parámetros encontrados: {'num_leaves': 126, 'max_depth': 15, 'learning_rate': 0.016211180252534055, 'n_estimators': 411, 'xgb_max_depth': 10, 'xgb_learning_rate': 0.006649623877393885, 'xgb_n_estimators': 488}
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.066343 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 9748
[LightGBM] [Info] Number of data points in the train set: 2945818, number of used features: 47
[LightGBM] [Info] Start training from score 0.449562


NameError: name 'df_pp' is not defined

In [None]:
# Entrenar modelos finales
best_params = study.best_params
lgb_model = lgb.LGBMRegressor(num_leaves=best_params['num_leaves'], max_depth=best_params['max_depth'],
                              learning_rate=best_params['learning_rate'], n_estimators=best_params['n_estimators'])
xgb_model = xgb.XGBRegressor(max_depth=best_params['xgb_max_depth'], learning_rate=best_params['xgb_learning_rate'],
                             n_estimators=best_params['xgb_n_estimators'], objective='reg:squarederror')
lgb_model.fit(X,y)
xgb_model.fit(X,y)

In [None]:
# Predicciones para periodo 201912
test_data = df[df['periodo']==201912]
X_test = test_data[feature_columns].astype(np.float32)
for col in X_test.select_dtypes(include=['object','category']).columns:
    X_test[col] = X_test[col].astype('category').cat.codes

preds_lgb = lgb_model.predict(X_test)
preds_xgb = xgb_model.predict(X_test)
preds_ensemble = (preds_lgb + preds_xgb)/2

result = pd.DataFrame({'product_id':test_data['product_id'],'tn':preds_ensemble})
result['product_id'] = result['product_id'].astype(int)
result.to_csv('./kaggle/ensemble_lgb_xgb.csv', index=False)
joblib.dump((lgb_model,xgb_model),'./models/ensemble_models.pkl')
print("✅ Ensemble entrenado y predicciones guardadas.")