# Entrenamiento del modelo LGBM hardcodeando los hiperparámetros seleccionados para el modelo final

En esta notebook se ajusta el mdoelo LGBM sin realizar la optimización de hiperparámetros con optuna.

In [1]:
import psutil
import pandas as pd
import numpy as np
import seaborn as sns
import lightgbm as lgb
from matplotlib import pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_squared_error

import warnings
warnings.filterwarnings('ignore')

In [2]:
from src.utils.utils import get_base_dir
base_dir = get_base_dir()
base_dir

WindowsPath('C:/Users/lauta/Desktop/Lautaro/maestria_ds/labo3/repo-entrega')

### 1. Carga de datos

In [3]:
# Cargar datos a predecir
predict_file = base_dir / "data/predict/raw/product_id_apredecir201912.txt"
df_pred_orig = pd.read_csv(predict_file, sep="\t", encoding="utf-8")
print(f"Registros a predecir: {df_pred_orig.shape[0]}")
df_pred_orig.head()

Registros a predecir: 780


Unnamed: 0,product_id
0,20001
1,20002
2,20003
3,20004
4,20005


In [4]:
# leo directamente df_train, validation y predict
df_train = pd.read_feather(base_dir / "data/processed/df_train.feather")
df_validation = pd.read_feather(base_dir / "data/processed/df_validation.feather")
df_predict = pd.read_feather(base_dir / "data/processed/df_predict.feather")

# dimensiones particiones post-procesamiento
df_predict.shape, df_train.shape, df_validation.shape

((465660, 79), (12516572, 79), (4128320, 79))

In [5]:
new_cols = ['venta_t-2', 'venta_t-3', 'venta_t-6',
       'venta_t-9', 'venta_t-12', 'venta_t-24', 'media_3m', 'std_3m',
       'suma_3m', 'max_3m', 'min_3m', 'media_6m', 'std_6m', 'suma_6m',
       'max_6m', 'min_6m', 'media_9m', 'std_9m', 'suma_9m', 'max_9m', 'min_9m',
       'media_12m', 'std_12m', 'suma_12m', 'max_12m', 'min_12m', 'media_24m',
       'std_24m', 'suma_24m', 'max_24m', 'min_24m', 'pct_change_2_4m',
       'pct_change_4_7m', 'tendencia_2_4m', 'tendencia_4_6m', 'venta_año_ant',
       'mes_sin', 'mes_cos', 'trimestre_sin', 'trimestre_cos', 'es_enero',
       'es_diciembre', 'es_verano', 'es_invierno', 'producto_media_historica',
       'producto_volatilidad', 'cliente_media_historica', 'momentum_3m',
       'momentum_6m', 'aceleracion', 'categoria_media', 'vs_categoria',
       'marca_media', 'vs_marca', 'meses_sin_venta', 'frecuencia_compras_12m',
       'VarMensualGral', 'VarMensualAlim', 'IndiceMensualGral',
       'IndiceMensualAlim', 'dolar']
numerical_features = new_cols + ["mes", "año", "trimestre", "a_predecir"]
categorical_features = ["product_id", "customer_id", 'cat1', 'cat2', 'cat3', 'brand', 'descripcion']
model_features = numerical_features + categorical_features

In [6]:
# check memoria ram
mem = psutil.virtual_memory()
print(f"Memoria RAM disponible: {mem.available / (1024**3):.2f} GB")

Memoria RAM disponible: 6.22 GB


## Entrenar modelos

In [7]:
# Entrenar 15 modelos con diferentes semillas aleatorias
best_params = {'learning_rate': 0.033298202348152534, 'num_leaves': 121, 'max_depth': 12, 'colsample_bytree': 0.7702280076072465, 'subsample': 0.9451048961270575, 'min_child_samples': 63, 'reg_alpha': 0.8261485784018764, 'reg_lambda': 0.2801548449048477, 'n_estimators': 335}

n_models = 15
models_list = []
mae_valid_list = []
rmse_valid_list = []
correlation_list = []

for i in range(n_models):
    best_model = lgb.LGBMRegressor(
        **best_params,
        objective='regression',
        metric='rmse',
        boosting_type='gbdt',
        random_state=42 + i,  # Diferente semilla para cada modelo
        verbose=-1
    )

    best_model.fit(
        df_train[model_features], df_train["target"],
        eval_set=[(df_validation[model_features], df_validation["target"])],
        eval_metric='rmse',
        categorical_feature=categorical_features,
        callbacks=[lgb.early_stopping(50)]
    )
    models_list.append(best_model)
    
    # Predicciones en validación
    y_pred_valid = best_model.predict(df_validation[model_features])

    # Métricas de evaluación
    mae_valid = mean_absolute_error(df_validation["target"], y_pred_valid)
    rmse_valid = np.sqrt(mean_squared_error(df_validation["target"], y_pred_valid))
    # Correlación entre predicciones y valores reales
    correlation = np.corrcoef(df_validation["target"], y_pred_valid)[0, 1]
    
    mae_valid_list.append(mae_valid)
    rmse_valid_list.append(rmse_valid)
    correlation_list.append(correlation)


print(f"Métricas en validación:")
print(f"MAE: {np.mean(mae_valid_list):.4f} ± {np.std(mae_valid_list):.4f}")
print(f"RMSE: {np.mean(rmse_valid_list):.4f} ± {np.std(rmse_valid_list):.4f}")
print(f"Correlación: {np.mean(correlation_list):.4f} ± {np.std(correlation_list):.4f}") 

Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[252]	valid_0's rmse: 0.748459
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[278]	valid_0's rmse: 0.749062
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[161]	valid_0's rmse: 0.753943
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[154]	valid_0's rmse: 0.752781
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[287]	valid_0's rmse: 0.748173
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[269]	valid_0's rmse: 0.750242
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[298]	valid_0's rmse: 0.751533
Training until validation scores don't improve for 50 rounds
Did not meet early stopp

## Predicciones para febrero 2020

In [8]:
# obtener las predicciones según los modelos ajustados
preds_ensemble = np.zeros(df_predict.shape[0])
for model in models_list:
    preds_ensemble += model.predict(df_predict[model_features])
preds_ensemble /= n_models  # Promedio de predicciones

# asignar en el conjunto df_predict
df_predict["predicted_target"] = preds_ensemble
df_predict["predicted_tn"] = df_predict["predicted_target"] + df_predict["venta_t-2"] # reconstruir tn

# preparar resultados finales
results_final = df_predict[[
        'product_id', 'customer_id', 'periodo', 'predicted_tn'
    ]].copy()
results_final.columns = ['product_id', 'customer_id', 'periodo', 'tn']

# check productos
common_products = set(results_final['product_id']).intersection(set(df_pred_orig['product_id']))
print(f"Productos en común con objetivo: {len(common_products)} de {len(df_pred_orig)}")

Productos en común con objetivo: 780 de 780


In [9]:
# agrupar predicciones a nivel producto
df_prediction_grouped_product_id = df_predict.groupby(['product_id'])["predicted_tn"].sum().reset_index()#.head(10)
df_prediction_grouped_product_id.columns = ["product_id","tn"]
df_prediction_grouped_product_id = df_pred_orig.merge(df_prediction_grouped_product_id, on='product_id', how='left')
df_prediction_grouped_product_id

Unnamed: 0,product_id,tn
0,20001,1108.548208
1,20002,1224.437310
2,20003,675.753929
3,20004,539.719046
4,20005,536.671452
...,...,...
775,21263,0.694170
776,21265,0.737044
777,21266,0.738548
778,21267,0.710705


## Guardar predicciones para subir a Kaggle

In [10]:
# uso modelo solo para el top de productos (segun validation)
top_productos = df_validation.groupby('product_id')['tn'].sum().reset_index().sort_values(by='tn', ascending=False).reset_index(drop=True)
top_productos["tn_acum"] = top_productos["tn"].cumsum()
top_productos["tn_acum"] = top_productos["tn_acum"] / top_productos["tn_acum"].max() * 100
top_productos = top_productos[top_productos["tn_acum"] < 20]["product_id"].unique().tolist()

predict_magicos = pd.read_csv(base_dir / f"data/predict/final/product_id_clase_6_modelo_reg_simple_v1_magicos.csv")
df_prediction_grouped_product_id = df_prediction_grouped_product_id.merge(predict_magicos, on='product_id', how='left', suffixes=('', '_magico'))
df_prediction_grouped_product_id["es_top"] = df_prediction_grouped_product_id["product_id"].isin(top_productos)

# para el top me quedo con el original, sino el magico
df_prediction_grouped_product_id["tn2"] = df_prediction_grouped_product_id["tn"].where(df_prediction_grouped_product_id["es_top"], df_prediction_grouped_product_id["tn_magico"])
df_prediction_grouped_product_id = df_prediction_grouped_product_id[["product_id","tn2"]]
df_prediction_grouped_product_id.columns = ["product_id","tn"]

time_tag = pd.Timestamp.now().strftime("%Y%m%d_%H%M")
predict_file_processed = base_dir / f"data/predict/final/predicciones_final_{time_tag}.csv"
df_prediction_grouped_product_id[["product_id","tn"]].to_csv(predict_file_processed, index=False)


In [37]:
df_prediction_grouped_product_id

Unnamed: 0,product_id,tn
0,20001,1108.548208
1,20002,1224.437310
2,20003,675.753929
3,20004,539.719046
4,20005,536.671452
...,...,...
775,21263,0.029993
776,21265,0.089541
777,21266,0.094659
778,21267,0.092835
