In [17]:
import pandas as pd
import numpy as np
import importlib
import gc
import sys
import warnings
sys.path.append('./scripts')  
import preprocesamiento
import feature_engineering
import model_lgb
importlib.reload(preprocesamiento)
importlib.reload(model_lgb)
importlib.reload(feature_engineering)
warnings.filterwarnings("ignore")

# Experimento 7: 
- LGBM
- Usando funcion entrenamiento: semillerio_en_prediccion
- Aumento cantidad de variables
- cambio pesos de logaritmo a y_max
- sqlite:///optuna_studies_v16.db
- Kaggle =  


##### Levantamos el dataset con target ya calculado

In [2]:
df = pd.read_csv("./datasets/periodo_x_producto_con_target.csv", sep=',', encoding='utf-8')
df.shape

(31362, 19)

In [3]:
columnas_baseline = df.columns.tolist()
columnas_baseline

['product_id',
 'periodo',
 'nacimiento_producto',
 'muerte_producto',
 'mes_n',
 'total_meses',
 'producto_nuevo',
 'ciclo_de_vida_inicial',
 'cat1',
 'cat2',
 'cat3',
 'brand',
 'sku_size',
 'stock_final',
 'tn',
 'plan_precios_cuidados',
 'cust_request_qty',
 'cust_request_tn',
 'target']

##### Preprocesamiento a la minima expresión :)

In [50]:
# ##### aplicamos OHE
# df = preprocesamiento.aplicarOHE(df)
# df.shape

### Feature Engineering

##### Neural Prophet

In [4]:
neural_prophet_fe = pd.read_csv("./datasets/features_neuralprophet_completo.csv", sep=',', encoding='utf-8')
neural_prophet_fe['ds'] = pd.to_datetime(neural_prophet_fe['ds'], errors='coerce')
# Versión alternativa más robusta:
neural_prophet_fe['periodo'] = neural_prophet_fe['ds'].apply(
    lambda x: x.year * 100 + x.month if pd.notnull(x) else None
)
neural_prophet_fe = neural_prophet_fe[['periodo', 'product_id', 'trend', "season_yearly", "season_monthly"]]
df = df.merge(neural_prophet_fe, on=['periodo', 'product_id'], how='left')
df.shape

(31362, 22)

##### Prophet

In [5]:
prophet_fe = pd.read_csv("./datasets/prophet_features_tn_zscore.csv", sep=',', encoding='utf-8')
prophet_fe['ds'] = pd.to_datetime(prophet_fe['ds'], errors='coerce')
prophet_fe['periodo'] = prophet_fe['ds'].apply(
    lambda x: x.year * 100 + x.month if pd.notnull(x) else None
)
prophet_fe = prophet_fe[['periodo', 'product_id', 'trend_add', "yearly_add", "additive_terms", 'trend_mult', 'yearly_mult', 'multiplicative_terms']]
df = df.merge(prophet_fe, on=['periodo', 'product_id'], how='left')
df.shape

(31362, 28)

##### FE Moviles

In [6]:
df = feature_engineering.get_lags(df, "tn", 201912)
df = feature_engineering.get_delta_lags(df, "tn", 24)
df = feature_engineering.get_rolling_means(df, "tn", 201912)
df = feature_engineering.get_rolling_stds(df, "tn", 201912)
df = feature_engineering.get_rolling_mins(df, "tn", 201912)
df = feature_engineering.get_rolling_maxs(df, "tn", 201912)
df = feature_engineering.get_rolling_medians(df, "tn", 201912)
df = feature_engineering.get_rolling_skewness(df, "tn", 201912)
df = feature_engineering.get_autocorrelaciones(df, "tn", 201912)
df.shape

(31362, 590)

In [7]:
df = feature_engineering.get_lags(df, "cust_request_qty", 201912)
df = feature_engineering.get_delta_lags(df, "cust_request_qty", 24)
df = feature_engineering.get_rolling_means(df, "cust_request_qty", 201912)
df = feature_engineering.get_rolling_stds(df, "cust_request_qty", 201912)
df = feature_engineering.get_rolling_mins(df, "cust_request_qty", 201912)
df = feature_engineering.get_rolling_maxs(df, "cust_request_qty", 201912)

In [8]:
df = feature_engineering.get_lags(df, "stock_final", 201912)
df = feature_engineering.get_delta_lags(df, "stock_final", 24)
df = feature_engineering.get_rolling_means(df, "stock_final", 201912)
df = feature_engineering.get_rolling_stds(df, "stock_final", 201912)
df = feature_engineering.get_rolling_mins(df, "stock_final", 201912)
df = feature_engineering.get_rolling_maxs(df, "stock_final", 201912)
df.shape

(31362, 1500)

Features Diana

In [9]:
df = feature_engineering.calcular_diferencia_con_medias_moviles(df)
df = feature_engineering.calcular_ratios_con_medias_moviles(df)

##### FE Moviles sobre otras variables

In [56]:
# #  stock final
# df = feature_engineering.get_lagsEspecificos(df, col='stock_final_zscore')
# df = feature_engineering.get_delta_lags_especificos(df, col='stock_final_zscore')
# df = feature_engineering.get_rolling_means_especificos(df, col='stock_final_zscore')
# df = feature_engineering.get_rolling_stds_especificos(df, col='stock_final_zscore')
# df = feature_engineering.get_rolling_medians_especificos(df, col='stock_final_zscore')
# df = feature_engineering.get_rolling_mins_especificos(df, col='stock_final_zscore')
# df = feature_engineering.get_rolling_maxs_especificos(df, col='stock_final_zscore')
# df = feature_engineering.get_rolling_skewness_especificos(df, col='stock_final_zscore')

#  cust_request_qty
# df = feature_engineering.get_lagsEspecificos(df, col='cust_request_qty')
# df = feature_engineering.get_delta_lags_especificos(df, col='cust_request_qty')
# df = feature_engineering.get_rolling_means_especificos(df, col='cust_request_qty')
# df = feature_engineering.get_rolling_stds_especificos(df, col='cust_request_qty')
# df = feature_engineering.get_rolling_mins_especificos(df, col='cust_request_qty')
# df = feature_engineering.get_rolling_maxs_especificos(df, col='cust_request_qty')
# df = feature_engineering.get_rolling_medians_especificos(df, col='cust_request_qty')
# df = feature_engineering.get_rolling_skewness_especificos(df, col='cust_request_qty')

##### FE Calendario

In [10]:
df = feature_engineering.generar_ids(df)
df = feature_engineering.get_componentesTemporales(df)
df = feature_engineering.get_anomaliasPoliticas(df)
# df = feature_engineering.descomposicion_serie_temporal(df, col='tn')
df.shape

(31362, 1561)

##### FE sobre FE

In [11]:
df = feature_engineering.chatGPT_features_serie(df, "tn")
df = feature_engineering.mes_con_feriado(df)
df.shape

(31362, 1590)

##### Variables Exogenas

In [12]:
df = feature_engineering.get_dolar(df)
df = feature_engineering.get_IPC(df)
df['ipc'] = df['ipc'].str.replace(',', '.').astype(float)
df['dolar'] = df['dolar'].str.replace(',', '.').astype(float)
# df.drop(columns=['ds'], inplace=True)
df.fillna(0, inplace=True) ##### EXPERIMENTAR
df = feature_engineering.correlacion_exogenas(df)
df = feature_engineering.get_mes_receso_escolar(df)
df.shape

(31362, 1593)

##### Nuevas FE

In [13]:
df = feature_engineering.create_ratio_features(df)
df = feature_engineering.enhance_lifecycle_features(df)
df = feature_engineering.create_category_features(df)
df = feature_engineering.create_regime_features(df)
df = feature_engineering.create_nonlinear_trends(df)
df = feature_engineering.create_temporal_interactions(df)
df = feature_engineering.create_asymmetric_window_features(df)
df = feature_engineering.recomendaciones_deepseek(df)
df = feature_engineering.get_nuevas_features(df)
df.shape

(31362, 1635)

##### Elimino aquellas que no sirven

In [None]:
# importantes = pd.read_csv("./feature_importance/exp04_3.csv", sep=',', encoding='utf-8')
# no_importantes = importantes[importantes['importance'] <= 100]
# no_importantes = no_importantes[~no_importantes['feature'].isin(columnas_baseline)]
# no_importantes

Unnamed: 0,feature,importance
67,tn_vs_prev_year,100
68,tn_delta_lag5_lag8,99
69,tn_rolling_min_4,96
70,tn_lag_5,95
71,tn_rolling_std_6,95
...,...,...
677,dia_del_year,0
678,cat2_TE,0
680,cat2_PIEL1,0
681,cat2_OTROS,0


In [None]:
# cols_a_eliminar = no_importantes.feature.unique()
# print(f"Antes de eliminar: {df.shape[1]} columnas")
# df = df.drop(columns=cols_a_eliminar, errors='ignore')
# print(f"Después de eliminar: {df.shape[1]} columnas")

Antes de eliminar: 1144 columnas
Después de eliminar: 684 columnas


Eliminar object/categorical columnas

In [14]:
df = df.select_dtypes(exclude=['datetime', 'datetime64', 'object'])

Train Test Split

In [15]:
train = df[df['periodo'] <= 201912]
test = df[df['periodo'] == 201912]

Entrenamiento

In [16]:
model_lgb.optimizar_con_optuna_con_semillerio_db(train, version="v18", n_trials=500, pesos="max")


Para visualizar los resultados en tiempo real:
1. Abre otra terminal y ejecuta:
   optuna-dashboard sqlite:///optuna_studies_v18.db
2. Abre en tu navegador: http://127.0.0.1:8080/


[I 2025-07-05 09:34:02,190] A new study created in RDB with name: lightgbm_optimization_v18
[I 2025-07-05 09:40:08,664] Trial 0 finished with value: 51.816161217243554 and parameters: {'num_leaves': 47, 'learning_rate': 0.2536999076681772, 'feature_fraction': 0.892797576724562, 'bagging_fraction': 0.8795975452591109, 'bagging_freq': 2, 'lambda_l1': 2.5348407664333426e-07, 'lambda_l2': 3.3323645788192616e-08, 'min_child_samples': 45, 'max_depth': 7, 'max_bin': 383, 'min_data_in_leaf': 21, 'extra_trees': True, 'early_stopping_rounds': 18, 'path_smooth': 0.18182496720710062, 'min_gain_to_split': 0.09170225492671691}. Best is trial 0 with value: 51.816161217243554.


Mejor trial hasta ahora: RMSE=51.816161, Parámetros={'num_leaves': 47, 'learning_rate': 0.2536999076681772, 'feature_fraction': 0.892797576724562, 'bagging_fraction': 0.8795975452591109, 'bagging_freq': 2, 'lambda_l1': 2.5348407664333426e-07, 'lambda_l2': 3.3323645788192616e-08, 'min_child_samples': 45, 'max_depth': 7, 'max_bin': 383, 'min_data_in_leaf': 21, 'extra_trees': True, 'early_stopping_rounds': 18, 'path_smooth': 0.18182496720710062, 'min_gain_to_split': 0.09170225492671691}


[I 2025-07-05 09:50:50,771] Trial 1 finished with value: 259.6389303683741 and parameters: {'num_leaves': 41, 'learning_rate': 0.05958389350068958, 'feature_fraction': 0.7727780074568463, 'bagging_fraction': 0.7873687420594125, 'bagging_freq': 7, 'lambda_l1': 1.8007140198129195e-07, 'lambda_l2': 4.258943089524393e-06, 'min_child_samples': 25, 'max_depth': 6, 'max_bin': 414, 'min_data_in_leaf': 36, 'extra_trees': False, 'early_stopping_rounds': 11, 'path_smooth': 0.6075448519014384, 'min_gain_to_split': 0.08526206184364576}. Best is trial 0 with value: 51.816161217243554.


Mejor trial hasta ahora: RMSE=51.816161, Parámetros={'num_leaves': 47, 'learning_rate': 0.2536999076681772, 'feature_fraction': 0.892797576724562, 'bagging_fraction': 0.8795975452591109, 'bagging_freq': 2, 'lambda_l1': 2.5348407664333426e-07, 'lambda_l2': 3.3323645788192616e-08, 'min_child_samples': 45, 'max_depth': 7, 'max_bin': 383, 'min_data_in_leaf': 21, 'extra_trees': True, 'early_stopping_rounds': 18, 'path_smooth': 0.18182496720710062, 'min_gain_to_split': 0.09170225492671691}


[I 2025-07-05 09:54:59,191] Trial 2 finished with value: 495.949895189545 and parameters: {'num_leaves': 20, 'learning_rate': 0.2521267904777921, 'feature_fraction': 0.9862528132298237, 'bagging_fraction': 0.9425192044349383, 'bagging_freq': 4, 'lambda_l1': 7.569183361880229e-08, 'lambda_l2': 0.014391207615728067, 'min_child_samples': 28, 'max_depth': 3, 'max_bin': 298, 'min_data_in_leaf': 22, 'extra_trees': True, 'early_stopping_rounds': 37, 'path_smooth': 0.31171107608941095, 'min_gain_to_split': 0.2600340105889054}. Best is trial 0 with value: 51.816161217243554.


Mejor trial hasta ahora: RMSE=51.816161, Parámetros={'num_leaves': 47, 'learning_rate': 0.2536999076681772, 'feature_fraction': 0.892797576724562, 'bagging_fraction': 0.8795975452591109, 'bagging_freq': 2, 'lambda_l1': 2.5348407664333426e-07, 'lambda_l2': 3.3323645788192616e-08, 'min_child_samples': 45, 'max_depth': 7, 'max_bin': 383, 'min_data_in_leaf': 21, 'extra_trees': True, 'early_stopping_rounds': 18, 'path_smooth': 0.18182496720710062, 'min_gain_to_split': 0.09170225492671691}


[I 2025-07-05 10:02:57,899] Trial 3 finished with value: 2193.8122488455087 and parameters: {'num_leaves': 62, 'learning_rate': 0.01875220945578641, 'feature_fraction': 0.9878338511058234, 'bagging_fraction': 0.9325398470083344, 'bagging_freq': 10, 'lambda_l1': 1.1309571585271483, 'lambda_l2': 0.002404915432737351, 'min_child_samples': 47, 'max_depth': 3, 'max_bin': 178, 'min_data_in_leaf': 23, 'extra_trees': False, 'early_stopping_rounds': 21, 'path_smooth': 0.8287375091519293, 'min_gain_to_split': 0.17837666334679464}. Best is trial 0 with value: 51.816161217243554.


Mejor trial hasta ahora: RMSE=51.816161, Parámetros={'num_leaves': 47, 'learning_rate': 0.2536999076681772, 'feature_fraction': 0.892797576724562, 'bagging_fraction': 0.8795975452591109, 'bagging_freq': 2, 'lambda_l1': 2.5348407664333426e-07, 'lambda_l2': 3.3323645788192616e-08, 'min_child_samples': 45, 'max_depth': 7, 'max_bin': 383, 'min_data_in_leaf': 21, 'extra_trees': True, 'early_stopping_rounds': 18, 'path_smooth': 0.18182496720710062, 'min_gain_to_split': 0.09170225492671691}


[I 2025-07-05 10:09:35,907] Trial 4 finished with value: 3229.5596097533826 and parameters: {'num_leaves': 39, 'learning_rate': 0.06333268775321843, 'feature_fraction': 0.6563696899899051, 'bagging_fraction': 0.9406590942262119, 'bagging_freq': 1, 'lambda_l1': 7.620481786158549, 'lambda_l2': 0.08916674715636537, 'min_child_samples': 18, 'max_depth': 3, 'max_bin': 427, 'min_data_in_leaf': 77, 'extra_trees': False, 'early_stopping_rounds': 13, 'path_smooth': 0.3584657285442726, 'min_gain_to_split': 0.05793452976256486}. Best is trial 0 with value: 51.816161217243554.


Mejor trial hasta ahora: RMSE=51.816161, Parámetros={'num_leaves': 47, 'learning_rate': 0.2536999076681772, 'feature_fraction': 0.892797576724562, 'bagging_fraction': 0.8795975452591109, 'bagging_freq': 2, 'lambda_l1': 2.5348407664333426e-07, 'lambda_l2': 3.3323645788192616e-08, 'min_child_samples': 45, 'max_depth': 7, 'max_bin': 383, 'min_data_in_leaf': 21, 'extra_trees': True, 'early_stopping_rounds': 18, 'path_smooth': 0.18182496720710062, 'min_gain_to_split': 0.09170225492671691}


[I 2025-07-05 10:19:50,179] Trial 5 finished with value: 120.32902290226943 and parameters: {'num_leaves': 89, 'learning_rate': 0.08330803890301997, 'feature_fraction': 0.7323592099410596, 'bagging_fraction': 0.7190675050858071, 'bagging_freq': 4, 'lambda_l1': 8.445977074223802e-06, 'lambda_l2': 0.036851536911881845, 'min_child_samples': 36, 'max_depth': 10, 'max_bin': 289, 'min_data_in_leaf': 29, 'extra_trees': False, 'early_stopping_rounds': 33, 'path_smooth': 0.770967179954561, 'min_gain_to_split': 0.24689779818219537}. Best is trial 0 with value: 51.816161217243554.


Mejor trial hasta ahora: RMSE=51.816161, Parámetros={'num_leaves': 47, 'learning_rate': 0.2536999076681772, 'feature_fraction': 0.892797576724562, 'bagging_fraction': 0.8795975452591109, 'bagging_freq': 2, 'lambda_l1': 2.5348407664333426e-07, 'lambda_l2': 3.3323645788192616e-08, 'min_child_samples': 45, 'max_depth': 7, 'max_bin': 383, 'min_data_in_leaf': 21, 'extra_trees': True, 'early_stopping_rounds': 18, 'path_smooth': 0.18182496720710062, 'min_gain_to_split': 0.09170225492671691}


[I 2025-07-05 10:26:41,795] Trial 6 finished with value: 168.39593357885548 and parameters: {'num_leaves': 59, 'learning_rate': 0.042808491617570936, 'feature_fraction': 0.610167650697638, 'bagging_fraction': 0.7323674280979913, 'bagging_freq': 1, 'lambda_l1': 0.005341874754868531, 'lambda_l2': 6.748446817464346e-06, 'min_child_samples': 30, 'max_depth': 10, 'max_bin': 199, 'min_data_in_leaf': 53, 'extra_trees': True, 'early_stopping_rounds': 13, 'path_smooth': 0.289751452913768, 'min_gain_to_split': 0.08061064362700221}. Best is trial 0 with value: 51.816161217243554.


Mejor trial hasta ahora: RMSE=51.816161, Parámetros={'num_leaves': 47, 'learning_rate': 0.2536999076681772, 'feature_fraction': 0.892797576724562, 'bagging_fraction': 0.8795975452591109, 'bagging_freq': 2, 'lambda_l1': 2.5348407664333426e-07, 'lambda_l2': 3.3323645788192616e-08, 'min_child_samples': 45, 'max_depth': 7, 'max_bin': 383, 'min_data_in_leaf': 21, 'extra_trees': True, 'early_stopping_rounds': 18, 'path_smooth': 0.18182496720710062, 'min_gain_to_split': 0.09170225492671691}


[I 2025-07-05 10:38:34,645] Trial 7 finished with value: 23.690437334931165 and parameters: {'num_leaves': 94, 'learning_rate': 0.156203869845265, 'feature_fraction': 0.8533615026041694, 'bagging_fraction': 0.9614381770563153, 'bagging_freq': 9, 'lambda_l1': 4.776728196949699e-07, 'lambda_l2': 1.0790237065789294, 'min_child_samples': 32, 'max_depth': 9, 'max_bin': 459, 'min_data_in_leaf': 45, 'extra_trees': False, 'early_stopping_rounds': 27, 'path_smooth': 0.8180147659224931, 'min_gain_to_split': 0.4303652916281717}. Best is trial 7 with value: 23.690437334931165.


Mejor trial hasta ahora: RMSE=23.690437, Parámetros={'num_leaves': 94, 'learning_rate': 0.156203869845265, 'feature_fraction': 0.8533615026041694, 'bagging_fraction': 0.9614381770563153, 'bagging_freq': 9, 'lambda_l1': 4.776728196949699e-07, 'lambda_l2': 1.0790237065789294, 'min_child_samples': 32, 'max_depth': 9, 'max_bin': 459, 'min_data_in_leaf': 45, 'extra_trees': False, 'early_stopping_rounds': 27, 'path_smooth': 0.8180147659224931, 'min_gain_to_split': 0.4303652916281717}


[I 2025-07-05 10:45:25,016] Trial 8 finished with value: 687.576238989167 and parameters: {'num_leaves': 15, 'learning_rate': 0.05681142678077596, 'feature_fraction': 0.7669644012595116, 'bagging_fraction': 0.7666323431412191, 'bagging_freq': 2, 'lambda_l1': 1.0927895733904103e-05, 'lambda_l2': 3.0632845126552133, 'min_child_samples': 23, 'max_depth': 7, 'max_bin': 381, 'min_data_in_leaf': 49, 'extra_trees': True, 'early_stopping_rounds': 20, 'path_smooth': 0.49724850589238545, 'min_gain_to_split': 0.15043915490838483}. Best is trial 7 with value: 23.690437334931165.


Mejor trial hasta ahora: RMSE=23.690437, Parámetros={'num_leaves': 94, 'learning_rate': 0.156203869845265, 'feature_fraction': 0.8533615026041694, 'bagging_fraction': 0.9614381770563153, 'bagging_freq': 9, 'lambda_l1': 4.776728196949699e-07, 'lambda_l2': 1.0790237065789294, 'min_child_samples': 32, 'max_depth': 9, 'max_bin': 459, 'min_data_in_leaf': 45, 'extra_trees': False, 'early_stopping_rounds': 27, 'path_smooth': 0.8180147659224931, 'min_gain_to_split': 0.4303652916281717}


[I 2025-07-05 11:00:21,483] Trial 9 finished with value: 3425.932399393901 and parameters: {'num_leaves': 39, 'learning_rate': 0.011336695817840537, 'feature_fraction': 0.8438257335919588, 'bagging_fraction': 0.8508037069686585, 'bagging_freq': 1, 'lambda_l1': 3.21972053981427e-06, 'lambda_l2': 1.49414578394363, 'min_child_samples': 19, 'max_depth': 4, 'max_bin': 296, 'min_data_in_leaf': 99, 'extra_trees': False, 'early_stopping_rounds': 41, 'path_smooth': 0.23763754399239967, 'min_gain_to_split': 0.3641081743059298}. Best is trial 7 with value: 23.690437334931165.


Mejor trial hasta ahora: RMSE=23.690437, Parámetros={'num_leaves': 94, 'learning_rate': 0.156203869845265, 'feature_fraction': 0.8533615026041694, 'bagging_fraction': 0.9614381770563153, 'bagging_freq': 9, 'lambda_l1': 4.776728196949699e-07, 'lambda_l2': 1.0790237065789294, 'min_child_samples': 32, 'max_depth': 9, 'max_bin': 459, 'min_data_in_leaf': 45, 'extra_trees': False, 'early_stopping_rounds': 27, 'path_smooth': 0.8180147659224931, 'min_gain_to_split': 0.4303652916281717}


[I 2025-07-05 11:10:11,183] Trial 10 finished with value: 225.14631889102816 and parameters: {'num_leaves': 99, 'learning_rate': 0.13416032434032454, 'feature_fraction': 0.8784471213433285, 'bagging_fraction': 0.9862830691251867, 'bagging_freq': 10, 'lambda_l1': 0.001502701089682037, 'lambda_l2': 6.3791369398325e-05, 'min_child_samples': 11, 'max_depth': 9, 'max_bin': 497, 'min_data_in_leaf': 70, 'extra_trees': False, 'early_stopping_rounds': 47, 'path_smooth': 0.990907327492855, 'min_gain_to_split': 0.49687192922818807}. Best is trial 7 with value: 23.690437334931165.


Mejor trial hasta ahora: RMSE=23.690437, Parámetros={'num_leaves': 94, 'learning_rate': 0.156203869845265, 'feature_fraction': 0.8533615026041694, 'bagging_fraction': 0.9614381770563153, 'bagging_freq': 9, 'lambda_l1': 4.776728196949699e-07, 'lambda_l2': 1.0790237065789294, 'min_child_samples': 32, 'max_depth': 9, 'max_bin': 459, 'min_data_in_leaf': 45, 'extra_trees': False, 'early_stopping_rounds': 27, 'path_smooth': 0.8180147659224931, 'min_gain_to_split': 0.4303652916281717}


[I 2025-07-05 11:16:49,134] Trial 11 finished with value: 54.911064089237314 and parameters: {'num_leaves': 76, 'learning_rate': 0.28971664054261004, 'feature_fraction': 0.9052045095657504, 'bagging_fraction': 0.8588256102468118, 'bagging_freq': 7, 'lambda_l1': 4.3380793966630313e-08, 'lambda_l2': 1.7270812116459777e-08, 'min_child_samples': 50, 'max_depth': 8, 'max_bin': 496, 'min_data_in_leaf': 42, 'extra_trees': True, 'early_stopping_rounds': 26, 'path_smooth': 0.07846238496678071, 'min_gain_to_split': 0.47411998848815184}. Best is trial 7 with value: 23.690437334931165.


Mejor trial hasta ahora: RMSE=23.690437, Parámetros={'num_leaves': 94, 'learning_rate': 0.156203869845265, 'feature_fraction': 0.8533615026041694, 'bagging_fraction': 0.9614381770563153, 'bagging_freq': 9, 'lambda_l1': 4.776728196949699e-07, 'lambda_l2': 1.0790237065789294, 'min_child_samples': 32, 'max_depth': 9, 'max_bin': 459, 'min_data_in_leaf': 45, 'extra_trees': False, 'early_stopping_rounds': 27, 'path_smooth': 0.8180147659224931, 'min_gain_to_split': 0.4303652916281717}


[I 2025-07-05 11:23:01,989] Trial 12 finished with value: 175.75346518011474 and parameters: {'num_leaves': 77, 'learning_rate': 0.16741718695640792, 'feature_fraction': 0.925913487601959, 'bagging_fraction': 0.8855023408544529, 'bagging_freq': 8, 'lambda_l1': 0.00014211618981379697, 'lambda_l2': 1.31299430222377e-08, 'min_child_samples': 41, 'max_depth': 6, 'max_bin': 357, 'min_data_in_leaf': 70, 'extra_trees': True, 'early_stopping_rounds': 27, 'path_smooth': 0.08788444971810856, 'min_gain_to_split': 0.370576595589974}. Best is trial 7 with value: 23.690437334931165.


Mejor trial hasta ahora: RMSE=23.690437, Parámetros={'num_leaves': 94, 'learning_rate': 0.156203869845265, 'feature_fraction': 0.8533615026041694, 'bagging_fraction': 0.9614381770563153, 'bagging_freq': 9, 'lambda_l1': 4.776728196949699e-07, 'lambda_l2': 1.0790237065789294, 'min_child_samples': 32, 'max_depth': 9, 'max_bin': 459, 'min_data_in_leaf': 45, 'extra_trees': False, 'early_stopping_rounds': 27, 'path_smooth': 0.8180147659224931, 'min_gain_to_split': 0.4303652916281717}


[I 2025-07-05 11:27:26,307] Trial 13 finished with value: 95.35411319719863 and parameters: {'num_leaves': 48, 'learning_rate': 0.14519268913318212, 'feature_fraction': 0.826308855197473, 'bagging_fraction': 0.89875512954508, 'bagging_freq': 4, 'lambda_l1': 8.163671809851979e-07, 'lambda_l2': 1.102565393878713e-06, 'min_child_samples': 37, 'max_depth': 8, 'max_bin': 101, 'min_data_in_leaf': 38, 'extra_trees': True, 'early_stopping_rounds': 20, 'path_smooth': 0.5625128570253415, 'min_gain_to_split': 0.012753625520683445}. Best is trial 7 with value: 23.690437334931165.


Mejor trial hasta ahora: RMSE=23.690437, Parámetros={'num_leaves': 94, 'learning_rate': 0.156203869845265, 'feature_fraction': 0.8533615026041694, 'bagging_fraction': 0.9614381770563153, 'bagging_freq': 9, 'lambda_l1': 4.776728196949699e-07, 'lambda_l2': 1.0790237065789294, 'min_child_samples': 32, 'max_depth': 9, 'max_bin': 459, 'min_data_in_leaf': 45, 'extra_trees': False, 'early_stopping_rounds': 27, 'path_smooth': 0.8180147659224931, 'min_gain_to_split': 0.4303652916281717}


[I 2025-07-05 11:33:11,961] Trial 14 finished with value: 527.7685121772287 and parameters: {'num_leaves': 71, 'learning_rate': 0.19168857979413848, 'feature_fraction': 0.9406932611185447, 'bagging_fraction': 0.9951570865434065, 'bagging_freq': 6, 'lambda_l1': 1.6340250254137068e-08, 'lambda_l2': 0.0003365583633830093, 'min_child_samples': 44, 'max_depth': 5, 'max_bin': 442, 'min_data_in_leaf': 58, 'extra_trees': False, 'early_stopping_rounds': 31, 'path_smooth': 0.7958677618935732, 'min_gain_to_split': 0.39500659174426994}. Best is trial 7 with value: 23.690437334931165.


Mejor trial hasta ahora: RMSE=23.690437, Parámetros={'num_leaves': 94, 'learning_rate': 0.156203869845265, 'feature_fraction': 0.8533615026041694, 'bagging_fraction': 0.9614381770563153, 'bagging_freq': 9, 'lambda_l1': 4.776728196949699e-07, 'lambda_l2': 1.0790237065789294, 'min_child_samples': 32, 'max_depth': 9, 'max_bin': 459, 'min_data_in_leaf': 45, 'extra_trees': False, 'early_stopping_rounds': 27, 'path_smooth': 0.8180147659224931, 'min_gain_to_split': 0.4303652916281717}


[I 2025-07-05 11:38:43,128] Trial 15 finished with value: 329.57611889858936 and parameters: {'num_leaves': 29, 'learning_rate': 0.12386192844163976, 'feature_fraction': 0.8624175337686146, 'bagging_fraction': 0.7997359893631422, 'bagging_freq': 9, 'lambda_l1': 9.341148350501848e-05, 'lambda_l2': 1.6500464738094685e-07, 'min_child_samples': 34, 'max_depth': 8, 'max_bin': 362, 'min_data_in_leaf': 45, 'extra_trees': True, 'early_stopping_rounds': 23, 'path_smooth': 0.9984987252001404, 'min_gain_to_split': 0.2719512092425288}. Best is trial 7 with value: 23.690437334931165.


Mejor trial hasta ahora: RMSE=23.690437, Parámetros={'num_leaves': 94, 'learning_rate': 0.156203869845265, 'feature_fraction': 0.8533615026041694, 'bagging_fraction': 0.9614381770563153, 'bagging_freq': 9, 'lambda_l1': 4.776728196949699e-07, 'lambda_l2': 1.0790237065789294, 'min_child_samples': 32, 'max_depth': 9, 'max_bin': 459, 'min_data_in_leaf': 45, 'extra_trees': False, 'early_stopping_rounds': 27, 'path_smooth': 0.8180147659224931, 'min_gain_to_split': 0.4303652916281717}


[I 2025-07-05 11:48:34,009] Trial 16 finished with value: 232.6441202121376 and parameters: {'num_leaves': 88, 'learning_rate': 0.030792216974309318, 'feature_fraction': 0.6984494811659784, 'bagging_fraction': 0.8252020862566151, 'bagging_freq': 3, 'lambda_l1': 0.03141951598789899, 'lambda_l2': 0.3117720452365209, 'min_child_samples': 40, 'max_depth': 7, 'max_bin': 448, 'min_data_in_leaf': 32, 'extra_trees': True, 'early_stopping_rounds': 17, 'path_smooth': 0.4514328155142508, 'min_gain_to_split': 0.16756019390833982}. Best is trial 7 with value: 23.690437334931165.


Mejor trial hasta ahora: RMSE=23.690437, Parámetros={'num_leaves': 94, 'learning_rate': 0.156203869845265, 'feature_fraction': 0.8533615026041694, 'bagging_fraction': 0.9614381770563153, 'bagging_freq': 9, 'lambda_l1': 4.776728196949699e-07, 'lambda_l2': 1.0790237065789294, 'min_child_samples': 32, 'max_depth': 9, 'max_bin': 459, 'min_data_in_leaf': 45, 'extra_trees': False, 'early_stopping_rounds': 27, 'path_smooth': 0.8180147659224931, 'min_gain_to_split': 0.4303652916281717}


[I 2025-07-05 11:54:11,262] Trial 17 finished with value: 481.4486270378329 and parameters: {'num_leaves': 54, 'learning_rate': 0.10296044495960273, 'feature_fraction': 0.8147324663134726, 'bagging_fraction': 0.8976369323205313, 'bagging_freq': 5, 'lambda_l1': 9.216515731670393e-07, 'lambda_l2': 0.00014593568357602004, 'min_child_samples': 32, 'max_depth': 9, 'max_bin': 231, 'min_data_in_leaf': 100, 'extra_trees': False, 'early_stopping_rounds': 34, 'path_smooth': 0.6725859207420777, 'min_gain_to_split': 0.31120079283829877}. Best is trial 7 with value: 23.690437334931165.


Mejor trial hasta ahora: RMSE=23.690437, Parámetros={'num_leaves': 94, 'learning_rate': 0.156203869845265, 'feature_fraction': 0.8533615026041694, 'bagging_fraction': 0.9614381770563153, 'bagging_freq': 9, 'lambda_l1': 4.776728196949699e-07, 'lambda_l2': 1.0790237065789294, 'min_child_samples': 32, 'max_depth': 9, 'max_bin': 459, 'min_data_in_leaf': 45, 'extra_trees': False, 'early_stopping_rounds': 27, 'path_smooth': 0.8180147659224931, 'min_gain_to_split': 0.4303652916281717}


[I 2025-07-05 11:59:40,977] Trial 18 finished with value: 440.4736849183969 and parameters: {'num_leaves': 67, 'learning_rate': 0.2181832153087695, 'feature_fraction': 0.8937453835988592, 'bagging_fraction': 0.9452904666198787, 'bagging_freq': 6, 'lambda_l1': 2.280946906991963e-05, 'lambda_l2': 0.0022584545819610774, 'min_child_samples': 43, 'max_depth': 9, 'max_bin': 337, 'min_data_in_leaf': 87, 'extra_trees': False, 'early_stopping_rounds': 26, 'path_smooth': 0.16448804475298406, 'min_gain_to_split': 0.430515477520969}. Best is trial 7 with value: 23.690437334931165.


Mejor trial hasta ahora: RMSE=23.690437, Parámetros={'num_leaves': 94, 'learning_rate': 0.156203869845265, 'feature_fraction': 0.8533615026041694, 'bagging_fraction': 0.9614381770563153, 'bagging_freq': 9, 'lambda_l1': 4.776728196949699e-07, 'lambda_l2': 1.0790237065789294, 'min_child_samples': 32, 'max_depth': 9, 'max_bin': 459, 'min_data_in_leaf': 45, 'extra_trees': False, 'early_stopping_rounds': 27, 'path_smooth': 0.8180147659224931, 'min_gain_to_split': 0.4303652916281717}


[I 2025-07-05 12:05:40,390] Trial 19 finished with value: 783.7873679363437 and parameters: {'num_leaves': 99, 'learning_rate': 0.08641148551188793, 'feature_fraction': 0.9405278000235617, 'bagging_fraction': 0.966150963007428, 'bagging_freq': 8, 'lambda_l1': 1.020155732268282e-08, 'lambda_l2': 6.1815752406034745, 'min_child_samples': 50, 'max_depth': 5, 'max_bin': 407, 'min_data_in_leaf': 20, 'extra_trees': True, 'early_stopping_rounds': 17, 'path_smooth': 0.4448660691865179, 'min_gain_to_split': 0.320574167909676}. Best is trial 7 with value: 23.690437334931165.


Mejor trial hasta ahora: RMSE=23.690437, Parámetros={'num_leaves': 94, 'learning_rate': 0.156203869845265, 'feature_fraction': 0.8533615026041694, 'bagging_fraction': 0.9614381770563153, 'bagging_freq': 9, 'lambda_l1': 4.776728196949699e-07, 'lambda_l2': 1.0790237065789294, 'min_child_samples': 32, 'max_depth': 9, 'max_bin': 459, 'min_data_in_leaf': 45, 'extra_trees': False, 'early_stopping_rounds': 27, 'path_smooth': 0.8180147659224931, 'min_gain_to_split': 0.4303652916281717}


[I 2025-07-05 12:09:54,610] Trial 20 finished with value: 325.056827194063 and parameters: {'num_leaves': 30, 'learning_rate': 0.27695181472908825, 'feature_fraction': 0.7870075896462443, 'bagging_fraction': 0.9124714230256978, 'bagging_freq': 3, 'lambda_l1': 4.7085484185731894e-07, 'lambda_l2': 1.274818793866283e-07, 'min_child_samples': 10, 'max_depth': 7, 'max_bin': 461, 'min_data_in_leaf': 64, 'extra_trees': True, 'early_stopping_rounds': 50, 'path_smooth': 0.01478120517468584, 'min_gain_to_split': 0.20721559622698366}. Best is trial 7 with value: 23.690437334931165.


Mejor trial hasta ahora: RMSE=23.690437, Parámetros={'num_leaves': 94, 'learning_rate': 0.156203869845265, 'feature_fraction': 0.8533615026041694, 'bagging_fraction': 0.9614381770563153, 'bagging_freq': 9, 'lambda_l1': 4.776728196949699e-07, 'lambda_l2': 1.0790237065789294, 'min_child_samples': 32, 'max_depth': 9, 'max_bin': 459, 'min_data_in_leaf': 45, 'extra_trees': False, 'early_stopping_rounds': 27, 'path_smooth': 0.8180147659224931, 'min_gain_to_split': 0.4303652916281717}


[I 2025-07-05 12:14:45,465] Trial 21 finished with value: 45.30830653500514 and parameters: {'num_leaves': 81, 'learning_rate': 0.2800708641106189, 'feature_fraction': 0.9137297806001223, 'bagging_fraction': 0.8498901197745152, 'bagging_freq': 8, 'lambda_l1': 6.222247264455146e-08, 'lambda_l2': 1.0636605953302615e-08, 'min_child_samples': 50, 'max_depth': 8, 'max_bin': 496, 'min_data_in_leaf': 42, 'extra_trees': True, 'early_stopping_rounds': 27, 'path_smooth': 0.16173608671122056, 'min_gain_to_split': 0.49937823147175053}. Best is trial 7 with value: 23.690437334931165.


Mejor trial hasta ahora: RMSE=23.690437, Parámetros={'num_leaves': 94, 'learning_rate': 0.156203869845265, 'feature_fraction': 0.8533615026041694, 'bagging_fraction': 0.9614381770563153, 'bagging_freq': 9, 'lambda_l1': 4.776728196949699e-07, 'lambda_l2': 1.0790237065789294, 'min_child_samples': 32, 'max_depth': 9, 'max_bin': 459, 'min_data_in_leaf': 45, 'extra_trees': False, 'early_stopping_rounds': 27, 'path_smooth': 0.8180147659224931, 'min_gain_to_split': 0.4303652916281717}


[I 2025-07-05 12:21:02,841] Trial 22 finished with value: 116.46086897485577 and parameters: {'num_leaves': 87, 'learning_rate': 0.18545119947871933, 'feature_fraction': 0.8671818675015122, 'bagging_fraction': 0.8726182101432829, 'bagging_freq': 9, 'lambda_l1': 7.672709160593344e-08, 'lambda_l2': 1.2822835644335922e-07, 'min_child_samples': 46, 'max_depth': 8, 'max_bin': 491, 'min_data_in_leaf': 30, 'extra_trees': True, 'early_stopping_rounds': 27, 'path_smooth': 0.1724297152314208, 'min_gain_to_split': 0.45061727320671363}. Best is trial 7 with value: 23.690437334931165.


Mejor trial hasta ahora: RMSE=23.690437, Parámetros={'num_leaves': 94, 'learning_rate': 0.156203869845265, 'feature_fraction': 0.8533615026041694, 'bagging_fraction': 0.9614381770563153, 'bagging_freq': 9, 'lambda_l1': 4.776728196949699e-07, 'lambda_l2': 1.0790237065789294, 'min_child_samples': 32, 'max_depth': 9, 'max_bin': 459, 'min_data_in_leaf': 45, 'extra_trees': False, 'early_stopping_rounds': 27, 'path_smooth': 0.8180147659224931, 'min_gain_to_split': 0.4303652916281717}


[I 2025-07-05 12:27:04,511] Trial 23 finished with value: 74.36183365236836 and parameters: {'num_leaves': 83, 'learning_rate': 0.2919505189195406, 'feature_fraction': 0.9144917526190501, 'bagging_fraction': 0.826752896569488, 'bagging_freq': 8, 'lambda_l1': 1.8330210432421913e-06, 'lambda_l2': 4.0428230960960376e-07, 'min_child_samples': 39, 'max_depth': 9, 'max_bin': 393, 'min_data_in_leaf': 54, 'extra_trees': True, 'early_stopping_rounds': 36, 'path_smooth': 0.18487092956980355, 'min_gain_to_split': 0.4364833350247777}. Best is trial 7 with value: 23.690437334931165.


Mejor trial hasta ahora: RMSE=23.690437, Parámetros={'num_leaves': 94, 'learning_rate': 0.156203869845265, 'feature_fraction': 0.8533615026041694, 'bagging_fraction': 0.9614381770563153, 'bagging_freq': 9, 'lambda_l1': 4.776728196949699e-07, 'lambda_l2': 1.0790237065789294, 'min_child_samples': 32, 'max_depth': 9, 'max_bin': 459, 'min_data_in_leaf': 45, 'extra_trees': False, 'early_stopping_rounds': 27, 'path_smooth': 0.8180147659224931, 'min_gain_to_split': 0.4303652916281717}


[I 2025-07-05 12:33:50,372] Trial 24 finished with value: 47.90270289216812 and parameters: {'num_leaves': 93, 'learning_rate': 0.2087511543581648, 'feature_fraction': 0.975256060400439, 'bagging_fraction': 0.8305014798202526, 'bagging_freq': 9, 'lambda_l1': 2.244591103786046e-07, 'lambda_l2': 2.2452869495924043e-08, 'min_child_samples': 47, 'max_depth': 10, 'max_bin': 458, 'min_data_in_leaf': 46, 'extra_trees': True, 'early_stopping_rounds': 41, 'path_smooth': 0.3653599236797689, 'min_gain_to_split': 0.40446788762687114}. Best is trial 7 with value: 23.690437334931165.


Mejor trial hasta ahora: RMSE=23.690437, Parámetros={'num_leaves': 94, 'learning_rate': 0.156203869845265, 'feature_fraction': 0.8533615026041694, 'bagging_fraction': 0.9614381770563153, 'bagging_freq': 9, 'lambda_l1': 4.776728196949699e-07, 'lambda_l2': 1.0790237065789294, 'min_child_samples': 32, 'max_depth': 9, 'max_bin': 459, 'min_data_in_leaf': 45, 'extra_trees': False, 'early_stopping_rounds': 27, 'path_smooth': 0.8180147659224931, 'min_gain_to_split': 0.4303652916281717}


[I 2025-07-05 12:41:15,821] Trial 25 finished with value: 49.24820547076514 and parameters: {'num_leaves': 94, 'learning_rate': 0.11777044730344836, 'feature_fraction': 0.9657779127740821, 'bagging_fraction': 0.8188400459203229, 'bagging_freq': 9, 'lambda_l1': 4.014447999501781e-05, 'lambda_l2': 1.713482889508348e-05, 'min_child_samples': 48, 'max_depth': 10, 'max_bin': 460, 'min_data_in_leaf': 45, 'extra_trees': True, 'early_stopping_rounds': 40, 'path_smooth': 0.41036922644443014, 'min_gain_to_split': 0.40334472418034417}. Best is trial 7 with value: 23.690437334931165.


Mejor trial hasta ahora: RMSE=23.690437, Parámetros={'num_leaves': 94, 'learning_rate': 0.156203869845265, 'feature_fraction': 0.8533615026041694, 'bagging_fraction': 0.9614381770563153, 'bagging_freq': 9, 'lambda_l1': 4.776728196949699e-07, 'lambda_l2': 1.0790237065789294, 'min_child_samples': 32, 'max_depth': 9, 'max_bin': 459, 'min_data_in_leaf': 45, 'extra_trees': False, 'early_stopping_rounds': 27, 'path_smooth': 0.8180147659224931, 'min_gain_to_split': 0.4303652916281717}
Estudio guardado en: sqlite:///optuna_studies_v18.db

Mejores hiperparámetros encontrados:
num_leaves: 94
learning_rate: 0.156203869845265
feature_fraction: 0.8533615026041694
bagging_fraction: 0.9614381770563153
bagging_freq: 9
lambda_l1: 4.776728196949699e-07
lambda_l2: 1.0790237065789294
min_child_samples: 32
max_depth: 9
max_bin: 459
min_data_in_leaf: 45
extra_trees: False
early_stopping_rounds: 27
path_smooth: 0.8180147659224931
min_gain_to_split: 0.4303652916281717


(<optuna.study.study.Study at 0x1e6048d9890>,
 {'num_leaves': 94,
  'learning_rate': 0.156203869845265,
  'feature_fraction': 0.8533615026041694,
  'bagging_fraction': 0.9614381770563153,
  'bagging_freq': 9,
  'lambda_l1': 4.776728196949699e-07,
  'lambda_l2': 1.0790237065789294,
  'min_child_samples': 32,
  'max_depth': 9,
  'max_bin': 459,
  'min_data_in_leaf': 45,
  'extra_trees': False,
  'early_stopping_rounds': 27,
  'path_smooth': 0.8180147659224931,
  'min_gain_to_split': 0.4303652916281717,
  'objective': 'regression',
  'metric': 'rmse',
  'boosting_type': 'gbdt',
  'verbosity': -1})

Prediccion

In [18]:
df_future = model_lgb.semillerio_en_prediccion_con_pesos(train, test, version="v18", pesos="max")

In [19]:
df_future

Unnamed: 0,periodo,product_id,target,pred
30476,201912,20001,0.0,813.458405
30477,201912,20002,0.0,368.541620
30478,201912,20003,0.0,68.682950
30479,201912,20004,0.0,-19.712524
30480,201912,20005,0.0,94.846860
...,...,...,...,...
31357,201912,21265,0.0,-3.468574
31358,201912,21266,0.0,-4.426887
31359,201912,21267,0.0,-7.239316
31360,201912,21271,0.0,-6.789859


Filtramos los 180 productos

In [20]:
productos_ok = pd.read_csv("https://storage.googleapis.com/open-courses/austral2025-af91/labo3v/product_id_apredecir201912.txt", sep="\t")
df_future = df_future[df_future['periodo'] == 201912]
df_future = df_future[df_future['product_id'].isin(productos_ok['product_id'].unique())]


In [21]:
df_future

Unnamed: 0,periodo,product_id,target,pred
30476,201912,20001,0.0,813.458405
30477,201912,20002,0.0,368.541620
30478,201912,20003,0.0,68.682950
30479,201912,20004,0.0,-19.712524
30480,201912,20005,0.0,94.846860
...,...,...,...,...
31355,201912,21263,0.0,0.067347
31357,201912,21265,0.0,-3.468574
31358,201912,21266,0.0,-4.426887
31359,201912,21267,0.0,-7.239316


Vemos cuantos negativos hay

In [22]:
df_future[df_future['pred'] < 0]

Unnamed: 0,periodo,product_id,target,pred
30479,201912,20004,0.0,-19.712524
30481,201912,20006,0.0,-20.043549
30482,201912,20007,0.0,-47.520696
30483,201912,20008,0.0,-34.543915
30484,201912,20009,0.0,-0.037305
...,...,...,...,...
31351,201912,21256,0.0,-5.154890
31354,201912,21262,0.0,-0.830648
31357,201912,21265,0.0,-3.468574
31358,201912,21266,0.0,-4.426887


Reemplazamos los negativos por el promedio de ultimos 12 meses

In [23]:
promedio780 = model_lgb.promedio_12_meses_780p()
df_future = df_future.merge(promedio780, on='product_id', how='left')
df_future.drop(columns=['target','periodo'], inplace=True)
df_future.loc[df_future['pred'] < 0, 'pred'] = df_future['tn']
df_future



Unnamed: 0,product_id,pred,tn
0,20001,813.458405,1454.732720
1,20002,368.541620,1175.437142
2,20003,68.682950,784.976407
3,20004,627.215328,627.215328
4,20005,94.846860,668.270104
...,...,...,...
775,21263,0.067347,0.029993
776,21265,0.089541,0.089541
777,21266,0.094659,0.094659
778,21267,0.092835,0.092835


Guardamos el archivo

In [24]:
df_future.drop(columns=['tn'], inplace=True)
df_future.rename(columns={'pred': 'tn'}, inplace=True)
df_future.to_csv("./outputs/predicciones_exp_07_lgb_v5.csv", index=False, sep=',')