In [17]:
import pandas as pd
import numpy as np
import importlib
import gc
import sys
import warnings
sys.path.append('./scripts')  
import preprocesamiento
import feature_engineering
import model_lgb
importlib.reload(preprocesamiento)
importlib.reload(model_lgb)
importlib.reload(feature_engineering)
warnings.filterwarnings("ignore")

# Experimento 7: 
- LGBM
- Usando funcion entrenamiento: semillerio_en_prediccion
- Aumento cantidad de variables
- sqlite:///optuna_studies_v15.db
- Kaggle =  


##### Levantamos el dataset con target ya calculado

In [3]:
df = pd.read_csv("./datasets/periodo_x_producto_con_target.csv", sep=',', encoding='utf-8')
df.shape

(31362, 19)

In [4]:
columnas_baseline = df.columns.tolist()
columnas_baseline

['product_id',
 'periodo',
 'nacimiento_producto',
 'muerte_producto',
 'mes_n',
 'total_meses',
 'producto_nuevo',
 'ciclo_de_vida_inicial',
 'cat1',
 'cat2',
 'cat3',
 'brand',
 'sku_size',
 'stock_final',
 'tn',
 'plan_precios_cuidados',
 'cust_request_qty',
 'cust_request_tn',
 'target']

##### Preprocesamiento a la minima expresión :)

In [50]:
# ##### aplicamos OHE
# df = preprocesamiento.aplicarOHE(df)
# df.shape

### Feature Engineering

##### Neural Prophet

In [5]:
neural_prophet_fe = pd.read_csv("./datasets/features_neuralprophet_completo.csv", sep=',', encoding='utf-8')
neural_prophet_fe['ds'] = pd.to_datetime(neural_prophet_fe['ds'], errors='coerce')
# Versión alternativa más robusta:
neural_prophet_fe['periodo'] = neural_prophet_fe['ds'].apply(
    lambda x: x.year * 100 + x.month if pd.notnull(x) else None
)
neural_prophet_fe = neural_prophet_fe[['periodo', 'product_id', 'trend', "season_yearly", "season_monthly"]]
df = df.merge(neural_prophet_fe, on=['periodo', 'product_id'], how='left')
df.shape

(31362, 22)

##### Prophet

In [6]:
prophet_fe = pd.read_csv("./datasets/prophet_features_tn_zscore.csv", sep=',', encoding='utf-8')
prophet_fe['ds'] = pd.to_datetime(prophet_fe['ds'], errors='coerce')
prophet_fe['periodo'] = prophet_fe['ds'].apply(
    lambda x: x.year * 100 + x.month if pd.notnull(x) else None
)
prophet_fe = prophet_fe[['periodo', 'product_id', 'trend_add', "yearly_add", "additive_terms", 'trend_mult', 'yearly_mult', 'multiplicative_terms']]
df = df.merge(prophet_fe, on=['periodo', 'product_id'], how='left')
df.shape

(31362, 28)

##### FE Moviles

In [7]:
df = feature_engineering.get_lags(df, "tn", 201912)
df = feature_engineering.get_delta_lags(df, "tn", 24)
df = feature_engineering.get_rolling_means(df, "tn", 201912)
df = feature_engineering.get_rolling_stds(df, "tn", 201912)
df = feature_engineering.get_rolling_mins(df, "tn", 201912)
df = feature_engineering.get_rolling_maxs(df, "tn", 201912)
df = feature_engineering.get_rolling_medians(df, "tn", 201912)
df = feature_engineering.get_rolling_skewness(df, "tn", 201912)
df = feature_engineering.get_autocorrelaciones(df, "tn", 201912)
df.shape

(31362, 590)

In [8]:
df = feature_engineering.get_lags(df, "cust_request_qty", 201912)
df = feature_engineering.get_delta_lags(df, "cust_request_qty", 24)
df = feature_engineering.get_rolling_means(df, "cust_request_qty", 201912)
df = feature_engineering.get_rolling_stds(df, "cust_request_qty", 201912)
df = feature_engineering.get_rolling_mins(df, "cust_request_qty", 201912)
df = feature_engineering.get_rolling_maxs(df, "cust_request_qty", 201912)

In [9]:
df = feature_engineering.get_lags(df, "stock_final", 201912)
df = feature_engineering.get_delta_lags(df, "stock_final", 24)
df = feature_engineering.get_rolling_means(df, "stock_final", 201912)
df = feature_engineering.get_rolling_stds(df, "stock_final", 201912)
df = feature_engineering.get_rolling_mins(df, "stock_final", 201912)
df = feature_engineering.get_rolling_maxs(df, "stock_final", 201912)
df.shape

(31362, 1500)

##### FE Moviles sobre otras variables

In [56]:
# #  stock final
# df = feature_engineering.get_lagsEspecificos(df, col='stock_final_zscore')
# df = feature_engineering.get_delta_lags_especificos(df, col='stock_final_zscore')
# df = feature_engineering.get_rolling_means_especificos(df, col='stock_final_zscore')
# df = feature_engineering.get_rolling_stds_especificos(df, col='stock_final_zscore')
# df = feature_engineering.get_rolling_medians_especificos(df, col='stock_final_zscore')
# df = feature_engineering.get_rolling_mins_especificos(df, col='stock_final_zscore')
# df = feature_engineering.get_rolling_maxs_especificos(df, col='stock_final_zscore')
# df = feature_engineering.get_rolling_skewness_especificos(df, col='stock_final_zscore')

#  cust_request_qty
# df = feature_engineering.get_lagsEspecificos(df, col='cust_request_qty')
# df = feature_engineering.get_delta_lags_especificos(df, col='cust_request_qty')
# df = feature_engineering.get_rolling_means_especificos(df, col='cust_request_qty')
# df = feature_engineering.get_rolling_stds_especificos(df, col='cust_request_qty')
# df = feature_engineering.get_rolling_mins_especificos(df, col='cust_request_qty')
# df = feature_engineering.get_rolling_maxs_especificos(df, col='cust_request_qty')
# df = feature_engineering.get_rolling_medians_especificos(df, col='cust_request_qty')
# df = feature_engineering.get_rolling_skewness_especificos(df, col='cust_request_qty')

##### FE Calendario

In [10]:
df = feature_engineering.generar_ids(df)
df = feature_engineering.get_componentesTemporales(df)
df = feature_engineering.get_anomaliasPoliticas(df)
# df = feature_engineering.descomposicion_serie_temporal(df, col='tn')
df.shape

(31362, 1525)

##### FE sobre FE

In [11]:
df = feature_engineering.chatGPT_features_serie(df, "tn")
df = feature_engineering.mes_con_feriado(df)
df.shape

(31362, 1554)

##### Variables Exogenas

In [12]:
df = feature_engineering.get_dolar(df)
df = feature_engineering.get_IPC(df)
df['ipc'] = df['ipc'].str.replace(',', '.').astype(float)
df['dolar'] = df['dolar'].str.replace(',', '.').astype(float)
# df.drop(columns=['ds'], inplace=True)
df.fillna(0, inplace=True) ##### EXPERIMENTAR
df = feature_engineering.correlacion_exogenas(df)
df = feature_engineering.get_mes_receso_escolar(df)
df.shape

(31362, 1557)

##### Nuevas FE

In [13]:
df = feature_engineering.create_ratio_features(df)
df = feature_engineering.enhance_lifecycle_features(df)
df = feature_engineering.create_category_features(df)
df = feature_engineering.create_regime_features(df)
df = feature_engineering.create_nonlinear_trends(df)
df = feature_engineering.create_temporal_interactions(df)
df = feature_engineering.create_asymmetric_window_features(df)
df = feature_engineering.recomendaciones_deepseek(df)
df = feature_engineering.get_nuevas_features(df)
df.shape

(31362, 1599)

##### Elimino aquellas que no sirven

In [None]:
# importantes = pd.read_csv("./feature_importance/exp04_3.csv", sep=',', encoding='utf-8')
# no_importantes = importantes[importantes['importance'] <= 100]
# no_importantes = no_importantes[~no_importantes['feature'].isin(columnas_baseline)]
# no_importantes

Unnamed: 0,feature,importance
67,tn_vs_prev_year,100
68,tn_delta_lag5_lag8,99
69,tn_rolling_min_4,96
70,tn_lag_5,95
71,tn_rolling_std_6,95
...,...,...
677,dia_del_year,0
678,cat2_TE,0
680,cat2_PIEL1,0
681,cat2_OTROS,0


In [None]:
# cols_a_eliminar = no_importantes.feature.unique()
# print(f"Antes de eliminar: {df.shape[1]} columnas")
# df = df.drop(columns=cols_a_eliminar, errors='ignore')
# print(f"Después de eliminar: {df.shape[1]} columnas")

Antes de eliminar: 1144 columnas
Después de eliminar: 684 columnas


Eliminar object/categorical columnas

In [14]:
df = df.select_dtypes(exclude=['datetime', 'datetime64', 'object'])

Train Test Split

In [15]:
train = df[df['periodo'] <= 201912]
test = df[df['periodo'] == 201912]

Entrenamiento

In [16]:
model_lgb.optimizar_con_optuna_con_semillerio_db(train, version="v15")


Para visualizar los resultados en tiempo real:
1. Abre otra terminal y ejecuta:
   optuna-dashboard sqlite:///optuna_studies_v15.db
2. Abre en tu navegador: http://127.0.0.1:8080/


[I 2025-06-30 22:05:36,488] A new study created in RDB with name: lightgbm_optimization_v15
[I 2025-06-30 22:10:45,212] Trial 0 finished with value: 8.484416635084013 and parameters: {'num_leaves': 47, 'learning_rate': 0.2536999076681772, 'feature_fraction': 0.892797576724562, 'bagging_fraction': 0.8795975452591109, 'bagging_freq': 2, 'lambda_l1': 2.5348407664333426e-07, 'lambda_l2': 3.3323645788192616e-08, 'min_child_samples': 45, 'max_depth': 7, 'max_bin': 383, 'min_data_in_leaf': 21, 'extra_trees': True, 'early_stopping_rounds': 18, 'path_smooth': 0.18182496720710062, 'min_gain_to_split': 0.09170225492671691}. Best is trial 0 with value: 8.484416635084013.


Mejor trial hasta ahora: RMSE=8.484417, Parámetros={'num_leaves': 47, 'learning_rate': 0.2536999076681772, 'feature_fraction': 0.892797576724562, 'bagging_fraction': 0.8795975452591109, 'bagging_freq': 2, 'lambda_l1': 2.5348407664333426e-07, 'lambda_l2': 3.3323645788192616e-08, 'min_child_samples': 45, 'max_depth': 7, 'max_bin': 383, 'min_data_in_leaf': 21, 'extra_trees': True, 'early_stopping_rounds': 18, 'path_smooth': 0.18182496720710062, 'min_gain_to_split': 0.09170225492671691}


[I 2025-06-30 22:21:27,633] Trial 1 finished with value: 17.15767005667181 and parameters: {'num_leaves': 41, 'learning_rate': 0.05958389350068958, 'feature_fraction': 0.7727780074568463, 'bagging_fraction': 0.7873687420594125, 'bagging_freq': 7, 'lambda_l1': 1.8007140198129195e-07, 'lambda_l2': 4.258943089524393e-06, 'min_child_samples': 25, 'max_depth': 6, 'max_bin': 414, 'min_data_in_leaf': 36, 'extra_trees': False, 'early_stopping_rounds': 11, 'path_smooth': 0.6075448519014384, 'min_gain_to_split': 0.08526206184364576}. Best is trial 0 with value: 8.484416635084013.


Mejor trial hasta ahora: RMSE=8.484417, Parámetros={'num_leaves': 47, 'learning_rate': 0.2536999076681772, 'feature_fraction': 0.892797576724562, 'bagging_fraction': 0.8795975452591109, 'bagging_freq': 2, 'lambda_l1': 2.5348407664333426e-07, 'lambda_l2': 3.3323645788192616e-08, 'min_child_samples': 45, 'max_depth': 7, 'max_bin': 383, 'min_data_in_leaf': 21, 'extra_trees': True, 'early_stopping_rounds': 18, 'path_smooth': 0.18182496720710062, 'min_gain_to_split': 0.09170225492671691}


[I 2025-06-30 22:25:19,594] Trial 2 finished with value: 44.991070785300806 and parameters: {'num_leaves': 20, 'learning_rate': 0.2521267904777921, 'feature_fraction': 0.9862528132298237, 'bagging_fraction': 0.9425192044349383, 'bagging_freq': 4, 'lambda_l1': 7.569183361880229e-08, 'lambda_l2': 0.014391207615728067, 'min_child_samples': 28, 'max_depth': 3, 'max_bin': 298, 'min_data_in_leaf': 22, 'extra_trees': True, 'early_stopping_rounds': 37, 'path_smooth': 0.31171107608941095, 'min_gain_to_split': 0.2600340105889054}. Best is trial 0 with value: 8.484416635084013.


Mejor trial hasta ahora: RMSE=8.484417, Parámetros={'num_leaves': 47, 'learning_rate': 0.2536999076681772, 'feature_fraction': 0.892797576724562, 'bagging_fraction': 0.8795975452591109, 'bagging_freq': 2, 'lambda_l1': 2.5348407664333426e-07, 'lambda_l2': 3.3323645788192616e-08, 'min_child_samples': 45, 'max_depth': 7, 'max_bin': 383, 'min_data_in_leaf': 21, 'extra_trees': True, 'early_stopping_rounds': 18, 'path_smooth': 0.18182496720710062, 'min_gain_to_split': 0.09170225492671691}


[I 2025-06-30 22:35:07,804] Trial 3 finished with value: 110.30584542497763 and parameters: {'num_leaves': 62, 'learning_rate': 0.01875220945578641, 'feature_fraction': 0.9878338511058234, 'bagging_fraction': 0.9325398470083344, 'bagging_freq': 10, 'lambda_l1': 1.1309571585271483, 'lambda_l2': 0.002404915432737351, 'min_child_samples': 47, 'max_depth': 3, 'max_bin': 178, 'min_data_in_leaf': 23, 'extra_trees': False, 'early_stopping_rounds': 21, 'path_smooth': 0.8287375091519293, 'min_gain_to_split': 0.17837666334679464}. Best is trial 0 with value: 8.484416635084013.


Mejor trial hasta ahora: RMSE=8.484417, Parámetros={'num_leaves': 47, 'learning_rate': 0.2536999076681772, 'feature_fraction': 0.892797576724562, 'bagging_fraction': 0.8795975452591109, 'bagging_freq': 2, 'lambda_l1': 2.5348407664333426e-07, 'lambda_l2': 3.3323645788192616e-08, 'min_child_samples': 45, 'max_depth': 7, 'max_bin': 383, 'min_data_in_leaf': 21, 'extra_trees': True, 'early_stopping_rounds': 18, 'path_smooth': 0.18182496720710062, 'min_gain_to_split': 0.09170225492671691}


[I 2025-06-30 22:41:49,159] Trial 4 finished with value: 106.69240781582789 and parameters: {'num_leaves': 39, 'learning_rate': 0.06333268775321843, 'feature_fraction': 0.6563696899899051, 'bagging_fraction': 0.9406590942262119, 'bagging_freq': 1, 'lambda_l1': 7.620481786158549, 'lambda_l2': 0.08916674715636537, 'min_child_samples': 18, 'max_depth': 3, 'max_bin': 427, 'min_data_in_leaf': 77, 'extra_trees': False, 'early_stopping_rounds': 13, 'path_smooth': 0.3584657285442726, 'min_gain_to_split': 0.05793452976256486}. Best is trial 0 with value: 8.484416635084013.


Mejor trial hasta ahora: RMSE=8.484417, Parámetros={'num_leaves': 47, 'learning_rate': 0.2536999076681772, 'feature_fraction': 0.892797576724562, 'bagging_fraction': 0.8795975452591109, 'bagging_freq': 2, 'lambda_l1': 2.5348407664333426e-07, 'lambda_l2': 3.3323645788192616e-08, 'min_child_samples': 45, 'max_depth': 7, 'max_bin': 383, 'min_data_in_leaf': 21, 'extra_trees': True, 'early_stopping_rounds': 18, 'path_smooth': 0.18182496720710062, 'min_gain_to_split': 0.09170225492671691}


[I 2025-06-30 22:52:47,108] Trial 5 finished with value: 8.205587190202099 and parameters: {'num_leaves': 89, 'learning_rate': 0.08330803890301997, 'feature_fraction': 0.7323592099410596, 'bagging_fraction': 0.7190675050858071, 'bagging_freq': 4, 'lambda_l1': 8.445977074223802e-06, 'lambda_l2': 0.036851536911881845, 'min_child_samples': 36, 'max_depth': 10, 'max_bin': 289, 'min_data_in_leaf': 29, 'extra_trees': False, 'early_stopping_rounds': 33, 'path_smooth': 0.770967179954561, 'min_gain_to_split': 0.24689779818219537}. Best is trial 5 with value: 8.205587190202099.


Mejor trial hasta ahora: RMSE=8.205587, Parámetros={'num_leaves': 89, 'learning_rate': 0.08330803890301997, 'feature_fraction': 0.7323592099410596, 'bagging_fraction': 0.7190675050858071, 'bagging_freq': 4, 'lambda_l1': 8.445977074223802e-06, 'lambda_l2': 0.036851536911881845, 'min_child_samples': 36, 'max_depth': 10, 'max_bin': 289, 'min_data_in_leaf': 29, 'extra_trees': False, 'early_stopping_rounds': 33, 'path_smooth': 0.770967179954561, 'min_gain_to_split': 0.24689779818219537}


[I 2025-06-30 23:00:18,136] Trial 6 finished with value: 5.382897215379932 and parameters: {'num_leaves': 59, 'learning_rate': 0.042808491617570936, 'feature_fraction': 0.610167650697638, 'bagging_fraction': 0.7323674280979913, 'bagging_freq': 1, 'lambda_l1': 0.005341874754868531, 'lambda_l2': 6.748446817464346e-06, 'min_child_samples': 30, 'max_depth': 10, 'max_bin': 199, 'min_data_in_leaf': 53, 'extra_trees': True, 'early_stopping_rounds': 13, 'path_smooth': 0.289751452913768, 'min_gain_to_split': 0.08061064362700221}. Best is trial 6 with value: 5.382897215379932.


Mejor trial hasta ahora: RMSE=5.382897, Parámetros={'num_leaves': 59, 'learning_rate': 0.042808491617570936, 'feature_fraction': 0.610167650697638, 'bagging_fraction': 0.7323674280979913, 'bagging_freq': 1, 'lambda_l1': 0.005341874754868531, 'lambda_l2': 6.748446817464346e-06, 'min_child_samples': 30, 'max_depth': 10, 'max_bin': 199, 'min_data_in_leaf': 53, 'extra_trees': True, 'early_stopping_rounds': 13, 'path_smooth': 0.289751452913768, 'min_gain_to_split': 0.08061064362700221}


[I 2025-06-30 23:12:02,766] Trial 7 finished with value: 5.75373309312771 and parameters: {'num_leaves': 94, 'learning_rate': 0.156203869845265, 'feature_fraction': 0.8533615026041694, 'bagging_fraction': 0.9614381770563153, 'bagging_freq': 9, 'lambda_l1': 4.776728196949699e-07, 'lambda_l2': 1.0790237065789294, 'min_child_samples': 32, 'max_depth': 9, 'max_bin': 459, 'min_data_in_leaf': 45, 'extra_trees': False, 'early_stopping_rounds': 27, 'path_smooth': 0.8180147659224931, 'min_gain_to_split': 0.4303652916281717}. Best is trial 6 with value: 5.382897215379932.


Mejor trial hasta ahora: RMSE=5.382897, Parámetros={'num_leaves': 59, 'learning_rate': 0.042808491617570936, 'feature_fraction': 0.610167650697638, 'bagging_fraction': 0.7323674280979913, 'bagging_freq': 1, 'lambda_l1': 0.005341874754868531, 'lambda_l2': 6.748446817464346e-06, 'min_child_samples': 30, 'max_depth': 10, 'max_bin': 199, 'min_data_in_leaf': 53, 'extra_trees': True, 'early_stopping_rounds': 13, 'path_smooth': 0.289751452913768, 'min_gain_to_split': 0.08061064362700221}


[I 2025-06-30 23:18:44,866] Trial 8 finished with value: 25.76518359328828 and parameters: {'num_leaves': 15, 'learning_rate': 0.05681142678077596, 'feature_fraction': 0.7669644012595116, 'bagging_fraction': 0.7666323431412191, 'bagging_freq': 2, 'lambda_l1': 1.0927895733904103e-05, 'lambda_l2': 3.0632845126552133, 'min_child_samples': 23, 'max_depth': 7, 'max_bin': 381, 'min_data_in_leaf': 49, 'extra_trees': True, 'early_stopping_rounds': 20, 'path_smooth': 0.49724850589238545, 'min_gain_to_split': 0.15043915490838483}. Best is trial 6 with value: 5.382897215379932.


Mejor trial hasta ahora: RMSE=5.382897, Parámetros={'num_leaves': 59, 'learning_rate': 0.042808491617570936, 'feature_fraction': 0.610167650697638, 'bagging_fraction': 0.7323674280979913, 'bagging_freq': 1, 'lambda_l1': 0.005341874754868531, 'lambda_l2': 6.748446817464346e-06, 'min_child_samples': 30, 'max_depth': 10, 'max_bin': 199, 'min_data_in_leaf': 53, 'extra_trees': True, 'early_stopping_rounds': 13, 'path_smooth': 0.289751452913768, 'min_gain_to_split': 0.08061064362700221}


[I 2025-06-30 23:33:36,593] Trial 9 finished with value: 77.29682170667218 and parameters: {'num_leaves': 39, 'learning_rate': 0.011336695817840537, 'feature_fraction': 0.8438257335919588, 'bagging_fraction': 0.8508037069686585, 'bagging_freq': 1, 'lambda_l1': 3.21972053981427e-06, 'lambda_l2': 1.49414578394363, 'min_child_samples': 19, 'max_depth': 4, 'max_bin': 296, 'min_data_in_leaf': 99, 'extra_trees': False, 'early_stopping_rounds': 41, 'path_smooth': 0.23763754399239967, 'min_gain_to_split': 0.3641081743059298}. Best is trial 6 with value: 5.382897215379932.


Mejor trial hasta ahora: RMSE=5.382897, Parámetros={'num_leaves': 59, 'learning_rate': 0.042808491617570936, 'feature_fraction': 0.610167650697638, 'bagging_fraction': 0.7323674280979913, 'bagging_freq': 1, 'lambda_l1': 0.005341874754868531, 'lambda_l2': 6.748446817464346e-06, 'min_child_samples': 30, 'max_depth': 10, 'max_bin': 199, 'min_data_in_leaf': 53, 'extra_trees': True, 'early_stopping_rounds': 13, 'path_smooth': 0.289751452913768, 'min_gain_to_split': 0.08061064362700221}


[I 2025-06-30 23:41:19,236] Trial 10 finished with value: 7.467538121738829 and parameters: {'num_leaves': 70, 'learning_rate': 0.029068799010326083, 'feature_fraction': 0.6030189277265172, 'bagging_fraction': 0.7053885626844458, 'bagging_freq': 6, 'lambda_l1': 0.016301353379407614, 'lambda_l2': 1.785697818123025e-05, 'min_child_samples': 11, 'max_depth': 9, 'max_bin': 104, 'min_data_in_leaf': 67, 'extra_trees': True, 'early_stopping_rounds': 47, 'path_smooth': 0.07644843397269116, 'min_gain_to_split': 0.016311559769174172}. Best is trial 6 with value: 5.382897215379932.


Mejor trial hasta ahora: RMSE=5.382897, Parámetros={'num_leaves': 59, 'learning_rate': 0.042808491617570936, 'feature_fraction': 0.610167650697638, 'bagging_fraction': 0.7323674280979913, 'bagging_freq': 1, 'lambda_l1': 0.005341874754868531, 'lambda_l2': 6.748446817464346e-06, 'min_child_samples': 30, 'max_depth': 10, 'max_bin': 199, 'min_data_in_leaf': 53, 'extra_trees': True, 'early_stopping_rounds': 13, 'path_smooth': 0.289751452913768, 'min_gain_to_split': 0.08061064362700221}


[I 2025-06-30 23:48:51,665] Trial 11 finished with value: 3.2461153050439577 and parameters: {'num_leaves': 96, 'learning_rate': 0.1251653145864116, 'feature_fraction': 0.8834227868828812, 'bagging_fraction': 0.9812246562585575, 'bagging_freq': 10, 'lambda_l1': 0.0017917827394915457, 'lambda_l2': 5.258219487669763e-05, 'min_child_samples': 34, 'max_depth': 9, 'max_bin': 494, 'min_data_in_leaf': 49, 'extra_trees': True, 'early_stopping_rounds': 27, 'path_smooth': 0.9812035852809249, 'min_gain_to_split': 0.47411998848815184}. Best is trial 11 with value: 3.2461153050439577.


Mejor trial hasta ahora: RMSE=3.246115, Parámetros={'num_leaves': 96, 'learning_rate': 0.1251653145864116, 'feature_fraction': 0.8834227868828812, 'bagging_fraction': 0.9812246562585575, 'bagging_freq': 10, 'lambda_l1': 0.0017917827394915457, 'lambda_l2': 5.258219487669763e-05, 'min_child_samples': 34, 'max_depth': 9, 'max_bin': 494, 'min_data_in_leaf': 49, 'extra_trees': True, 'early_stopping_rounds': 27, 'path_smooth': 0.9812035852809249, 'min_gain_to_split': 0.47411998848815184}


[I 2025-06-30 23:54:08,816] Trial 12 finished with value: 1.299332378968511 and parameters: {'num_leaves': 79, 'learning_rate': 0.11440654892094684, 'feature_fraction': 0.9053986232371537, 'bagging_fraction': 0.7988287677667018, 'bagging_freq': 8, 'lambda_l1': 0.003857574161640363, 'lambda_l2': 3.8925679417681855e-05, 'min_child_samples': 39, 'max_depth': 10, 'max_bin': 208, 'min_data_in_leaf': 58, 'extra_trees': True, 'early_stopping_rounds': 27, 'path_smooth': 0.9901597696400357, 'min_gain_to_split': 0.44819072914026564}. Best is trial 12 with value: 1.299332378968511.


Mejor trial hasta ahora: RMSE=1.299332, Parámetros={'num_leaves': 79, 'learning_rate': 0.11440654892094684, 'feature_fraction': 0.9053986232371537, 'bagging_fraction': 0.7988287677667018, 'bagging_freq': 8, 'lambda_l1': 0.003857574161640363, 'lambda_l2': 3.8925679417681855e-05, 'min_child_samples': 39, 'max_depth': 10, 'max_bin': 208, 'min_data_in_leaf': 58, 'extra_trees': True, 'early_stopping_rounds': 27, 'path_smooth': 0.9901597696400357, 'min_gain_to_split': 0.44819072914026564}


[I 2025-07-01 00:00:26,650] Trial 13 finished with value: 5.552548771971881 and parameters: {'num_leaves': 80, 'learning_rate': 0.12135983719521656, 'feature_fraction': 0.9178581456448356, 'bagging_fraction': 0.8121933117224753, 'bagging_freq': 8, 'lambda_l1': 0.0006418643074595679, 'lambda_l2': 0.0001582574379358235, 'min_child_samples': 39, 'max_depth': 8, 'max_bin': 492, 'min_data_in_leaf': 66, 'extra_trees': True, 'early_stopping_rounds': 27, 'path_smooth': 0.9559291326495888, 'min_gain_to_split': 0.4900967110055443}. Best is trial 12 with value: 1.299332378968511.


Mejor trial hasta ahora: RMSE=1.299332, Parámetros={'num_leaves': 79, 'learning_rate': 0.11440654892094684, 'feature_fraction': 0.9053986232371537, 'bagging_fraction': 0.7988287677667018, 'bagging_freq': 8, 'lambda_l1': 0.003857574161640363, 'lambda_l2': 3.8925679417681855e-05, 'min_child_samples': 39, 'max_depth': 10, 'max_bin': 208, 'min_data_in_leaf': 58, 'extra_trees': True, 'early_stopping_rounds': 27, 'path_smooth': 0.9901597696400357, 'min_gain_to_split': 0.44819072914026564}


[I 2025-07-01 00:05:45,890] Trial 14 finished with value: 1.0106892200633344 and parameters: {'num_leaves': 100, 'learning_rate': 0.11852639795705067, 'feature_fraction': 0.9325299021978121, 'bagging_fraction': 0.9913043905607574, 'bagging_freq': 10, 'lambda_l1': 0.10351163679178885, 'lambda_l2': 1.3129174195141697e-07, 'min_child_samples': 42, 'max_depth': 9, 'max_bin': 220, 'min_data_in_leaf': 78, 'extra_trees': True, 'early_stopping_rounds': 31, 'path_smooth': 0.9652325178791951, 'min_gain_to_split': 0.3734857029985921}. Best is trial 14 with value: 1.0106892200633344.


Mejor trial hasta ahora: RMSE=1.010689, Parámetros={'num_leaves': 100, 'learning_rate': 0.11852639795705067, 'feature_fraction': 0.9325299021978121, 'bagging_fraction': 0.9913043905607574, 'bagging_freq': 10, 'lambda_l1': 0.10351163679178885, 'lambda_l2': 1.3129174195141697e-07, 'min_child_samples': 42, 'max_depth': 9, 'max_bin': 220, 'min_data_in_leaf': 78, 'extra_trees': True, 'early_stopping_rounds': 31, 'path_smooth': 0.9652325178791951, 'min_gain_to_split': 0.3734857029985921}


[I 2025-07-01 00:10:23,148] Trial 15 finished with value: 2.10479020322729 and parameters: {'num_leaves': 79, 'learning_rate': 0.17109224279309102, 'feature_fraction': 0.9485257618766799, 'bagging_fraction': 0.8958654007917661, 'bagging_freq': 8, 'lambda_l1': 0.19311805214142994, 'lambda_l2': 3.421572849180321e-08, 'min_child_samples': 41, 'max_depth': 10, 'max_bin': 220, 'min_data_in_leaf': 84, 'extra_trees': True, 'early_stopping_rounds': 33, 'path_smooth': 0.6538839629254711, 'min_gain_to_split': 0.3721830905975507}. Best is trial 14 with value: 1.0106892200633344.


Mejor trial hasta ahora: RMSE=1.010689, Parámetros={'num_leaves': 100, 'learning_rate': 0.11852639795705067, 'feature_fraction': 0.9325299021978121, 'bagging_fraction': 0.9913043905607574, 'bagging_freq': 10, 'lambda_l1': 0.10351163679178885, 'lambda_l2': 1.3129174195141697e-07, 'min_child_samples': 42, 'max_depth': 9, 'max_bin': 220, 'min_data_in_leaf': 78, 'extra_trees': True, 'early_stopping_rounds': 31, 'path_smooth': 0.9652325178791951, 'min_gain_to_split': 0.3734857029985921}


[I 2025-07-01 00:14:12,152] Trial 16 finished with value: 11.944736460363904 and parameters: {'num_leaves': 83, 'learning_rate': 0.10054887949195469, 'feature_fraction': 0.8227026716857875, 'bagging_fraction': 0.822061595670232, 'bagging_freq': 9, 'lambda_l1': 0.057278386004176896, 'lambda_l2': 5.46078271227366e-07, 'min_child_samples': 50, 'max_depth': 5, 'max_bin': 144, 'min_data_in_leaf': 79, 'extra_trees': True, 'early_stopping_rounds': 41, 'path_smooth': 0.9864440343486612, 'min_gain_to_split': 0.3565405469003663}. Best is trial 14 with value: 1.0106892200633344.


Mejor trial hasta ahora: RMSE=1.010689, Parámetros={'num_leaves': 100, 'learning_rate': 0.11852639795705067, 'feature_fraction': 0.9325299021978121, 'bagging_fraction': 0.9913043905607574, 'bagging_freq': 10, 'lambda_l1': 0.10351163679178885, 'lambda_l2': 1.3129174195141697e-07, 'min_child_samples': 42, 'max_depth': 9, 'max_bin': 220, 'min_data_in_leaf': 78, 'extra_trees': True, 'early_stopping_rounds': 31, 'path_smooth': 0.9652325178791951, 'min_gain_to_split': 0.3734857029985921}


[I 2025-07-01 00:23:30,604] Trial 17 finished with value: 5.4934539018805255 and parameters: {'num_leaves': 99, 'learning_rate': 0.036622033681778327, 'feature_fraction': 0.9287211378997813, 'bagging_fraction': 0.8976369323205313, 'bagging_freq': 7, 'lambda_l1': 8.306630840098401e-05, 'lambda_l2': 5.54450238813181e-07, 'min_child_samples': 43, 'max_depth': 8, 'max_bin': 243, 'min_data_in_leaf': 96, 'extra_trees': True, 'early_stopping_rounds': 23, 'path_smooth': 0.7087397162307963, 'min_gain_to_split': 0.29370324178919}. Best is trial 14 with value: 1.0106892200633344.


Mejor trial hasta ahora: RMSE=1.010689, Parámetros={'num_leaves': 100, 'learning_rate': 0.11852639795705067, 'feature_fraction': 0.9325299021978121, 'bagging_fraction': 0.9913043905607574, 'bagging_freq': 10, 'lambda_l1': 0.10351163679178885, 'lambda_l2': 1.3129174195141697e-07, 'min_child_samples': 42, 'max_depth': 9, 'max_bin': 220, 'min_data_in_leaf': 78, 'extra_trees': True, 'early_stopping_rounds': 31, 'path_smooth': 0.9652325178791951, 'min_gain_to_split': 0.3734857029985921}


[I 2025-07-01 00:27:52,466] Trial 18 finished with value: 2.8313129467650398 and parameters: {'num_leaves': 70, 'learning_rate': 0.18707824638286116, 'feature_fraction': 0.8748882346435806, 'bagging_fraction': 0.7633081745592074, 'bagging_freq': 9, 'lambda_l1': 0.00011815497621861733, 'lambda_l2': 0.0010816765158133525, 'min_child_samples': 38, 'max_depth': 8, 'max_bin': 251, 'min_data_in_leaf': 61, 'extra_trees': True, 'early_stopping_rounds': 32, 'path_smooth': 0.9085597177400111, 'min_gain_to_split': 0.4207498089725614}. Best is trial 14 with value: 1.0106892200633344.


Mejor trial hasta ahora: RMSE=1.010689, Parámetros={'num_leaves': 100, 'learning_rate': 0.11852639795705067, 'feature_fraction': 0.9325299021978121, 'bagging_fraction': 0.9913043905607574, 'bagging_freq': 10, 'lambda_l1': 0.10351163679178885, 'lambda_l2': 1.3129174195141697e-07, 'min_child_samples': 42, 'max_depth': 9, 'max_bin': 220, 'min_data_in_leaf': 78, 'extra_trees': True, 'early_stopping_rounds': 31, 'path_smooth': 0.9652325178791951, 'min_gain_to_split': 0.3734857029985921}


[I 2025-07-01 00:35:01,375] Trial 19 finished with value: 1.7926265318393881 and parameters: {'num_leaves': 88, 'learning_rate': 0.08239946405325899, 'feature_fraction': 0.9592829493986843, 'bagging_fraction': 0.8560114029693604, 'bagging_freq': 5, 'lambda_l1': 0.29019950769397734, 'lambda_l2': 7.188763352094568e-07, 'min_child_samples': 49, 'max_depth': 9, 'max_bin': 345, 'min_data_in_leaf': 90, 'extra_trees': True, 'early_stopping_rounds': 39, 'path_smooth': 0.49148803447070455, 'min_gain_to_split': 0.3127213185315013}. Best is trial 14 with value: 1.0106892200633344.


Mejor trial hasta ahora: RMSE=1.010689, Parámetros={'num_leaves': 100, 'learning_rate': 0.11852639795705067, 'feature_fraction': 0.9325299021978121, 'bagging_fraction': 0.9913043905607574, 'bagging_freq': 10, 'lambda_l1': 0.10351163679178885, 'lambda_l2': 1.3129174195141697e-07, 'min_child_samples': 42, 'max_depth': 9, 'max_bin': 220, 'min_data_in_leaf': 78, 'extra_trees': True, 'early_stopping_rounds': 31, 'path_smooth': 0.9652325178791951, 'min_gain_to_split': 0.3734857029985921}


[I 2025-07-01 00:38:26,676] Trial 20 finished with value: 4.342914619026405 and parameters: {'num_leaves': 71, 'learning_rate': 0.27695181472908825, 'feature_fraction': 0.7987169132990556, 'bagging_fraction': 0.9921770866603778, 'bagging_freq': 10, 'lambda_l1': 5.646099525142642, 'lambda_l2': 1.2688802305139627e-08, 'min_child_samples': 43, 'max_depth': 6, 'max_bin': 131, 'min_data_in_leaf': 71, 'extra_trees': True, 'early_stopping_rounds': 50, 'path_smooth': 0.8744625619759305, 'min_gain_to_split': 0.4017824311708472}. Best is trial 14 with value: 1.0106892200633344.


Mejor trial hasta ahora: RMSE=1.010689, Parámetros={'num_leaves': 100, 'learning_rate': 0.11852639795705067, 'feature_fraction': 0.9325299021978121, 'bagging_fraction': 0.9913043905607574, 'bagging_freq': 10, 'lambda_l1': 0.10351163679178885, 'lambda_l2': 1.3129174195141697e-07, 'min_child_samples': 42, 'max_depth': 9, 'max_bin': 220, 'min_data_in_leaf': 78, 'extra_trees': True, 'early_stopping_rounds': 31, 'path_smooth': 0.9652325178791951, 'min_gain_to_split': 0.3734857029985921}


[I 2025-07-01 00:45:03,435] Trial 21 finished with value: 3.228538761393019 and parameters: {'num_leaves': 88, 'learning_rate': 0.0856778490792569, 'feature_fraction': 0.9615262820385787, 'bagging_fraction': 0.8498901197745152, 'bagging_freq': 5, 'lambda_l1': 0.27433635712274546, 'lambda_l2': 7.23766171586278e-07, 'min_child_samples': 50, 'max_depth': 9, 'max_bin': 333, 'min_data_in_leaf': 91, 'extra_trees': True, 'early_stopping_rounds': 38, 'path_smooth': 0.4715116714365277, 'min_gain_to_split': 0.30962118932291743}. Best is trial 14 with value: 1.0106892200633344.


Mejor trial hasta ahora: RMSE=1.010689, Parámetros={'num_leaves': 100, 'learning_rate': 0.11852639795705067, 'feature_fraction': 0.9325299021978121, 'bagging_fraction': 0.9913043905607574, 'bagging_freq': 10, 'lambda_l1': 0.10351163679178885, 'lambda_l2': 1.3129174195141697e-07, 'min_child_samples': 42, 'max_depth': 9, 'max_bin': 220, 'min_data_in_leaf': 78, 'extra_trees': True, 'early_stopping_rounds': 31, 'path_smooth': 0.9652325178791951, 'min_gain_to_split': 0.3734857029985921}


[I 2025-07-01 00:52:16,082] Trial 22 finished with value: 3.8326630542110074 and parameters: {'num_leaves': 99, 'learning_rate': 0.0799021879712553, 'feature_fraction': 0.9181046678091449, 'bagging_fraction': 0.8139292933591038, 'bagging_freq': 5, 'lambda_l1': 0.013665203725712225, 'lambda_l2': 1.5468616329274978e-07, 'min_child_samples': 47, 'max_depth': 10, 'max_bin': 345, 'min_data_in_leaf': 87, 'extra_trees': True, 'early_stopping_rounds': 37, 'path_smooth': 0.41655835255999285, 'min_gain_to_split': 0.33389971119681044}. Best is trial 14 with value: 1.0106892200633344.


Mejor trial hasta ahora: RMSE=1.010689, Parámetros={'num_leaves': 100, 'learning_rate': 0.11852639795705067, 'feature_fraction': 0.9325299021978121, 'bagging_fraction': 0.9913043905607574, 'bagging_freq': 10, 'lambda_l1': 0.10351163679178885, 'lambda_l2': 1.3129174195141697e-07, 'min_child_samples': 42, 'max_depth': 9, 'max_bin': 220, 'min_data_in_leaf': 78, 'extra_trees': True, 'early_stopping_rounds': 31, 'path_smooth': 0.9652325178791951, 'min_gain_to_split': 0.3734857029985921}


[I 2025-07-01 00:57:36,663] Trial 23 finished with value: 3.989614703967129 and parameters: {'num_leaves': 88, 'learning_rate': 0.11505759709059823, 'feature_fraction': 0.9620323392128296, 'bagging_fraction': 0.8615575260217088, 'bagging_freq': 7, 'lambda_l1': 0.6925493413868201, 'lambda_l2': 2.587577084494008e-06, 'min_child_samples': 40, 'max_depth': 8, 'max_bin': 262, 'min_data_in_leaf': 75, 'extra_trees': True, 'early_stopping_rounds': 30, 'path_smooth': 0.6308189452654209, 'min_gain_to_split': 0.4420082508084827}. Best is trial 14 with value: 1.0106892200633344.


Mejor trial hasta ahora: RMSE=1.010689, Parámetros={'num_leaves': 100, 'learning_rate': 0.11852639795705067, 'feature_fraction': 0.9325299021978121, 'bagging_fraction': 0.9913043905607574, 'bagging_freq': 10, 'lambda_l1': 0.10351163679178885, 'lambda_l2': 1.3129174195141697e-07, 'min_child_samples': 42, 'max_depth': 9, 'max_bin': 220, 'min_data_in_leaf': 78, 'extra_trees': True, 'early_stopping_rounds': 31, 'path_smooth': 0.9652325178791951, 'min_gain_to_split': 0.3734857029985921}


[I 2025-07-01 01:01:52,936] Trial 24 finished with value: 6.159770385483371 and parameters: {'num_leaves': 74, 'learning_rate': 0.14568924319434676, 'feature_fraction': 0.9896176725148149, 'bagging_fraction': 0.7858982672977801, 'bagging_freq': 6, 'lambda_l1': 0.05871381583986674, 'lambda_l2': 5.4662403953602196e-05, 'min_child_samples': 46, 'max_depth': 9, 'max_bin': 166, 'min_data_in_leaf': 58, 'extra_trees': True, 'early_stopping_rounds': 44, 'path_smooth': 0.5594887642857059, 'min_gain_to_split': 0.22747772415856038}. Best is trial 14 with value: 1.0106892200633344.


Mejor trial hasta ahora: RMSE=1.010689, Parámetros={'num_leaves': 100, 'learning_rate': 0.11852639795705067, 'feature_fraction': 0.9325299021978121, 'bagging_fraction': 0.9913043905607574, 'bagging_freq': 10, 'lambda_l1': 0.10351163679178885, 'lambda_l2': 1.3129174195141697e-07, 'min_child_samples': 42, 'max_depth': 9, 'max_bin': 220, 'min_data_in_leaf': 78, 'extra_trees': True, 'early_stopping_rounds': 31, 'path_smooth': 0.9652325178791951, 'min_gain_to_split': 0.3734857029985921}


[I 2025-07-01 01:05:55,584] Trial 25 finished with value: 1.5762878385446588 and parameters: {'num_leaves': 90, 'learning_rate': 0.20130980409705235, 'feature_fraction': 0.9016437302675341, 'bagging_fraction': 0.7460290598414079, 'bagging_freq': 8, 'lambda_l1': 0.0011025917499955906, 'lambda_l2': 1.3774897562296066e-07, 'min_child_samples': 36, 'max_depth': 10, 'max_bin': 215, 'min_data_in_leaf': 83, 'extra_trees': True, 'early_stopping_rounds': 25, 'path_smooth': 0.7226957415743933, 'min_gain_to_split': 0.38937888415620175}. Best is trial 14 with value: 1.0106892200633344.


Mejor trial hasta ahora: RMSE=1.010689, Parámetros={'num_leaves': 100, 'learning_rate': 0.11852639795705067, 'feature_fraction': 0.9325299021978121, 'bagging_fraction': 0.9913043905607574, 'bagging_freq': 10, 'lambda_l1': 0.10351163679178885, 'lambda_l2': 1.3129174195141697e-07, 'min_child_samples': 42, 'max_depth': 9, 'max_bin': 220, 'min_data_in_leaf': 78, 'extra_trees': True, 'early_stopping_rounds': 31, 'path_smooth': 0.9652325178791951, 'min_gain_to_split': 0.3734857029985921}
Estudio guardado en: sqlite:///optuna_studies_v15.db

Mejores hiperparámetros encontrados:
num_leaves: 100
learning_rate: 0.11852639795705067
feature_fraction: 0.9325299021978121
bagging_fraction: 0.9913043905607574
bagging_freq: 10
lambda_l1: 0.10351163679178885
lambda_l2: 1.3129174195141697e-07
min_child_samples: 42
max_depth: 9
max_bin: 220
min_data_in_leaf: 78
extra_trees: True
early_stopping_rounds: 31
path_smooth: 0.9652325178791951
min_gain_to_split: 0.3734857029985921


(<optuna.study.study.Study at 0x21580c09510>,
 {'num_leaves': 100,
  'learning_rate': 0.11852639795705067,
  'feature_fraction': 0.9325299021978121,
  'bagging_fraction': 0.9913043905607574,
  'bagging_freq': 10,
  'lambda_l1': 0.10351163679178885,
  'lambda_l2': 1.3129174195141697e-07,
  'min_child_samples': 42,
  'max_depth': 9,
  'max_bin': 220,
  'min_data_in_leaf': 78,
  'extra_trees': True,
  'early_stopping_rounds': 31,
  'path_smooth': 0.9652325178791951,
  'min_gain_to_split': 0.3734857029985921,
  'objective': 'regression',
  'metric': 'rmse',
  'boosting_type': 'gbdt',
  'verbosity': -1})

Prediccion

In [18]:
df_future = model_lgb.semillerio_en_prediccion_con_pesos(train, test, version="v15")

In [19]:
df_future

Unnamed: 0,periodo,product_id,target,pred
30476,201912,20001,0.0,1312.098207
30477,201912,20002,0.0,1016.446887
30478,201912,20003,0.0,728.742897
30479,201912,20004,0.0,717.139216
30480,201912,20005,0.0,678.526221
...,...,...,...,...
31357,201912,21265,0.0,-1.218135
31358,201912,21266,0.0,-2.318349
31359,201912,21267,0.0,-2.595775
31360,201912,21271,0.0,1.564772


Filtramos los 180 productos

In [20]:
productos_ok = pd.read_csv("https://storage.googleapis.com/open-courses/austral2025-af91/labo3v/product_id_apredecir201912.txt", sep="\t")
df_future = df_future[df_future['periodo'] == 201912]
df_future = df_future[df_future['product_id'].isin(productos_ok['product_id'].unique())]


In [21]:
df_future

Unnamed: 0,periodo,product_id,target,pred
30476,201912,20001,0.0,1312.098207
30477,201912,20002,0.0,1016.446887
30478,201912,20003,0.0,728.742897
30479,201912,20004,0.0,717.139216
30480,201912,20005,0.0,678.526221
...,...,...,...,...
31355,201912,21263,0.0,1.518800
31357,201912,21265,0.0,-1.218135
31358,201912,21266,0.0,-2.318349
31359,201912,21267,0.0,-2.595775


Vemos cuantos negativos hay

In [22]:
df_future[df_future['pred'] < 0]

Unnamed: 0,periodo,product_id,target,pred
30549,201912,20079,0.0,-5.693166
30570,201912,20102,0.0,-17.693306
30629,201912,20168,0.0,-1.682108
30643,201912,20183,0.0,-1.710660
30647,201912,20187,0.0,-1.216176
...,...,...,...,...
31344,201912,21233,0.0,-1.073537
31354,201912,21262,0.0,-0.944677
31357,201912,21265,0.0,-1.218135
31358,201912,21266,0.0,-2.318349


Reemplazamos los negativos por el promedio de ultimos 12 meses

In [24]:
promedio780 = model_lgb.promedio_12_meses_780p()
df_future = df_future.merge(promedio780, on='product_id', how='left')
df_future.drop(columns=['target','periodo'], inplace=True)
df_future.loc[df_future['pred'] < 0, 'pred'] = df_future['tn']
df_future



Unnamed: 0,product_id,pred,tn
0,20001,1312.098207,1454.732720
1,20002,1016.446887,1175.437142
2,20003,728.742897,784.976407
3,20004,717.139216,627.215328
4,20005,678.526221,668.270104
...,...,...,...
775,21263,1.518800,0.029993
776,21265,0.089541,0.089541
777,21266,0.094659,0.094659
778,21267,0.092835,0.092835


Guardamos el archivo

In [25]:
df_future.drop(columns=['tn'], inplace=True)
df_future.rename(columns={'pred': 'tn'}, inplace=True)
df_future.to_csv("./outputs/predicciones_exp_07_lgb_v2.csv", index=False, sep=',')