In [None]:
import pandas as pd
import gc
import numpy as np
from sklearn.preprocessing import LabelEncoder
from scipy.stats import linregress
from statsmodels.tsa.seasonal import seasonal_decompose




df = pd.read_csv("../datasets/sell-in.txt.gz", sep="\t")
productos = pd.read_csv("../datasets/tb_productos.txt", sep="\t")
productos = productos.drop_duplicates(subset=["product_id"],keep="first")  # Eliminar duplicados por si acaso
stocks = pd.read_csv("../datasets/tb_stocks.txt", sep="\t")

productos_ok = pd.read_csv("https://storage.googleapis.com/open-courses/austral2025-af91/labo3v/product_id_apredecir201912.txt", sep="\t")
#df = df[df["product_id"].isin(productos_ok["product_id"])]
df = df.groupby(by=["periodo","product_id"]).agg({
    "tn":"sum",
    "plan_precios_cuidados":"first",
    "customer_id":"nunique",
    }).reset_index()


productos_list = df['product_id'].unique()
periodos = df['periodo'].unique()
idx = pd.MultiIndex.from_product([productos_list, periodos], names=['product_id', 'periodo'])
completo = idx.to_frame(index=False)
# 4 filtrar combinaciones periodo_producto
completo = completo.merge(df, on=['periodo', 'product_id'], how='left')
nacimiento_producto = df.groupby('product_id')['periodo'].min().rename('nacimiento_producto')

# sumar 3 periodos al nacimiento del producto
nacimiento_producto = nacimiento_producto.apply(lambda x: pd.to_datetime(x, format="%Y%m") + pd.DateOffset(months=3))
nacimiento_producto = nacimiento_producto.dt.strftime("%Y%m").astype(int)

# merge nacimiento_producto con completo
completo['periodo'] = completo['periodo'].astype(int)
# debe quedar cada producto desde su nacimiento hasta el final
completo = completo.merge(nacimiento_producto, on='product_id', how='left')
completo = completo[completo['periodo'] >= completo['nacimiento_producto']]



df = pd.merge(completo, productos, how="left", on="product_id")
df = df.merge(stocks, how="left", on=["product_id", "periodo"])


def add_decomposition_columns(df, value_col='tn', group_col='product_id', period=12):
    """
    Añade columnas de descomposición directamente al DataFrame original
    """
    df = df.copy()
    df['trend'] = np.nan
    df['seasonal'] = np.nan
    df['resid'] = np.nan
    
    for product_id, group in df.groupby(group_col):
        idx = group.index
        clean_series = group[value_col].dropna()
        
        if len(clean_series) >= 2 * period:
            try:
                decomp = seasonal_decompose(clean_series, model='additive', period=period)
                df.loc[idx, 'trend_s'] = decomp.trend.reindex(idx)
                df.loc[idx, 'seasonal_s'] = decomp.seasonal.reindex(idx)
                df.loc[idx, 'resid_s'] = decomp.resid.reindex(idx)
            except:
                continue
                
    return df


df["periodo"] = pd.to_datetime(df["periodo"], format="%Y%m")

df["mes"] = df["periodo"].dt.month
df["year"] = df["periodo"].dt.year
df["quarter"] = df["periodo"].dt.quarter

# total periodo general
df["tn_total"] = df.groupby("periodo")["tn"].transform("sum")

# totales por cat1 por periodo
df["cat1_total"] = df.groupby(["periodo", "cat1"])["tn"].transform("sum")
df["i_cat1_total"] = df["cat1_total"] / df["tn_total"]
df["cat1_mean"] = df.groupby(["periodo", "cat1"])["tn"].transform("mean")
df["cat1_mean_12"] = df.groupby(["periodo", "cat1"])["tn"].transform(lambda x: x.rolling(12).mean())
df["cat1_customers"] = df.groupby(["periodo", "cat1"])["customer_id"].transform("nunique")
# totales por cat2 por periodo
df["cat2_total"] = df.groupby(["periodo", "cat2"])["tn"].transform("sum")
df["i_cat2_total"] =  df["cat2_total"] / df["tn_total"]
df["cat2_mean"] = df.groupby(["periodo", "cat2"])["tn"].transform("mean")
df["cat2_mean_12"] = df.groupby(["periodo", "cat2"])["tn"].transform(lambda x: x.rolling(12).mean())
df["cat2_customers"] = df.groupby(["periodo", "cat2"])["customer_id"].transform("nunique")
# totales por cat3 por periodo
df["cat3_total"] = df.groupby(["periodo", "cat3"])["tn"].transform("sum")
df["i_cat3_total"] = df["cat3_total"] / df["tn_total"]
df["cat3_mean"] = df.groupby(["periodo", "cat3"])["tn"].transform("mean")
df["cat3_mean_12"] = df.groupby(["periodo", "cat3"])["tn"].transform(lambda x: x.rolling(12).mean())
df["cat3_customers"] = df.groupby(["periodo", "cat3"])["customer_id"].transform("nunique")
# totales por brand
df["brand_total"] = df.groupby(["periodo", "brand"])["tn"].transform("sum")
df["i_brand_total"] = df["brand_total"] / df["tn_total"]
df["brand_mean"] = df.groupby(["periodo", "brand"])["tn"].transform("mean")
df["brand_mean_12"] = df.groupby(["periodo", "brand"])["tn"].transform(lambda x: x.rolling(12).mean())
df["brand_customers"] = df.groupby(["periodo", "brand"])["customer_id"].transform("nunique")


df['media_movil_3m'] = df.groupby('product_id')['tn'].transform(lambda x: x.rolling(3).mean())
df['media_movil_6m'] = df.groupby('product_id')['tn'].transform(lambda x: x.rolling(6).mean())
df['media_movil_12m'] = df.groupby('product_id')['tn'].transform(lambda x: x.rolling(12).mean())
df['i_media_3_6'] = df['media_movil_3m'] / df['media_movil_6m']
df['i_media_6_12'] = df['media_movil_6m'] / df['media_movil_12m']
df['i_media_3_12'] = df['media_movil_3m'] / df['media_movil_12m']
df["trend_corta"] = (df["i_media_3_6"]>1).astype(int)
df["trend_larga"] = (df["i_media_6_12"]>1).astype(int)
df["trend_media"] = (df["i_media_3_12"]>1).astype(int)
df["trend"] = df["trend_corta"] + df["trend_larga"] + df["trend_media"]

df["producto_grande"] = (df["product_id"].isin(list(productos_list)[:150])).astype(int)
df = add_decomposition_columns(df)

# shift t-1 a t12
meses = ["tn"]
for i in range(1, 13):
    df[f"tn_t{i}"] = df.groupby("product_id")["tn"].shift(i)
    df[f"delta_t{i}"] = df[f"tn"] / df[f"tn_t{i}"]
    df[f"diff_t{i}"] = df[f"tn"] - df[f"tn_t{i}"]
    meses.append(f"tn_t{i}")
    df[f"cat1_t{i}"] = df.groupby("product_id")["cat1_total"].shift(i)
    df[f"cat2_t{i}"] = df.groupby("product_id")["cat2_total"].shift(i)
    df[f"cat3_t{i}"] = df.groupby("product_id")["cat3_total"].shift(i)
    df[f"customer_id_t{i}"] = df.groupby("product_id")["customer_id"].shift(i)
    df[f"trend_{i}"] = df.groupby("product_id")["trend"].shift(i)
    # brand
    df[f"brand_t{i}"] = df.groupby("product_id")["brand_total"].shift(i)
    df[f"brand_cust_t{i}"] = df.groupby("product_id")["brand_customers"].shift(i)
    df[f"trend_s_t{i}"] = df.groupby("product_id")["trend_s"].shift(i)
    df[f"seasonal_s_t{i}"] = df.groupby("product_id")["seasonal_s"].shift(i)
    df[f"resid_s_t{i}"] = df.groupby("product_id")["resid_s"].shift(i)

df[f"tn_t13"] = df.groupby("product_id")["tn"].shift(13)

df[f"delta_t12_t13"] = df[f"tn_t12"] / df[f"tn_t13"]
df[f"diff_t12_13"] = df[f"tn_t12"] - df[f"tn_t13"]

df["i_tn_t12"] = df["tn"] / df["tn_t12"]
df["i_tn_t6"] = df["tn"] / df["tn_t6"]
df["i_tn_t3"] = df["tn"] / df["tn_t3"]

df["i_tn_customers"] = df["tn"] / df["customer_id"]

df["promedio_t12"] = df[meses].mean(axis=1)
df["promedio_t6"] = df[meses[:6]].mean(axis=1)
df["promedio_t3"] = df[meses[:3]].mean(axis=1)

df["mediana_t12"] = df[meses].median(axis=1)
df["mediana_t6"] = df[meses[:6]].median(axis=1)
df["mediana_t3"] = df[meses[:3]].median(axis=1)

df["maximo_t12"] = df[meses].max(axis=1)
df["maximo_t6"] = df[meses[:6]].max(axis=1)
df["maximo_t3"] = df[meses[:3]].max(axis=1)

df["i_tn_maximo_t12"] = df["tn"] / df["maximo_t12"]
df["i_tn_maximo_t6"] = df["tn"] / df["maximo_t6"]
df["i_tn_maximo_t3"] = df["tn"] / df["maximo_t3"]

df["minimo_t12"] = df[meses].min(axis=1)
df["minimo_t6"] = df[meses[:6]].min(axis=1)
df["minimo_t3"] = df[meses[:3]].min(axis=1)

df["std_t12"] = df[meses].std(axis=1)
df["std_t6"] = df[meses[:6]].std(axis=1)
df["std_t3"] = df[meses[:3]].std(axis=1)

df["variacion_12"] = df["std_t12"]/df["promedio_t12"]
df["variacion_6"] = df["std_t6"]/df["promedio_t6"]
df["variacion_3"] = df["std_t3"]/df["promedio_t3"]

# interaccion desvio, promedio, tendencia
df["trend_var_12"] = df["trend"] * df["variacion_12"]
df["trend_var_6"] = df["trend"] * df["variacion_6"]
df["trend_var_3"] = df["trend"] * df["variacion_3"]

df["i_tn_minimo_t12"] = df["tn"] / df["minimo_t12"]
df["i_tn_minimo_t6"] = df["tn"] / df["minimo_t6"]
df["i_tn_minimo_t3"] = df["tn"] / df["minimo_t3"]

df["rango_t12"] = df["maximo_t12"] - df["minimo_t12"]
df["rango_t6"] = df["maximo_t6"] - df["minimo_t6"]
df["rango_t3"] = df["maximo_t3"] - df["minimo_t3"]

df["antiguedad"] = (df["periodo"] - pd.to_datetime(df["nacimiento_producto"], format="%Y%m")).dt.days


# Función para calcular la pendiente de una serie temporal
def calcular_pendiente(serie):
    datos = serie.dropna().values
    if len(datos) < 2:  # Mínimo 2 puntos para una regresión
        return np.nan
    x = np.arange(len(datos))  # [0, 1, 2, ..., n-1] (representa el tiempo)
    slope, _, _, _, _ = linregress(x, datos)
    return slope

# Aplicamos la función por grupo (producto) y reindexamos para alinear con df
df["pendiente_reg_12"] = (
    df.groupby("product_id")[meses]
    .apply(lambda grupo: grupo.apply(calcular_pendiente, axis=1))
    .reset_index(level=0, drop=True)  # Eliminamos el nivel de grupo para alinear índices
)
df["tn_pendiente"] = df["tn"] * df["pendiente_reg_12"]


df["diff_tn_mm3"] = df["tn"] - df['media_movil_3m']
df["diff_tn_mm6"] = df["tn"] - df['media_movil_6m']
df["diff_tn_mm12"] = df["tn"] - df['media_movil_12m']
df["i_tn_mm3"] = df["tn"] / df['media_movil_3m']
df["i_tn_mm6"] = df["tn"] / df['media_movil_6m']
df["i_tn_mm12"] = df["tn"] / df['media_movil_12m']

from scipy.stats import mode
def rolling_mode(x):
    return x.shift(1).rolling(6, min_periods=1).apply(lambda x: mode(x, keepdims=True)[0][0], raw=False)
#Moda o patrón de cantidad
df['modo_6m'] = df.groupby(['product_id'])['tn'].transform(rolling_mode)
df['modo_diff'] = df['tn'] - df['modo_6m']

# sku_size
df["sku_size_cat1"] = df.groupby("cat1")["sku_size"].transform("mean")
df["sku_size_cat2"] = df.groupby("cat2")["sku_size"].transform("mean")
df["sku_size_cat3"] = df.groupby("cat3")["sku_size"].transform("mean")
df["i_tn_sku_size"] = df["tn"] / (df["sku_size"]+0.00001)
df['tn_diff1'] = df.groupby('product_id')['tn'].transform(lambda x: x.diff())
# target t+2
df["target"] = df.groupby("product_id")["tn"].shift(-2)




cat_cols = ['cat1', 'cat2', 'cat3', 'brand', 'descripcion']
for col in cat_cols:
    if col in df.columns:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))

del completo, productos, stocks
gc.collect()
df

  df[f"trend_s_t{i}"] = df.groupby("product_id")["trend_s"].shift(i)
  df[f"seasonal_s_t{i}"] = df.groupby("product_id")["seasonal_s"].shift(i)
  df[f"resid_s_t{i}"] = df.groupby("product_id")["resid_s"].shift(i)
  df[f"tn_t{i}"] = df.groupby("product_id")["tn"].shift(i)
  df[f"delta_t{i}"] = df[f"tn"] / df[f"tn_t{i}"]
  df[f"diff_t{i}"] = df[f"tn"] - df[f"tn_t{i}"]
  df[f"cat1_t{i}"] = df.groupby("product_id")["cat1_total"].shift(i)
  df[f"cat2_t{i}"] = df.groupby("product_id")["cat2_total"].shift(i)
  df[f"cat3_t{i}"] = df.groupby("product_id")["cat3_total"].shift(i)
  df[f"customer_id_t{i}"] = df.groupby("product_id")["customer_id"].shift(i)
  df[f"trend_{i}"] = df.groupby("product_id")["trend"].shift(i)
  df[f"brand_t{i}"] = df.groupby("product_id")["brand_total"].shift(i)
  df[f"brand_cust_t{i}"] = df.groupby("product_id")["brand_customers"].shift(i)
  df[f"trend_s_t{i}"] = df.groupby("product_id")["trend_s"].shift(i)
  df[f"seasonal_s_t{i}"] = df.groupby("product_id")["seasonal_s

Unnamed: 0,product_id,periodo,tn,plan_precios_cuidados,customer_id,nacimiento_producto,cat1,cat2,cat3,brand,...,i_tn_mm6,i_tn_mm12,modo_6m,modo_diff,sku_size_cat1,sku_size_cat2,sku_size_cat3,i_tn_sku_size,tn_diff1,target
0,20001,2017-04-01,1069.96130,0.0,104.0,201704,1,10,51,0,...,,,,,1220.621087,1552.858258,1283.608458,0.356654,,1520.06539
1,20001,2017-05-01,1502.20132,0.0,238.0,201704,1,10,51,0,...,,,1069.96130,432.24002,1220.621087,1552.858258,1283.608458,0.500734,432.24002,1030.67391
2,20001,2017-06-01,1520.06539,0.0,220.0,201704,1,10,51,0,...,,,1069.96130,450.10409,1220.621087,1552.858258,1283.608458,0.506688,17.86407,1267.39462
3,20001,2017-07-01,1030.67391,0.0,151.0,201704,1,10,51,0,...,,,1069.96130,-39.28739,1220.621087,1552.858258,1283.608458,0.343558,-489.39148,1316.94604
4,20001,2017-08-01,1267.39462,0.0,236.0,201704,1,10,51,0,...,,,1030.67391,236.72071,1220.621087,1552.858258,1283.608458,0.422465,236.72071,1439.75563
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32191,20962,2019-12-01,1.99182,0.0,57.0,201912,0,5,32,11,...,,,,,155.702416,320.618812,180.000000,0.011066,,
32192,20975,2019-12-01,1.69045,0.0,51.0,201912,0,5,32,11,...,,,,,155.702416,320.618812,180.000000,0.009391,,
32193,20995,2019-12-01,1.55285,0.0,51.0,201912,0,5,32,11,...,,,,,155.702416,320.618812,180.000000,0.008627,,
32194,21087,2019-12-01,1.02205,0.0,51.0,201912,2,3,3,23,...,,,,,270.621299,81.557603,90.919147,0.015724,,


In [3]:
df_kgl = df[df["periodo"] == "2019-12-01"].copy()
df_kgl["periodo"] = df_kgl["periodo"].astype(int)

promedio_12_per = df.query("periodo >= '2019-01-01'")
promedio_12_per = promedio_12_per.groupby("product_id")["tn"].mean().reset_index()
promedio_12_per.columns = ["product_id", "promedio_12"]

#df = df.drop(columns=["product_id"])
df = df[~df["periodo"].isin(["2019-11-01", "2019-12-01"])].copy()
df_kgl

  df = df[~df["periodo"].isin(["2019-11-01", "2019-12-01"])].copy()


Unnamed: 0,product_id,periodo,tn,plan_precios_cuidados,customer_id,nacimiento_producto,cat1,cat2,cat3,brand,...,i_tn_mm6,i_tn_mm12,modo_6m,modo_diff,sku_size_cat1,sku_size_cat2,sku_size_cat3,i_tn_sku_size,tn_diff1,target
32,20001,1575158400000000000,1504.68856,0.0,176.0,201704,1,10,51,0,...,0.996053,1.034340,1109.93769,394.75087,1220.621087,1552.858258,1283.608458,0.501563,107.31625,
65,20002,1575158400000000000,1087.30855,0.0,98.0,201704,1,10,51,15,...,0.874412,0.925025,813.78215,273.52640,1220.621087,1552.858258,1283.608458,0.362436,-336.26884,
98,20003,1575158400000000000,892.50129,0.0,161.0,201704,0,0,52,22,...,1.021805,1.136978,635.59563,256.90566,155.702416,356.697723,421.108534,1.878950,-55.79264,
131,20004,1575158400000000000,637.90002,0.0,158.0,201704,0,0,52,22,...,0.907707,1.017035,482.13372,155.76630,155.702416,356.697723,421.108534,2.657917,-86.04204,
164,20005,1575158400000000000,593.24443,0.0,142.0,201704,0,0,52,22,...,0.816600,0.887732,536.66800,56.57643,155.702416,356.697723,421.108534,4.943703,-13.66730,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32191,20962,1575158400000000000,1.99182,0.0,57.0,201912,0,5,32,11,...,,,,,155.702416,320.618812,180.000000,0.011066,,
32192,20975,1575158400000000000,1.69045,0.0,51.0,201912,0,5,32,11,...,,,,,155.702416,320.618812,180.000000,0.009391,,
32193,20995,1575158400000000000,1.55285,0.0,51.0,201912,0,5,32,11,...,,,,,155.702416,320.618812,180.000000,0.008627,,
32194,21087,1575158400000000000,1.02205,0.0,51.0,201912,2,3,3,23,...,,,,,270.621299,81.557603,90.919147,0.015724,,


In [4]:

# Separar features y target
X = df.drop(columns=["target"])
y = df["target"]
X_train = X[X["periodo"] < '2019-10-01']
X_train["periodo"] = X_train["periodo"].astype(int)
y_train = y[X["periodo"] < '2019-10-01']

X_val = X[X["periodo"] == '2019-10-01']
X_val["periodo"] = X_val["periodo"].astype(int)
y_val = y[X["periodo"] == '2019-10-01']



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train["periodo"] = X_train["periodo"].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_val["periodo"] = X_val["periodo"].astype(int)


In [None]:
# import lightgbm as lgb
# import optuna
# from sklearn.metrics import mean_squared_error, mean_absolute_error
# import os

# def custom_mape_loss(y_true, y_pred):
#     numerator = np.sum(np.abs(y_true - y_pred))
#     denominator = np.sum(np.abs(y_true))
#     epsilon = np.finfo(float).eps  # Valor muy pequeño para evitar división por cero
#     return numerator / (denominator + epsilon)

# # === 2. Definición del objetivo para Optuna ===
# def objective(trial):
#     params = {
#         "sample_weight": X_train["tn"].to_list(),
#         "objective": "regression",
#         "metric": ["rmse", "mape"],
#         "boosting_type": "gbdt",
#         "verbosity": -1,
#         "n_jobs": -1,
#         "seed": 42,
#         "num_leaves": trial.suggest_int("num_leaves", 20, 100),
#         "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.3),
#         "feature_fraction": trial.suggest_float("feature_fraction", 0.6, 0.9),
#         "bagging_fraction": trial.suggest_float("bagging_fraction", 0.7, 0.95),
#         "bagging_freq": trial.suggest_int("bagging_freq", 0, 5),
#         "min_child_samples": trial.suggest_int("min_child_samples", 10, 100),
#         "lambda_l1": trial.suggest_float("lambda_l1", 1e-3, 5.0, log=True),
#         "lambda_l2": trial.suggest_float("lambda_l2", 1e-3, 5.0, log=True),
#         "max_depth": trial.suggest_int("max_depth", 3, 15),
#         "max_bin": trial.suggest_int("max_bin", 10, 4000),
#         "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 20, 200),
#         "extra_trees": trial.suggest_categorical("extra_trees", [True, False]),
#         "path_smooth": trial.suggest_float("path_smooth", 0, 1),  # Nuevo parámetro útil
#     }

#     model = lgb.train(
#         params,
#         lgb.Dataset(X_train, label=y_train),
#         valid_sets=[lgb.Dataset(X_val, label=y_val)],
#         num_boost_round=500,
#         callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=False)]
#     )

#     preds = model.predict(X_val)
    
    
    
#     mape = custom_mape_loss(y_val, preds)
#     # rmse = mean_squared_error(y_val, preds)
#     # rmse = np.sqrt(rmse)
    
#     #rmse = mean_squared_error(y_val, preds, squared=False)
#     return mape

# # === 3. Configurar almacenamiento SQLite para Optuna ===
# os.makedirs("optuna_storage", exist_ok=True)
# DB_PATH = "optuna_storage/optuna.db"
# STUDY_NAME = study_name
# storage_url = f"sqlite:///{DB_PATH}"

# # === 4. Crear o cargar estudio ===
# study = optuna.create_study(
#     study_name=STUDY_NAME,
#     storage=storage_url,
#     direction="minimize",
#     load_if_exists=True
# )

# # === 5. Ejecutar optimización ===
# study.optimize(objective, n_trials=150)

# # === 6. Mostrar resultados ===
# print("Mejores hiperparámetros encontrados:")
# print(study.best_params)
# print(f"Mejor MAE: {study.best_value:.4f}")

  from .autonotebook import tqdm as notebook_tqdm
[I 2025-07-20 11:21:50,787] A new study created in RDB with name: lightgbm_x_product_verified
[I 2025-07-20 11:21:55,321] Trial 0 finished with value: 0.31478110342308185 and parameters: {'num_leaves': 55, 'learning_rate': 0.12277738834237925, 'feature_fraction': 0.7591929069398793, 'bagging_fraction': 0.9285746341164004, 'bagging_freq': 4, 'min_child_samples': 81, 'lambda_l1': 0.1331534666138763, 'lambda_l2': 0.009870013788357704, 'max_depth': 5, 'max_bin': 2436, 'min_data_in_leaf': 180, 'extra_trees': True, 'path_smooth': 0.414567237219457}. Best is trial 0 with value: 0.31478110342308185.
[I 2025-07-20 11:22:03,310] Trial 1 finished with value: 0.3244510792200001 and parameters: {'num_leaves': 41, 'learning_rate': 0.17336965138646762, 'feature_fraction': 0.8562940351936206, 'bagging_fraction': 0.9238230287899742, 'bagging_freq': 0, 'min_child_samples': 58, 'lambda_l1': 0.004923727392010157, 'lambda_l2': 0.007167651250067889, 'max_dep

Mejores hiperparámetros encontrados:
{'num_leaves': 63, 'learning_rate': 0.018031400731504475, 'feature_fraction': 0.6132695816782275, 'bagging_fraction': 0.9298379246413645, 'bagging_freq': 0, 'min_child_samples': 73, 'lambda_l1': 0.0018689100257572961, 'lambda_l2': 0.001428729332633474, 'max_depth': 15, 'max_bin': 2162, 'min_data_in_leaf': 24, 'extra_trees': True, 'path_smooth': 0.7671959515378658}
Mejor MAE: 0.2508


# LGBM KGL

In [8]:
import lightgbm as lgb
best_params = {'num_leaves': 63,
 'learning_rate': 0.018031400731504475,
 'feature_fraction': 0.6132695816782275,
 'bagging_fraction': 0.9298379246413645,
 'bagging_freq': 0,
 'min_child_samples': 73,
 'lambda_l1': 0.0018689100257572961,
 'lambda_l2': 0.001428729332633474,
 'max_depth': 15,
 'max_bin': 2162,
 'min_data_in_leaf': 24,
 'extra_trees': True,
 'path_smooth': 0.7671959515378658,
 'objective': 'regression',
 'metric': ['rmse', 'mape'],
 'verbosity': -1,
 'n_jobs': -1,
 'seed': 42}

model = lgb.train(
    best_params,
    lgb.Dataset(X_train, label=y_train),
    valid_sets=[lgb.Dataset(X_val, label=y_val)],
    num_boost_round=500,
    callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False)]
)

# Asegurar las mismas columnas
X_kgl = df_kgl[X_train.columns]  # Misma estructura

# === 9. Hacer predicción sobre nuevos datos ===
preds_kgl = model.predict(X_kgl)


# LGBM Validacion

In [None]:
def custom_mape_loss(y_true, y_pred):
    numerator = np.sum(np.abs(y_true - y_pred))
    denominator = np.sum(np.abs(y_true))
    epsilon = np.finfo(float).eps  # Valor muy pequeño para evitar división por cero
    return numerator / (denominator + epsilon)

model = lgb.train(
    best_params,
    lgb.Dataset(X_train, label=y_train),
    valid_sets=[lgb.Dataset(X_val, label=y_val)],
    num_boost_round=500,
    callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False)]
)

preds_val = model.predict(X_val)

val = pd.DataFrame({"product_id": X_val["product_id"],  "y_pred_val": preds_val, "y_true_val": X_val["tn"]})
val = val[val["product_id"].isin(productos_ok["product_id"])]
val["score"] = val.apply(lambda row: custom_mape_loss(row["y_true_val"], row["y_pred_val"]), axis=1)

val

Unnamed: 0,product_id,y_pred_val,y_true_val,score
30,20001,1315.472810,1561.50552,0.157561
63,20002,1090.395735,1979.53635,0.449166
96,20003,786.574670,1081.36645,0.272610
129,20004,641.656797,1064.69633,0.397334
162,20005,593.317928,996.78275,0.404767
...,...,...,...,...
32140,21035,1.781902,2.14477,0.169187
32143,21039,1.425192,1.58773,0.102372
32146,21079,1.481543,1.87330,0.209127
32149,21109,1.587781,1.93394,0.178991


In [11]:
result = pd.DataFrame({"product_id": X_kgl["product_id"],  "lgbm": preds_kgl})
result = result[result["product_id"].isin(productos_ok["product_id"])]
result.loc[result["lgbm"] < 0, "lgbm"] = 0  # Asegurar que no haya valores negativos
result = result.merge(val[["product_id", "score"]], on="product_id", how="left")
best_lgbm = pd.read_csv("lightgbm10.csv", sep=",")
best_lgbm.rename(columns={"tn": "best_lgbm"}, inplace=True)
result = result.merge(best_lgbm, on="product_id", how="left")

result

Unnamed: 0,product_id,lgbm,score,best_lgbm
0,20001,1185.874155,0.157561,1384.421698
1,20002,939.011338,0.449166,1105.965210
2,20003,631.533430,0.272610,892.024268
3,20004,510.314093,0.397334,593.078658
4,20005,510.501060,0.404767,587.307835
...,...,...,...,...
775,20962,1.770523,,2.960315
776,20975,1.515817,,2.821064
777,20995,1.472004,,2.821064
778,21087,1.142025,,2.489595


# Regresion lineal

In [12]:
df = pd.read_csv("../datasets/sell-in.txt.gz", sep="\t")
df = df.groupby(by=["periodo","product_id"]).agg({"tn":"sum"}).reset_index()
df["periodo"] = pd.to_datetime(df["periodo"], format="%Y%m")
df_pivot = df.pivot(index="periodo", columns="product_id", values="tn").reset_index()

magicos = [ "periodo",20002, 20003, 20006, 20010, 20011, 20018, 20019, 20021,
   20026, 20028, 20035, 20039, 20042, 20044, 20045, 20046,  20049,
   20051, 20052, 20053, 20055, 20008, 20001, 20017, 20086, 20180,
   20193, 20320, 20532, 20612, 20637, 20807, 20838
 ]
X_train = df_pivot[magicos].query("periodo >= '2018-01-01' & periodo <= '2018-12-31' ")
X_train = X_train.T.iloc[1:]
X_train.columns = [f"t-{11-k}" for k in range(12)]


X_kgl = df_pivot.query("periodo >= '2019-01-01' & periodo <= '2019-12-31'")
X_kgl = X_kgl.T.iloc[1:]
X_kgl.columns = [f"t-{11-k}" for k in range(12)]
promedio = X_kgl.mean(axis=1)
promedio = promedio.fillna(0)


y = df_pivot[magicos].query("periodo == '2019-02-01'").T.iloc[1:]
y.columns = ["target"]

prod_menos12 = X_kgl.index[X_kgl.isna().sum(axis=1)> 0]
X_kgl = X_kgl[~X_kgl.index.isin(prod_menos12)]
promedio_menos12 = promedio[prod_menos12]


from sklearn.linear_model import LinearRegression

reg_model = LinearRegression()
reg_model.fit(X_train, y)


pred = pd.DataFrame({"product_id": X_kgl.index, "tn": reg_model.predict(X_kgl).flatten()})
nuevas_filas = []
for prod in productos_ok["product_id"]:
    if prod not in pred["product_id"].values:
        nuevas_filas.append({"product_id": prod, "tn": promedio[prod]})

pred = pd.concat([pred, pd.DataFrame(nuevas_filas)], ignore_index=True)
pred = pred[pred["product_id"].isin(productos_ok["product_id"])]
pred.rename(columns={"tn": "reg_lin"}, inplace=True)
pred.loc[pred["reg_lin"] < 0, "reg_lin"] = 0  # Asegurar que no haya valores negativos
pred

  promedio = promedio.fillna(0)
  y = df_pivot[magicos].query("periodo == '2019-02-01'").T.iloc[1:]


Unnamed: 0,product_id,reg_lin
0,20001,1162.707525
1,20002,1183.640604
2,20003,684.763931
3,20004,580.484961
4,20005,563.560780
...,...,...
861,21252,0.178011
862,21265,0.089541
863,21266,0.094659
864,21267,0.092835


In [13]:
result = result.merge(pred, on="product_id", how="left")
result

Unnamed: 0,product_id,lgbm,score,best_lgbm,reg_lin
0,20001,1185.874155,0.157561,1384.421698,1162.707525
1,20002,939.011338,0.449166,1105.965210,1183.640604
2,20003,631.533430,0.272610,892.024268,684.763931
3,20004,510.314093,0.397334,593.078658,580.484961
4,20005,510.501060,0.404767,587.307835,563.560780
...,...,...,...,...,...
775,20962,1.770523,,2.960315,3.915682
776,20975,1.515817,,2.821064,3.583990
777,20995,1.472004,,2.821064,3.365322
778,21087,1.142025,,2.489595,0.907423


In [None]:
result.loc[result["score"] <= 0.15, "tn_final"] = result["lgbm"]
result.loc[result["score"] > 0.15, "tn_final"] = result["reg_lin"]
result.loc[result["tn_final"].isna(), "tn_final"] = result["lgbm"] 
result_csv = result[["product_id", "tn_final"]]
result_csv.rename(columns={"tn_final": "tn"}, inplace=True)
result_csv.to_csv(f"newlgbm_reglin.csv",sep=',', index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_csv.rename(columns={"tn_final": "tn"}, inplace=True)


In [21]:
result.loc[result["score"] <= 0.15, "tn_final"] = result["best_lgbm"]
result.loc[result["score"] > 0.15, "tn_final"] = result["reg_lin"]
result.loc[result["tn_final"].isna(), "tn_final"] = result["best_lgbm"] # Asegurar que no haya valores negativos
result_csv = result[["product_id", "tn_final"]]
result_csv.rename(columns={"tn_final": "tn"}, inplace=True)
result_csv.to_csv(f"bestlgbm_reglin.csv",sep=',', index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_csv.rename(columns={"tn_final": "tn"}, inplace=True)


In [19]:
result_csv

Unnamed: 0,product_id,tn
0,20001,1162.707525
1,20002,1183.640604
2,20003,684.763931
3,20004,580.484961
4,20005,563.560780
...,...,...
775,20962,1.770523
776,20975,1.515817
777,20995,1.472004
778,21087,1.142025
