In [48]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

# 🔧 PASO 1 – Preparar dataset

In [49]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

# Crear función de preparación con lags y clase
def preparar_dataset_para_regresion(df):
    df = df.sort_values(['product_id', 'periodo'])
    
    # Generar lags tn_1 a tn_11 por producto
    for i in range(1, 12):
        df[f'tn_{i}'] = df.groupby('product_id')['tn'].shift(i)
    
    # Crear campo "clase" con tn en periodo+2
    df['clase'] = df.groupby('product_id')['tn'].shift(-2)
    
    return df

In [50]:
df = pd.read_csv('../../data/raw/sell-in.csv', sep='\t')  # Cargar el dataset
df = df.groupby(['periodo', 'product_id']).agg({'tn' : 'sum'}).reset_index() # Agrupar por periodo y producto
df_prep = preparar_dataset_para_regresion(df)  # Preparar el dataset
df_prep

Unnamed: 0,periodo,product_id,tn,tn_1,tn_2,tn_3,tn_4,tn_5,tn_6,tn_7,tn_8,tn_9,tn_10,tn_11,clase
0,201701,20001,934.77222,,,,,,,,,,,,1303.35771
785,201702,20001,798.01620,934.77222,,,,,,,,,,,1069.96130
1566,201703,20001,1303.35771,798.01620,934.77222,,,,,,,,,,1502.20132
2352,201704,20001,1069.96130,1303.35771,798.01620,934.77222,,,,,,,,,1520.06539
3136,201705,20001,1502.20132,1069.96130,1303.35771,798.01620,934.77222,,,,,,,,1030.67391
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
783,201701,21295,0.00699,,,,,,,,,,,,
6435,201708,21296,0.00651,,,,,,,,,,,,
784,201701,21297,0.00579,,,,,,,,,,,,
6436,201708,21298,0.00573,,,,,,,,,,,,


# 📚 PASO 2 – Entrenar regresión lineal

In [51]:
# Lista de productos mágicos
magicos = [
    20002, 20003, 20006, 20010, 20011, 20018, 20019, 20021,
    20026, 20028, 20035, 20039, 20042, 20044, 20045, 20046, 20049,
    20051, 20052, 20053, 20055, 20008, 20001, 20017, 20086, 20180,
    20193, 20320, 20532, 20612, 20637, 20807, 20838
]

# Entrenar regresión lineal en periodo base con productos mágicos
def entrenar_regresion_lineal(df, periodo_base, magicos,  modelo='ridge', alpha=1.0, random_state=42):
    columnas_tn = ['tn'] + [f'tn_{i}' for i in range(1, 12)]
    
    # Filtrar registros del periodo base y productos mágicos
    df_train = df[(df['periodo'] == periodo_base) & (df['product_id'].isin(magicos))].copy()
    
    # Eliminar filas con datos faltantes
    df_train = df_train.dropna(subset=columnas_tn + ['clase'])

    # Entrenar modelo
    X = df_train[columnas_tn]
    y = df_train['clase']

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    if modelo == 'ridge':
        reg = Ridge(alpha=alpha, random_state=random_state)
    elif modelo == 'lasso':
        reg = Lasso(alpha=alpha, random_state=random_state, max_iter=5000)
    else:
        reg = LinearRegression()
    
    reg.fit(X_scaled, y)
    y_pred = reg.predict(X_scaled)
    
    
    # Mostrar coeficientes
    coef = pd.Series([reg.intercept_] + reg.coef_.tolist(), index=['intercept'] + columnas_tn)
    return reg, scaler, coef.sort_values(ascending=False) 

   


In [58]:
import optuna
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, root_mean_squared_error
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Lista de productos mágicos
magicos = [
    20002, 20003, 20006, 20010, 20011, 20018, 20019, 20021,
    20026, 20028, 20035, 20039, 20042, 20044, 20045, 20046, 20049,
    20051, 20052, 20053, 20055, 20008, 20001, 20017, 20086, 20180,
    20193, 20320, 20532, 20612, 20637, 20807, 20838
]

def optimizar_ridge_con_optuna(df, n_trials=50, random_state=42):
    columnas_tn = ['tn'] + [f'tn_{i}' for i in range(1, 12)]
    
    # Filtrar registros del periodo base y productos mágicos
    df_train = df[(df['periodo'] == 201812) & (df['product_id'].isin(magicos))].copy()
    
    # Eliminar filas con datos faltantes
    df_train = df_train.dropna(subset=columnas_tn + ['clase'])

    # Entrenar modelo
    X = df_train[columnas_tn]
    y = df_train['clase']


    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    def objective(trial):
        alpha = trial.suggest_float("alpha", 1e-4, 100.0, log=True)
        fit_intercept = trial.suggest_categorical("fit_intercept", [True, False])
        tol = trial.suggest_float("tol", 1e-6, 1e-2, log=True)
        solver = trial.suggest_categorical("solver", ["auto", "svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga"])
        # solver = trial.suggest_categorical("solver", ["auto", "svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga", "lbfgs"])

        model = Ridge(
            alpha=alpha,
            fit_intercept=fit_intercept,
            tol=tol,
            solver=solver,
            random_state=random_state
        )

        model.fit(X_scaled, y)
        y_pred = model.predict(X_scaled)
        rmse = root_mean_squared_error(y, y_pred)
        return rmse

    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=n_trials)

    best_params = study.best_params
    modelo_final = Ridge(**best_params, random_state=random_state)
    modelo_final.fit(X_scaled, y)

    coef = pd.Series([modelo_final.intercept_] + modelo_final.coef_.tolist(), index=['intercept'] + columnas_tn)
    print(f"✅ Mejor combinación: {best_params} | RMSE: {study.best_value:.4f}")

    return modelo_final, scaler, coef.sort_values(ascending=False)


In [53]:
import optuna
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error, root_mean_squared_error
from sklearn.preprocessing import StandardScaler

# Lista de productos mágicos
magicos = [
    20002, 20003, 20006, 20010, 20011, 20018, 20019, 20021,
    20026, 20028, 20035, 20039, 20042, 20044, 20045, 20046, 20049,
    20051, 20052, 20053, 20055, 20008, 20001, 20017, 20086, 20180,
    20193, 20320, 20532, 20612, 20637, 20807, 20838
]

def optimizar_lasso_con_optuna(df, n_trials=30, random_state=42):
    columnas_tn = ['tn'] + [f'tn_{i}' for i in range(1, 12)]
    
    # Filtrar registros del periodo base y productos mágicos
    df_train = df[(df['periodo'] == 201812) & (df['product_id'].isin(magicos))].copy()
    
    # Eliminar filas con datos faltantes
    df_train = df_train.dropna(subset=columnas_tn + ['clase'])

    # Entrenar modelo
    X = df_train[columnas_tn]
    y = df_train['clase']

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    def objective(trial):
        alpha = trial.suggest_float("alpha", 1e-4, 10.0, log=True)

        model = Lasso(alpha=alpha, random_state=random_state, max_iter=5000)
        model.fit(X_scaled, y)
        y_pred = model.predict(X_scaled)
        rmse = root_mean_squared_error(y, y_pred)
        return rmse

    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=n_trials)

    # Entrenar modelo final con mejor alpha
    best_alpha = study.best_params['alpha']
    modelo_final = Lasso(alpha=best_alpha, random_state=random_state, max_iter=5000)
    modelo_final.fit(X_scaled, y)
    
    coef = pd.Series([modelo_final.intercept_] + modelo_final.coef_.tolist(), index=['intercept'] + columnas_tn)
    print(f"✅ Mejor alpha: {best_alpha:.5f} | RMSE: {study.best_value:.4f}")
    return modelo_final, scaler, coef.sort_values(ascending=False)

In [61]:
# Lista de productos mágicos
magicos = [
    20002, 20003, 20006, 20010, 20011, 20018, 20019, 20021,
    20026, 20028, 20035, 20039, 20042, 20044, 20045, 20046, 20049,
    20051, 20052, 20053, 20055, 20008, 20001, 20017, 20086, 20180,
    20193, 20320, 20532, 20612, 20637, 20807, 20838
]

def optimizar_lasso_con_optuna_v2(df, n_trials=50, random_state=42):
    columnas_tn = ['tn'] + [f'tn_{i}' for i in range(1, 12)]
    df_train = df[(df['periodo'] == 201812) & (df['product_id'].isin(magicos))].copy()
    df_train = df_train.dropna(subset=columnas_tn + ['clase'])

    X = df_train[columnas_tn]
    y = df_train['clase']
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    def objective(trial):
        alpha = trial.suggest_float("alpha", 1e-4, 10.0, log=True)
        tol = trial.suggest_float("tol", 1e-6, 1e-2, log=True)
        max_iter = trial.suggest_int("max_iter", 1000, 10000, step=1000)
        selection = trial.suggest_categorical("selection", ['cyclic', 'random'])
        fit_intercept = trial.suggest_categorical("fit_intercept", [True, False])

        model = Lasso(
            alpha=alpha,
            tol=tol,
            max_iter=max_iter,
            selection=selection,
            fit_intercept=fit_intercept,
            random_state=random_state
        )
        model.fit(X_scaled, y)
        y_pred = model.predict(X_scaled)
        rmse = root_mean_squared_error(y, y_pred)
        return rmse

    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=n_trials)

    best_params = study.best_params
    modelo_final = Lasso(**best_params, random_state=random_state)
    modelo_final.fit(X_scaled, y)
    
    coef = pd.Series([modelo_final.intercept_] + modelo_final.coef_.tolist(), index=['intercept'] + columnas_tn)
    print(f"✅ Mejor combinación: {best_params} | RMSE: {study.best_value:.4f}")
    return modelo_final, scaler, coef.sort_values(ascending=False)


In [74]:
import optuna
from sklearn.linear_model import TheilSenRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Lista de productos mágicos
magicos = [
    20002, 20003, 20006, 20010, 20011, 20018, 20019, 20021,
    20026, 20028, 20035, 20039, 20042, 20044, 20045, 20046, 20049,
    20051, 20052, 20053, 20055, 20008, 20001, 20017, 20086, 20180,
    20193, 20320, 20532, 20612, 20637, 20807, 20838
]

def optimizar_theilsen_con_optuna(df, n_trials=30):
    columnas_tn = ['tn'] + [f'tn_{i}' for i in range(1, 12)]
    
    # Filtrar registros del periodo base y productos mágicos
    df_train = df[(df['periodo'] == 201812) & (df['product_id'].isin(magicos))].copy()
    
    # Eliminar filas con datos faltantes
    df_train = df_train.dropna(subset=columnas_tn + ['clase'])

    # Entrenar modelo
    X = df_train[columnas_tn]
    y = df_train['clase']

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    def objective(trial):
        n_features = X.shape[1]
        max_subpopulation = trial.suggest_int("max_subpopulation", 50, 1000)
        
        # n_subsamples debe ser ≥ n_features
        n_subsamples = trial.suggest_int("n_subsamples", n_features, min(X.shape[0], 100))
        
        fit_intercept = trial.suggest_categorical("fit_intercept", [True, False])

        model = TheilSenRegressor(
            max_subpopulation=max_subpopulation,
            n_subsamples=n_subsamples,
            fit_intercept=fit_intercept,
            random_state=42,
            n_jobs=-1
        )
        model.fit(X_scaled, y)
        y_pred = model.predict(X_scaled)
        rmse = root_mean_squared_error(y, y_pred)
        return rmse

    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=n_trials)

    best_params = study.best_params
    modelo_final = TheilSenRegressor(**best_params, random_state=42, n_jobs=-1)
    modelo_final.fit(X_scaled, y)

    coef = pd.Series([modelo_final.intercept_] + modelo_final.coef_.tolist(), index=['intercept'] + columnas_tn)
    print(f"✅ Mejor combinación: {best_params} | RMSE: {study.best_value:.4f}")

    return modelo_final, scaler, coef.sort_values(ascending=False)


In [76]:
import optuna
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Lista de productos mágicos
magicos = [
    20002, 20003, 20006, 20010, 20011, 20018, 20019, 20021,
    20026, 20028, 20035, 20039, 20042, 20044, 20045, 20046, 20049,
    20051, 20052, 20053, 20055, 20008, 20001, 20017, 20086, 20180,
    20193, 20320, 20532, 20612, 20637, 20807, 20838
]

def optimizar_elasticnet_con_optuna(df, n_trials=50, random_state=42):
    columnas_tn = ['tn'] + [f'tn_{i}' for i in range(1, 12)]
    
    # Filtrar registros del periodo base y productos mágicos
    df_train = df[(df['periodo'] == 201812) & (df['product_id'].isin(magicos))].copy()
    
    # Eliminar filas con datos faltantes
    df_train = df_train.dropna(subset=columnas_tn + ['clase'])

    # Entrenar modelo
    X = df_train[columnas_tn]
    y = df_train['clase']

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    def objective(trial):
        alpha = trial.suggest_float("alpha", 1e-4, 10.0, log=True)
        l1_ratio = trial.suggest_float("l1_ratio", 0.0, 1.0)  # 0 = Ridge, 1 = Lasso

        model = ElasticNet(
            alpha=alpha,
            l1_ratio=l1_ratio,
            random_state=random_state,
            max_iter=5000
        )
        model.fit(X_scaled, y)
        y_pred = model.predict(X_scaled)
        rmse = root_mean_squared_error(y, y_pred)
        return rmse

    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=n_trials)

    best_params = study.best_params
    modelo_final = ElasticNet(**best_params, random_state=random_state, max_iter=5000)
    modelo_final.fit(X_scaled, y)

    coef = pd.Series([modelo_final.intercept_] + modelo_final.coef_.tolist(), index=['intercept'] + columnas_tn)
    print(f"✅ Mejor combinación: {best_params} | RMSE: {study.best_value:.4f}")

    return modelo_final, scaler, coef.sort_values(ascending=False)

# 🔮 PASO 3 – Predecir con el modelo entrenado

In [55]:
def productos_con_historia():
    df = pd.read_csv('../../data/raw/sell-in.csv', sep='\t')  # Cargar el dataset
    df = df.groupby(['periodo', 'product_id']).agg({'tn' : 'sum'}).reset_index() # Agrupar por periodo y producto
    productos = pd.read_csv('../../data/raw/product_id_apredecir201912.csv', sep=',')  # Cargar productos
    df = df[df['product_id'].isin(productos['product_id'].unique())].copy()  # Filtrar productos a predecir

    contador = df[(df['periodo']>=201901) & (df['periodo']<=201912) ] 
    contador = contador.groupby('product_id').size().reset_index(name='count')
    product_id = contador[contador['count']==12]['product_id'].unique()

    return product_id

In [None]:
# df = pd.read_csv('../../data/raw/sell-in.csv', sep='\t')  # Cargar el dataset
# df = df.groupby(['periodo', 'product_id']).agg({'tn' : 'sum'}).reset_index() # Agrupar por periodo y producto
# productos = pd.read_csv('../../data/raw/product_id_apredecir201912.csv', sep=',')  # Cargar productos
# df = df[df['product_id'].isin(productos['product_id'].unique())].copy()  # Filtrar productos a predecir
# df = df[(df['periodo']>=201901) & (df['periodo']<=201912) ] 
# df = df[df['product_id'].isin(productos_con_historia())]  # Filtrar productos con historia
# df_pred = df[df['periodo'] == 201912].copy() 
# df_pred['tn'].sum()

np.float64(23439.27676)

In [56]:
# Predecir para un periodo futuro
def predecir_regresion(model, scaler):
    
    df = pd.read_csv('../../data/raw/sell-in.csv', sep='\t')  # Cargar el dataset
    df = df.groupby(['periodo', 'product_id']).agg({'tn' : 'sum'}).reset_index() # Agrupar por periodo y producto
    productos = pd.read_csv('../../data/raw/product_id_apredecir201912.csv', sep=',')  # Cargar productos
    df = df[df['product_id'].isin(productos['product_id'].unique())].copy()  # Filtrar productos a predecir
    df = df[(df['periodo']>=201901) & (df['periodo']<=201912) ] 
    df = df[df['product_id'].isin(productos_con_historia())]  # Filtrar productos con historia
    
    df = df.sort_values(['product_id', 'periodo'])
    
    # Generar lags tn_1 a tn_11 por producto
    for i in range(1, 12):
        df[f'tn_{i}'] = df.groupby('product_id')['tn'].shift(i)
    
    columnas_tn = ['tn'] + [f'tn_{i}' for i in range(1, 12)]
    
    

    df_pred = df[df['periodo'] == 201912].copy() 

    X_holdout = scaler.transform(df_pred[columnas_tn])
    

    # Predecir usando regresión lineal
    df_pred['pred'] = model.predict(X_holdout)   
    
    return df_pred

def predecir_no_completos():
    
    df = pd.read_csv('../../data/raw/sell-in.csv', sep='\t')  # Cargar el dataset
    df = df.groupby(['periodo', 'product_id']).agg({'tn' : 'sum'}).reset_index() # Agrupar por periodo y producto
    productos = pd.read_csv('../../data/raw/product_id_apredecir201912.csv', sep=',')  # Cargar productos
    df = df[df['product_id'].isin(productos['product_id'].unique())].copy()  # Filtrar productos a predecir
    df = df[(df['periodo']>=201901) & (df['periodo']<=201912) ] 
    df = df[~df['product_id'].isin(productos_con_historia())]  # Filtrar productos con historia
    
    df = df.sort_values(['product_id', 'periodo'])
    
    df = df.groupby('product_id').agg({'tn':'mean'}).reset_index()  # Tomar los últimos 12 meses de cada producto
    
    return df

In [43]:
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd

def predecir_regresion_semillerio(modelo_base, scaler, seeds=[42, 101, 202, 303, 404]):
    df = pd.read_csv('../../data/raw/sell-in.csv', sep='\t')
    df = df.groupby(['periodo', 'product_id']).agg({'tn': 'sum'}).reset_index()
    
    productos = pd.read_csv('../../data/raw/product_id_apredecir201912.csv', sep=',')
    df = df[df['product_id'].isin(productos['product_id'].unique())]
    df = df[(df['periodo'] >= 201901) & (df['periodo'] <= 201912)]
    df = df[df['product_id'].isin(productos_con_historia())]
    
    df = df.sort_values(['product_id', 'periodo'])

    for i in range(1, 12):
        df[f'tn_{i}'] = df.groupby('product_id')['tn'].shift(i)

    columnas_tn = ['tn'] + [f'tn_{i}' for i in range(1, 12)]
    df_pred = df[df['periodo'] == 201912].copy()

    X_holdout = scaler.transform(df_pred[columnas_tn])
    y_true = df_pred['tn'].values

    preds = []
    rmses = []

    for seed in seeds:
        # Clonar el modelo base (esto requiere que modelo_base sea un tipo de sklearn)
        model = modelo_base.__class__(**modelo_base.get_params())
        if hasattr(model, 'random_state'):
            model.random_state = seed

        # Re-entrenar con el mismo scaler y datos (simulamos variación si selection='random')
        X_train = scaler.transform(df_pred[columnas_tn])  # Reutilizamos porque no entrenamos de nuevo
        y_pred = model.fit(X_train, y_true).predict(X_holdout)

        preds.append(y_pred)
        rmse = root_mean_squared_error(y_true, y_pred)
        rmses.append(rmse)

    # Promediar predicciones y errores
    df_pred['pred'] = np.mean(preds, axis=0)
    df_pred['rmse_promedio'] = np.mean(rmses)

    print(f"📉 RMSE por semilla: {[round(r, 2) for r in rmses]}")
    print(f"✅ RMSE promedio: {df_pred['rmse_promedio'].iloc[0]:.2f}")

    return df_pred


In [None]:
# prods = predecir_regresion(modelo)
# prods.sort_values(['product_id','periodo'], inplace=True)
# prods

Unnamed: 0,periodo,product_id,tn,tn_1,tn_2,tn_3,tn_4,tn_5,tn_6,tn_7,tn_8,tn_9,tn_10,tn_11,pred
30316,201912,20001,1504.68856,1397.37231,1561.50552,1660.00561,1261.34529,1678.99318,1109.93769,1629.78233,1647.63848,1470.65653,1259.09363,1275.77351,1162.707525
30317,201912,20002,1087.30855,1423.57739,1979.53635,1090.18771,813.78215,1066.44999,928.36431,1034.98927,1287.62346,1083.62552,1043.01349,1266.78751,1183.640604
30318,201912,20003,892.50129,948.29393,1081.36645,967.77116,635.59563,715.20314,662.38654,590.12515,565.33774,638.04010,758.32657,964.76919,684.763931
30319,201912,20004,637.90002,723.94206,1064.69633,786.17140,482.13372,521.71519,667.19411,603.31081,466.70901,619.77084,441.70332,511.33713,580.484961
30320,201912,20005,593.24443,606.91173,996.78275,879.52808,536.66800,745.74978,876.39696,897.26297,624.99880,488.21387,409.89950,363.58438,563.560780
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31229,201912,21248,0.01129,0.02964,0.01270,0.01411,0.02117,0.02116,0.00988,0.01553,0.03106,0.05365,0.06209,0.02962,0.468061
31232,201912,21256,0.01271,0.02682,0.00847,0.00423,0.02965,0.02822,0.00988,0.01553,0.01835,0.05930,0.05081,0.03811,0.463856
31234,201912,21259,0.01412,0.02965,0.01975,0.00564,0.03106,0.04657,0.00988,0.01976,0.02117,0.06777,0.05080,0.04234,0.467856
31235,201912,21262,0.01834,0.02682,0.01693,0.01552,0.02258,0.03953,0.01270,0.01130,0.01412,0.06353,0.05786,0.02680,0.465820


# ▶️ PASO 4 – Ejecutar todo el flujo

In [21]:
# Paso A: preparar dataset (con tus datos originales en df)
# df_prep = preparar_dataset_para_regresion(df)

# Paso B: entrenar el modelo con periodo 201812
modelo,scaler, coeficientes = entrenar_regresion_lineal(df_prep, periodo_base=201812, magicos=magicos, modelo='lasso', alpha=1.0, random_state=42)
print("Coeficientes encontrados:")
print(coeficientes)

# Paso C: predecir para registros del periodo 201912 → objetivo: predecir 202002
df_pred_con_historia = predecir_regresion(modelo, scaler)
df_pred_con_historia = df_pred_con_historia[['product_id', 'pred']].rename(columns={'pred': 'tn'})

df_pred_sin_historia = predecir_no_completos()

df_pred = pd.concat([df_pred_con_historia, df_pred_sin_historia], ignore_index=True)

# Paso D: ver primeras predicciones
df_pred

Coeficientes encontrados:
intercept    246.749568
tn_1          84.835718
tn_8          45.777292
tn_6          39.353392
tn_2          38.235749
tn_10         33.631495
tn_11         28.215369
tn_9          15.300660
tn             7.283986
tn_3           0.000000
tn_4           0.000000
tn_5           0.000000
tn_7           0.000000
dtype: float64


Unnamed: 0,product_id,tn
0,20001,1190.054107
1,20002,1094.308084
2,20003,692.947197
3,20004,541.181072
4,20005,541.943462
...,...,...
775,21252,0.178011
776,21265,0.089541
777,21266,0.094659
778,21267,0.092835


probando lasso

In [44]:
# Paso A: preparar dataset (con tus datos originales en df)
# df_prep = preparar_dataset_para_regresion(df)

# Paso B: entrenar el modelo con periodo 201812
# modelo,scaler, coeficientes = entrenar_regresion_lineal(df_prep, periodo_base=201812, magicos=magicos, modelo='lasso', alpha=1.0, random_state=42)
modelo, scaler, coef = optimizar_lasso_con_optuna_v2(df_prep, n_trials=50)
print("Coeficientes encontrados:")
print(coeficientes)

# Paso C: predecir para registros del periodo 201912 → objetivo: predecir 202002
df_pred_con_historia = predecir_regresion_semillerio(modelo, scaler)
df_pred_con_historia = df_pred_con_historia[['product_id', 'pred']].rename(columns={'pred': 'tn'})

df_pred_sin_historia = predecir_no_completos()

df_pred = pd.concat([df_pred_con_historia, df_pred_sin_historia], ignore_index=True)

# Paso D: ver primeras predicciones
df_pred

[I 2025-07-01 11:47:09,520] A new study created in memory with name: no-name-3f2aa97b-b93f-43f7-ac3a-57b7c06909f1
[I 2025-07-01 11:47:09,524] Trial 0 finished with value: 248.67735368193385 and parameters: {'alpha': 0.016901684938805332, 'tol': 0.0003059513819309678, 'max_iter': 4000, 'selection': 'random', 'fit_intercept': False}. Best is trial 0 with value: 248.67735368193385.
[I 2025-07-01 11:47:09,524] Trial 1 finished with value: 32.11460653729057 and parameters: {'alpha': 1.059685036357688, 'tol': 0.0014104782142781854, 'max_iter': 2000, 'selection': 'cyclic', 'fit_intercept': True}. Best is trial 1 with value: 32.11460653729057.
[I 2025-07-01 11:47:09,530] Trial 2 finished with value: 32.146854990287295 and parameters: {'alpha': 2.018615879609104, 'tol': 0.00025038290872528296, 'max_iter': 5000, 'selection': 'cyclic', 'fit_intercept': True}. Best is trial 1 with value: 32.11460653729057.
[I 2025-07-01 11:47:09,532] Trial 3 finished with value: 248.87346789038696 and parameters: 

✅ Mejor combinación: {'alpha': 0.0001018801256537752, 'tol': 6.421911631182504e-05, 'max_iter': 8000, 'selection': 'random', 'fit_intercept': True} | RMSE: 30.9021
Coeficientes encontrados:
intercept    246.749568
tn_1          84.835718
tn_8          45.777292
tn_6          39.353392
tn_2          38.235749
tn_10         33.631495
tn_11         28.215369
tn_9          15.300660
tn             7.283986
tn_3           0.000000
tn_4           0.000000
tn_5           0.000000
tn_7           0.000000
dtype: float64
📉 RMSE por semilla: [0.09, 0.06, 0.0, 0.0, 0.08]
✅ RMSE promedio: 0.05


Unnamed: 0,product_id,tn
0,20001,1504.660450
1,20002,1087.211975
2,20003,892.539472
3,20004,637.983008
4,20005,593.216173
...,...,...
775,21252,0.178011
776,21265,0.089541
777,21266,0.094659
778,21267,0.092835


In [59]:
# Paso A: preparar dataset (con tus datos originales en df)
# df_prep = preparar_dataset_para_regresion(df)

# Paso B: entrenar el modelo con periodo 201812
# modelo,scaler, coeficientes = entrenar_regresion_lineal(df_prep, periodo_base=201812, magicos=magicos, modelo='lasso', alpha=1.0, random_state=42)
modelo, scaler, coef = optimizar_ridge_con_optuna(df_prep, n_trials=50)
print("Coeficientes encontrados:")
print(coeficientes)

# Paso C: predecir para registros del periodo 201912 → objetivo: predecir 202002
df_pred_con_historia = predecir_regresion_semillerio(modelo, scaler)
df_pred_con_historia = df_pred_con_historia[['product_id', 'pred']].rename(columns={'pred': 'tn'})

df_pred_sin_historia = predecir_no_completos()

df_pred = pd.concat([df_pred_con_historia, df_pred_sin_historia], ignore_index=True)

# Paso D: ver primeras predicciones
df_pred

[I 2025-07-01 12:03:01,997] A new study created in memory with name: no-name-2d3b38f0-2a20-42f4-9a59-5a1aa231f269
[I 2025-07-01 12:03:02,013] Trial 0 finished with value: 248.76846163500056 and parameters: {'alpha': 0.0023117915782605095, 'fit_intercept': False, 'tol': 0.0017864945045497108, 'solver': 'lsqr'}. Best is trial 0 with value: 248.76846163500056.
[I 2025-07-01 12:03:02,017] Trial 1 finished with value: 248.6910873614599 and parameters: {'alpha': 0.0014243287748664322, 'fit_intercept': False, 'tol': 2.0412655474081734e-05, 'solver': 'sag'}. Best is trial 1 with value: 248.6910873614599.
[I 2025-07-01 12:03:02,018] Trial 2 finished with value: 251.12185355365455 and parameters: {'alpha': 39.9733759385943, 'fit_intercept': False, 'tol': 2.6749020325566944e-06, 'solver': 'auto'}. Best is trial 1 with value: 248.6910873614599.
[I 2025-07-01 12:03:02,018] Trial 3 finished with value: 31.09269360169335 and parameters: {'alpha': 0.006691430663471013, 'fit_intercept': True, 'tol': 0.

✅ Mejor combinación: {'alpha': 0.00010034375429476364, 'fit_intercept': True, 'tol': 2.343804651438914e-05, 'solver': 'svd'} | RMSE: 30.9021
Coeficientes encontrados:
intercept    246.749568
tn_1          84.835718
tn_8          45.777292
tn_6          39.353392
tn_2          38.235749
tn_10         33.631495
tn_11         28.215369
tn_9          15.300660
tn             7.283986
tn_3           0.000000
tn_4           0.000000
tn_5           0.000000
tn_7           0.000000
dtype: float64
📉 RMSE por semilla: [0.0, 0.0, 0.0, 0.0, 0.0]
✅ RMSE promedio: 0.00


Unnamed: 0,product_id,tn
0,20001,1504.683689
1,20002,1087.309721
2,20003,892.498875
3,20004,637.898753
4,20005,593.243558
...,...,...
775,21252,0.178011
776,21265,0.089541
777,21266,0.094659
778,21267,0.092835


In [75]:
# Paso A: preparar dataset (con tus datos originales en df)
# df_prep = preparar_dataset_para_regresion(df)

# Paso B: entrenar el modelo con periodo 201812
# modelo,scaler, coeficientes = entrenar_regresion_lineal(df_prep, periodo_base=201812, magicos=magicos, modelo='lasso', alpha=1.0, random_state=42)
modelo, scaler, coef = optimizar_theilsen_con_optuna(df_prep)
print("Coeficientes encontrados:")
print(coeficientes)

# Paso C: predecir para registros del periodo 201912 → objetivo: predecir 202002
df_pred_con_historia = predecir_regresion_semillerio(modelo, scaler)
df_pred_con_historia = df_pred_con_historia[['product_id', 'pred']].rename(columns={'pred': 'tn'})

df_pred_sin_historia = predecir_no_completos()

df_pred = pd.concat([df_pred_con_historia, df_pred_sin_historia], ignore_index=True)

# Paso D: ver primeras predicciones
df_pred

[I 2025-07-01 12:22:53,043] A new study created in memory with name: no-name-9be7c3af-801d-4f22-8cda-e39e9adb2b0d
[I 2025-07-01 12:22:53,066] Trial 0 finished with value: 32.1410839661141 and parameters: {'max_subpopulation': 922, 'n_subsamples': 23, 'fit_intercept': True}. Best is trial 0 with value: 32.1410839661141.
[I 2025-07-01 12:22:53,086] Trial 1 finished with value: 30.9300798040687 and parameters: {'max_subpopulation': 995, 'n_subsamples': 30, 'fit_intercept': True}. Best is trial 1 with value: 30.9300798040687.
[I 2025-07-01 12:22:53,106] Trial 2 finished with value: 32.27795271425134 and parameters: {'max_subpopulation': 826, 'n_subsamples': 22, 'fit_intercept': True}. Best is trial 1 with value: 30.9300798040687.
[I 2025-07-01 12:22:53,125] Trial 3 finished with value: 343.77174338710506 and parameters: {'max_subpopulation': 831, 'n_subsamples': 21, 'fit_intercept': False}. Best is trial 1 with value: 30.9300798040687.
[I 2025-07-01 12:22:53,148] Trial 4 finished with valu

✅ Mejor combinación: {'max_subpopulation': 71, 'n_subsamples': 33, 'fit_intercept': True} | RMSE: 30.9021
Coeficientes encontrados:
intercept    246.749568
tn_1          84.835718
tn_8          45.777292
tn_6          39.353392
tn_2          38.235749
tn_10         33.631495
tn_11         28.215369
tn_9          15.300660
tn             7.283986
tn_3           0.000000
tn_4           0.000000
tn_5           0.000000
tn_7           0.000000
dtype: float64
📉 RMSE por semilla: [0.0, 0.0, 0.0, 0.0, 0.0]
✅ RMSE promedio: 0.00


Unnamed: 0,product_id,tn
0,20001,1504.688560
1,20002,1087.308550
2,20003,892.501290
3,20004,637.900020
4,20005,593.244430
...,...,...
775,21252,0.178011
776,21265,0.089541
777,21266,0.094659
778,21267,0.092835


In [77]:
# Paso A: preparar dataset (con tus datos originales en df)
# df_prep = preparar_dataset_para_regresion(df)

# Paso B: entrenar el modelo con periodo 201812
# modelo,scaler, coeficientes = entrenar_regresion_lineal(df_prep, periodo_base=201812, magicos=magicos, modelo='lasso', alpha=1.0, random_state=42)
modelo, scaler, coef = optimizar_elasticnet_con_optuna(df_prep, n_trials=50)
print("Coeficientes encontrados:")
print(coeficientes)

# Paso C: predecir para registros del periodo 201912 → objetivo: predecir 202002
df_pred_con_historia = predecir_regresion_semillerio(modelo, scaler)
df_pred_con_historia = df_pred_con_historia[['product_id', 'pred']].rename(columns={'pred': 'tn'})

df_pred_sin_historia = predecir_no_completos()

df_pred = pd.concat([df_pred_con_historia, df_pred_sin_historia], ignore_index=True)

# Paso D: ver primeras predicciones
df_pred

[I 2025-07-01 12:29:24,186] A new study created in memory with name: no-name-870824eb-bfd0-4694-a2fd-cffcc827e84d
[I 2025-07-01 12:29:24,188] Trial 0 finished with value: 30.90354233576271 and parameters: {'alpha': 0.00023094345611295643, 'l1_ratio': 0.19918988480810518}. Best is trial 0 with value: 30.90354233576271.
[I 2025-07-01 12:29:24,188] Trial 1 finished with value: 31.648565701676244 and parameters: {'alpha': 0.011299215869363671, 'l1_ratio': 0.030570638411832385}. Best is trial 0 with value: 30.90354233576271.
[I 2025-07-01 12:29:24,188] Trial 2 finished with value: 34.76510489675546 and parameters: {'alpha': 0.8089669966926882, 'l1_ratio': 0.8570991325609568}. Best is trial 0 with value: 30.90354233576271.
[I 2025-07-01 12:29:24,188] Trial 3 finished with value: 37.41762149046516 and parameters: {'alpha': 0.3786106930052284, 'l1_ratio': 0.07964572431241956}. Best is trial 0 with value: 30.90354233576271.
[I 2025-07-01 12:29:24,188] Trial 4 finished with value: 32.08549418383

✅ Mejor combinación: {'alpha': 0.00017714865103452468, 'l1_ratio': 0.9997946086270396} | RMSE: 30.9021
Coeficientes encontrados:
intercept    246.749568
tn_1          84.835718
tn_8          45.777292
tn_6          39.353392
tn_2          38.235749
tn_10         33.631495
tn_11         28.215369
tn_9          15.300660
tn             7.283986
tn_3           0.000000
tn_4           0.000000
tn_5           0.000000
tn_7           0.000000
dtype: float64
📉 RMSE por semilla: [0.0, 0.0, 0.0, 0.0, 0.0]
✅ RMSE promedio: 0.00


Unnamed: 0,product_id,tn
0,20001,1504.681745
1,20002,1087.303495
2,20003,892.497284
3,20004,637.897351
4,20005,593.242194
...,...,...
775,21252,0.178011
776,21265,0.089541
777,21266,0.094659
778,21267,0.092835


# 💾 PASO 5 – Exportar a CSV (opcional)

In [60]:
# Exportar archivo para enviar a Kaggle (ajustar columnas si necesario)
df_pred[['product_id', 'tn']].to_csv("./outputs/predicciones_regresion_lineal_v7_ridge_optuna_5kfold.csv", index=False)

In [61]:
a = pd.read_csv('./outputs/predicciones_regresion_lineal_v1.csv', sep=',')  # Cargar el dataset
b = pd.read_csv('./outputs/prediccion_autogluon_2ventanas.csv', sep=',')  # Cargar el dataset

a = a.merge(b, on='product_id', how='left')
a['mean'] = a[['tn_x', 'tn_y']].mean(axis=1)
a = a[['product_id', 'mean']].rename(columns={'mean': 'tn'})
a.to_csv("./outputs/predicciones_regresion_lineal_v2.csv", index=False)