In [None]:
# ============================
# 0) SETUP: rutas e imports
# ============================
import sys
import os

# Añadir la raíz del proyecto (donde está tu carpeta src/)
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

import pandas as pd
import numpy as np

# ============================
# 1) CARGA DE DATOS
# ============================
sell_in       = pd.read_csv('../data/sell-in.txt', sep='\t')
prod_vigentes = pd.read_csv('../data/product_id_apredecir201912.txt', sep='\t')
productos     = pd.read_csv('../data/tb_productos_05262025.txt', sep='\t')

# ============================
# 2) PARSEO DE FECHAS
# ============================
sell_in['periodo'] = pd.to_datetime(
    sell_in['periodo'].astype(str) + '01',
    format='%Y%m%d'
)
print("Dimensiones sell_in:", sell_in.shape)

# ============================
# 3) AGREGADOS INICIALES
# ============================
sell_in_agg = (
    sell_in
    .groupby(['periodo','product_id'], as_index=False)
    .agg({'tn': 'sum'})
)

# Filtrar solo productos vigentes
sell_in_agg = sell_in_agg.merge(
    prod_vigentes[['product_id']],
    on='product_id',
    how='inner'
)

# Campo objetivo a 2 periodos en el futuro
sell_in_agg['tn_mas_2'] = sell_in_agg.groupby('product_id')['tn'].shift(-2)

# Lags de tn, del mes anterior hasta 11 meses atrás
for lag in range(1, 12):
    sell_in_agg[f'tn_{lag}'] = (
        sell_in_agg
        .groupby('product_id')['tn']
        .shift(lag)
    )

# Renombrar tn actual a tn_0
sell_in_agg.rename(columns={'tn': 'tn_0'}, inplace=True)

# ============================
# 4) PREPARAR DATOS DE ENTRENAMIENTO (solo diciembre 2018)
# ============================
dataset_training = (
    sell_in_agg[sell_in_agg['periodo'] == '2018-12-01']
    .dropna(subset=['tn_mas_2'] + [f'tn_{i}' for i in range(12)])
    .copy()
)

magicos = [
 20002,20003,20006,20010,20011,20018,20019,20021,
 20026,20028,20035,20039,20042,20044,20045,20046,
 20049,20051,20052,20053,20055,20008,20001,20017,
 20086,20180,20193,20320,20532,20612,20637,20807,20838
]

df_magicos = dataset_training[dataset_training['product_id'].isin(magicos)].copy()
df_magicos.drop(columns=['periodo'], inplace=True)
df_magicos.set_index('product_id', inplace=True)
df_magicos.info()

# ============================
# 5) DEFINIR X e y
# ============================
feature_cols = [f'tn_{i}' for i in range(12)]
X_train = df_magicos[feature_cols]
y_train = df_magicos['tn_mas_2']

# ============================
# 6) ENTRENAR Lasso con GridSearchCV (cv=50)
# ============================
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

# Malla de alphas
#param_grid = {'alpha': np.logspace(1, 2, 100)}
param_grid = {'alpha': [-1 , 1, 10, 50, 100, 200, 500, 1000, 2000, 5000]}

# Modelo base
lasso = Lasso(max_iter=5000, random_state=42)
folds = 25
# GridSearch con 33 folds y MSE negativo
grid_search = GridSearchCV(
    estimator=lasso,
    param_grid=param_grid,
    cv=folds,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=1
)

# Ajuste
grid_search.fit(X_train, y_train)

best_alpha  = grid_search.best_params_['alpha']
best_lasso  = grid_search.best_estimator_
best_cv_mse = -grid_search.best_score_
print("alpha óptimo:", grid_search.best_params_['alpha'])

print(f"Mejor alpha encontrado: {best_alpha}")
print(f"MSE CV promedio ({folds} folds): {best_cv_mse:.4f}")

cv_results = pd.DataFrame(grid_search.cv_results_)
print(cv_results[['param_alpha', 'mean_test_score', 'std_test_score']])

# ============================
# 7) MÉTRICAS y COEFICIENTES
# ============================
coef = pd.Series(best_lasso.coef_, index=feature_cols)
print("\nCoeficientes ordenados (de mayor a menor):")
print(coef.sort_values(ascending=False))

print("\nIntercepto:", best_lasso.intercept_)

y_train_pred = best_lasso.predict(X_train)
mse_train    = mean_squared_error(y_train, y_train_pred)
print(f"\nMSE en training: {mse_train:.4f}")

# ============================
# 8) PREPARAR DATOS PARA PREDICCIÓN (2019-12)
# ============================
dataset_201912 = (
    sell_in_agg[sell_in_agg['periodo'] == '2019-12-01']
    .merge(prod_vigentes[['product_id']], on='product_id', how='inner')
    .drop(columns=['tn_mas_2','periodo'])
)

complete   = dataset_201912.dropna(subset=feature_cols)
incomplete = dataset_201912[dataset_201912[feature_cols].isna().any(axis=1)].copy()
incomplete[feature_cols] = incomplete[feature_cols].apply(
    lambda row: row.fillna(row.mean()), axis=1
)

dataset_final = pd.concat([complete, incomplete], ignore_index=True)

# ============================
# 9) PREDICCIÓN y GUARDADO
# ============================
X_new = dataset_final[feature_cols]
y_pred = best_lasso.predict(X_new)

predicciones = pd.DataFrame({
    'product_id': dataset_final['product_id'],
    'tn_predicho': y_pred
})

predicciones.sort_values(by='product_id', inplace=True)
predicciones.to_csv('../data/predicciones_lasso_v9.csv', index=False)
print("Predicciones guardadas en ../data/predicciones_lasso_v9.csv")


Dimensiones sell_in: (2945818, 7)
<class 'pandas.core.frame.DataFrame'>
Index: 33 entries, 20001 to 20838
Data columns (total 13 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   tn_0      33 non-null     float64
 1   tn_mas_2  33 non-null     float64
 2   tn_1      33 non-null     float64
 3   tn_2      33 non-null     float64
 4   tn_3      33 non-null     float64
 5   tn_4      33 non-null     float64
 6   tn_5      33 non-null     float64
 7   tn_6      33 non-null     float64
 8   tn_7      33 non-null     float64
 9   tn_8      33 non-null     float64
 10  tn_9      33 non-null     float64
 11  tn_10     33 non-null     float64
 12  tn_11     33 non-null     float64
dtypes: float64(13)
memory usage: 3.6 KB
Fitting 25 folds for each of 100 candidates, totalling 2500 fits
alpha óptimo: 100.0
Mejor alpha encontrado: 100.0
MSE CV promedio (25 folds): 3125.2845
    param_alpha  mean_test_score  std_test_score
0     10.000000     -6317.870328 