In [None]:
import sys
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV, RepeatedKFold
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error

# Añadir la raíz del proyecto (donde está tu carpeta src/)
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))


# ============================
# 1) CARGA DE DATOS
# ============================
sell_in       = pd.read_csv('../data/sell-in.txt', sep='\t')
prod_vigentes = pd.read_csv('../data/product_id_apredecir201912.txt', sep='\t')
productos     = pd.read_csv('../data/tb_productos_05262025.txt', sep='\t')

# ============================
# 2) PARSEO DE FECHAS
# ============================
sell_in['periodo'] = pd.to_datetime(
    sell_in['periodo'].astype(str) + '01',
    format='%Y%m%d'
)
print("Dimensiones sell_in:", sell_in.shape)

# ============================
# 3) AGREGADOS INICIALES
# ============================
sell_in_agg = (
    sell_in
    .groupby(['periodo','product_id'], as_index=False)
    .agg({'tn': 'sum'})
)

# Filtrar solo productos vigentes
sell_in_agg = sell_in_agg.merge(
    prod_vigentes[['product_id']],
    on='product_id',
    how='inner'
)

# Campo objetivo a 2 periodos en el futuro
sell_in_agg['tn_mas_2'] = sell_in_agg.groupby('product_id')['tn'].shift(-2)

# Lags de tn, del mes anterior hasta 11 meses atrás
for lag in range(1, 12):
    sell_in_agg[f'tn_{lag}'] = (
        sell_in_agg
        .groupby('product_id')['tn']
        .shift(lag)
    )

# Renombrar tn actual a tn_0
sell_in_agg.rename(columns={'tn': 'tn_0'}, inplace=True)

# ============================
# 4) PREPARAR DATOS DE ENTRENAMIENTO (solo diciembre 2018)
# ============================
dataset_training = (
    sell_in_agg[sell_in_agg['periodo'] == '2018-12-01']
    .dropna(subset=['tn_mas_2'] + [f'tn_{i}' for i in range(12)])
    .copy()
)

magicos = [
    20002,20003,20006,20010,20011,20018,20019,20021,
    20026,20028,20035,20039,20042,20044,20045,20046,
    20049,20051,20052,20053,20055,20008,20001,20017,
    20086,20180,20193,20320,20532,20612,20637,20807,20838
]

df_magicos = dataset_training[dataset_training['product_id'].isin(magicos)].copy()
df_magicos.drop(columns=['periodo'], inplace=True)
df_magicos.set_index('product_id', inplace=True)
df_magicos.info()

# ============================
# 5) DEFINIR X e y
# ============================
feature_cols = [f'tn_{i}' for i in range(12)]
X_train = df_magicos[feature_cols]
y_train = df_magicos['tn_mas_2']

# ------------------------------------------------------------------------------------------------

# Modelo base
model = ElasticNet()

# Repeated K-Fold CV
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

# Grid de hiperparámetros
param_grid = {
    'alpha':    [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 0.0, 1.0, 10.0, 100.0],
    'l1_ratio': np.arange(0.40, 1.00, 0.10),
    'tol':      [0.0001, 0.001]
}

# Búsqueda de cuadrícula
search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='neg_mean_absolute_error',
    cv=cv,
    n_jobs=-1,
    verbose=1
)

# Ejecutar búsqueda
results = search.fit(X_train, y_train)

# Resultados
print(f"MAE CV óptimo: {-results.best_score_:.3f}")
print(f"Mejores parámetros: {results.best_params_}")

best_elasticnet = results.best_estimator_

dataset_201912 = (
    sell_in_agg[sell_in_agg['periodo'] == '2019-12-01']
    .merge(prod_vigentes[['product_id']], on='product_id', how='inner')
    .drop(columns=['tn_mas_2','periodo'])
)

complete   = dataset_201912.dropna(subset=feature_cols)
incomplete = dataset_201912[dataset_201912[feature_cols].isna().any(axis=1)].copy()
incomplete[feature_cols] = incomplete[feature_cols].apply(
    lambda row: row.fillna(row.mean()), axis=1
)

dataset_final = pd.concat([complete, incomplete], ignore_index=True)

# ============================
# 9) PREDICCIÓN y GUARDADO
# ============================
X_new = dataset_final[feature_cols]
y_pred = best_elasticnet.predict(X_new)

predicciones = pd.DataFrame({
    'product_id': dataset_final['product_id'],
    'tn_predicho': y_pred
})

# si tn_predicho es negaivo o NaN, reemplazarlo por el valor en el product_id del dataset dataset_201912
predicciones['tn_predicho'] = predicciones.apply(
    lambda row: row['tn_predicho'] if row['tn_predicho'] > 0 else dataset_201912.loc[dataset_201912['product_id'] == row['product_id'], 'tn_0'].values[0],
    axis=1
)

predicciones.sort_values(by='product_id', inplace=True)
predicciones.to_csv('../data/predicciones_elastivnet_v1.csv', index=False)
print("Predicciones guardadas en ../data/predicciones_elastivnet_v1.csv")
