# 🔧 PASO 1 – Preparar dataset

In [55]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

# Crear función de preparación con lags y clase
def preparar_dataset_para_regresion(df):
    df = df.sort_values(['product_id', 'periodo'])
    
    # Generar lags tn_1 a tn_11 por producto
    for i in range(1, 12):
        df[f'tn_{i}'] = df.groupby('product_id')['tn'].shift(i)
    

    df['delta_1_2'] = df['tn_1'] / df['tn_2'].replace(0, np.nan)
    df['delta_1_3'] = df['tn_1'] / df['tn_3'].replace(0, np.nan)
    df['ratio_12'] = (df['tn_1'] - df['tn_2']) / df['tn_2'].replace(0, np.nan)

    # Crear campo "clase" con tn en periodo+2
    df['clase'] = df.groupby('product_id')['tn'].shift(-2)
    
    return df

In [56]:
df = pd.read_csv('../../data/raw/sell-in.csv', sep='\t')  # Cargar el dataset
df = df.groupby(['periodo', 'product_id']).agg({'tn' : 'sum'}).reset_index() # Agrupar por periodo y producto
df_prep = preparar_dataset_para_regresion(df)  # Preparar el dataset
df_prep['year'] = df_prep['periodo'].astype(str).str[:4].astype(int)
df_prep['month'] = df_prep['periodo'] % 100
df_prep['is_q1'] = df_prep['month'].isin([1,2,3]).astype(int)
df_prep['is_q2'] = df_prep['month'].isin([4,5,6]).astype(int)
df_prep['is_end_of_year'] = df_prep['month'].isin([11,12]).astype(int)
df_prep['is_summer'] = df_prep['month'].isin([1,2,12]).astype(int)  # hemisferio sur
df_prep

Unnamed: 0,periodo,product_id,tn,tn_1,tn_2,tn_3,tn_4,tn_5,tn_6,tn_7,...,delta_1_2,delta_1_3,ratio_12,clase,year,month,is_q1,is_q2,is_end_of_year,is_summer
0,201701,20001,934.77222,,,,,,,,...,,,,1303.35771,2017,1,1,0,0,1
785,201702,20001,798.01620,934.77222,,,,,,,...,,,,1069.96130,2017,2,1,0,0,1
1566,201703,20001,1303.35771,798.01620,934.77222,,,,,,...,0.853701,,-0.146299,1502.20132,2017,3,1,0,0,0
2352,201704,20001,1069.96130,1303.35771,798.01620,934.77222,,,,,...,1.633247,1.394305,0.633247,1520.06539,2017,4,0,1,0,0
3136,201705,20001,1502.20132,1069.96130,1303.35771,798.01620,934.77222,,,,...,0.820927,1.340776,-0.179073,1030.67391,2017,5,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
783,201701,21295,0.00699,,,,,,,,...,,,,,2017,1,1,0,0,1
6435,201708,21296,0.00651,,,,,,,,...,,,,,2017,8,0,0,0,0
784,201701,21297,0.00579,,,,,,,,...,,,,,2017,1,1,0,0,1
6436,201708,21298,0.00573,,,,,,,,...,,,,,2017,8,0,0,0,0


# 📚 PASO 2 – Entrenar regresión lineal

In [57]:
def get_productos_36_meses():
    df = pd.read_csv('../../data/raw/sell-in.csv', sep='\t')  # Cargar el dataset
    df = df.groupby(['periodo', 'product_id']).agg({'tn' : 'sum'}).reset_index() # Agrupar por periodo y producto


    # Paso 1: calcular la cantidad de registros por product_id
    conteo = df.groupby('product_id').size()

    # Paso 2: quedarte solo con los que tienen 36 registros
    productos_36_meses = conteo[conteo == 36].index.tolist()

    # Paso 3: filtrar el DataFrame original
    df_filtrado = df[df['product_id'].isin(productos_36_meses)].copy()

    return df_filtrado

In [58]:
# Lista de productos mágicos
magicos = [
    20002, 20003, 20006, 20010, 20011, 20018, 20019, 20021,
    20026, 20028, 20035, 20039, 20042, 20044, 20045, 20046, 20049,
    20051, 20052, 20053, 20055, 20008, 20001, 20017, 20086, 20180,
    20193, 20320, 20532, 20612, 20637, 20807, 20838
]


# Entrenar regresión lineal en periodo base con productos mágicos
def entrenar_regresion_lineal(df, periodo_base, magicos):
    columnas_tn = ['tn'] + [f'tn_{i}' for i in range(1, 12)] + ['year'] + ['month'] + ['is_q1'] + ['is_q2'] + ['is_end_of_year'] + ['is_summer'] + ['delta_1_2'] + ['delta_1_3'] + ['ratio_12']
    
    # Filtrar registros del periodo base y productos mágicos
    df_train = df[(df['periodo'] == periodo_base) & (df['product_id'].isin(magicos))].copy()
    
    # Eliminar filas con datos faltantes
    df_train = df_train.dropna(subset=columnas_tn + ['clase'])

    # Entrenar modelo
    X = df_train[columnas_tn]
    y = df_train['clase']
    model = LinearRegression()
    model.fit(X, y)

    # Mostrar coeficientes
    coef = pd.Series([model.intercept_] + model.coef_.tolist(), index=['intercept'] + columnas_tn)
    return model, coef


# 🔮 PASO 3 – Predecir con el modelo entrenado

In [59]:
def productos_con_historia():
    df = pd.read_csv('../../data/raw/sell-in.csv', sep='\t')  # Cargar el dataset
    df = df.groupby(['periodo', 'product_id']).agg({'tn' : 'sum'}).reset_index() # Agrupar por periodo y producto
    productos = pd.read_csv('../../data/raw/product_id_apredecir201912.csv', sep=',')  # Cargar productos
    df = df[df['product_id'].isin(productos['product_id'].unique())].copy()  # Filtrar productos a predecir

    contador = df[(df['periodo']>=201901) & (df['periodo']<=201912) ] 
    contador = contador.groupby('product_id').size().reset_index(name='count')
    product_id = contador[contador['count']==12]['product_id'].unique()

    return product_id

In [60]:
# df = pd.read_csv('../../data/raw/sell-in.csv', sep='\t')  # Cargar el dataset
# df = df.groupby(['periodo', 'product_id']).agg({'tn' : 'sum'}).reset_index() # Agrupar por periodo y producto
# productos = pd.read_csv('../../data/raw/product_id_apredecir201912.csv', sep=',')  # Cargar productos
# df = df[df['product_id'].isin(productos['product_id'].unique())].copy()  # Filtrar productos a predecir
# df = df[(df['periodo']>=201901) & (df['periodo']<=201912) ] 
# df = df[df['product_id'].isin(productos_con_historia())]  # Filtrar productos con historia
# df_pred = df[df['periodo'] == 201912].copy() 
# df_pred['tn'].sum()

In [None]:
# Predecir para un periodo futuro
def predecir_regresion(model):
    df = pd.read_csv('../../data/raw/sell-in.csv', sep='\t')  # Cargar el dataset
    df = df.groupby(['periodo', 'product_id']).agg({'tn': 'sum'}).reset_index()
    productos = pd.read_csv('../../data/raw/product_id_apredecir201912.csv', sep=',')
    df = df[df['product_id'].isin(productos['product_id'].unique())].copy()
    df = df[(df['periodo'] >= 201901) & (df['periodo'] <= 201912)]
    df = df[df['product_id'].isin(productos_con_historia())]
    df['year'] = df['periodo'].astype(str).str[:4].astype(int)
    df_prep['month'] = df_prep['periodo'] % 100
    df_prep['is_q1'] = df_prep['month'].isin([1,2,3]).astype(int)
    df_prep['is_q2'] = df_prep['month'].isin([4,5,6]).astype(int)
    df_prep['is_end_of_year'] = df_prep['month'].isin([11,12]).astype(int)
    df_prep['is_summer'] = df_prep['month'].isin([1,2,12]).astype(int)  # hemisferio sur
    df = df.sort_values(['product_id', 'periodo'])

    # Generar lags tn_1 a tn_11 por producto
    for i in range(1, 12):
        df[f'tn_{i}'] = df.groupby('product_id')['tn'].shift(i)

    # Agregar aquí las features que faltaban
    df['delta_1_2'] = df['tn_1'] / df['tn_2'].replace(0, np.nan)
    df['delta_1_3'] = df['tn_1'] / df['tn_3'].replace(0, np.nan)
    df['ratio_12'] = (df['tn_1'] - df['tn_2']) / df['tn_2'].replace(0, np.nan)

    columnas_tn = ['tn'] + [f'tn_{i}' for i in range(1, 12)] + ['year'] + ['month'] + ['is_q1'] + ['is_q2'] + ['is_end_of_year'] + ['is_summer'] + ['delta_1_2'] + ['delta_1_3'] + ['ratio_12']

    # Filtrar periodo de predicción
    df_pred = df[df['periodo'] == 201912].copy()

    # Asegurar que no haya nulls
    df_pred = df_pred.dropna(subset=columnas_tn)

    # Predecir
    df_pred['pred'] = model.predict(df_pred[columnas_tn])

    return df_pred


def predecir_no_completos():
    
    df = pd.read_csv('../../data/raw/sell-in.csv', sep='\t')  # Cargar el dataset
    df = df.groupby(['periodo', 'product_id']).agg({'tn' : 'sum'}).reset_index() # Agrupar por periodo y producto
    productos = pd.read_csv('../../data/raw/product_id_apredecir201912.csv', sep=',')  # Cargar productos
    df = df[df['product_id'].isin(productos['product_id'].unique())].copy()  # Filtrar productos a predecir
    df = df[(df['periodo']>=201901) & (df['periodo']<=201912) ] 
    df = df[~df['product_id'].isin(productos_con_historia())]  # Filtrar productos con historia
    
    df = df.sort_values(['product_id', 'periodo'])
    
    df = df.groupby('product_id').agg({'tn':'mean'}).reset_index()  # Tomar los últimos 12 meses de cada producto
    
    return df

In [62]:
# prods = predecir_regresion(modelo)
# prods.sort_values(['product_id','periodo'], inplace=True)
# prods

# ▶️ PASO 4 – Ejecutar todo el flujo

In [63]:
# Paso A: preparar dataset (con tus datos originales en df)
# df_prep = preparar_dataset_para_regresion(df)

# Paso B: entrenar el modelo con periodo 201812
modelo, coeficientes = entrenar_regresion_lineal(df_prep, periodo_base=201812, magicos=magicos)
print("Coeficientes encontrados:")
print(coeficientes)

# Paso C: predecir para registros del periodo 201912 → objetivo: predecir 202002
df_pred_con_historia = predecir_regresion(modelo)
df_pred_con_historia = df_pred_con_historia[['product_id', 'pred']].rename(columns={'pred': 'tn'})

df_pred_sin_historia = predecir_no_completos()

df_pred = pd.concat([df_pred_con_historia, df_pred_sin_historia], ignore_index=True)

# Paso D: ver primeras predicciones
df_pred

Coeficientes encontrados:
intercept        -18.682075
tn                 0.121090
tn_1               0.207221
tn_2               0.055012
tn_3               0.028440
tn_4              -0.112903
tn_5              -0.035531
tn_6               0.144624
tn_7              -0.001131
tn_8               0.157050
tn_9               0.108254
tn_10              0.086909
tn_11              0.106625
year               0.000000
month              0.000000
is_q1              0.000000
is_q2              0.000000
is_end_of_year     0.000000
is_summer          0.000000
delta_1_2        -16.461077
delta_1_3         33.231533
ratio_12         -16.461077
dtype: float64


KeyError: ['month', 'is_q1', 'is_q2', 'is_end_of_year', 'is_summer']

# 💾 PASO 5 – Exportar a CSV (opcional)

In [None]:
# # Exportar archivo para enviar a Kaggle (ajustar columnas si necesario)
# df_pred[['product_id', 'tn']].to_csv("./outputs/predicciones_regresion_lineal_avg3.csv", index=False)

In [None]:
# a = pd.read_csv('./outputs/predicciones_regresion_lineal_v1.csv', sep=',')  # Cargar el dataset
# b = pd.read_csv('./outputs/prediccion_autogluon_2ventanas.csv', sep=',')  # Cargar el dataset

# a = a.merge(b, on='product_id', how='left')
# a['mean'] = a[['tn_x', 'tn_y']].mean(axis=1)
# a = a[['product_id', 'mean']].rename(columns={'mean': 'tn'})
# a.to_csv("./outputs/predicciones_regresion_lineal_v2.csv", index=False)