In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

# üîß PASO 1 ‚Äì Preparar dataset

In [7]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

# Crear funci√≥n de preparaci√≥n con lags y clase
def preparar_dataset_para_regresion(df):
    df = df.sort_values(['product_id', 'periodo'])
    
    # Generar lags tn_1 a tn_11 por producto
    for i in range(1, 36):
        df[f'tn_{i}'] = df.groupby('product_id')['tn'].shift(i)
    
    # Crear campo "clase" con tn en periodo+2
    df['clase'] = df.groupby('product_id')['tn'].shift(-2)
    
    return df

In [46]:
def get_productos_36_meses():
    df = pd.read_csv('../../data/raw/sell-in.csv', sep='\t')  # Cargar el dataset
    df = df.groupby(['periodo', 'product_id']).agg({'tn' : 'sum'}).reset_index() # Agrupar por periodo y producto


    # Paso 1: calcular la cantidad de registros por product_id
    conteo = df.groupby('product_id').size()

    # Paso 2: quedarte solo con los que tienen 36 registros
    productos_36_meses = conteo[conteo == 36].index.tolist()

    # Paso 3: filtrar el DataFrame original
    df_filtrado = df[df['product_id'].isin(productos_36_meses)].copy()

    return df_filtrado


def get_dataset_especial():
    df = pd.read_csv('../../data/raw/sell-in.csv', sep='\t')  # Cargar el dataset
    df = df.groupby(['periodo', 'product_id']).agg({'tn' : 'sum'}).reset_index() # Agrupar por periodo y producto
    # df = df[df['product_id'].isin(get_productos_36_meses()['product_id'].unique())]
    df_prep = preparar_dataset_para_regresion(df)  # Preparar el dataset
    df_prep = df_prep[df_prep['periodo'] <= 201910]
    df_prep = df_prep[df_prep['product_id'].isin(get_productos_36_meses()['product_id'].unique())]
    df_prep = df_prep.sort_values('periodo').groupby('product_id').tail(1) 
    df_prep.drop(columns={'tn_34', 'tn_35'})
    return df_prep



# columnas_lags = [f'tn_{i}' for i in range(1, 36)]
# columnas_necesarias = columnas_lags + ['clase']

# df_prep = df.dropna(subset=columnas_necesarias).copy()

# df_prep = df_prep.sort_values('periodo').groupby('product_id').tail(1)
# df_prep

# üìö PASO 2 ‚Äì Entrenar regresi√≥n lineal

In [47]:
# Lista de productos m√°gicos
magicos = get_productos_36_meses()['product_id'].unique()


# Entrenar regresi√≥n lineal en periodo base con productos m√°gicos
def entrenar_regresion_lineal(df, periodo_base, magicos,  modelo='ridge', alpha=1.0, random_state=42):
    columnas_tn = ['tn'] + [f'tn_{i}' for i in range(1, 33)]
    
    # Filtrar registros del periodo base y productos m√°gicos
    df_train = get_dataset_especial()
    
    # Eliminar filas con datos faltantes
    df_train = df_train.dropna(subset=columnas_tn + ['clase'])

    # Entrenar modelo
    X = df_train[columnas_tn]
    y = df_train['clase']

    # scaler = StandardScaler()
    # X_scaled = scaler.fit_transform(X)
    
    # if modelo == 'ridge':
    #     reg = Ridge(alpha=alpha, random_state=random_state)
    # elif modelo == 'lasso':
    #     reg = Lasso(alpha=alpha, random_state=random_state, max_iter=5000)
    # else:
    #     reg = LinearRegression()
    
    reg = LinearRegression()

    reg.fit(X, y)
    y_pred = reg.predict(X)
    
    
    # Mostrar coeficientes
    coef = pd.Series([reg.intercept_] + reg.coef_.tolist(), index=['intercept'] + columnas_tn)
    return reg, coef.sort_values(ascending=False) 


# üîÆ PASO 3 ‚Äì Predecir con el modelo entrenado

In [49]:
def productos_con_historia():
    df = pd.read_csv('../../data/raw/sell-in.csv', sep='\t')  # Cargar el dataset
    df = df.groupby(['periodo', 'product_id']).agg({'tn' : 'sum'}).reset_index() # Agrupar por periodo y producto
    productos = pd.read_csv('../../data/raw/product_id_apredecir201912.csv', sep=',')  # Cargar productos
    df = df[df['product_id'].isin(productos['product_id'].unique())].copy()  # Filtrar productos a predecir

    contador = df[(df['periodo']>=201701) & (df['periodo']<=201912) ] 
    contador = contador.groupby('product_id').size().reset_index(name='count')
    product_id = contador[contador['count']==36]['product_id'].unique()

    return product_id

In [None]:
# df = pd.read_csv('../../data/raw/sell-in.csv', sep='\t')  # Cargar el dataset
# df = df.groupby(['periodo', 'product_id']).agg({'tn' : 'sum'}).reset_index() # Agrupar por periodo y producto
# productos = pd.read_csv('../../data/raw/product_id_apredecir201912.csv', sep=',')  # Cargar productos
# df = df[df['product_id'].isin(productos['product_id'].unique())].copy()  # Filtrar productos a predecir
# df = df[(df['periodo']>=201901) & (df['periodo']<=201912) ] 
# df = df[df['product_id'].isin(productos_con_historia())]  # Filtrar productos con historia
# df_pred = df[df['periodo'] == 201912].copy() 
# df_pred['tn'].sum()

np.float64(23439.27676)

In [62]:
# Predecir para un periodo futuro
def predecir_regresion(model):
    
    df = pd.read_csv('../../data/raw/sell-in.csv', sep='\t')  # Cargar el dataset
    df = df.groupby(['periodo', 'product_id']).agg({'tn' : 'sum'}).reset_index() # Agrupar por periodo y producto
    productos = pd.read_csv('../../data/raw/product_id_apredecir201912.csv', sep=',')  # Cargar productos
    df = df[df['product_id'].isin(productos['product_id'].unique())].copy()  # Filtrar productos a predecir
    df = df[(df['periodo']>=201901) & (df['periodo']<=201912) ] 
    df = df[df['product_id'].isin(productos_con_historia())]  # Filtrar productos con historia
    
    df = df.sort_values(['product_id', 'periodo'])
    
    # Generar lags tn_1 a tn_11 por producto
    for i in range(1, 33):
        df[f'tn_{i}'] = df.groupby('product_id')['tn'].shift(i)
    
    columnas_tn = ['tn'] + [f'tn_{i}' for i in range(1, 33)]
    
    
    df_pred = df[df['periodo'] == 201912].copy() 
    df_pred.fillna(0, inplace=True)

    # Predecir usando regresi√≥n lineal
    df_pred['pred'] = model.predict(df_pred[columnas_tn])   
    
    return df_pred

def predecir_no_completos():
    
    df = pd.read_csv('../../data/raw/sell-in.csv', sep='\t')  # Cargar el dataset
    df = df.groupby(['periodo', 'product_id']).agg({'tn' : 'sum'}).reset_index() # Agrupar por periodo y producto
    productos = pd.read_csv('../../data/raw/product_id_apredecir201912.csv', sep=',')  # Cargar productos
    df = df[df['product_id'].isin(productos['product_id'].unique())].copy()  # Filtrar productos a predecir
    df = df[(df['periodo']>=201901) & (df['periodo']<=201912) ] 
    df = df[~df['product_id'].isin(productos_con_historia())]  # Filtrar productos con historia
    
    df = df.sort_values(['product_id', 'periodo'])
    
    df = df.groupby('product_id').agg({'tn':'mean'}).reset_index()  # Tomar los √∫ltimos 12 meses de cada producto
    
    return df

In [57]:
hol = predecir_regresion("asd")
hol

Unnamed: 0,periodo,product_id,tn,tn_1,tn_2,tn_3,tn_4,tn_5,tn_6,tn_7,...,tn_23,tn_24,tn_25,tn_26,tn_27,tn_28,tn_29,tn_30,tn_31,tn_32
30316,201912,20001,1504.68856,1397.37231,1561.50552,1660.00561,1261.34529,1678.99318,1109.93769,1629.78233,...,,,,,,,,,,
30317,201912,20002,1087.30855,1423.57739,1979.53635,1090.18771,813.78215,1066.44999,928.36431,1034.98927,...,,,,,,,,,,
30318,201912,20003,892.50129,948.29393,1081.36645,967.77116,635.59563,715.20314,662.38654,590.12515,...,,,,,,,,,,
30319,201912,20004,637.90002,723.94206,1064.69633,786.17140,482.13372,521.71519,667.19411,603.31081,...,,,,,,,,,,
30320,201912,20005,593.24443,606.91173,996.78275,879.52808,536.66800,745.74978,876.39696,897.26297,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31196,201912,21182,0.06688,0.04370,0.15562,0.03166,0.14046,0.02274,0.00089,0.18552,...,,,,,,,,,,
31200,201912,21192,0.01238,0.04005,0.04441,0.04369,0.02840,0.09466,0.06480,0.15288,...,,,,,,,,,,
31206,201912,21202,0.04587,0.02927,0.05069,0.07474,0.08585,0.08480,0.05822,0.24567,...,,,,,,,,,,
31212,201912,21218,0.03348,0.03129,0.03966,0.07060,0.04513,0.08153,0.02874,0.19473,...,,,,,,,,,,


In [None]:
# prods = predecir_regresion(modelo)
# prods.sort_values(['product_id','periodo'], inplace=True)
# prods

Unnamed: 0,periodo,product_id,tn,tn_1,tn_2,tn_3,tn_4,tn_5,tn_6,tn_7,tn_8,tn_9,tn_10,tn_11,pred
30316,201912,20001,1504.68856,1397.37231,1561.50552,1660.00561,1261.34529,1678.99318,1109.93769,1629.78233,1647.63848,1470.65653,1259.09363,1275.77351,1162.707525
30317,201912,20002,1087.30855,1423.57739,1979.53635,1090.18771,813.78215,1066.44999,928.36431,1034.98927,1287.62346,1083.62552,1043.01349,1266.78751,1183.640604
30318,201912,20003,892.50129,948.29393,1081.36645,967.77116,635.59563,715.20314,662.38654,590.12515,565.33774,638.04010,758.32657,964.76919,684.763931
30319,201912,20004,637.90002,723.94206,1064.69633,786.17140,482.13372,521.71519,667.19411,603.31081,466.70901,619.77084,441.70332,511.33713,580.484961
30320,201912,20005,593.24443,606.91173,996.78275,879.52808,536.66800,745.74978,876.39696,897.26297,624.99880,488.21387,409.89950,363.58438,563.560780
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31229,201912,21248,0.01129,0.02964,0.01270,0.01411,0.02117,0.02116,0.00988,0.01553,0.03106,0.05365,0.06209,0.02962,0.468061
31232,201912,21256,0.01271,0.02682,0.00847,0.00423,0.02965,0.02822,0.00988,0.01553,0.01835,0.05930,0.05081,0.03811,0.463856
31234,201912,21259,0.01412,0.02965,0.01975,0.00564,0.03106,0.04657,0.00988,0.01976,0.02117,0.06777,0.05080,0.04234,0.467856
31235,201912,21262,0.01834,0.02682,0.01693,0.01552,0.02258,0.03953,0.01270,0.01130,0.01412,0.06353,0.05786,0.02680,0.465820


# ‚ñ∂Ô∏è PASO 4 ‚Äì Ejecutar todo el flujo

In [63]:
# Paso A: preparar dataset (con tus datos originales en df)
# df_prep = preparar_dataset_para_regresion(df)

# Paso B: entrenar el modelo con periodo 201812
modelo, coeficientes = entrenar_regresion_lineal(df_prep, periodo_base=201812, magicos=magicos, modelo='ivan', alpha=1.0, random_state=42)
print("Coeficientes encontrados:")
print(coeficientes)

# Paso C: predecir para registros del periodo 201912 ‚Üí objetivo: predecir 202002
df_pred_con_historia = predecir_regresion(modelo)
df_pred_con_historia = df_pred_con_historia[['product_id', 'pred']].rename(columns={'pred': 'tn'})

df_pred_sin_historia = predecir_no_completos()

df_pred = pd.concat([df_pred_con_historia, df_pred_sin_historia], ignore_index=True)

# Paso D: ver primeras predicciones
df_pred

Coeficientes encontrados:
tn_2         0.250816
tn_10        0.232305
tn_7         0.216191
tn_6         0.196566
tn_21        0.193240
tn_23        0.187153
tn_12        0.180523
tn_4         0.170946
tn           0.125537
tn_1         0.111837
tn_18        0.091976
tn_29        0.066872
tn_24        0.053014
tn_9         0.052922
tn_15        0.042940
tn_28        0.038882
tn_22        0.035626
tn_16       -0.002907
tn_20       -0.007872
tn_25       -0.008782
tn_17       -0.017711
tn_5        -0.030610
tn_13       -0.034343
tn_3        -0.049857
tn_27       -0.056633
tn_19       -0.069283
tn_26       -0.075328
tn_14       -0.085351
tn_31       -0.093627
tn_32       -0.119390
tn_11       -0.223615
tn_8        -0.226298
tn_30       -0.244874
intercept   -1.815694
dtype: float64


Unnamed: 0,product_id,tn
0,20001,1099.177980
1,20002,973.731887
2,20003,650.055379
3,20004,630.553932
4,20005,680.177027
...,...,...
775,21263,0.029993
776,21265,0.089541
777,21266,0.094659
778,21267,0.092835


probando lasso

# üíæ PASO 5 ‚Äì Exportar a CSV (opcional)

In [64]:
# Exportar archivo para enviar a Kaggle (ajustar columnas si necesario)
df_pred[['product_id', 'tn']].to_csv("./outputs/predicciones_regresion_lineal_v8_36m.csv", index=False)

In [61]:
a = pd.read_csv('./outputs/predicciones_regresion_lineal_v1.csv', sep=',')  # Cargar el dataset
b = pd.read_csv('./outputs/prediccion_autogluon_2ventanas.csv', sep=',')  # Cargar el dataset

a = a.merge(b, on='product_id', how='left')
a['mean'] = a[['tn_x', 'tn_y']].mean(axis=1)
a = a[['product_id', 'mean']].rename(columns={'mean': 'tn'})
a.to_csv("./outputs/predicciones_regresion_lineal_v2.csv", index=False)