In [1]:
import sys
import os

# Añadir la raíz del proyecto (la carpeta donde está tu src/)
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

In [3]:
import pandas as pd
import numpy as np

# 1) Carga de datos
sell_in       = pd.read_csv('../data/sell-in.txt', sep='\t')
prod_vigentes = pd.read_csv('../data/product_id_apredecir201912.txt', sep='\t')  # contiene al menos product_id
productos = pd.read_csv('../data/tb_productos_05262025.txt', sep='\t')  # contiene al menos product_id


In [4]:
# 2) Parseo de 'periodo' a datetime
sell_in['periodo'] = pd.to_datetime(
    sell_in['periodo'].astype(str) + '01',
    format='%Y%m%d'
)

In [5]:
sell_in.shape

(2945818, 7)

In [6]:
# 3) Agregar cust_request_qty, cust_request_tn y tn por (periodo, customer_id, product_id)

sell_in_agg = (
    sell_in
    .groupby(['periodo','product_id'], as_index=False)
    .agg({
        'tn': 'sum',
        'cust_request_qty': 'sum',
        'cust_request_tn': 'sum'
    })
)

In [7]:
# hacer un join con prod_vigentes para quedarnos solo con los productos vigentes
sell_in_agg = sell_in_agg.merge(
    prod_vigentes[['product_id']],
    on='product_id',
    how='inner'
)

In [8]:
sell_in_agg.drop(columns=['cust_request_qty', 'cust_request_tn'], inplace=True)
sell_in_agg

Unnamed: 0,periodo,product_id,tn
0,2017-01-01,20001,934.77222
1,2017-01-01,20002,550.15707
2,2017-01-01,20003,1063.45835
3,2017-01-01,20004,555.91614
4,2017-01-01,20005,494.27011
...,...,...,...
22344,2019-12-01,21263,0.01270
22345,2019-12-01,21265,0.05007
22346,2019-12-01,21266,0.05121
22347,2019-12-01,21267,0.01569


In [9]:
# Crear un nuevo campo llamado tn_mas_2 que es el campo tn de 2 periodos a futuro
sell_in_agg['tn_mas_2'] = sell_in_agg.groupby(['product_id'])['tn'].shift(-2)


In [10]:
sell_in_agg

Unnamed: 0,periodo,product_id,tn,tn_mas_2
0,2017-01-01,20001,934.77222,1303.35771
1,2017-01-01,20002,550.15707,834.73521
2,2017-01-01,20003,1063.45835,917.16548
3,2017-01-01,20004,555.91614,489.91328
4,2017-01-01,20005,494.27011,563.89955
...,...,...,...,...
22344,2019-12-01,21263,0.01270,
22345,2019-12-01,21265,0.05007,
22346,2019-12-01,21266,0.05121,
22347,2019-12-01,21267,0.01569,


In [11]:
sell_in_agg[sell_in_agg['periodo'] == '2019-12-01']

Unnamed: 0,periodo,product_id,tn,tn_mas_2
21569,2019-12-01,20001,1504.68856,
21570,2019-12-01,20002,1087.30855,
21571,2019-12-01,20003,892.50129,
21572,2019-12-01,20004,637.90002,
21573,2019-12-01,20005,593.24443,
...,...,...,...,...
22344,2019-12-01,21263,0.01270,
22345,2019-12-01,21265,0.05007,
22346,2019-12-01,21266,0.05121,
22347,2019-12-01,21267,0.01569,


In [12]:
sell_in_agg['mean_last_3m'] = (
    sell_in_agg
      .groupby('product_id')['tn']
      .apply(lambda x: x.rolling(window=3, min_periods=1).mean())
      .reset_index(level=0, drop=True)
)

In [13]:
for lag in range (1, 12):
    sell_in_agg[f'tn_{lag}'] = (sell_in_agg
        .groupby(['product_id'])['tn']
        .shift(lag)
    )

In [14]:
# Renombrar el campo tn a tn_0
sell_in_agg.rename(columns={'tn': 'tn_0'}, inplace=True)

In [15]:
sell_in_agg


Unnamed: 0,periodo,product_id,tn_0,tn_mas_2,mean_last_3m,tn_1,tn_2,tn_3,tn_4,tn_5,tn_6,tn_7,tn_8,tn_9,tn_10,tn_11
0,2017-01-01,20001,934.77222,1303.35771,934.772220,,,,,,,,,,,
1,2017-01-01,20002,550.15707,834.73521,550.157070,,,,,,,,,,,
2,2017-01-01,20003,1063.45835,917.16548,1063.458350,,,,,,,,,,,
3,2017-01-01,20004,555.91614,489.91328,555.916140,,,,,,,,,,,
4,2017-01-01,20005,494.27011,563.89955,494.270110,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22344,2019-12-01,21263,0.01270,,0.020230,0.03247,0.01552,0.01128,0.03388,0.03387,0.00988,0.02258,0.01835,0.06636,0.05927,0.04376
22345,2019-12-01,21265,0.05007,,0.075093,0.06600,0.10921,0.01707,0.01593,0.02959,0.05121,0.17635,0.36405,0.01593,,
22346,2019-12-01,21266,0.05121,,0.078883,0.06713,0.11831,0.02844,0.01480,0.05916,0.05235,0.17634,0.36178,0.01707,,
22347,2019-12-01,21267,0.01569,,0.050990,0.04052,0.09676,0.01830,0.04054,0.07452,0.05882,0.24451,0.12291,0.21578,,


In [16]:
from sklearn.linear_model import LinearRegression

# 2) Filtras sólo diciembre 2018 y eliminas filas incompletas:
dataset_training = (
    sell_in_agg[sell_in_agg['periodo'] == '2018-12-01']
      .dropna(subset=['tn_mas_2'] + [f'tn_{i}' for i in range(12)])
)

In [17]:
dataset_training

Unnamed: 0,periodo,product_id,tn_0,tn_mas_2,mean_last_3m,tn_1,tn_2,tn_3,tn_4,tn_5,tn_6,tn_7,tn_8,tn_9,tn_10,tn_11
12906,2018-12-01,20001,1486.68669,1259.09363,1864.966707,1813.01511,2295.19832,1438.67455,1800.96168,1470.41009,1150.79169,1293.89788,1251.28462,1856.83534,1043.76470,1169.07532
12907,2018-12-01,20002,1009.45458,1043.01349,1384.918527,1766.81068,1378.49032,954.23575,1161.88430,977.40239,1033.82845,1103.39191,999.20934,966.86044,712.00087,984.80167
12908,2018-12-01,20003,769.82869,758.32657,1096.696177,1206.91773,1313.34211,912.34156,955.97079,656.22700,660.73323,784.35885,765.47838,778.55594,788.30749,907.56304
12909,2018-12-01,20004,585.56477,441.70332,732.527440,802.34669,809.67086,948.86342,936.42001,653.42310,447.84475,641.37063,611.51237,488.92473,503.65326,415.52538
12910,2018-12-01,20005,372.63428,409.89950,578.546193,469.26344,893.74086,761.77520,874.88924,502.34077,547.62513,637.11135,496.41774,559.98671,399.20878,417.53208
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13545,2018-12-01,21207,0.12478,0.18451,0.125490,0.11475,0.13694,0.12438,0.18884,0.21173,0.24206,0.21064,0.19744,0.35279,0.27239,0.39088
13547,2018-12-01,21212,0.09869,0.12051,0.159813,0.13082,0.24993,0.12490,0.17085,0.16847,0.21680,0.28263,0.19330,0.22093,0.22882,0.24341
13548,2018-12-01,21218,0.03092,0.05751,0.045113,0.02618,0.07824,0.07952,0.20672,0.16725,0.09627,0.14214,0.19183,0.16635,0.03003,0.04950
13549,2018-12-01,21222,0.02839,0.03786,0.049983,0.04150,0.08006,0.06261,0.09394,0.09829,0.08301,0.09830,0.09173,0.06919,0.04879,0.12522


In [18]:

dataset_training.drop(columns=['periodo'], inplace=True)

In [19]:
dataset_training

Unnamed: 0,product_id,tn_0,tn_mas_2,mean_last_3m,tn_1,tn_2,tn_3,tn_4,tn_5,tn_6,tn_7,tn_8,tn_9,tn_10,tn_11
12906,20001,1486.68669,1259.09363,1864.966707,1813.01511,2295.19832,1438.67455,1800.96168,1470.41009,1150.79169,1293.89788,1251.28462,1856.83534,1043.76470,1169.07532
12907,20002,1009.45458,1043.01349,1384.918527,1766.81068,1378.49032,954.23575,1161.88430,977.40239,1033.82845,1103.39191,999.20934,966.86044,712.00087,984.80167
12908,20003,769.82869,758.32657,1096.696177,1206.91773,1313.34211,912.34156,955.97079,656.22700,660.73323,784.35885,765.47838,778.55594,788.30749,907.56304
12909,20004,585.56477,441.70332,732.527440,802.34669,809.67086,948.86342,936.42001,653.42310,447.84475,641.37063,611.51237,488.92473,503.65326,415.52538
12910,20005,372.63428,409.89950,578.546193,469.26344,893.74086,761.77520,874.88924,502.34077,547.62513,637.11135,496.41774,559.98671,399.20878,417.53208
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13545,21207,0.12478,0.18451,0.125490,0.11475,0.13694,0.12438,0.18884,0.21173,0.24206,0.21064,0.19744,0.35279,0.27239,0.39088
13547,21212,0.09869,0.12051,0.159813,0.13082,0.24993,0.12490,0.17085,0.16847,0.21680,0.28263,0.19330,0.22093,0.22882,0.24341
13548,21218,0.03092,0.05751,0.045113,0.02618,0.07824,0.07952,0.20672,0.16725,0.09627,0.14214,0.19183,0.16635,0.03003,0.04950
13549,21222,0.02839,0.03786,0.049983,0.04150,0.08006,0.06261,0.09394,0.09829,0.08301,0.09830,0.09173,0.06919,0.04879,0.12522


In [20]:
# # 3) Definir X e y:
# feature_cols = [f'tn_{i}' for i in range(12)]   # ['tn_0','tn_1',…,'tn_11']
# X_train = dataset_training[feature_cols]
# y_train = dataset_training['tn_mas_2']

In [21]:
# # 4) Ajustar la regresión lineal:
# lr = LinearRegression()
# lr.fit(X_train, y_train)


In [22]:


# # 5) Echa un vistazo a R² en el mismo train (o en cross-val):
# print("R² en diciembre 2018:", lr.score(X_train, y_train))

# # 6) Cuando quieras predecir febrero 2020, construye otro 
# #    DataFrame análogo para periodo='2019-12-01' y usa:
# # X_new = df[df['periodo']=='2019-12-01'][feature_cols]
# # y_pred = lr.predict(X_new)

In [23]:
magicos = [
 20002,20003,20006,20010,20011,20018,20019,20021,
 20026,20028,20035,20039,20042,20044,20045,20046,
 20049,20051,20052,20053,20055,20008,20001,20017,
 20086,20180,20193,20320,20532,20612,20637,20807,20838
]

In [24]:
df_magicos = dataset_training.copy()  

In [25]:



# 3) Extraigo sólo diciembre-2019 y los productos “mágicos”:
df_magicos = (
    df_magicos[
      (df_magicos['product_id'].isin(magicos))
    ]
    .set_index('product_id')
)

In [26]:
df_magicos.drop(columns=['mean_last_3m'], inplace=True)
df_magicos

Unnamed: 0_level_0,tn_0,tn_mas_2,tn_1,tn_2,tn_3,tn_4,tn_5,tn_6,tn_7,tn_8,tn_9,tn_10,tn_11
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
20001,1486.68669,1259.09363,1813.01511,2295.19832,1438.67455,1800.96168,1470.41009,1150.79169,1293.89788,1251.28462,1856.83534,1043.7647,1169.07532
20002,1009.45458,1043.01349,1766.81068,1378.49032,954.23575,1161.8843,977.40239,1033.82845,1103.39191,999.20934,966.86044,712.00087,984.80167
20003,769.82869,758.32657,1206.91773,1313.34211,912.34156,955.97079,656.227,660.73323,784.35885,765.47838,778.55594,788.30749,907.56304
20006,407.75925,479.99914,566.66809,513.15472,478.04388,615.70617,515.20419,468.1526,865.28861,748.44391,862.19361,588.56272,470.33785
20008,426.32899,476.98787,433.5017,532.45644,436.96269,554.82147,526.38149,554.57063,707.59267,691.53246,765.98901,506.25385,469.29224
20010,285.02947,337.76009,414.97753,612.50721,480.60235,582.83104,331.96807,223.87746,227.24082,171.74107,653.77607,477.48363,298.25586
20011,321.09714,431.62938,289.13976,177.75576,189.5985,191.0727,300.26178,437.7555,484.04538,562.70214,526.99374,601.26066,340.75314
20017,259.32724,308.7106,286.83676,331.23254,288.35292,374.95908,351.60065,316.45841,533.53335,550.29417,488.79258,377.84497,291.70926
20018,326.01506,265.84135,371.52958,161.58557,282.43485,375.61778,325.03223,420.33781,388.43687,543.06908,510.33171,337.54792,342.16945
20019,446.69747,323.66178,532.98143,552.71975,417.95455,387.73155,351.0561,262.33076,356.42982,290.39581,321.26878,629.89543,243.71984


In [27]:
df_magicos.info()


<class 'pandas.core.frame.DataFrame'>
Index: 33 entries, 20001 to 20838
Data columns (total 13 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   tn_0      33 non-null     float64
 1   tn_mas_2  33 non-null     float64
 2   tn_1      33 non-null     float64
 3   tn_2      33 non-null     float64
 4   tn_3      33 non-null     float64
 5   tn_4      33 non-null     float64
 6   tn_5      33 non-null     float64
 7   tn_6      33 non-null     float64
 8   tn_7      33 non-null     float64
 9   tn_8      33 non-null     float64
 10  tn_9      33 non-null     float64
 11  tn_10     33 non-null     float64
 12  tn_11     33 non-null     float64
dtypes: float64(13)
memory usage: 3.6 KB


In [28]:
# 3) Definir X e y:
feature_cols = [f'tn_{i}' for i in range(12)]   # ['tn_0','tn_1',…,'tn_11']
X_train = df_magicos[feature_cols]
y_train = df_magicos['tn_mas_2']

In [29]:
# 4) Ajustar la regresión lineal:
lr = LinearRegression()
lr.fit(X_train, y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [30]:
# Entrenar el modelo

In [31]:
# 5) Echa un vistazo a R² en el mismo train (o en cross-val):
print("R² en diciembre 2018:", lr.score(X_train, y_train))

R² en diciembre 2018: 0.9882870446930067


In [32]:
# Mostrar coeficientes de la regresión
coeficientes = pd.Series(lr.coef_, index=feature_cols)
coeficientes.sort_values(ascending=False)


tn_1     0.236558
tn_2     0.178208
tn_6     0.151936
tn_8     0.142839
tn_10    0.119211
tn_9     0.103804
tn_11    0.073671
tn_7     0.043933
tn_0    -0.001339
tn_5    -0.007775
tn_3    -0.060031
tn_4    -0.161875
dtype: float64

In [33]:
# Mostrar el intercepto
print("Intercepto:", lr.intercept_)

Intercepto: 0.4414671835401691


In [34]:
# Realizar un merge entre sell_in para el periodo 2019-12-01 con prod_vigentes para quedarnos sólo con los productos vigentes

# Crear un nuevo campo llamado tn_mas_2 que es el campo tn de 2 periodos a futuro
# sell_in_agg['tn_mas_2'] = sell_in_agg.groupby(['product_id'])['tn'].shift(-2)
dataset_201912_vigentes = (
    sell_in_agg[sell_in_agg['periodo'] == '2019-12-01']
      .merge(prod_vigentes[['product_id']], on='product_id', how='inner')
)
dataset_201912_vigentes

Unnamed: 0,periodo,product_id,tn_0,tn_mas_2,mean_last_3m,tn_1,tn_2,tn_3,tn_4,tn_5,tn_6,tn_7,tn_8,tn_9,tn_10,tn_11
0,2019-12-01,20001,1504.68856,,1487.855463,1397.37231,1561.50552,1660.00561,1261.34529,1678.99318,1109.93769,1629.78233,1647.63848,1470.65653,1259.09363,1275.77351
1,2019-12-01,20002,1087.30855,,1496.807430,1423.57739,1979.53635,1090.18771,813.78215,1066.44999,928.36431,1034.98927,1287.62346,1083.62552,1043.01349,1266.78751
2,2019-12-01,20003,892.50129,,974.053890,948.29393,1081.36645,967.77116,635.59563,715.20314,662.38654,590.12515,565.33774,638.04010,758.32657,964.76919
3,2019-12-01,20004,637.90002,,808.846137,723.94206,1064.69633,786.17140,482.13372,521.71519,667.19411,603.31081,466.70901,619.77084,441.70332,511.33713
4,2019-12-01,20005,593.24443,,732.312970,606.91173,996.78275,879.52808,536.66800,745.74978,876.39696,897.26297,624.99880,488.21387,409.89950,363.58438
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
775,2019-12-01,21263,0.01270,,0.020230,0.03247,0.01552,0.01128,0.03388,0.03387,0.00988,0.02258,0.01835,0.06636,0.05927,0.04376
776,2019-12-01,21265,0.05007,,0.075093,0.06600,0.10921,0.01707,0.01593,0.02959,0.05121,0.17635,0.36405,0.01593,,
777,2019-12-01,21266,0.05121,,0.078883,0.06713,0.11831,0.02844,0.01480,0.05916,0.05235,0.17634,0.36178,0.01707,,
778,2019-12-01,21267,0.01569,,0.050990,0.04052,0.09676,0.01830,0.04054,0.07452,0.05882,0.24451,0.12291,0.21578,,


In [35]:
dataset_201912_vigentes.drop(columns=['tn_mas_2'], inplace=True)


In [36]:
dataset_201912_vigentes

Unnamed: 0,periodo,product_id,tn_0,mean_last_3m,tn_1,tn_2,tn_3,tn_4,tn_5,tn_6,tn_7,tn_8,tn_9,tn_10,tn_11
0,2019-12-01,20001,1504.68856,1487.855463,1397.37231,1561.50552,1660.00561,1261.34529,1678.99318,1109.93769,1629.78233,1647.63848,1470.65653,1259.09363,1275.77351
1,2019-12-01,20002,1087.30855,1496.807430,1423.57739,1979.53635,1090.18771,813.78215,1066.44999,928.36431,1034.98927,1287.62346,1083.62552,1043.01349,1266.78751
2,2019-12-01,20003,892.50129,974.053890,948.29393,1081.36645,967.77116,635.59563,715.20314,662.38654,590.12515,565.33774,638.04010,758.32657,964.76919
3,2019-12-01,20004,637.90002,808.846137,723.94206,1064.69633,786.17140,482.13372,521.71519,667.19411,603.31081,466.70901,619.77084,441.70332,511.33713
4,2019-12-01,20005,593.24443,732.312970,606.91173,996.78275,879.52808,536.66800,745.74978,876.39696,897.26297,624.99880,488.21387,409.89950,363.58438
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
775,2019-12-01,21263,0.01270,0.020230,0.03247,0.01552,0.01128,0.03388,0.03387,0.00988,0.02258,0.01835,0.06636,0.05927,0.04376
776,2019-12-01,21265,0.05007,0.075093,0.06600,0.10921,0.01707,0.01593,0.02959,0.05121,0.17635,0.36405,0.01593,,
777,2019-12-01,21266,0.05121,0.078883,0.06713,0.11831,0.02844,0.01480,0.05916,0.05235,0.17634,0.36178,0.01707,,
778,2019-12-01,21267,0.01569,0.050990,0.04052,0.09676,0.01830,0.04054,0.07452,0.05882,0.24451,0.12291,0.21578,,


In [37]:
dataset_656 = dataset_201912_vigentes.dropna()
dataset_656

Unnamed: 0,periodo,product_id,tn_0,mean_last_3m,tn_1,tn_2,tn_3,tn_4,tn_5,tn_6,tn_7,tn_8,tn_9,tn_10,tn_11
0,2019-12-01,20001,1504.68856,1487.855463,1397.37231,1561.50552,1660.00561,1261.34529,1678.99318,1109.93769,1629.78233,1647.63848,1470.65653,1259.09363,1275.77351
1,2019-12-01,20002,1087.30855,1496.807430,1423.57739,1979.53635,1090.18771,813.78215,1066.44999,928.36431,1034.98927,1287.62346,1083.62552,1043.01349,1266.78751
2,2019-12-01,20003,892.50129,974.053890,948.29393,1081.36645,967.77116,635.59563,715.20314,662.38654,590.12515,565.33774,638.04010,758.32657,964.76919
3,2019-12-01,20004,637.90002,808.846137,723.94206,1064.69633,786.17140,482.13372,521.71519,667.19411,603.31081,466.70901,619.77084,441.70332,511.33713
4,2019-12-01,20005,593.24443,732.312970,606.91173,996.78275,879.52808,536.66800,745.74978,876.39696,897.26297,624.99880,488.21387,409.89950,363.58438
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
770,2019-12-01,21248,0.01129,0.017877,0.02964,0.01270,0.01411,0.02117,0.02116,0.00988,0.01553,0.03106,0.05365,0.06209,0.02962
772,2019-12-01,21256,0.01271,0.016000,0.02682,0.00847,0.00423,0.02965,0.02822,0.00988,0.01553,0.01835,0.05930,0.05081,0.03811
773,2019-12-01,21259,0.01412,0.021173,0.02965,0.01975,0.00564,0.03106,0.04657,0.00988,0.01976,0.02117,0.06777,0.05080,0.04234
774,2019-12-01,21262,0.01834,0.020697,0.02682,0.01693,0.01552,0.02258,0.03953,0.01270,0.01130,0.01412,0.06353,0.05786,0.02680


In [38]:
# Obtener el complemento de los 124 elementos
dataset_124 = dataset_201912_vigentes[dataset_201912_vigentes.isna().any(axis=1)]
dataset_124

Unnamed: 0,periodo,product_id,tn_0,mean_last_3m,tn_1,tn_2,tn_3,tn_4,tn_5,tn_6,tn_7,tn_8,tn_9,tn_10,tn_11
31,2019-12-01,20032,527.79811,728.554807,906.69823,751.16808,629.90072,703.14059,698.02627,605.54931,488.43471,361.38220,573.57324,247.9988,
111,2019-12-01,20127,170.32792,273.648603,463.80054,186.81735,12.80399,,,,,,,,
144,2019-12-01,20174,91.11780,102.493790,68.84756,147.51601,129.99999,120.97136,171.23158,73.99801,,,,,
171,2019-12-01,20210,58.78261,106.976320,134.38972,127.75663,7.78032,,,,,,,,
174,2019-12-01,20213,58.84043,87.258370,104.19056,98.74412,86.10789,42.71804,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
771,2019-12-01,21252,0.08560,0.101890,0.11982,0.10025,0.14427,0.13446,0.02691,0.07824,0.47194,0.23471,0.38391,,
776,2019-12-01,21265,0.05007,0.075093,0.06600,0.10921,0.01707,0.01593,0.02959,0.05121,0.17635,0.36405,0.01593,,
777,2019-12-01,21266,0.05121,0.078883,0.06713,0.11831,0.02844,0.01480,0.05916,0.05235,0.17634,0.36178,0.01707,,
778,2019-12-01,21267,0.01569,0.050990,0.04052,0.09676,0.01830,0.04054,0.07452,0.05882,0.24451,0.12291,0.21578,,


In [39]:
lags = [f"tn_{i}" for i in range(1, 12)]

# ejemplo de DataFrame
# df = pd.read_csv(… )

# 1) Calcula la media por fila (ignorando NaN) en las columnas tn_1…tn_11
row_means = dataset_124[lags].mean(axis=1)

# 2) Rellena los NaN de cada columna con la media de su propia fila
dataset_124[lags] = dataset_124[lags].apply(
    lambda row: row.fillna(row.mean()),
    axis=1
)

# —– o equivalentemente —–

dataset_124[lags] = dataset_124[lags].fillna(row_means, axis=0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_124[lags] = dataset_124[lags].apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_124[lags] = dataset_124[lags].fillna(row_means, axis=0)


In [40]:
dataset_124

Unnamed: 0,periodo,product_id,tn_0,mean_last_3m,tn_1,tn_2,tn_3,tn_4,tn_5,tn_6,tn_7,tn_8,tn_9,tn_10,tn_11
31,2019-12-01,20032,527.79811,728.554807,906.69823,751.16808,629.90072,703.140590,698.026270,605.549310,488.434710,361.382200,573.573240,247.998800,596.587215
111,2019-12-01,20127,170.32792,273.648603,463.80054,186.81735,12.80399,221.140627,221.140627,221.140627,221.140627,221.140627,221.140627,221.140627,221.140627
144,2019-12-01,20174,91.11780,102.493790,68.84756,147.51601,129.99999,120.971360,171.231580,73.998010,118.760752,118.760752,118.760752,118.760752,118.760752
171,2019-12-01,20210,58.78261,106.976320,134.38972,127.75663,7.78032,89.975557,89.975557,89.975557,89.975557,89.975557,89.975557,89.975557,89.975557
174,2019-12-01,20213,58.84043,87.258370,104.19056,98.74412,86.10789,42.718040,82.940153,82.940153,82.940153,82.940153,82.940153,82.940153,82.940153
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
771,2019-12-01,21252,0.08560,0.101890,0.11982,0.10025,0.14427,0.134460,0.026910,0.078240,0.471940,0.234710,0.383910,0.188279,0.188279
776,2019-12-01,21265,0.05007,0.075093,0.06600,0.10921,0.01707,0.015930,0.029590,0.051210,0.176350,0.364050,0.015930,0.093927,0.093927
777,2019-12-01,21266,0.05121,0.078883,0.06713,0.11831,0.02844,0.014800,0.059160,0.052350,0.176340,0.361780,0.017070,0.099487,0.099487
778,2019-12-01,21267,0.01569,0.050990,0.04052,0.09676,0.01830,0.040540,0.074520,0.058820,0.244510,0.122910,0.215780,0.101407,0.101407


In [41]:
# Sumar todos los tn_0 del dataset dataset_656
tn_0 = dataset_656['tn_0'].sum()
tn_0

23447.85905

In [42]:
tn_0_nan = dataset_124['tn_0'].sum()
tn_0_nan

1697.39303

In [43]:
# Merge de dataset_124 con dataset_656
dataset_final = pd.concat([dataset_124, dataset_656], ignore_index=True)
dataset_final.drop(columns=['mean_last_3m','periodo'], inplace=True)
dataset_final

Unnamed: 0,product_id,tn_0,tn_1,tn_2,tn_3,tn_4,tn_5,tn_6,tn_7,tn_8,tn_9,tn_10,tn_11
0,20032,527.79811,906.69823,751.16808,629.90072,703.140590,698.026270,605.549310,488.434710,361.382200,573.573240,247.998800,596.587215
1,20127,170.32792,463.80054,186.81735,12.80399,221.140627,221.140627,221.140627,221.140627,221.140627,221.140627,221.140627,221.140627
2,20174,91.11780,68.84756,147.51601,129.99999,120.971360,171.231580,73.998010,118.760752,118.760752,118.760752,118.760752,118.760752
3,20210,58.78261,134.38972,127.75663,7.78032,89.975557,89.975557,89.975557,89.975557,89.975557,89.975557,89.975557,89.975557
4,20213,58.84043,104.19056,98.74412,86.10789,42.718040,82.940153,82.940153,82.940153,82.940153,82.940153,82.940153,82.940153
...,...,...,...,...,...,...,...,...,...,...,...,...,...
775,21248,0.01129,0.02964,0.01270,0.01411,0.021170,0.021160,0.009880,0.015530,0.031060,0.053650,0.062090,0.029620
776,21256,0.01271,0.02682,0.00847,0.00423,0.029650,0.028220,0.009880,0.015530,0.018350,0.059300,0.050810,0.038110
777,21259,0.01412,0.02965,0.01975,0.00564,0.031060,0.046570,0.009880,0.019760,0.021170,0.067770,0.050800,0.042340
778,21262,0.01834,0.02682,0.01693,0.01552,0.022580,0.039530,0.012700,0.011300,0.014120,0.063530,0.057860,0.026800


In [44]:
# Realizar la predicción con el modelo entrenado para los productos mágicos
X_new = dataset_final
y_pred = lr.predict(X_new[feature_cols])
# Mostrar las predicciones
predicciones = pd.DataFrame({
    'product_id': X_new['product_id'],
    'tn_mas_2_pred': y_pred
})


In [45]:
# Ordernar por product_id
predicciones.sort_values(by='product_id', inplace=True)
predicciones

Unnamed: 0,product_id,tn_mas_2_pred
124,20001,1162.707525
125,20002,1183.640604
126,20003,684.763931
127,20004,580.484961
128,20005,563.560780
...,...,...
779,21263,0.467764
120,21265,0.559941
121,21266,0.562138
122,21267,0.539228


In [46]:
# renombrar campo tn_mas_2_pred a tn
predicciones.rename(columns={'tn_mas_2_pred': 'tn'}, inplace=True)

In [47]:
# Guardar las predicciones en un archivo CSV
predicciones.to_csv('../data/predicciones_regresion_lineal_v1.csv', index=False)