In [1]:
import pandas as pd
import numpy as np

DATA_PATH = "data/processed"

df = pd.read_parquet(f"{DATA_PATH}/monthly_full.parquet")

print(df.shape)
df.head()


(10913850, 5)


Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,item_cnt_month_clipped
0,0,59,22154,1.0,1.0
1,0,59,2552,0.0,0.0
2,0,59,2554,0.0,0.0
3,0,59,2555,0.0,0.0
4,0,59,2564,0.0,0.0


In [2]:
df = df.sort_values(["date_block_num", "shop_id", "item_id"]).reset_index(drop=True)

df[["date_block_num", "shop_id", "item_id"]].head()


Unnamed: 0,date_block_num,shop_id,item_id
0,0,0,19
1,0,0,27
2,0,0,28
3,0,0,29
4,0,0,32


In [3]:
def add_lag(df, lags, col):
    for lag in lags:
        df[f"{col}_lag_{lag}"] = (
            df
            .groupby(["shop_id", "item_id"])[col]
            .shift(lag)
        )
    return df


In [4]:
LAGS = [1, 2, 3, 6, 12]

df = add_lag(df, LAGS, "item_cnt_month_clipped")

df[[f"item_cnt_month_clipped_lag_{l}" for l in LAGS]].head(15)


Unnamed: 0,item_cnt_month_clipped_lag_1,item_cnt_month_clipped_lag_2,item_cnt_month_clipped_lag_3,item_cnt_month_clipped_lag_6,item_cnt_month_clipped_lag_12
0,,,,,
1,,,,,
2,,,,,
3,,,,,
4,,,,,
5,,,,,
6,,,,,
7,,,,,
8,,,,,
9,,,,,


In [5]:
lag_cols = [f"item_cnt_month_clipped_lag_{l}" for l in LAGS]

df[lag_cols].isna().mean().sort_values(ascending=False)


item_cnt_month_clipped_lag_12    0.745040
item_cnt_month_clipped_lag_6     0.469699
item_cnt_month_clipped_lag_3     0.269581
item_cnt_month_clipped_lag_2     0.191327
item_cnt_month_clipped_lag_1     0.104107
dtype: float64

In [6]:
df[lag_cols] = df[lag_cols].fillna(0)

df[lag_cols].describe()


Unnamed: 0,item_cnt_month_clipped_lag_1,item_cnt_month_clipped_lag_2,item_cnt_month_clipped_lag_3,item_cnt_month_clipped_lag_6,item_cnt_month_clipped_lag_12
count,10913850.0,10913850.0,10913850.0,10913850.0,10913850.0
mean,0.2882394,0.2768729,0.2651453,0.2282167,0.141348
std,1.205493,1.185778,1.167193,1.104924,0.9053981
min,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0
max,20.0,20.0,20.0,20.0,20.0


In [7]:
# Promedio mensual de ventas por tienda 
shop_month_mean = (
    df.groupby(["date_block_num", "shop_id"], as_index=False)["item_cnt_month_clipped"]
      .mean()
      .rename(columns={"item_cnt_month_clipped": "shop_month_mean"})
)

# Promedio mensual de ventas por producto 
item_month_mean = (
    df.groupby(["date_block_num", "item_id"], as_index=False)["item_cnt_month_clipped"]
      .mean()
      .rename(columns={"item_cnt_month_clipped": "item_month_mean"})
)

df = df.merge(shop_month_mean, on=["date_block_num", "shop_id"], how="left")
df = df.merge(item_month_mean, on=["date_block_num", "item_id"], how="left")

df[["shop_month_mean", "item_month_mean"]].describe()



Unnamed: 0,shop_month_mean,item_month_mean
count,10913850.0,10913850.0
mean,0.2982399,0.2982399
std,0.2197987,0.848358
min,0.0,0.0
25%,0.1722945,0.03921569
50%,0.2341354,0.08695652
75%,0.3216047,0.2391304
max,1.799243,19.55556


In [8]:
for col in ["shop_month_mean", "item_month_mean"]:
    df = add_lag(df, [1], col)

df[["shop_month_mean_lag_1", "item_month_mean_lag_1"]].isna().mean()


shop_month_mean_lag_1    0.104107
item_month_mean_lag_1    0.104107
dtype: float64

In [9]:
df[["shop_month_mean_lag_1", "item_month_mean_lag_1"]] = df[["shop_month_mean_lag_1", "item_month_mean_lag_1"]].fillna(0)
df[["shop_month_mean_lag_1", "item_month_mean_lag_1"]].describe()


Unnamed: 0,shop_month_mean_lag_1,item_month_mean_lag_1
count,10913850.0,10913850.0
mean,0.2694319,0.2871383
std,0.2277702,0.840351
min,0.0,0.0
25%,0.1492471,0.02222222
50%,0.2231922,0.08333333
75%,0.3085226,0.2272727
max,1.799243,19.55556


In [10]:
df.isna().mean().sort_values(ascending=False).head(15)


date_block_num                   0.0
shop_id                          0.0
item_id                          0.0
item_cnt_month                   0.0
item_cnt_month_clipped           0.0
item_cnt_month_clipped_lag_1     0.0
item_cnt_month_clipped_lag_2     0.0
item_cnt_month_clipped_lag_3     0.0
item_cnt_month_clipped_lag_6     0.0
item_cnt_month_clipped_lag_12    0.0
shop_month_mean                  0.0
item_month_mean                  0.0
shop_month_mean_lag_1            0.0
item_month_mean_lag_1            0.0
dtype: float64

In [11]:
OUT_PATH = "data/processed/features.parquet"

df.to_parquet(OUT_PATH, index=False)

print("Features guardadas en:", OUT_PATH)
print("Shape final:", df.shape)



Features guardadas en: data/processed/features.parquet
Shape final: (10913850, 14)
