# PREPROCESSING

In [None]:
import pandas as pd

train = pd.read_csv("train.csv")
train["Date"] = pd.to_datetime(train["Date"])

# create Store_Dept series ID
train["series_id"] = train["Store"].astype(str) + "_" + train["Dept"].astype(str)

# select top 40 highest-volume series
top_series = (
    train.groupby("series_id")["Weekly_Sales"]
         .sum()
         .sort_values(ascending=False)
         .head(40)
         .index
)

train_ds = train[train["series_id"].isin(top_series)].copy()

# convert to Nixtla panel format
Y_df = (
    train_ds
    .rename(columns={
        "series_id": "unique_id",
        "Date": "ds",
        "Weekly_Sales": "y"
    })
    [["unique_id", "ds", "y"]]
    .sort_values(["unique_id", "ds"])
)

# save as csv file
Y_df.to_csv("downsampled_df.csv", index=False)

print("Done! Saved downsampled_df.csv")
print("Number of series:", Y_df["unique_id"].nunique())


Done! Saved downsampled_df.csv
Number of series: 40


In [None]:
import pandas as pd

Y_df = pd.read_csv("downsampled_df.csv")
Y_df["ds"] = pd.to_datetime(Y_df["ds"])  # important

print(Y_df.head())
print("Number of series:", Y_df["unique_id"].nunique())


  unique_id         ds          y
0      10_2 2010-02-05  123952.48
1      10_2 2010-02-12  119209.48
2      10_2 2010-02-19  121430.80
3      10_2 2010-02-26  120292.15
4      10_2 2010-03-05  113163.91
Number of series: 40
