In [1]:
import pandas as pd
import numpy as np
from datetime import timedelta

np.random.seed(42)

dates = pd.date_range("2020-01-01", "2023-12-31", freq="D")
stores = [f"S{i}" for i in range(1, 4)]       # 3 stores
products = [f"P{j}" for j in range(1, 4)]     # 3 products

# create simple synthetic data
records = []
for s in stores:
    for p in products:
        base = np.random.uniform(50, 200)      # base demand level
        seasonality = np.sin(np.linspace(0, 20, len(dates))) * 10
        noise = np.random.normal(0, 5, len(dates))
        sales = base + seasonality + noise
        records.extend(zip([s]*len(dates), [p]*len(dates), dates, sales))

df = pd.DataFrame(records, columns=["store","product","date","sales"])


In [4]:
df

Unnamed: 0,store,product,date,sales,week_start
0,S1,P1,2020-01-01,100.621617,2019-12-30
1,S1,P1,2020-01-02,107.912511,2019-12-30
2,S1,P1,2020-01-03,107.850163,2019-12-30
3,S1,P1,2020-01-04,111.644437,2019-12-30
4,S1,P1,2020-01-05,103.824298,2019-12-30
...,...,...,...,...,...
13144,S3,P3,2023-12-27,129.145189,2023-12-25
13145,S3,P3,2023-12-28,133.463730,2023-12-25
13146,S3,P3,2023-12-29,135.506390,2023-12-25
13147,S3,P3,2023-12-30,130.173824,2023-12-25


In [3]:
# aggregate daily → weekly
df["week_start"] = df["date"] - pd.to_timedelta(df["date"].dt.weekday, unit="D")
weekly = (df.groupby(["store","product","week_start"], as_index=False)
            .agg(sales_week=("sales","sum")))

# sort
weekly = weekly.sort_values(["store","product","week_start"])


In [4]:
weekly.head()

Unnamed: 0,store,product,week_start,sales_week
0,S1,P1,2019-12-30,531.853026
1,S1,P1,2020-01-06,728.968924
2,S1,P1,2020-01-13,750.115904
3,S1,P1,2020-01-20,764.055714
4,S1,P1,2020-01-27,756.200977


In [4]:
weekly[['store','product']].stack().unique()

array(['S1', 'P1', 'P2', 'P3', 'S2', 'S3'], dtype=object)

In [9]:
weekly["sales_lag_1"] = (weekly.groupby(["store","product"])["sales_week"]
                         .shift(1))
weekly["sales_avg_4"] = (weekly.groupby(["store","product"])["sales_week"]
                         .shift(1).rolling(4).mean().reset_index(0, drop=True))
weekly["weekofyear"] = weekly["week_start"].dt.isocalendar().week.astype(int)


In [9]:
weekly.head()

Unnamed: 0,store,product,week_start,sales_week,sales_lag_1,sales_avg_4,weekofyear
0,S1,P1,2019-12-30,531.853026,,,1
1,S1,P1,2020-01-06,728.968924,531.853026,,2
2,S1,P1,2020-01-13,750.115904,728.968924,,3
3,S1,P1,2020-01-20,764.055714,750.115904,,4
4,S1,P1,2020-01-27,756.200977,764.055714,693.748392,5


In [10]:
cutoff = weekly["week_start"].max() - pd.Timedelta(weeks=20)
train_mask = weekly["week_start"] <= cutoff

feat_cols_num = ["sales_lag_1","sales_avg_4","weekofyear"]
feat_cols_cat = ["store","product"]

In [11]:
X_num = weekly[feat_cols_num]
X_cat = pd.get_dummies(weekly[feat_cols_cat], drop_first=False)
X = pd.concat([X_num, X_cat], axis=1)
y = weekly["sales_week"]

X_train, X_test = X[train_mask], X[~train_mask]
y_train, y_test = y[train_mask], y[~train_mask]

In [12]:
X_train

Unnamed: 0,sales_lag_1,sales_avg_4,weekofyear,store_S1,store_S2,store_S3,product_P1,product_P2,product_P3
0,,,1,True,False,False,True,False,False
1,531.853026,,2,True,False,False,True,False,False
2,728.968924,,3,True,False,False,True,False,False
3,750.115904,,4,True,False,False,True,False,False
4,764.055714,693.748392,5,True,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...
1856,777.171756,758.178293,28,False,False,True,False,False,True
1857,772.663118,765.047648,29,False,False,True,False,False,True
1858,821.003455,779.701725,30,False,False,True,False,False,True
1859,782.785362,788.405923,31,False,False,True,False,False,True


In [13]:
y_train

0       531.853026
1       728.968924
2       750.115904
3       764.055714
4       756.200977
           ...    
1856    772.663118
1857    821.003455
1858    782.785362
1859    800.216478
1860    797.009147
Name: sales_week, Length: 1701, dtype: float64

In [14]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error

xgb = XGBRegressor(
    n_estimators=800,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.9,
    reg_lambda=1.0,
    objective="reg:squarederror",
    tree_method="hist",   # fast & scalable
    random_state=42
)

xgb.fit(X_train, y_train)
pred = xgb.predict(X_test)
print("MAE (last ~20 weeks):", mean_absolute_error(y_test, pred))


MAE (last ~20 weeks): 24.396029317259632


In [15]:
pred

array([ 669.676  ,  682.5288 ,  690.81714,  716.7318 ,  698.394  ,
        705.4565 ,  696.72253,  739.59375,  725.8655 ,  717.0958 ,
        757.0271 ,  751.3508 ,  768.65704,  770.0109 ,  765.64703,
        761.282  ,  784.36426,  766.8677 ,  796.75806,  807.0404 ,
        481.57468,  473.77206,  481.31595,  472.47296,  500.07718,
        497.29593,  516.32104,  550.7522 ,  523.15106,  523.6612 ,
        540.42255,  538.59845,  531.22614,  548.97064,  550.8762 ,
        574.49994,  578.6291 ,  570.92145,  599.6609 ,  613.45197,
        631.9963 ,  623.16077,  635.49664,  621.75726,  646.09863,
        625.74414,  674.0051 ,  654.74457,  675.85   ,  663.7953 ,
        677.65155,  675.9738 ,  699.8283 ,  717.9369 ,  719.943  ,
        695.8164 ,  736.6289 ,  738.60895,  713.95703,  726.6258 ,
       1034.0353 , 1040.8956 , 1047.7347 , 1026.3146 , 1031.595  ,
       1032.9064 , 1049.2106 , 1066.0693 , 1048.4924 , 1070.4683 ,
       1071.4362 , 1083.1727 , 1083.4216 , 1107.1764 , 1150.45

In [24]:
# Build next-week feature rows from the latest history
last_week = weekly["week_start"].max()
next_week = last_week + timedelta(weeks=1)
next_week_woy = int(pd.Timestamp(next_week).isocalendar().week)

# For each (store, product), compute the features as of "last_week"
latest_per_sp = (weekly.sort_values(["store","product","week_start"])
                        .groupby(["store","product"])
                        .tail(4)  # to compute avg_4
                        .copy())

# compute lag_1 (last week's sales) and avg_4 (prev 4 weeks) per group
lag_1 = (latest_per_sp.groupby(["store","product"])["sales_week"]
         .tail(1).reset_index(drop=True))
avg_4 = (latest_per_sp.groupby(["store","product"])["sales_week"]
         .mean().reset_index(drop=True))

sp_index = (latest_per_sp.groupby(["store","product"])
            .tail(1)[["store","product"]].reset_index(drop=True))
sp_index["sales_lag_1"] = lag_1
sp_index["sales_avg_4"] = avg_4
sp_index["weekofyear"] = next_week_woy

In [25]:
sp_index

Unnamed: 0,store,product,sales_lag_1,sales_avg_4,weekofyear
0,S1,P1,807.65673,808.165042,1
1,S1,P2,624.790339,611.261195,1
2,S1,P3,756.378472,744.270484,1
3,S2,P1,1156.893241,1143.03392,1
4,S2,P2,639.325926,624.286269,1
5,S2,P3,875.88912,881.544909,1
6,S3,P1,1165.434503,1173.979302,1
7,S3,P2,670.375934,658.97148,1
8,S3,P3,928.827694,908.433269,1


In [26]:
# align feature columns (same one-hots as training)
X_next_num = sp_index[["sales_lag_1","sales_avg_4","weekofyear"]]
X_next_cat = pd.get_dummies(sp_index[["store","product"]], drop_first=False)

# ensure the columns match training dummy columns (add missing, order same)
for col in X_cat.columns:
    if col not in X_next_cat.columns:
        X_next_cat[col] = 0
X_next_cat = X_next_cat[X_cat.columns]

X_next = pd.concat([X_next_num, X_next_cat], axis=1)

In [27]:
X_next

Unnamed: 0,sales_lag_1,sales_avg_4,weekofyear,store_S1,store_S2,store_S3,product_P1,product_P2,product_P3
0,807.65673,808.165042,1,True,False,False,True,False,False
1,624.790339,611.261195,1,True,False,False,False,True,False
2,756.378472,744.270484,1,True,False,False,False,False,True
3,1156.893241,1143.03392,1,False,True,False,True,False,False
4,639.325926,624.286269,1,False,True,False,False,True,False
5,875.88912,881.544909,1,False,True,False,False,False,True
6,1165.434503,1173.979302,1,False,False,True,True,False,False
7,670.375934,658.97148,1,False,False,True,False,True,False
8,928.827694,908.433269,1,False,False,True,False,False,True


In [28]:
sp_index["pred_sales_next_week"] = xgb.predict(X_next)
sp_index = sp_index.sort_values(["store","product"]).reset_index(drop=True)

print("\nNext-week forecast (store, product):")
print(sp_index.head(10))


Next-week forecast (store, product):
  store product  sales_lag_1  sales_avg_4  weekofyear  pred_sales_next_week
0    S1      P1   807.656730   808.165042           1            820.225891
1    S1      P2   624.790339   611.261195           1            604.597290
2    S1      P3   756.378472   744.270484           1            753.178772
3    S2      P1  1156.893241  1143.033920           1           1155.732788
4    S2      P2   639.325926   624.286269           1            627.167480
5    S2      P3   875.889120   881.544909           1            859.397583
6    S3      P1  1165.434503  1173.979302           1           1171.335083
7    S3      P2   670.375934   658.971480           1            650.643188
8    S3      P3   928.827694   908.433269           1            890.633423
