## Programing – Hands-On Assignment

In [70]:
import pandas as pd
import numpy as np

from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error

import lightgbm as lgb

import warnings
warnings.filterwarnings("ignore")

In [71]:
# Load Dataset
df = pd.read_csv("data/sales_pred_case.csv")
print(df.shape)
df.head()

(143273, 20)


Unnamed: 0,Key,YearWeek,Sales,Material,Customer,CustomerGroup,Category,Week,Month,Qtr,New_Year,Christmas_Day,Easter_Monday,Other_Holidays,DiscountedPrice,PromoShipment,Objective1,Objective2,PromoMethod,PromoStatus
0,0_25,2020-03,2.0,0,25,13,0,3,1,1,0,0,0,0,5.92,0,7,3,8,7
1,0_25,2020-04,0.0,0,25,13,0,4,1,1,0,0,0,0,0.0,0,7,3,8,7
2,0_25,2020-05,0.0,0,25,13,0,5,2,1,0,0,0,0,0.0,0,7,3,8,7
3,0_25,2020-06,0.0,0,25,13,0,6,2,1,0,0,0,0,0.0,0,7,3,8,7
4,0_25,2020-07,0.0,0,25,13,0,7,2,1,0,0,0,0,0.0,0,7,3,8,7


### Exploratory Data Analysis

In [73]:
df.columns

Index(['Key', 'YearWeek', 'Sales', 'Material', 'Customer', 'CustomerGroup', 'Category', 'Week', 'Month', 'Qtr', 'New_Year', 'Christmas_Day', 'Easter_Monday', 'Other_Holidays', 'DiscountedPrice', 'PromoShipment', 'Objective1', 'Objective2', 'PromoMethod', 'PromoStatus'], dtype='object')

In [74]:
df.dtypes

Key                 object
YearWeek            object
Sales              float64
Material             int64
Customer             int64
CustomerGroup        int64
Category             int64
Week                 int64
Month                int64
Qtr                  int64
New_Year             int64
Christmas_Day        int64
Easter_Monday        int64
Other_Holidays       int64
DiscountedPrice    float64
PromoShipment        int64
Objective1           int64
Objective2           int64
PromoMethod          int64
PromoStatus          int64
dtype: object

In [75]:
# Missing values if any.
df.isna().sum()

Key                0
YearWeek           0
Sales              0
Material           0
Customer           0
CustomerGroup      0
Category           0
Week               0
Month              0
Qtr                0
New_Year           0
Christmas_Day      0
Easter_Monday      0
Other_Holidays     0
DiscountedPrice    0
PromoShipment      0
Objective1         0
Objective2         0
PromoMethod        0
PromoStatus        0
dtype: int64

### The above result confirmed that we do not have any missing value present in the dataset.

In [76]:
df["Key"].nunique()

970

In [78]:
df["YearWeek"].min(), df["YearWeek"].max()

('2020-01', '2023-03')

In [79]:
# Lets sort the data by key and yearweek.
df = df.sort_values(["Key", "YearWeek"]).reset_index(drop=True)

In [80]:
# Sales distribution summary using stats for sales.
df["Sales"].describe()

count    143273.000000
mean        226.232961
std         640.523581
min           0.000000
25%           0.000000
50%           0.000000
75%         160.000000
max       21450.000000
Name: Sales, dtype: float64

- The above stats shows 25, and 50 percentile is zero. This means 50% of all sales records are zero.
- Quick galance, models will biased towards predicting zeros (highly imbalance data).
- WMAPE will be higher because of denominator factor.

In [81]:
# Rows per key distribution.
df.groupby("Key")["YearWeek"].count().describe()

count    970.000000
mean     147.704124
std       21.352902
min       77.000000
25%      150.000000
50%      158.000000
75%      159.000000
max      160.000000
Name: YearWeek, dtype: float64

- Each Key (Material–Customer pair) contains about 150 weeks of data, with most keys tightly grouped between 150 and 160 weeks. This indicates:
	- Consistent time coverage for almost all keys, with no major gaps.
	- A few keys have shorter histories (minimum ~77 weeks), which may limit seasonal patterns for those specific pairs.
	- The overall dataset is balanced across keys, making it well-suited for a global model rather than separate models per key.

In [None]:
# Convert YearWeek → datetime
# Convert "YYYY-WW" to datetime (ISO week format)
df["YearWeek_dt"] = pd.to_datetime(df["YearWeek"] + "-1", format="%Y-%W-%w")

# Create an integer time index for modeling (global ordering)
df["TimeIndex"] = df["YearWeek_dt"].rank(method="dense").astype(int)

# Sort again with new ordering
df = df.sort_values(["Key", "TimeIndex"]).reset_index(drop=True)

# Check
df[["YearWeek", "YearWeek_dt", "TimeIndex"]].head(10)

Unnamed: 0,YearWeek,YearWeek_dt,TimeIndex
0,2020-03,2020-01-20,3
1,2020-04,2020-01-27,4
2,2020-05,2020-02-03,5
3,2020-06,2020-02-10,6
4,2020-07,2020-02-17,7
5,2020-08,2020-02-24,8
6,2020-09,2020-03-02,9
7,2020-10,2020-03-09,10
8,2020-11,2020-03-16,11
9,2020-12,2020-03-23,12


In [34]:
# Step 4: Feature Engineering - Lags + Rolling Stats

lags = [1, 2, 3, 4, 7, 13, 26, 52]

for lag in lags:
    df[f"lag_{lag}"] = df.groupby("Key")["Sales"].shift(lag)

# Rolling features
df["rolling_mean_4"] = df.groupby("Key")["Sales"].shift(1).rolling(window=4).mean()
df["rolling_mean_8"] = df.groupby("Key")["Sales"].shift(1).rolling(window=8).mean()
df["rolling_std_4"]  = df.groupby("Key")["Sales"].shift(1).rolling(window=4).std()

# After feature creation, drop rows where lags cannot exist
min_lag = max(lags + [8])  # ensures rolling windows safe
df = df[df["TimeIndex"] > min_lag].reset_index(drop=True)

# Check result
df.head(10)

Unnamed: 0,Key,YearWeek,Sales,Material,Customer,CustomerGroup,Category,Week,Month,Qtr,New_Year,Christmas_Day,Easter_Monday,Other_Holidays,DiscountedPrice,PromoShipment,Objective1,Objective2,PromoMethod,PromoStatus,YearWeek_dt,TimeIndex,lag_1,lag_2,lag_3,lag_4,lag_7,lag_13,lag_26,lag_52,rolling_mean_4,rolling_mean_8,rolling_std_4
0,0_25,2020-53,0.0,0,25,13,0,53,12,4,0,1,0,1,0.0,0,7,3,8,7,2021-01-04,53,0.0,0.0,0.0,1.0,1.0,0.0,0.0,,0.25,0.25,0.5
1,0_25,2021-01,0.0,0,25,13,0,1,1,1,1,0,0,0,0.0,0,7,3,8,7,2021-01-04,53,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.25,0.0
2,0_25,2021-02,0.0,0,25,13,0,2,1,1,0,0,0,0,0.0,0,7,3,8,7,2021-01-11,54,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.125,0.0
3,0_25,2021-03,0.0,0,25,13,0,3,1,1,0,0,0,0,0.0,0,7,3,8,7,2021-01-18,55,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.125,0.0
4,0_25,2021-04,0.0,0,25,13,0,4,1,1,0,0,0,0,0.0,0,7,3,8,7,2021-01-25,56,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0
5,0_25,2021-05,0.0,0,25,13,0,5,2,1,0,0,0,0,0.0,0,7,3,8,7,2021-02-01,57,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0_25,2021-06,0.0,0,25,13,0,6,2,1,0,0,0,0,0.0,0,7,3,8,7,2021-02-08,58,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
7,0_25,2021-07,0.0,0,25,13,0,7,2,1,0,0,0,1,0.0,0,7,3,8,7,2021-02-15,59,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0_25,2021-08,0.0,0,25,13,0,8,2,1,0,0,0,0,0.0,0,7,3,8,7,2021-02-22,60,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0_25,2021-09,0.0,0,25,13,0,9,3,1,0,0,0,0,0.0,0,7,3,8,7,2021-03-01,61,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0


In [35]:
# Step 5: Label Encoding for categorical string columns

from sklearn.preprocessing import LabelEncoder

cat_cols = ["Key"]  # Only true string categorical column

le_dict = {}

for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    le_dict[col] = le  # Store encoders for later use if needed

print("Label encoding completed.")
df[["Key"]].head()

Label encoding completed.


Unnamed: 0,Key
0,0
1,0
2,0
3,0
4,0


In [36]:
# Step 6: Time-based split

# First convert YearWeek to comparable period index (YYYYWW as int)
df["YW_int"] = df["YearWeek"].str.replace("-", "").astype(int)

# Split conditions
train_end = 202239
val_start = 202240
val_end   = 202245
test_start = 202246   # forecast start
test_end   = 202302   # forecast end

train_df = df[df["YW_int"] <= train_end].copy()
val_df   = df[(df["YW_int"] >= val_start) & (df["YW_int"] <= val_end)].copy()
test_df  = df[(df["YW_int"] >= test_start) & (df["YW_int"] <= test_end)].copy()

print("Train size:", train_df.shape)
print("Validation size:", val_df.shape)
print("Test (forecast) rows:", test_df.shape)

# Quick sanity checks
print("\nTrain max YearWeek:", train_df["YearWeek"].max())
print("Validation min→max:", val_df["YearWeek"].min(), "→", val_df["YearWeek"].max())
print("Forecast horizon:", test_df["YearWeek"].min(), "→", test_df["YearWeek"].max())

Train size: (87616, 34)
Validation size: (5820, 34)
Test (forecast) rows: (8730, 34)

Train max YearWeek: 2022-39
Validation min→max: 2022-40 → 2022-45
Forecast horizon: 2022-46 → 2023-02


In [37]:
# Step 7: Feature selection

# Identify all lag and rolling features
lag_cols = [c for c in df.columns if c.startswith("lag_")]
roll_cols = [c for c in df.columns if c.startswith("rolling_")]

base_cols = [
    "Key", "Material", "Customer", "CustomerGroup", "Category",
    "Week", "Month", "Qtr",
    "New_Year", "Christmas_Day", "Easter_Monday", "Other_Holidays",
    "DiscountedPrice", "PromoShipment",
    "Objective1", "Objective2", "PromoMethod", "PromoStatus",
    "TimeIndex"
]

feature_cols = base_cols + lag_cols + roll_cols

print("Total features:", len(feature_cols))
feature_cols

Total features: 30


['Key',
 'Material',
 'Customer',
 'CustomerGroup',
 'Category',
 'Week',
 'Month',
 'Qtr',
 'New_Year',
 'Christmas_Day',
 'Easter_Monday',
 'Other_Holidays',
 'DiscountedPrice',
 'PromoShipment',
 'Objective1',
 'Objective2',
 'PromoMethod',
 'PromoStatus',
 'TimeIndex',
 'lag_1',
 'lag_2',
 'lag_3',
 'lag_4',
 'lag_7',
 'lag_13',
 'lag_26',
 'lag_52',
 'rolling_mean_4',
 'rolling_mean_8',
 'rolling_std_4']

In [38]:
X_train = train_df[feature_cols]
y_train = train_df["Sales"]

X_val = val_df[feature_cols]
y_val = val_df["Sales"]

In [40]:
import lightgbm as lgb

lgbm_model = lgb.LGBMRegressor(
    boosting_type="gbdt",
    objective="regression",
    num_leaves=255,
    learning_rate=0.05,
    n_estimators=500,
    feature_fraction=0.9,
    bagging_fraction=0.9,
    random_state=42,
    verbose=-1
)

# LightGBM >= 4.0 requires using callbacks for early stopping
callbacks = [
    lgb.early_stopping(stopping_rounds=50, verbose=False),
    lgb.log_evaluation(period=50)
]

lgbm_model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    eval_metric="l1",
    callbacks=callbacks
)

[50]	valid_0's l1: 195.279	valid_0's l2: 265803


In [42]:
val_pred = lgbm_model.predict(X_val)

wmape_val = np.sum(np.abs(y_val - val_pred)) / np.sum(y_val)
accuracy_val = 1 - wmape_val
bias_val = (np.sum(y_val) / np.sum(val_pred)) - 1

print("Validation WMAPE:", wmape_val)
print("Validation Accuracy:", accuracy_val)
print("Validation Bias:", bias_val)

Validation WMAPE: 0.7393882476007272
Validation Accuracy: 0.2606117523992728
Validation Bias: 0.023501200887705354


In [43]:
# Step 8: Prepare a copy for recursive forecasting
forecast_df = df.copy()

In [44]:
# Determine the future YearWeek sequence for forecast horizon
future_weeks = sorted(test_df["YearWeek"].unique())

print("Forecast horizon weeks:", future_weeks)

Forecast horizon weeks: ['2022-46', '2022-47', '2022-48', '2022-49', '2022-50', '2022-51', '2022-52', '2023-01', '2023-02']


In [45]:
predictions = []

for target_week in future_weeks:
    print("Predicting:", target_week)

    # Extract rows for this specific target week
    step_df = forecast_df[forecast_df["YearWeek"] == target_week].copy()

    # Build features for this step only
    step_X = step_df[feature_cols]

    # Predict
    step_df["Pred"] = lgbm_model.predict(step_X)

    # Store predictions
    predictions.append(step_df[["Key", "YearWeek", "Sales", "Pred"]])

    # Inject these predictions into forecast_df as if they were actual Sales
    # This is required to update lag features for the next step
    forecast_df.loc[step_df.index, "Sales"] = step_df["Pred"]

    # Regenerate lag features for all future steps
    for lag in lags:
        forecast_df[f"lag_{lag}"] = forecast_df.groupby("Key")["Sales"].shift(lag)

    forecast_df["rolling_mean_4"] = forecast_df.groupby("Key")["Sales"].shift(1).rolling(4).mean()
    forecast_df["rolling_mean_8"] = forecast_df.groupby("Key")["Sales"].shift(1).rolling(8).mean()
    forecast_df["rolling_std_4"]  = forecast_df.groupby("Key")["Sales"].shift(1).rolling(4).std()

# Combine predictions
final_preds = pd.concat(predictions).reset_index(drop=True)

Predicting: 2022-46
Predicting: 2022-47
Predicting: 2022-48
Predicting: 2022-49
Predicting: 2022-50
Predicting: 2022-51
Predicting: 2022-52
Predicting: 2023-01
Predicting: 2023-02


In [46]:
abs_err = np.abs(final_preds["Sales"] - final_preds["Pred"])
wmape_final = abs_err.sum() / final_preds["Sales"].sum()
accuracy_final = 1 - wmape_final
bias_final = (final_preds["Sales"].sum() / final_preds["Pred"].sum()) - 1

print("FINAL WMAPE:", wmape_final)
print("FINAL Accuracy:", accuracy_final)
print("FINAL Bias:", bias_final)

FINAL WMAPE: inf
FINAL Accuracy: -inf
FINAL Bias: -1.0


In [47]:
print("Sales sum in test_df:", test_df["Sales"].sum())
print(test_df.groupby("YearWeek")["Sales"].sum())

Sales sum in test_df: 0.0
YearWeek
2022-46    0.0
2022-47    0.0
2022-48    0.0
2022-49    0.0
2022-50    0.0
2022-51    0.0
2022-52    0.0
2023-01    0.0
2023-02    0.0
Name: Sales, dtype: float64


In [48]:
test_df["Sales"].describe()

count    8730.0
mean        0.0
std         0.0
min         0.0
25%         0.0
50%         0.0
75%         0.0
max         0.0
Name: Sales, dtype: float64

In [49]:
# Identify keys that actually have non-zero sales in the forecast window
horizon_df = df[(df["YW_int"] >= test_start) & (df["YW_int"] <= test_end)]

valid_keys = horizon_df.groupby("Key")["Sales"].sum()
valid_keys = valid_keys[valid_keys > 0].index.tolist()

print("Total keys with real sales in forecast horizon:", len(valid_keys))
valid_keys[:15]  # preview

Total keys with real sales in forecast horizon: 0


[]

In [50]:
from xgboost import XGBRegressor

xgb_model = XGBRegressor(
    objective="reg:squarederror",
    n_estimators=500,
    learning_rate=0.05,
    max_depth=8,
    subsample=0.9,
    colsample_bytree=0.9,
    random_state=42,
    tree_method="hist"
)

xgb_model.fit(X_train, y_train)

val_pred_xgb = xgb_model.predict(X_val)

wmape_xgb = np.sum(np.abs(y_val - val_pred_xgb)) / np.sum(y_val)
accuracy_xgb = 1 - wmape_xgb
bias_xgb = (np.sum(y_val) / np.sum(val_pred_xgb)) - 1

print("XGBoost Validation WMAPE:", wmape_xgb)
print ("XGBoost Validation Accuracy:", accuracy_xgb)
print("XGBoost Validation Bias:", bias_xgb)

XGBoost Validation WMAPE: 0.7164333815824362
XGBoost Validation Accuracy: 0.2835666184175638
XGBoost Validation Bias: 0.06749629538736324


In [51]:
from catboost import CatBoostRegressor

cb_model = CatBoostRegressor(
    iterations=500,
    depth=8,
    learning_rate=0.05,
    loss_function="MAE",
    random_seed=42,
    verbose=False
)

cb_model.fit(X_train, y_train)

val_pred_cb = cb_model.predict(X_val)

wmape_cb = np.sum(np.abs(y_val - val_pred_cb)) / np.sum(y_val)
accuracy_cb = 1 - wmape_cb
bias_cb = (np.sum(y_val) / np.sum(val_pred_cb)) - 1

print("CatBoost Validation WMAPE:", wmape_cb)
print ("CatBoost Validation Accuracy:", accuracy_cb)
print("CatBoost Validation Bias:", bias_cb)

CatBoost Validation WMAPE: 0.6319652748894812
CatBoost Validation Accuracy: 0.3680347251105188
CatBoost Validation Bias: 0.33893658623989564


In [52]:
results = pd.DataFrame({
    "Model": ["LightGBM", "XGBoost", "CatBoost"],
    "WMAPE": [wmape_val, wmape_xgb, wmape_cb],
    "Accuracy": [accuracy_val, accuracy_xgb, accuracy_cb],
    "Bias": [bias_val, bias_xgb, bias_cb]
})

results

Unnamed: 0,Model,WMAPE,Accuracy,Bias
0,LightGBM,0.739388,0.260612,0.023501
1,XGBoost,0.716433,0.283567,0.067496
2,CatBoost,0.631965,0.368035,0.338937


### Improvement

In [54]:
df["Sales_log"] = np.log1p(df["Sales"])

train_df["Sales_log"] = np.log1p(train_df["Sales"])
val_df["Sales_log"] = np.log1p(val_df["Sales"])
test_df["Sales_log"] = np.log1p(test_df["Sales"])

print("Log-transform complete.")

Log-transform complete.


In [55]:
X_train = train_df[feature_cols]
y_train_log = train_df["Sales_log"]

X_val = val_df[feature_cols]
y_val = val_df["Sales"]         # real scale for evaluation
y_val_log = val_df["Sales_log"] # log scale for training feedback

In [56]:
lgbm_log = lgb.LGBMRegressor(
    boosting_type="gbdt",
    objective="regression",
    num_leaves=255,
    learning_rate=0.05,
    n_estimators=500,
    feature_fraction=0.9,
    bagging_fraction=0.9,
    random_state=42,
    verbose=-1
)

callbacks = [
    lgb.early_stopping(stopping_rounds=50, verbose=False),
    lgb.log_evaluation(period=50)
]

lgbm_log.fit(
    X_train, y_train_log,
    eval_set=[(X_val, y_val_log)],
    eval_metric="l2",
    callbacks=callbacks
)

# Predict and invert
val_pred_log = lgbm_log.predict(X_val)
val_pred = np.expm1(val_pred_log)

wmape_lgbm = np.sum(np.abs(y_val - val_pred)) / np.sum(y_val)
bias_lgbm = (np.sum(y_val) / np.sum(val_pred)) - 1

print("LightGBM (log1p) WMAPE:", wmape_lgbm)
print("LightGBM (log1p) Bias:", bias_lgbm)

[50]	valid_0's l2: 3.38615
[100]	valid_0's l2: 3.43893
LightGBM (log1p) WMAPE: 0.7539533266964737
LightGBM (log1p) Bias: 2.0157327388665385


In [57]:
xgb_log = XGBRegressor(
    objective="reg:squarederror",
    n_estimators=500,
    learning_rate=0.05,
    max_depth=8,
    subsample=0.9,
    colsample_bytree=0.9,
    random_state=42,
    tree_method="hist"
)

xgb_log.fit(X_train, y_train_log)

val_pred_xgb_log = xgb_log.predict(X_val)
val_pred_xgb = np.expm1(val_pred_xgb_log)

wmape_xgb = np.sum(np.abs(y_val - val_pred_xgb)) / np.sum(y_val)
bias_xgb = (np.sum(y_val) / np.sum(val_pred_xgb)) - 1

print("XGBoost (log1p) WMAPE:", wmape_xgb)
print("XGBoost (log1p) Bias:", bias_xgb)

XGBoost (log1p) WMAPE: 0.7445672398240921
XGBoost (log1p) Bias: 1.2661116433340762


In [58]:
cb_log = CatBoostRegressor(
    iterations=500,
    depth=8,
    learning_rate=0.05,
    loss_function="RMSE",
    random_seed=42,
    verbose=False
)

cb_log.fit(X_train, y_train_log)

val_pred_cb_log = cb_log.predict(X_val)
val_pred_cb = np.expm1(val_pred_cb_log)

wmape_cb = np.sum(np.abs(y_val - val_pred_cb)) / np.sum(y_val)
bias_cb = (np.sum(y_val) / np.sum(val_pred_cb)) - 1

print("CatBoost (log1p) WMAPE:", wmape_cb)
print("CatBoost (log1p) Bias:", bias_cb)

CatBoost (log1p) WMAPE: 0.7100783838323692
CatBoost (log1p) Bias: 1.3363044899775214


In [59]:
results_log = pd.DataFrame({
    "Model": ["LightGBM_log1p", "XGBoost_log1p", "CatBoost_log1p"],
    "WMAPE": [wmape_lgbm, wmape_xgb, wmape_cb],
    "Bias": [bias_lgbm, bias_xgb, bias_cb]
})

results_log

Unnamed: 0,Model,WMAPE,Bias
0,LightGBM_log1p,0.753953,2.015733
1,XGBoost_log1p,0.744567,1.266112
2,CatBoost_log1p,0.710078,1.336304


### Let’s attempt 2-stage model.

In [60]:
# Stage 1 target
train_df["Sales_binary"] = (train_df["Sales"] > 0).astype(int)
val_df["Sales_binary"]   = (val_df["Sales"] > 0).astype(int)

In [61]:
X_train_cls = train_df[feature_cols]
y_train_cls = train_df["Sales_binary"]

X_val_cls = val_df[feature_cols]
y_val_cls = val_df["Sales_binary"]

In [62]:
train_df_nonzero = train_df[train_df["Sales"] > 0].copy()
val_df_nonzero   = val_df[val_df["Sales"] > 0].copy()

X_train_reg = train_df_nonzero[feature_cols]
y_train_reg = train_df_nonzero["Sales"]

X_val_reg = val_df_nonzero[feature_cols]
y_val_reg = val_df_nonzero["Sales"]

In [64]:
clf_model = lgb.LGBMClassifier(
    boosting_type="gbdt",
    num_leaves=255,
    learning_rate=0.05,
    n_estimators=500,
    feature_fraction=0.9,
    bagging_fraction=0.9,
    random_state=42
)

callbacks_cls = [
    lgb.early_stopping(stopping_rounds=50, verbose=False),
    lgb.log_evaluation(period=50)
]

clf_model.fit(
    X_train_cls, y_train_cls,
    eval_set=[(X_val_cls, y_val_cls)],
    eval_metric="binary_logloss",
    callbacks=callbacks_cls
)

[LightGBM] [Info] Number of positive: 40482, number of negative: 47134
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002215 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3753
[LightGBM] [Info] Number of data points in the train set: 87616, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.462039 -> initscore=-0.152137
[LightGBM] [Info] Start training from score -0.152137
[50]	valid_0's binary_logloss: 0.332548


In [65]:
reg_model = lgb.LGBMRegressor(
    boosting_type="gbdt",
    num_leaves=255,
    learning_rate=0.05,
    n_estimators=500,
    feature_fraction=0.9,
    bagging_fraction=0.9,
    random_state=42
)

callbacks_reg = [
    lgb.early_stopping(stopping_rounds=50, verbose=False),
    lgb.log_evaluation(period=50)
]

reg_model.fit(
    X_train_reg, y_train_reg,
    eval_set=[(X_val_reg, y_val_reg)],
    eval_metric="l1",
    callbacks=callbacks_reg
)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012301 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3751
[LightGBM] [Info] Number of data points in the train set: 40482, number of used features: 29
[LightGBM] [Info] Start training from score 518.181266
[50]	valid_0's l1: 301.744	valid_0's l2: 489176
[100]	valid_0's l1: 296.329	valid_0's l2: 501098


In [66]:
# Stage 1: Probability Sales > 0
val_prob = clf_model.predict_proba(X_val_cls)[:, 1]

# Stage 2: Regression prediction (unconditional)
val_pred_reg = reg_model.predict(X_val_cls)

# Combined prediction
val_final_pred = val_prob * val_pred_reg



In [67]:
y_val_true = val_df["Sales"]

wmape_2stage = np.sum(np.abs(y_val_true - val_final_pred)) / np.sum(y_val_true)
bias_2stage  = (np.sum(y_val_true) / np.sum(val_final_pred)) - 1

print("2-Stage Validation WMAPE:", wmape_2stage)
print("2-Stage Validation Bias:", bias_2stage)

2-Stage Validation WMAPE: 0.6946983636318159
2-Stage Validation Bias: 0.051996158584622476


In [68]:
# Rebuild the forecast horizon based on YearWeek integer window
future_weeks = sorted(
    df[(df["YW_int"] >= test_start) & (df["YW_int"] <= test_end)]["YearWeek"].unique()
)
print("Forecast horizon weeks:", future_weeks)

Forecast horizon weeks: ['2022-46', '2022-47', '2022-48', '2022-49', '2022-50', '2022-51', '2022-52', '2023-01', '2023-02']


In [69]:
# Make a mutable copy of df for recursive forecasting
forecast_df = df.copy()

predictions_2stage = []

for target_week in future_weeks:
    print("Predicting week:", target_week)
    
    # Select rows for this specific forecast week
    step_mask = forecast_df["YearWeek"] == target_week
    step_df = forecast_df[step_mask].copy()
    
    # Feature matrix for this week
    step_X = step_df[feature_cols]
    
    # Stage 1: probability of Sales > 0
    prob_pos = clf_model.predict_proba(step_X)[:, 1]
    
    # Stage 2: regression prediction (sales amount)
    reg_pred = reg_model.predict(step_X)
    
    # Combined prediction
    step_pred = prob_pos * reg_pred
    
    # Optional: clip to non-negative
    step_pred = np.clip(step_pred, a_min=0, a_max=None)
    
    step_df["Pred"] = step_pred
    
    # Store predictions for this week
    predictions_2stage.append(step_df[["Key", "YearWeek", "Pred"]])
    
    # Inject predictions into forecast_df as "Sales" for future lag calculations
    forecast_df.loc[step_mask, "Sales"] = step_pred
    
    # Recompute lag and rolling features for subsequent weeks
    for lag in lags:
        forecast_df[f"lag_{lag}"] = forecast_df.groupby("Key")["Sales"].shift(lag)
    
    forecast_df["rolling_mean_4"] = (
        forecast_df.groupby("Key")["Sales"].shift(1).rolling(window=4).mean()
    )
    forecast_df["rolling_mean_8"] = (
        forecast_df.groupby("Key")["Sales"].shift(1).rolling(window=8).mean()
    )
    forecast_df["rolling_std_4"] = (
        forecast_df.groupby("Key")["Sales"].shift(1).rolling(window=4).std()
    )

# Combine all weekly predictions
final_preds_2stage = pd.concat(predictions_2stage).reset_index(drop=True)

# Sort for neatness
final_preds_2stage = final_preds_2stage.sort_values(["Key", "YearWeek"]).reset_index(drop=True)

final_preds_2stage.head()

Predicting week: 2022-46
Predicting week: 2022-47
Predicting week: 2022-48
Predicting week: 2022-49
Predicting week: 2022-50
Predicting week: 2022-51
Predicting week: 2022-52
Predicting week: 2023-01
Predicting week: 2023-02


Unnamed: 0,Key,YearWeek,Pred
0,0,2022-46,3.021559
1,0,2022-47,2.984895
2,0,2022-48,3.094203
3,0,2022-49,3.203354
4,0,2022-50,4.049884
