In [1]:
# Cell 1: Imports & basic setup

import numpy as np
import pandas as pd

from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_absolute_error

import matplotlib.pyplot as plt

pd.set_option("display.max_columns", 50)

In [2]:
# Cell 2: Load data

# Assumes `sales_pred_case.csv` is in the same directory as the notebook
data_path = "data/sales_pred_case.csv"
df = pd.read_csv(data_path)

print("Shape:", df.shape)
df.head()


Shape: (143273, 20)


Unnamed: 0,Key,YearWeek,Sales,Material,Customer,CustomerGroup,Category,Week,Month,Qtr,New_Year,Christmas_Day,Easter_Monday,Other_Holidays,DiscountedPrice,PromoShipment,Objective1,Objective2,PromoMethod,PromoStatus
0,0_25,2020-03,2.0,0,25,13,0,3,1,1,0,0,0,0,5.92,0,7,3,8,7
1,0_25,2020-04,0.0,0,25,13,0,4,1,1,0,0,0,0,0.0,0,7,3,8,7
2,0_25,2020-05,0.0,0,25,13,0,5,2,1,0,0,0,0,0.0,0,7,3,8,7
3,0_25,2020-06,0.0,0,25,13,0,6,2,1,0,0,0,0,0.0,0,7,3,8,7
4,0_25,2020-07,0.0,0,25,13,0,7,2,1,0,0,0,0,0.0,0,7,3,8,7


In [3]:
# Cell 3: Basic EDA and sanity checks

print("Columns:", df.columns.tolist())
print("\nDtypes:\n", df.dtypes)
print("\nYearWeek range:", df["YearWeek"].min(), "->", df["YearWeek"].max())
print("\nNumber of unique Keys:", df["Key"].nunique())

print("\nMissing values per column:")
print(df.isna().sum())

# Basic target statistics
print("\nSales description:")
print(df["Sales"].describe())

Columns: ['Key', 'YearWeek', 'Sales', 'Material', 'Customer', 'CustomerGroup', 'Category', 'Week', 'Month', 'Qtr', 'New_Year', 'Christmas_Day', 'Easter_Monday', 'Other_Holidays', 'DiscountedPrice', 'PromoShipment', 'Objective1', 'Objective2', 'PromoMethod', 'PromoStatus']

Dtypes:
 Key                 object
YearWeek            object
Sales              float64
Material             int64
Customer             int64
CustomerGroup        int64
Category             int64
Week                 int64
Month                int64
Qtr                  int64
New_Year             int64
Christmas_Day        int64
Easter_Monday        int64
Other_Holidays       int64
DiscountedPrice    float64
PromoShipment        int64
Objective1           int64
Objective2           int64
PromoMethod          int64
PromoStatus          int64
dtype: object

YearWeek range: 2020-01 -> 2023-03

Number of unique Keys: 970

Missing values per column:
Key                0
YearWeek           0
Sales              0
Material

In [4]:
# Cell 4: Helper functions for metrics (WMAPE, Bias)

def wmape(y_true, y_pred):
    """
    Weighted MAPE as defined in the problem:
    Accuracy = (1 - SUM(|Sales - Prediction|) / SUM(Sales))
    WMAPE itself is SUM(|Sales - Prediction|) / SUM(Sales)
    """
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    denom = np.sum(np.abs(y_true))
    if denom == 0:
        return np.nan
    return np.sum(np.abs(y_true - y_pred)) / denom


def accuracy_from_wmape(y_true, y_pred):
    """
    Accuracy = 1 - WMAPE
    """
    return 1.0 - wmape(y_true, y_pred)


def bias_metric(y_true, y_pred):
    """
    Bias = (SUM(Sales) / SUM(Prediction) - 1)
    """
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    num = np.sum(y_true)
    denom = np.sum(y_pred)
    if denom == 0:
        return np.nan
    return num / denom - 1.0



In [5]:
# Cell 5: Train/validation split based on YearWeek

# The assignment says we can use data on or before "2022-45" for training/validation,
# and we must forecast weeks from "2022-46" to "2023-02".

CUTOFF_TRAIN_MAX = "2022-45"
FORECAST_START = "2022-46"
FORECAST_END = "2023-02"

# It's convenient to treat YearWeek as string here because it's already in YYYY-WW format;
# for ordering, lexicographic sorting works for this range.

# Split train/validation and future test (forecast horizon)
df_train_full = df[df["YearWeek"] <= CUTOFF_TRAIN_MAX].copy()
df_forecast = df[(df["YearWeek"] >= FORECAST_START) & (df["YearWeek"] <= FORECAST_END)].copy()

print("Train/validation data shape:", df_train_full.shape)
print("Forecast horizon data shape:", df_forecast.shape)

# For model selection & hyperparameter tuning, we can create a time-based split
# by reserving the last few weeks before CUTOFF_TRAIN_MAX as a validation set.
# Example: treat the last 4 weeks in training as validation.

# Find the last N distinct YearWeeks in training set
N_VAL_WEEKS = 4
unique_weeks_train = sorted(df_train_full["YearWeek"].unique())
val_weeks = unique_weeks_train[-N_VAL_WEEKS:]
print("Validation weeks:", val_weeks)

train_mask = ~df_train_full["YearWeek"].isin(val_weeks)
val_mask = df_train_full["YearWeek"].isin(val_weeks)

df_train = df_train_full[train_mask].copy()
df_val = df_train_full[val_mask].copy()

print("Train shape:", df_train.shape, "Val shape:", df_val.shape)



Train/validation data shape: (133573, 20)
Forecast horizon data shape: (8730, 20)
Validation weeks: ['2022-42', '2022-43', '2022-44', '2022-45']
Train shape: (129693, 20) Val shape: (3880, 20)


In [6]:
# Cell 6: Feature definitions and preprocessing

# Target
TARGET_COL = "Sales"

# Features as per data dictionary
# Categorical features
cat_features = [
    "Key",
    "Material",
    "Customer",
    "CustomerGroup",
    "Category",
    "PromoShipment",
    "Objective1",
    "Objective2",
    "PromoMethod",
    "PromoStatus",
]

# Numeric features (including time/holiday and promotion numeric fields)
num_features = [
    "Week",
    "Month",
    "Qtr",
    "New_Year",
    "Christmas_Day",
    "Easter_Monday",
    "Other_Holidays",
    "DiscountedPrice",
]

# (Optional) You could also generate extra features such as:
# - Log-transformed price
# - Mean/median sales by Key
# - Lag features
# For this assignment-friendly baseline, we keep it simple and fully explainable.

# Prepare data matrices
X_train = df_train[cat_features + num_features]
y_train = df_train[TARGET_COL].values

X_val = df_val[cat_features + num_features]
y_val = df_val[TARGET_COL].values

X_full = df_train_full[cat_features + num_features]
y_full = df_train_full[TARGET_COL].values

X_forecast = df_forecast[cat_features + num_features]

print("X_train shape:", X_train.shape)
print("X_val shape:", X_val.shape)
print("X_forecast shape:", X_forecast.shape)



X_train shape: (129693, 18)
X_val shape: (3880, 18)
X_forecast shape: (8730, 18)


In [7]:
# Cell 7: Build the preprocessing + model pipeline

# OrdinalEncoder works well with tree-based models and is efficient for many categories.
categorical_transformer = OrdinalEncoder(
    handle_unknown="use_encoded_value",
    unknown_value=-1,
)

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", categorical_transformer, cat_features),
        ("num", "passthrough", num_features),
    ]
)

# Model choice:
# HistGradientBoostingRegressor is a fast, regularized gradient boosting tree model
# available in scikit-learn, suitable for mixed numeric/categorical (with encoding),
# robust to non-linearities and interactions, and handles large tabular data well.

model = HistGradientBoostingRegressor(
    loss="absolute_error",  # L1 loss is robust to outliers and is aligned with MAE/WMAPE-like metrics
    max_depth=None,
    learning_rate=0.05,
    max_iter=300,
    min_samples_leaf=20,
    l2_regularization=0.0,
    random_state=42,
)

pipeline = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("model", model),
    ]
)

pipeline



In [8]:
# Cell 8: Fit on train, evaluate on validation

pipeline.fit(X_train, y_train)

# Validation predictions
val_pred = pipeline.predict(X_val)

val_mae = mean_absolute_error(y_val, val_pred)
val_wmape = wmape(y_val, val_pred)
val_accuracy = accuracy_from_wmape(y_val, val_pred)
val_bias = bias_metric(y_val, val_pred)

print("Validation MAE:", val_mae)
print("Validation WMAPE:", val_wmape)
print("Validation Accuracy (1 - WMAPE):", val_accuracy)
print("Validation Bias:", val_bias)



Validation MAE: 263.5936073144453
Validation WMAPE: 0.9981546546181835
Validation Accuracy (1 - WMAPE): 0.0018453453818164656
Validation Bias: 301.87362254712815


In [9]:
# Cell 9: (Optional) simple error analysis on validation

df_val_eval = df_val.copy()
df_val_eval["Pred"] = val_pred
df_val_eval["AbsError"] = (df_val_eval["Sales"] - df_val_eval["Pred"]).abs()

print("Top 10 rows with highest absolute error on validation:")
df_val_eval.sort_values("AbsError", ascending=False).head(10)

# You can also group by Key to see which combinations are hardest:
key_errors = (
    df_val_eval.groupby("Key")
    .agg(
        total_sales=("Sales", "sum"),
        total_abs_error=("AbsError", "sum"),
    )
    .assign(
        wmape=lambda d: d["total_abs_error"] / d["total_sales"].replace(0, np.nan)
    )
    .sort_values("wmape", ascending=False)
)

print("\nPer-Key WMAPE (top 10 hardest Keys):")
key_errors.head(10)



Top 10 rows with highest absolute error on validation:

Per-Key WMAPE (top 10 hardest Keys):


Unnamed: 0_level_0,total_sales,total_abs_error,wmape
Key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
155_31,62.0,62.0,1.0
1_27,1076.0,1076.0,1.0
18_12,832.0,832.0,1.0
18_24,8.0,8.0,1.0
19_13,80.0,80.0,1.0
19_16,800.0,800.0,1.0
19_17,440.0,440.0,1.0
1_12,200.0,200.0,1.0
1_13,1444.0,1444.0,1.0
1_17,800.0,800.0,1.0


In [10]:
# Cell 10: Refit on full data up to 2022-45 and forecast 2022-46 to 2023-02

pipeline.fit(X_full, y_full)

forecast_pred = pipeline.predict(X_forecast)

df_forecast_out = df_forecast.copy()
df_forecast_out["Prediction"] = forecast_pred

print("Forecast sample:")
df_forecast_out.head()

# If the ground-truth Sales are also present in df_forecast (which they may be),
# we can compute held-out accuracy metrics on the forecast horizon as well.

if "Sales" in df_forecast_out.columns:
    y_true_forecast = df_forecast_out["Sales"].values
    y_pred_forecast = df_forecast_out["Prediction"].values

    forecast_mae = mean_absolute_error(y_true_forecast, y_pred_forecast)
    forecast_wmape = wmape(y_true_forecast, y_pred_forecast)
    forecast_accuracy = accuracy_from_wmape(y_true_forecast, y_pred_forecast)
    forecast_bias = bias_metric(y_true_forecast, y_pred_forecast)

    print("\n=== Forecast Horizon Metrics (2022-46 to 2023-02) ===")
    print("MAE:", forecast_mae)
    print("WMAPE:", forecast_wmape)
    print("Accuracy (1 - WMAPE):", forecast_accuracy)
    print("Bias:", forecast_bias)



Forecast sample:

=== Forecast Horizon Metrics (2022-46 to 2023-02) ===
MAE: 0.6401745314343326
WMAPE: nan
Accuracy (1 - WMAPE): nan
Bias: -1.0
