In [10]:
# 1. Setup & Imports

import numpy as np
import pandas as pd

from catboost import CatBoostRegressor, Pool
from sklearn.metrics import mean_absolute_error

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

In [11]:
# 2. Load Data

data = pd.read_csv("data/sales_pred_case.csv")
print("Loaded:", data.shape)
display(data.head())
print("\nDtypes:")
print(data.dtypes)

print("\nMissing values:")
print(data.isna().sum())

print("\nUnique Keys:", data['Key'].nunique())
print("YearWeek range:", data['YearWeek'].min(), "→", data['YearWeek'].max())

Loaded: (143273, 20)


Unnamed: 0,Key,YearWeek,Sales,Material,Customer,CustomerGroup,Category,Week,Month,Qtr,New_Year,Christmas_Day,Easter_Monday,Other_Holidays,DiscountedPrice,PromoShipment,Objective1,Objective2,PromoMethod,PromoStatus
0,0_25,2020-03,2.0,0,25,13,0,3,1,1,0,0,0,0,5.92,0,7,3,8,7
1,0_25,2020-04,0.0,0,25,13,0,4,1,1,0,0,0,0,0.0,0,7,3,8,7
2,0_25,2020-05,0.0,0,25,13,0,5,2,1,0,0,0,0,0.0,0,7,3,8,7
3,0_25,2020-06,0.0,0,25,13,0,6,2,1,0,0,0,0,0.0,0,7,3,8,7
4,0_25,2020-07,0.0,0,25,13,0,7,2,1,0,0,0,0,0.0,0,7,3,8,7



Dtypes:
Key                 object
YearWeek            object
Sales              float64
Material             int64
Customer             int64
CustomerGroup        int64
Category             int64
Week                 int64
Month                int64
Qtr                  int64
New_Year             int64
Christmas_Day        int64
Easter_Monday        int64
Other_Holidays       int64
DiscountedPrice    float64
PromoShipment        int64
Objective1           int64
Objective2           int64
PromoMethod          int64
PromoStatus          int64
dtype: object

Missing values:
Key                0
YearWeek           0
Sales              0
Material           0
Customer           0
CustomerGroup      0
Category           0
Week               0
Month              0
Qtr                0
New_Year           0
Christmas_Day      0
Easter_Monday      0
Other_Holidays     0
DiscountedPrice    0
PromoShipment      0
Objective1         0
Objective2         0
PromoMethod        0
PromoStatus        0


In [12]:
# 4. Feature Engineering (Global Regression)

# Parse YearWeek "YYYY-WW"
data['Year'] = data['YearWeek'].str.split('-').str[0].astype(int)
data['WeekOfYear'] = data['YearWeek'].str.split('-').str[1].astype(int)

# Numeric sortable year-week (for splitting only)
data['YearWeek_int'] = data['YearWeek'].str.replace('-', '').astype(int)

# You already have Week, Month, Qtr, holiday flags, promo features, DiscountedPrice.
# We add Key as categorical as well.
# Do NOT create lags.

# Define candidate feature columns
base_cols = [
    'Key',              # high-cardinality identifier
    'Material',
    'Customer',
    'CustomerGroup',
    'Category',
    'Week',
    'Month',
    'Qtr',
    'New_Year',
    'Christmas_Day',
    'Easter_Monday',
    'Other_Holidays',
    'DiscountedPrice',
    'PromoShipment',
    'Objective1',
    'Objective2',
    'PromoMethod',
    'PromoStatus',
    'Year',
    'WeekOfYear'
]

# Ensure all exist
missing = [c for c in base_cols if c not in data.columns]
print("Missing feature columns:", missing)

# Drop any rows with missing Sales (should be none)
data = data.dropna(subset=['Sales'])
print("After dropping missing Sales:", data.shape)

Missing feature columns: []
After dropping missing Sales: (143273, 23)


In [13]:
# 5. Time-based Train / Validation Split

train_end = 202240   # up to week 40 inclusive
val_start = 202241   # weeks 41-45
val_end   = 202245

train_df = data[data['YearWeek_int'] <= train_end].copy()
valid_df = data[(data['YearWeek_int'] >= val_start) &
                (data['YearWeek_int'] <= val_end)].copy()

print("Train shape:", train_df.shape)
print("Valid shape:", valid_df.shape)

target_col = 'Sales'
feature_cols = base_cols  # as defined earlier

# Feature matrices (unchanged)
X_train = train_df[feature_cols].copy()
X_valid = valid_df[feature_cols].copy()

# ----------------------------------------
# LOG-TRANSFORM TARGET
# ----------------------------------------

train_df['Sales_log'] = np.log1p(train_df['Sales'])
valid_df['Sales_log'] = np.log1p(valid_df['Sales'])

y_train = train_df['Sales_log']
y_valid = valid_df['Sales_log']

print("y_train shape:", y_train.shape)
print("y_valid shape:", y_valid.shape)

Train shape: (128723, 23)
Valid shape: (4850, 23)
y_train shape: (128723,)
y_valid shape: (4850,)


In [14]:
# 6. CatBoost Pools

# ----------------------------------------
# BUILD CATBOOST POOLS
# ----------------------------------------

cat_features = [
    feature_cols.index(col) for col in [
        'Key',
        'Material',
        'Customer',
        'CustomerGroup',
        'Category',
        'Week',
        'Month',
        'Qtr',
        'New_Year',
        'Christmas_Day',
        'Easter_Monday',
        'Other_Holidays',
        'PromoShipment',
        'Objective1',
        'Objective2',
        'PromoMethod',
        'PromoStatus',
        'WeekOfYear'
    ] if col in feature_cols
]

train_pool = Pool(X_train, label=y_train, cat_features=cat_features)
valid_pool = Pool(X_valid, label=y_valid, cat_features=cat_features)

print("Categorical feature indices:", cat_features)

Categorical feature indices: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 19]


In [15]:
# 7. Train CatBoost Global Regressor

# ----------------------------------------
# TRAIN CATBOOST ON LOG-TRANSFORMED TARGET
# ----------------------------------------

model = CatBoostRegressor(
    loss_function='RMSE',      # Perfect for log target
    eval_metric='RMSE',
    learning_rate=0.05,
    depth=8,
    l2_leaf_reg=3.0,
    random_seed=RANDOM_SEED,
    iterations=4000,
    od_type='Iter',
    od_wait=200,
    verbose=200
)

model.fit(
    train_pool,
    eval_set=valid_pool,
    use_best_model=True
)
# ----------------------------------------
# INVERSE LOG-TRANSFORM PREDICTIONS
# ----------------------------------------

valid_pred_log = model.predict(valid_pool)

# Reverse log1p
valid_pred = np.expm1(valid_pred_log)

# Sales cannot be negative
valid_pred = np.clip(valid_pred, 0, None)

print("Prediction sample:", valid_pred[:10])

0:	learn: 2.7485631	test: 2.8316270	best: 2.8316270 (0)	total: 28.3ms	remaining: 1m 53s
200:	learn: 1.4734976	test: 1.9654941	best: 1.9654941 (200)	total: 6.83s	remaining: 2m 9s
400:	learn: 1.4372201	test: 1.9657954	best: 1.9637451 (221)	total: 14.2s	remaining: 2m 7s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 1.963745131
bestIteration = 221

Shrink model to first 222 iterations.
Prediction sample: [0.01580909 0.0261562  0.01636696 0.         0.02917661 0.63458064
 1.24153064 0.85823877 0.57429288 0.67460444]


In [16]:
# ===========================================================
# 8. CORRECT WMAPE / ACCURACY / BIAS USING REAL SALES
# ===========================================================

# Predict log-sales
valid_pred_log = model.predict(valid_pool)

# Convert back to Sales
valid_pred = np.expm1(valid_pred_log)
valid_pred = np.clip(valid_pred, 0, None)

# TRUE Sales (not log)
true_sales = valid_df["Sales"].values

# Correct metrics
wmape = np.sum(np.abs(true_sales - valid_pred)) / np.sum(true_sales)
accuracy = 1 - wmape
bias = (np.sum(true_sales) / np.sum(valid_pred)) - 1

print("\n=== CORRECTED CATBOOST METRICS ===")
print("WMAPE:", wmape)
print("Accuracy:", accuracy)
print("Bias:", bias)


=== CORRECTED CATBOOST METRICS ===
WMAPE: 0.7504759585964518
Accuracy: 0.24952404140354822
Bias: 1.698259225556669
