In [1]:
# ----------------------------------------
# 1. Setup & Imports
# ----------------------------------------

import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

In [2]:
# ----------------------------------------
# 2. Load Data
# ----------------------------------------

data = pd.read_csv("data/sales_pred_case.csv")
print("Loaded:", data.shape)
display(data.head())

Loaded: (143273, 20)


Unnamed: 0,Key,YearWeek,Sales,Material,Customer,CustomerGroup,Category,Week,Month,Qtr,New_Year,Christmas_Day,Easter_Monday,Other_Holidays,DiscountedPrice,PromoShipment,Objective1,Objective2,PromoMethod,PromoStatus
0,0_25,2020-03,2.0,0,25,13,0,3,1,1,0,0,0,0,5.92,0,7,3,8,7
1,0_25,2020-04,0.0,0,25,13,0,4,1,1,0,0,0,0,0.0,0,7,3,8,7
2,0_25,2020-05,0.0,0,25,13,0,5,2,1,0,0,0,0,0.0,0,7,3,8,7
3,0_25,2020-06,0.0,0,25,13,0,6,2,1,0,0,0,0,0.0,0,7,3,8,7
4,0_25,2020-07,0.0,0,25,13,0,7,2,1,0,0,0,0,0.0,0,7,3,8,7


In [3]:
# ----------------------------------------
# 3. EDA
# ----------------------------------------

print("\nDtypes:")
print(data.dtypes)

print("\nMissing values:")
print(data.isna().sum())

print("\nUnique Keys:", data['Key'].nunique())
print("YearWeek range:", data['YearWeek'].min(), "→", data['YearWeek'].max())


Dtypes:
Key                 object
YearWeek            object
Sales              float64
Material             int64
Customer             int64
CustomerGroup        int64
Category             int64
Week                 int64
Month                int64
Qtr                  int64
New_Year             int64
Christmas_Day        int64
Easter_Monday        int64
Other_Holidays       int64
DiscountedPrice    float64
PromoShipment        int64
Objective1           int64
Objective2           int64
PromoMethod          int64
PromoStatus          int64
dtype: object

Missing values:
Key                0
YearWeek           0
Sales              0
Material           0
Customer           0
CustomerGroup      0
Category           0
Week               0
Month              0
Qtr                0
New_Year           0
Christmas_Day      0
Easter_Monday      0
Other_Holidays     0
DiscountedPrice    0
PromoShipment      0
Objective1         0
Objective2         0
PromoMethod        0
PromoStatus        0


In [4]:
# ----------------------------------------
# 4. Corrected Stable Feature Engineering
# ----------------------------------------

# ----------------------------------------
# A. Correct Key Handling (CRITICAL FIX)
# ----------------------------------------

# Split Key into numeric Material and Customer
data[['Material_code', 'Customer_code']] = (
    data['Key'].str.split('_', expand=True).astype(int)
)

# ----------------------------------------
# B. Extract Time Features
# ----------------------------------------

data['Year'] = data['YearWeek'].str.split('-').str[0].astype(int)
data['WeekOfYear'] = data['YearWeek'].str.split('-').str[1].astype(int)

# Numeric sortable year-week
data['YearWeek_int'] = data['YearWeek'].str.replace('-', '').astype(int)

# ----------------------------------------
# C. Sort correctly by Key (using numeric codes) AND by time
# ----------------------------------------

data = data.sort_values(['Material_code', 'Customer_code', 'YearWeek_int'])


# ----------------------------------------
# D. Lag Features (robust set)
# ----------------------------------------

lag_list = [1, 2, 3, 4, 8, 12, 26, 52]

for lag in lag_list:
    data[f'lag_{lag}'] = data.groupby(['Material_code', 'Customer_code'])['Sales'].shift(lag)


# ----------------------------------------
# E. Rolling Window Features (safe only)
# ----------------------------------------

data['roll_mean_4'] = data.groupby(['Material_code','Customer_code'])['Sales'] \
    .shift(1).rolling(4, min_periods=1).mean()

data['roll_mean_8'] = data.groupby(['Material_code','Customer_code'])['Sales'] \
    .shift(1).rolling(8, min_periods=1).mean()

data['roll_std_4'] = data.groupby(['Material_code','Customer_code'])['Sales'] \
    .shift(1).rolling(4, min_periods=1).std()

data['roll_min_4'] = data.groupby(['Material_code','Customer_code'])['Sales'] \
    .shift(1).rolling(4, min_periods=1).min()

data['roll_max_4'] = data.groupby(['Material_code','Customer_code'])['Sales'] \
    .shift(1).rolling(4, min_periods=1).max()


# ----------------------------------------
# F. Promo & Price Features
# ----------------------------------------

data['is_discounted'] = (data['DiscountedPrice'] > 0).astype(int)


# ----------------------------------------
# G. Safe Categorical List
# ----------------------------------------

categorical_features = [
    'Material', 'Customer', 'CustomerGroup', 'Category',
    'PromoShipment', 'Objective1', 'Objective2',
    'PromoMethod', 'PromoStatus'
]


# ----------------------------------------
# H. Construct model-ready dataset
# ----------------------------------------

required_cols = (
    [f'lag_{l}' for l in lag_list] +
    ['roll_mean_4', 'roll_mean_8',
     'roll_std_4', 'roll_min_4', 'roll_max_4']
)

data_model = data.dropna(subset=required_cols)

print("Model-ready shape:", data_model.shape)

Model-ready shape: (92833, 39)


In [5]:
# ----------------------------------------
# 5. Train/Validation Split
# ----------------------------------------

train_end = 202240
val_start = 202241
val_end   = 202245

train_df = data_model[data_model['YearWeek_int'] <= train_end].copy()
valid_df = data_model[(data_model['YearWeek_int'] >= val_start) &
                      (data_model['YearWeek_int'] <= val_end)].copy()

drop_cols = ['Sales', 'YearWeek', 'YearWeek_int', 'Key']

target_col = 'Sales'
features = [c for c in data_model.columns if c not in drop_cols]

X_train = train_df[features]
y_train = train_df[target_col]

X_valid = valid_df[features]
y_valid = valid_df[target_col]

print("X_train:", X_train.shape, "y_train:", y_train.shape)
print("X_valid:", X_valid.shape, "y_valid:", y_valid.shape)

X_train: (78283, 35) y_train: (78283,)
X_valid: (4850, 35) y_valid: (4850,)


In [6]:
# ----------------------------------------
# 6. Interaction features
# ----------------------------------------

# Customer × Category
X_train['cust_cat'] = (train_df['Customer'].astype(str) + "_" + train_df['Category'].astype(str)).astype('category').cat.codes
X_valid['cust_cat'] = (valid_df['Customer'].astype(str) + "_" + valid_df['Category'].astype(str)).astype('category').cat.codes

In [7]:
# ----------------------------------------
# 7. LightGBM Training (Stable Version)
# ----------------------------------------

categorical_features = [
    'Material', 'Customer', 'CustomerGroup', 'Category',
    'PromoShipment', 'Objective1', 'Objective2',
    'PromoMethod', 'PromoStatus'
]

train_dataset = lgb.Dataset(
    X_train, y_train,
    categorical_feature=categorical_features
)

valid_dataset = lgb.Dataset(
    X_valid, y_valid,
    categorical_feature=categorical_features
)

params = {
    'objective': 'huber',
    'alpha': 0.9,
    'metric': 'mae',
    'learning_rate': 0.05,
    'num_leaves': 64,
    'min_data_in_leaf': 30,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.9,
    'bagging_freq': 3,
    'lambda_l1': 0.1,
    'lambda_l2': 0.1,
    'seed': 42,
    'verbosity': -1
}

model = lgb.train(
    params,
    train_dataset,
    num_boost_round=3000,
    valid_sets=[train_dataset, valid_dataset],
    valid_names=['train','valid'],
    callbacks=[
        lgb.early_stopping(200),
        lgb.log_evaluation(200)
    ]
)

print("Best iteration:", model.best_iteration)

Training until validation scores don't improve for 200 rounds
[200]	train's l1: 345.85	valid's l1: 355.354
[400]	train's l1: 339.532	valid's l1: 349.555
[600]	train's l1: 333.229	valid's l1: 343.763
[800]	train's l1: 326.953	valid's l1: 337.986
[1000]	train's l1: 320.696	valid's l1: 332.238
[1200]	train's l1: 314.461	valid's l1: 326.513
[1400]	train's l1: 308.232	valid's l1: 320.807
[1600]	train's l1: 302.062	valid's l1: 315.153
[1800]	train's l1: 295.981	valid's l1: 309.558
[2000]	train's l1: 290.027	valid's l1: 304.041
[2200]	train's l1: 284.177	valid's l1: 298.602
[2400]	train's l1: 278.424	valid's l1: 293.218
[2600]	train's l1: 272.753	valid's l1: 287.89
[2800]	train's l1: 267.142	valid's l1: 282.606
[3000]	train's l1: 261.571	valid's l1: 277.375
Did not meet early stopping. Best iteration is:
[3000]	train's l1: 261.571	valid's l1: 277.375
Best iteration: 3000


In [8]:
# ----------------------------------------
# 8. Evaluation Metrics
# ----------------------------------------

valid_pred = model.predict(X_valid)

wmape = np.sum(np.abs(y_valid - valid_pred)) / np.sum(np.abs(y_valid))
accuracy = 1 - wmape
bias = (np.sum(y_valid) / np.sum(valid_pred)) - 1

print("Validation WMAPE:", wmape)
print("Validation Accuracy:", accuracy)
print("Validation Bias:", bias)

Validation WMAPE: 1.0430087785984596
Validation Accuracy: -0.04300877859845964
Validation Bias: 0.3775954745599521


In [9]:
print(valid_df.shape)
print(valid_df['YearWeek_int'].min(), valid_df['YearWeek_int'].max())
valid_df['YearWeek'].value_counts().sort_index()

(4850, 39)
202241 202245


YearWeek
2022-41    970
2022-42    970
2022-43    970
2022-44    970
2022-45    970
Name: count, dtype: int64

In [10]:
X_valid.describe().T[['mean','std']]

Unnamed: 0,mean,std
Material,101.63299,50.093554
Customer,15.94433,9.151178
CustomerGroup,10.568041,6.168227
Category,0.0,0.0
Week,43.0,1.414359
Month,10.4,0.489948
Qtr,4.0,0.0
New_Year,0.0,0.0
Christmas_Day,0.0,0.0
Easter_Monday,0.0,0.0


In [11]:
print("Mean y_valid:", y_valid.mean())
print("Mean valid_pred:", model.predict(X_valid).mean())

Mean y_valid: 265.93690721649483
Mean valid_pred: 193.0442659892183


In [12]:
lag_cols = [c for c in X_valid.columns if 'lag_' in c]
X_valid[lag_cols].isna().sum()

lag_1     0
lag_2     0
lag_3     0
lag_4     0
lag_8     0
lag_12    0
lag_26    0
lag_52    0
dtype: int64

In [13]:
for col in categorical_features:
    print(col, 
          "train unique:", X_train[col].nunique(),
          "valid unique:", X_valid[col].nunique())

Material train unique: 183 valid unique: 183
Customer train unique: 33 valid unique: 33
CustomerGroup train unique: 22 valid unique: 22
Category train unique: 1 valid unique: 1
PromoShipment train unique: 2 valid unique: 2
Objective1 train unique: 8 valid unique: 6
Objective2 train unique: 4 valid unique: 4
PromoMethod train unique: 8 valid unique: 6
PromoStatus train unique: 6 valid unique: 2


In [14]:
print(X_train.dtypes)

Material             int64
Customer             int64
CustomerGroup        int64
Category             int64
Week                 int64
Month                int64
Qtr                  int64
New_Year             int64
Christmas_Day        int64
Easter_Monday        int64
Other_Holidays       int64
DiscountedPrice    float64
PromoShipment        int64
Objective1           int64
Objective2           int64
PromoMethod          int64
PromoStatus          int64
Material_code        int64
Customer_code        int64
Year                 int64
WeekOfYear           int64
lag_1              float64
lag_2              float64
lag_3              float64
lag_4              float64
lag_8              float64
lag_12             float64
lag_26             float64
lag_52             float64
roll_mean_4        float64
roll_mean_8        float64
roll_std_4         float64
roll_min_4         float64
roll_max_4         float64
is_discounted        int64
cust_cat              int8
dtype: object


In [15]:
for col in X_train.columns:
    if col.startswith('lag_'):
        print(col, X_train[col].dtype)

lag_1 float64
lag_2 float64
lag_3 float64
lag_4 float64
lag_8 float64
lag_12 float64
lag_26 float64
lag_52 float64
