## Code Assessment

### 1. Baic setup

In [2]:
# Basic libraries.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Warnings management.
import warnings
warnings.filterwarnings("ignore")

# Reproducibility.
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# Data/Results Path.
DATA_DIR = Path("data/")
RESULTS_DIR = Path("results/")

In [3]:
# Global parameters for plots.
plt.rcParams.update({
    "font.family": "Times New Roman",
    "font.size": 12,
    "axes.titlesize": 14,
    "axes.labelsize": 12,
    "axes.edgecolor": "black",
    "axes.linewidth": 1,
    "xtick.color": "black",
    "ytick.color": "black",
    "xtick.direction": "out",
    "ytick.direction": "out",
    "xtick.bottom": True,
    "ytick.left": True,
    "xtick.top": False,
    "ytick.right": False,
    "figure.dpi": 300,
    "legend.frameon": True,
    "legend.facecolor": "white",
    "legend.edgecolor": "black",
    "legend.fontsize": 12
})

# Seaborn
sns.set_theme(context="notebook", style="ticks")

### 2. Load Data and Audit

In [6]:
data = pd.read_csv(DATA_DIR / "sales_pred_case.csv")

In [7]:
data.head()

Unnamed: 0,Key,YearWeek,Sales,Material,Customer,CustomerGroup,Category,Week,Month,Qtr,New_Year,Christmas_Day,Easter_Monday,Other_Holidays,DiscountedPrice,PromoShipment,Objective1,Objective2,PromoMethod,PromoStatus
0,0_25,2020-03,2.0,0,25,13,0,3,1,1,0,0,0,0,5.92,0,7,3,8,7
1,0_25,2020-04,0.0,0,25,13,0,4,1,1,0,0,0,0,0.0,0,7,3,8,7
2,0_25,2020-05,0.0,0,25,13,0,5,2,1,0,0,0,0,0.0,0,7,3,8,7
3,0_25,2020-06,0.0,0,25,13,0,6,2,1,0,0,0,0,0.0,0,7,3,8,7
4,0_25,2020-07,0.0,0,25,13,0,7,2,1,0,0,0,0,0.0,0,7,3,8,7


In [8]:
# Audit the data structure.
print ("Shape of the data:", data.shape)
print ("Data Information: \n", data.info())
print ("Missing values in each column: \n", data.isnull().sum())

Shape of the data: (143273, 20)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 143273 entries, 0 to 143272
Data columns (total 20 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   Key              143273 non-null  object 
 1   YearWeek         143273 non-null  object 
 2   Sales            143273 non-null  float64
 3   Material         143273 non-null  int64  
 4   Customer         143273 non-null  int64  
 5   CustomerGroup    143273 non-null  int64  
 6   Category         143273 non-null  int64  
 7   Week             143273 non-null  int64  
 8   Month            143273 non-null  int64  
 9   Qtr              143273 non-null  int64  
 10  New_Year         143273 non-null  int64  
 11  Christmas_Day    143273 non-null  int64  
 12  Easter_Monday    143273 non-null  int64  
 13  Other_Holidays   143273 non-null  int64  
 14  DiscountedPrice  143273 non-null  float64
 15  PromoShipment    143273 non-null  int64  
 16  Object

In [13]:
data.describe(include='all')

Unnamed: 0,Key,YearWeek,Sales,Material,Customer,CustomerGroup,Category,Week,Month,Qtr,New_Year,Christmas_Day,Easter_Monday,Other_Holidays,DiscountedPrice,PromoShipment,Objective1,Objective2,PromoMethod,PromoStatus
count,143273,143273,143273.0,143273.0,143273.0,143273.0,143273.0,143273.0,143273.0,143273.0,143273.0,143273.0,143273.0,143273.0,143273.0,143273.0,143273.0,143273.0,143273.0,143273.0
unique,970,160,,,,,,,,,,,,,,,,,,
top,125_9,2023-02,,,,,,,,,,,,,,,,,,
freq,160,970,,,,,,,,,,,,,,,,,,
mean,,,226.232961,100.433906,15.740258,10.460589,0.0,26.856323,6.59707,2.526757,0.021141,0.025678,0.018601,0.138414,2.45147,0.388119,5.769147,2.156882,7.026272,5.305759
std,,,640.523581,49.288938,9.07891,6.190107,0.0,15.174052,3.45395,1.120696,0.143856,0.158174,0.135111,0.345335,45.946344,0.487324,1.962011,1.100601,1.970161,2.187815
min,,,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,-13606.23,0.0,0.0,0.0,0.0,0.0
25%,,,0.0,63.0,10.0,5.0,0.0,14.0,4.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,1.0,7.0,3.0
50%,,,0.0,107.0,15.0,10.0,0.0,27.0,7.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,3.0,8.0,7.0
75%,,,160.0,144.0,25.0,15.0,0.0,40.0,10.0,4.0,0.0,0.0,0.0,0.0,6.080606,1.0,7.0,3.0,8.0,7.0


### 3. Exploratory Data Analysis (EDA)

In [18]:
# --- Unique counts ---
print("\nUnique Counts:")
print("Unique Keys:", data['Key'].nunique())
print("Unique Materials:", data['Material'].nunique())
print("Unique Customers:", data['Customer'].nunique())
print("Unique Customer Groups:", data['CustomerGroup'].nunique())
print("Unique Categories:", data['Category'].nunique())


Unique Counts:
Unique Keys: 970
Unique Materials: 183
Unique Customers: 33
Unique Customer Groups: 22
Unique Categories: 1


In [19]:
# Rows per Key — sparsity check
rows_per_key = data['Key'].value_counts()
print("\nRows per Key summary:")
print(rows_per_key.describe())


Rows per Key summary:
count    970.000000
mean     147.704124
std       21.352902
min       77.000000
25%      150.000000
50%      158.000000
75%      159.000000
max      160.000000
Name: count, dtype: float64


In [20]:
# --- Time coverage ---
print("\nYearWeek range:")
print("Min:", data['YearWeek'].min(), "Max:", data['YearWeek'].max())


YearWeek range:
Min: 2020-01 Max: 2023-03


In [24]:
# Sort check
data_sorted = data.sort_values(['Key', 'YearWeek'])

In [25]:
data_sorted.head()

Unnamed: 0,Key,YearWeek,Sales,Material,Customer,CustomerGroup,Category,Week,Month,Qtr,New_Year,Christmas_Day,Easter_Monday,Other_Holidays,DiscountedPrice,PromoShipment,Objective1,Objective2,PromoMethod,PromoStatus
0,0_25,2020-03,2.0,0,25,13,0,3,1,1,0,0,0,0,5.92,0,7,3,8,7
1,0_25,2020-04,0.0,0,25,13,0,4,1,1,0,0,0,0,0.0,0,7,3,8,7
2,0_25,2020-05,0.0,0,25,13,0,5,2,1,0,0,0,0,0.0,0,7,3,8,7
3,0_25,2020-06,0.0,0,25,13,0,6,2,1,0,0,0,0,0.0,0,7,3,8,7
4,0_25,2020-07,0.0,0,25,13,0,7,2,1,0,0,0,0,0.0,0,7,3,8,7


In [36]:
# 1. Convert "2020-03" → 202003 (int)
data['YearWeek_int'] = data['YearWeek'].str.replace('-', '').astype(int)

# 2. Sort
data_sorted = data.sort_values(['Key', 'YearWeek_int'])

# 3. Check monotonicity within each Key
is_monotonic = data_sorted.groupby('Key')['YearWeek_int'].apply(lambda x: x.is_monotonic_increasing)

print("Is YearWeek sorted within each Key?:", is_monotonic.all())

Is YearWeek sorted within each Key?: True


In [37]:
# --- Sales distribution ---
print("\nSales Summary:")
display(data['Sales'].describe())
print("Zero-sales %:", (data['Sales'] == 0).mean())


Sales Summary:


count    143273.000000
mean        226.232961
std         640.523581
min           0.000000
25%           0.000000
50%           0.000000
75%         160.000000
max       21450.000000
Name: Sales, dtype: float64

Zero-sales %: 0.5621505796626022


In [38]:
# --- Promotion & price checks ---
promo_cols = ['PromoShipment', 'Objective1', 'Objective2', 'PromoMethod', 'PromoStatus']
print("\nPromo Columns:", promo_cols)


Promo Columns: ['PromoShipment', 'Objective1', 'Objective2', 'PromoMethod', 'PromoStatus']


In [39]:
print("\nPromo Feature Unique Levels:")
for col in promo_cols:
    print(col, ":", data[col].nunique())


Promo Feature Unique Levels:
PromoShipment : 2
Objective1 : 8
Objective2 : 4
PromoMethod : 9
PromoStatus : 8


In [40]:
print("\nDiscountedPrice Summary:")
display(data['DiscountedPrice'].describe())


DiscountedPrice Summary:


count    143273.000000
mean          2.451470
std          45.946344
min      -13606.230000
25%           0.000000
50%           0.000000
75%           6.080606
max          86.400000
Name: DiscountedPrice, dtype: float64

In [41]:
# --- Key consistency check ---
key_check = data.groupby('Key')[['Material','Customer']].nunique()
inconsistent_keys = key_check[(key_check['Material'] > 1) | (key_check['Customer'] > 1)]

In [42]:
print("\nKey consistency issues:", inconsistent_keys.shape[0])
if inconsistent_keys.shape[0] > 0:
    display(inconsistent_keys.head())


Key consistency issues: 0


### 3. Feature Engineering

In [44]:
# Make a numeric Year and Week-of-year
data['Year'] = data['YearWeek'].str.split('-').str[0].astype(int)
data['WeekOfYear'] = data['YearWeek'].str.split('-').str[1].astype(int)

In [45]:
# Convert YearWeek to sortable int for ordering
data['YearWeek_int'] = data['YearWeek'].str.replace('-', '').astype(int)

In [46]:
# Sort data properly
data = data.sort_values(['Key', 'YearWeek_int'])

In [None]:
# ----------------------------------------
# A. Lag Features
# ----------------------------------------

lag_features = [1, 2, 3, 4, 8, 52]

for lag in lag_features:
    data[f'lag_{lag}'] = data.groupby('Key')['Sales'].shift(lag)

In [None]:
# ----------------------------------------
# B. Rolling Window Features
# ----------------------------------------

data['roll_mean_4'] = data.groupby('Key')['Sales'].shift(1).rolling(4, min_periods=1).mean()
data['roll_mean_8'] = data.groupby('Key')['Sales'].shift(1).rolling(8, min_periods=1).mean()

data['roll_std_4'] = data.groupby('Key')['Sales'].shift(1).rolling(4, min_periods=1).std()
data['roll_min_4'] = data.groupby('Key')['Sales'].shift(1).rolling(4, min_periods=1).min()
data['roll_max_4'] = data.groupby('Key')['Sales'].shift(1).rolling(4, min_periods=1).max()

In [49]:
# ----------------------------------------
# C. Discount / Promotion Features
# ----------------------------------------

data['is_discounted'] = (data['DiscountedPrice'] > 0).astype(int)


In [50]:
# Optionally: data['discount_strength'] = data['DiscountedPrice']  # as-is

# ----------------------------------------
# D. Categorical Features (will be passed to LightGBM)
# ----------------------------------------

categorical_features = [
    'Material', 'Customer', 'CustomerGroup', 'Category',
    'PromoShipment', 'Objective1', 'Objective2', 'PromoMethod', 'PromoStatus'
]

In [51]:
# ----------------------------------------
# IMPORTANT: Drop rows with missing lag features before modeling
# ----------------------------------------

feature_cols_to_check = [f'lag_{l}' for l in lag_features] + ['roll_mean_4', 'roll_mean_8']

In [52]:
# training rows with full history
data_model = data.dropna(subset=feature_cols_to_check)
print("Shape after lag/rolling drop:", data_model.shape)

Shape after lag/rolling drop: (92833, 35)


### 4. Train / Validation Split (Time-Aware)

In [53]:
# ----------------------------------------
# 5. Train / Validation Split
# ----------------------------------------

# Define numeric year-week boundaries
train_end = 202240
val_start = 202241
val_end = 202245

In [54]:
# Training set: all rows with full lag history up to 2022-40
train_df = data_model[data_model['YearWeek_int'] <= train_end].copy()

In [55]:
# Validation set: next 5 weeks (41-45)
valid_df = data_model[
    (data_model['YearWeek_int'] >= val_start) &
    (data_model['YearWeek_int'] <= val_end)
].copy()

print("Train shape:", train_df.shape)
print("Validation shape:", valid_df.shape)

Train shape: (78283, 35)
Validation shape: (4850, 35)


In [56]:
# ----------------------------------------
# Features and target
# ----------------------------------------

target_col = 'Sales'

In [68]:
# All feature columns except target + YearWeek + identifiers
drop_cols = ['Sales', 'YearWeek', 'YearWeek_int', 'Key']

In [69]:
features = [c for c in data_model.columns if c not in drop_cols]

In [70]:
# X, y split
X_train = train_df[features]
y_train = train_df[target_col]

X_valid = valid_df[features]
y_valid = valid_df[target_col]

print("X_train:", X_train.shape, " y_train:", y_train.shape)
print("X_valid:", X_valid.shape, " y_valid:", y_valid.shape)

X_train: (78283, 31)  y_train: (78283,)
X_valid: (4850, 31)  y_valid: (4850,)


### 5. Model Training

In [71]:
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error

In [72]:
# ----------------------------------------
# 6. Model Training (LightGBM)
# ----------------------------------------

# Define categorical features (they are already integer-encoded)
categorical_features = [
    'Material', 'Customer', 'CustomerGroup', 'Category',
    'PromoShipment', 'Objective1', 'Objective2', 'PromoMethod', 'PromoStatus'
]

In [73]:
# Prepare LightGBM datasets
train_dataset = lgb.Dataset(
    X_train, 
    label=y_train,
    categorical_feature=categorical_features,
    free_raw_data=False
)

In [74]:
valid_dataset = lgb.Dataset(
    X_valid, 
    label=y_valid,
    categorical_feature=categorical_features,
    free_raw_data=False
)

In [75]:
# LightGBM parameters
params = {
    'objective': 'regression_l1',    # MAE loss
    'metric': 'mae',
    'learning_rate': 0.05,
    'num_leaves': 64,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'seed': 42,
    'verbosity': -1
}

In [76]:
model = lgb.train(
    params,
    train_dataset,
    num_boost_round=5000,
    valid_sets=[train_dataset, valid_dataset],
    valid_names=['train', 'valid'],
    callbacks=[
        lgb.early_stopping(stopping_rounds=200, verbose=True),
        lgb.log_evaluation(period=200)
    ]
)

Training until validation scores don't improve for 200 rounds
[200]	train's l1: 129.208	valid's l1: 173.544
[400]	train's l1: 127.481	valid's l1: 173.235
[600]	train's l1: 126.273	valid's l1: 173.07
[800]	train's l1: 124.989	valid's l1: 173.082
Early stopping, best iteration is:
[619]	train's l1: 126.115	valid's l1: 173.034


In [77]:
print("Best iteration:", model.best_iteration)

Best iteration: 619


### 6. Evaluation 

In [78]:
# ----------------------------------------
# 7. Evaluation on Validation Set
# ----------------------------------------

# Make predictions
valid_pred = model.predict(X_valid)

# Compute WMAPE
wmape = np.sum(np.abs(y_valid - valid_pred)) / np.sum(np.abs(y_valid))
accuracy = 1 - wmape

# Compute Bias
bias = (np.sum(y_valid) / np.sum(valid_pred)) - 1

print("Validation WMAPE:", wmape)
print("Validation Accuracy:", accuracy)
print("Validation Bias:", bias)

Validation WMAPE: 0.650659012806655
Validation Accuracy: 0.34934098719334505
Validation Bias: 0.33054620895943043
