In [1]:
import sys
from pathlib import Path
import numpy as np
import polars as pl
import pickle
import logging

from IPython.display import Markdown, display

current_dir = Path.cwd()
if str(current_dir) not in sys.path:
    sys.path.insert(0, str(current_dir))

from src import (
    DataConfig, DataLoader, ModelingStrategy, ReleaseManager, BenchmarkPipeline, create_config
)
from datetime import date
from lets_plot import *
import optuna

LetsPlot.setup_html()
optuna.logging.set_verbosity(optuna.logging.CRITICAL)
np.random.seed(42)


  from .autonotebook import tqdm as notebook_tqdm


## Get trainable SKUs

In [2]:
#get list of trainable SKUs
data_path = "/Users/ivn/Documents/PhD/Transformer Research/Code/Benchmarking/data/db_snapshot_offsite/train_data/processed/train_data_features.feather"
df_clean = pl.read_ipc(data_path)

sku_tuples_all = [(d['productID'], d['storeID']) for d in df_clean.select(pl.col("productID"), pl.col("storeID")).unique().to_dicts()]
print("total unseen skus: ", len(sku_tuples_all))

sku_exclude = (df_clean
 .group_by("storeID","productID")
 .agg(pl.col("date").first())
 .filter(pl.col("date") >= date(2016,1,1))
 .select("productID","storeID")
 )

sku_exclude = [(d['productID'], d['storeID']) for d in sku_exclude.select(pl.col("productID"), pl.col("storeID")).unique().to_dicts()]

sku_tuples_complete =  [sku for sku in sku_tuples_all if sku not in sku_exclude]
print("total unseen skus available for training: ", len(sku_tuples_complete))

total unseen skus:  7634
total unseen skus available for training:  7632


In [6]:
data_path = "/Users/ivn/Documents/PhD/Transformer Research/Code/Benchmarking/data/db_snapshot_offsite/unseen_sku/unseen.csv"

In [8]:
df_clean = pl.read_csv(data_path)

In [12]:
df_clean.select(pl.col("storeID")).unique()

storeID
i64
9
3
6
4
1
10
7
8
5
2


In [11]:
df_clean.select(pl.col("productID")).unique()

productID
i64
1450
2099
2790
411
2510
…
104
1176
902
506


In [20]:
sku_tuples_complete[:1]

[(2489, 1)]

In [21]:
sku_tuples_complete[0][0]

2489

In [22]:
sku_tuples_complete[0][1]

1

In [35]:
(df_clean
 .filter((pl.col("productID")==2484) & (pl.col("storeID")==7))
 .sort("date")
 .drop_nulls()
    .group_by("storeID", "productID")
    .agg([
        pl.col("target_lag_1").filter(pl.col("date") < date(2016,1,1 )).mean().alias("mean_train"),
        pl.col("target_lag_1").filter(pl.col("date") >= date(2016,1,1 )).mean().alias("mean_test")
    ])
)

storeID,productID,mean_train,mean_test
i64,i64,f64,f64
7,2484,0.319933,0.258741


## Check SKUs for training data before 1.1.2016

In [144]:
(df_clean
.filter(
    (pl.col("storeID") == 1) &
    (pl.col("productID").is_in([2991, 2258]))
)
.sort("date")
.group_by(["storeID", "productID"])
.head(1)
.select("storeID", "productID", "date")
)

storeID,productID,date
i64,i64,date
1,2258,2016-02-13
1,2991,2016-01-16


In [137]:
(df_clean
 .group_by("storeID","productID")
 .agg(pl.col("date").first())
 .filter(pl.col("date") >= date(2016,1,1)))

storeID,productID,date
i64,i64,date
1,2991,2016-01-16
1,2258,2016-02-13


In [None]:
split_date = date(2016,1,1)
train_summary=(df_clean
 .group_by("storeID", "productID")
 .agg([
     pl.col("date").min().alias("date_first"),
     pl.col("date").filter(pl.col("date") < split_date).count().alias("obs_train"),
     pl.col("date").filter(pl.col("date") >= split_date).count().alias("obs_test")
 ])
 .sort("storeID", "productID"))

In [139]:
train_summary.filter(pl.col("date_first") > split_date)

storeID,productID,date_first,obs_train,obs_test
i64,i64,date,u32,u32
1,2258,2016-02-13,0,100
1,2991,2016-01-16,0,128


In [142]:
train_summary.filter(pl.col("obs_train") < 150)

storeID,productID,date_first,obs_train,obs_test
i64,i64,date,u32,u32
1,664,2015-10-24,69,143
1,1020,2015-12-26,6,143
1,1098,2015-10-10,83,143
1,1669,2015-09-05,118,143
1,1790,2015-12-26,6,143
…,…,…,…,…
7,2130,2015-09-05,118,143
7,2677,2015-10-31,62,143
7,2692,2015-11-07,55,143
7,2888,2015-08-29,125,143


## Check pipeline with 3 SKUs and random HP

In [None]:
data_config = DataConfig(
    mapping_path = 'data/feature_mapping_train.pkl',
    features_path = "/Users/ivn/Documents/PhD/Transformer Research/Code/Benchmarking/data/db_snapshot_offsite/train_data/processed/train_data_features.feather",
    target_path = "/Users/ivn/Documents/PhD/Transformer Research/Code/Benchmarking/data/db_snapshot_offsite/train_data/train_data/train_data_target.feather",
    split_date="2016-01-01",
)

sku_tuples=[(1912, 7), (377, 1), (715, 7)]

quantiles = [0.5, 0.7, 0.9, 0.95, 0.99]
pipeline = BenchmarkPipeline(data_config)

total unseen skus:  7634


In [9]:
results_lightning_std = pipeline.run_experiment(
    sku_tuples=sku_tuples,
    modeling_strategy=ModelingStrategy.INDIVIDUAL,
    model_type="xgboost_quantile",
    quantile_alphas=quantiles,
    hyperparameters = {
        "eta": 0.05,
        "max_depth": 8,
        "min_child_weight": 20,
        "subsample": 0.8,
        "colsample_bytree": 0.7,
        "gamma": 1.0,   
        "lambda": 10.0,
        "alpha": 1.0,
        "tree_method": "hist",
        "num_boost_round": 100,
        "seed": 42
},
    experiment_name="xgb_quantile_test",
    evaluate_on_test=True
)

100%|██████████| 3/3 [00:00<00:00, 13.30it/s]
Training models: 100%|██████████| 3/3 [00:01<00:00,  2.13it/s]


In [20]:
sample_result_1 = results_lightning_std.training_results[0]
print(f"Model type: {sample_result_1.model_type}")
print(f"Strategy: {sample_result_1.modeling_strategy.value}")
print(f"SKU tuples: {sample_result_1.sku_tuples}")
print(f"Quantile level: {sample_result_1.quantile_level}")
print(f"quantile_score: {sample_result_1.performance_metrics.get('quantile_score', 'N/A')}")
print(f"predictions: {sample_result_1.performance_metrics.get('predictions', 'N/A')}")
print("overall stats: ----------------------------------------------------")
print(f"number of models trained: {len(results_lightning_std.training_results)}")

Model type: xgboost_quantile
Strategy: individual
SKU tuples: [(1912, 7)]
Quantile level: 0.5
quantile_score: N/A
predictions: [-0. -0. -0. -0. -0. -0. -0. -0. -0. -0. -0. -0. -0. -0. -0. -0. -0. -0.
 -0. -0. -0. -0. -0. -0. -0. -0. -0. -0. -0. -0. -0. -0. -0. -0. -0. -0.
 -0. -0. -0. -0. -0. -0. -0. -0. -0. -0. -0. -0. -0. -0. -0. -0. -0. -0.
 -0. -0. -0. -0. -0. -0. -0. -0. -0. -0. -0. -0. -0. -0. -0. -0. -0. -0.
 -0. -0. -0. -0. -0. -0. -0. -0. -0. -0. -0. -0. -0. -0. -0. -0. -0. -0.
 -0. -0. -0. -0. -0. -0. -0. -0. -0. -0. -0. -0. -0. -0. -0. -0. -0. -0.
 -0. -0. -0. -0. -0. -0. -0. -0. -0. -0. -0. -0. -0. -0. -0. -0. -0. -0.
 -0. -0. -0. -0. -0. -0. -0. -0. -0. -0. -0. -0. -0. -0. -0. -0. -0.]
overall stats: ----------------------------------------------------
number of models trained: 15


In [21]:
sample_result_1.performance_metrics

{'quantile_losses': array([0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 1. , 0. , 1. , 0. ,
        0. , 0. , 0.5, 0. , 0. , 0. , 0. , 0. , 0. , 0.5, 0. , 0. , 0. ,
        0. , 0.5, 0. , 0.5, 0. , 0. , 0. , 0. , 0. , 1. , 0. , 1. , 0. ,
        0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0.5, 0. , 0. , 0. ,
        0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
        0. , 0.5, 0. , 0. , 0. , 0. , 0. , 0.5, 0. , 0.5, 0. , 0. , 0. ,
        0. , 0. , 1. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
        0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
        0. , 0. , 0. , 0. , 0. , 0.5, 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
        0.5, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0.5, 0. ,
        0. , 0. , 0.5, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ]),
 'predictions': array([-0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0.,
        -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0.,
        -0., -0

In [9]:
release_manager = ReleaseManager()
output_dir = Path("./xgb_releases_2")
release_path = release_manager.create_complete_release(
    experiment_results=results_lightning_std,  # Your ExperimentResults from pipeline
    base_output_dir=output_dir
  )

# Check Data in pipeline and prototype for consistency

In [6]:
sku = (715, 7)
loader = DataLoader(data_config)
loader.load_data(lazy=False)
dataset = loader.prepare_modeling_dataset([sku], ModelingStrategy.INDIVIDUAL)

In [7]:
loader = DataLoader(data_config)
# Don't call load_data() at all - let prepare_modeling_dataset do it
dataset = loader.prepare_modeling_dataset([sku], ModelingStrategy.INDIVIDUAL)

In [8]:
# Access data
X_train_pipeline = dataset.X_train
y_train_pipeline = dataset.y_train
X_test_pipeline = dataset.X_test
y_test_pipeline = dataset.y_test

In [9]:
X_train_pipeline.shape

(741, 132)

In [34]:

# Data loading
mapping_path = "/Users/ivn/Documents/PhD/Transformer Research/Code/Benchmarking/data/feature_mapping_train.pkl"
features_path = "/Users/ivn/Documents/PhD/Transformer Research/Code/Benchmarking/data/db_snapshot_offsite/train_data/processed/train_data_features.feather"
target_path = "/Users/ivn/Documents/PhD/Transformer Research/Code/Benchmarking/data/db_snapshot_offsite/train_data/train_data/train_data_target.feather"

with open(mapping_path, 'rb') as f:
    mapping = pickle.load(f)

train_features = pl.read_ipc(features_path)
train_target = pl.read_ipc(target_path)

In [35]:
X = (train_features
     .lazy()
     .filter(pl.col('productID') == 715)
     .filter(pl.col('storeID') == 7)
     .drop_nulls()
     .sort("date","skuID")
     .collect())

X_bdIDs = (X
           .lazy()
           .select('bdID')
           .unique()
           .collect()
           .to_numpy()
           .flatten())

y = (train_target
     .lazy()
     .filter(pl.col("bdID").is_in(X_bdIDs))
     .join(
         X.lazy().select("bdID","skuID"), 
         on="bdID", 
         how="left")
     .sort("date","skuID")
     .collect())

In [36]:
split_date = pl.date(2016, 1, 1)
X_train = X.filter(pl.col("date") < split_date)
X_test = X.filter(pl.col("date") >= split_date)
y_train = y.filter(pl.col("date") < split_date)
y_test = y.filter(pl.col("date") >= split_date)

In [37]:
meta_cols = ['frequency',
 'idx',
 'bdID',
 'base_date',
 'date',
 'dateID',
 'skuID',
 'productID',
 'storeID',
 'companyID',
 'is_daily',
 'missing_value',
 'not_for_sale',
 'name',
 'name-2']

In [38]:
# Explicitly set dtype to float64 for all arrays
X_train = (X_train
            .sort("date","skuID")
            .select(pl.selectors.exclude(meta_cols)))

y_train = (y_train
            .sort("date","skuID")
            .select("target")
            .to_numpy())  # Also flatten to ensure 1D

X_test = (X_test
        .sort("date","skuID")
        .select(pl.selectors.exclude(meta_cols)))

y_test = (y_test
        .sort("date","skuID")
        .select("target"))

In [None]:
#wrap them into print statements to see the output
print("Train set shape test:   ", X_train.shape == X_train_pipeline.shape )
print("Train label shape test: ", y_train.shape == y_train_pipeline.shape )
print(30*"-")
print("Test set shape test:  ", X_test.shape == X_test_pipeline.shape )
print("Test label shape test:", y_test.shape == y_test_pipeline.shape ) 

Train set shape test:    True
Train label shape test:  True
------------------------------
Test set shape test:   True
Test label shape test: True


In [None]:
#test if prototype and pipeline data are the same
print("Training set check:    ", (X_train == X_train_pipeline).to_numpy().all())
print("Training label check:  ",(y_train == y_train_pipeline).all())
print(30*"-")
print("Test set check:   ", (X_test == X_test_pipeline).to_numpy().all())
print("Test label check: ",(y_test == y_test).to_numpy().all())  

Training set check:     True
Training label check:   True
------------------------------
Test set check:    True
Test label check:  True


# Test Hyperparameter mode

## Test Tune Run

In [3]:
data_path = "/Users/ivn/Documents/PhD/Transformer Research/Code/Benchmarking/data/db_snapshot_offsite/train_data/processed/train_data_features.feather"
df_clean = pl.read_ipc(data_path)

sku_tuples = [(d['productID'], d['storeID']) for d in df_clean.select(pl.col("productID"), pl.col("storeID")).unique().to_dicts()]
len(sku_tuples)

7634

In [4]:
data_config = DataConfig(
    mapping_path = 'data/feature_mapping_train.pkl',
    features_path = "/Users/ivn/Documents/PhD/Transformer Research/Code/Benchmarking/data/db_snapshot_offsite/train_data/processed/train_data_features.feather",
    target_path = "/Users/ivn/Documents/PhD/Transformer Research/Code/Benchmarking/data/db_snapshot_offsite/train_data/train_data/train_data_target.feather",
    split_date="2016-01-01",
)

pipeline = BenchmarkPipeline(data_config)

In [None]:

# Step 1: Tune hyperparameters
tune_result = pipeline.run_experiment(
    sku_tuples=sku_tuples,
    modeling_strategy=ModelingStrategy.COMBINED,
    model_type="lightning_quantile",
    quantile_alphas=[0.5],
    mode="hp_tune",
    tune_on=10,
    tuning_config={'n_trials': 10, 'n_folds': 5}
)

Tuning dataset has 38247 training samples and 132 features.
Prepared training data with 38247 samples and 132 features.


In [15]:
tune_result.best_score

0.4355871870782814

In [16]:
tune_result.best_params

{'eta': 0.2251586017238223,
 'max_depth': 10,
 'min_child_weight': 26,
 'subsample': 0.9639446007829685,
 'colsample_bytree': 0.6507755877543546,
 'gamma': 0.6139963456942783,
 'reg_alpha': 6.768595449859619,
 'reg_lambda': 4.832153974114423,
 'n_estimators': 293}

## Run With tuned Params (done on server)

In [70]:
#Parameter from runs over 500 and over 1000 SKUs  each on 100 trials and 5 folds
hp_random = {
        "eta": 0.05,
        "max_depth": 8,
        "min_child_weight": 20,
        "subsample": 0.8,
        "colsample_bytree": 0.7,
        "gamma": 1.0,   
        "lambda": 10.0,
        "alpha": 1.0,
        "tree_method": "hist",
        "num_boost_round": 100,
        "seed": 42
}
hp_100 = {'eta': 0.299573707733717,
        'max_depth': 8,
        'min_child_weight': 19,
        'subsample': 0.696340395708422,
        'colsample_bytree': 0.6220507570163917,
        'gamma': 1.0789384910762259,
        'reg_alpha': 9.755231227570237,
        'reg_lambda': 8.295423481524228, 
        'n_estimators': 297}

hp_500 = {'eta': 0.2251586017238223,
        'max_depth': 10,
        'min_child_weight': 26,
        'subsample': 0.9639446007829685,
        'colsample_bytree': 0.6507755877543546,
        'gamma': 0.6139963456942783,
        'reg_alpha': 6.768595449859619,
        'reg_lambda': 4.832153974114423,
        'n_estimators': 293}

hp_1000 = {'eta': 0.2657057478526166, 
        'max_depth': 9,
        'min_child_weight': 19,
        'subsample': 0.6897467091557125, 
        'colsample_bytree': 0.9965497359024938,
        'gamma': 1.064228070424531, 
        'reg_alpha': 0.16585154768227728,
        'reg_lambda': 0.17000025072317992,
        'n_estimators': 297} 

In [71]:
hp_list = [hp_random,hp_100,hp_500,hp_1000]
quantiles = [0.5, 0.7, 0.9, 0.95, 0.99]

In [72]:
data_config = DataConfig(
    mapping_path = 'data/feature_mapping_train.pkl',
    features_path = "/Users/ivn/Documents/PhD/Transformer Research/Code/Benchmarking/data/db_snapshot_offsite/train_data/processed/train_data_features.feather",
    target_path = "/Users/ivn/Documents/PhD/Transformer Research/Code/Benchmarking/data/db_snapshot_offsite/train_data/train_data/train_data_target.feather",
    split_date="2016-01-01",
)

In [None]:
#get list of trainable SKUs
data_path = "/Users/ivn/Documents/PhD/Transformer Research/Code/Benchmarking/data/db_snapshot_offsite/train_data/processed/train_data_features.feather"
df_clean = pl.read_ipc(data_path)

sku_tuples_all = [(d['productID'], d['storeID']) for d in df_clean.select(pl.col("productID"), pl.col("storeID")).unique().to_dicts()]
print("total unseen skus: ", len(sku_tuples_all))

sku_exclude = (df_clean
 .group_by("storeID","productID")
 .agg(pl.col("date").first())
 .filter(pl.col("date") >= date(2016,1,1))
 .select("productID","storeID")
 )

sku_exclude = [(d['productID'], d['storeID']) for d in sku_exclude.select(pl.col("productID"), pl.col("storeID")).unique().to_dicts()]

sku_tuples_complete =  [sku for sku in sku_tuples_all if sku not in sku_exclude]
print("total unseen skus available for training: ", len(sku_tuples_complete))

In [73]:
experiments= []
for i, hp in enumerate(hp_list, start=1):
    pipeline = BenchmarkPipeline(data_config)
    
    results = pipeline.run_experiment(
        sku_tuples= sku_tuples_complete[:100],
        modeling_strategy=ModelingStrategy.INDIVIDUAL,
        model_type="xgboost_quantile",
        quantile_alphas=quantiles,
        hyperparameters = hp,
        experiment_name=f"xgb_quantile_{i}",
        evaluate_on_test=True
    )
    release_manager = ReleaseManager()
    output_dir = Path("./xgb_releases_hp_full")
    release_path = release_manager.create_complete_release(
    experiment_results=results,  
    base_output_dir=output_dir
  )
    experiments.append([hp,results])
    print(f"Completed experiment {i}")


    

100%|██████████| 100/100 [00:06<00:00, 16.36it/s]
Training models: 100%|██████████| 100/100 [00:48<00:00,  2.05it/s]


Completed experiment 1


100%|██████████| 100/100 [00:05<00:00, 19.09it/s]
Training models: 100%|██████████| 100/100 [01:40<00:00,  1.00s/it]


Completed experiment 2


100%|██████████| 100/100 [00:05<00:00, 19.56it/s]
Training models: 100%|██████████| 100/100 [01:49<00:00,  1.09s/it]


Completed experiment 3


100%|██████████| 100/100 [00:05<00:00, 19.16it/s]
Training models: 100%|██████████| 100/100 [01:58<00:00,  1.19s/it]


Completed experiment 4


In [None]:
#give results quick hp_name for later processing
hp_type = ["0","100","500","1000"]
for i,experiment in enumerate(experiments, start=0):
    experiment.append(hp_type[i])

#combine results into one dataframe for easier processing
results_dfs = []
for experiment in experiments:

    results_df = pl.DataFrame({
        "productID": [result.sku_tuples[0][0] for result in experiment[1].training_results],
        "storeID": [result.sku_tuples[0][1] for result in experiment[1].training_results],
        "quantile_level": [result.quantile_level for result in experiment[1].training_results],
        "mean_quantile_loss" : [result.performance_metrics.get('mean_quantile_loss') for result in experiment[1].training_results],
        "hp_type" : experiment[2]
    })
    results_dfs.append(results_df)


experiment_results = pl.concat(results_dfs, how="vertical")

#optinally save results as csv
#experiment_results.write_csv("my_experiment.csv", separator=",", include_header=True)


## Analyze and viz the runs on different HPs

### Analyse first

In [7]:
#read int he experiment data from csv or use the one just created above
experiment_results = pl.read_csv("xgb_quantile_hp_all.csv", separator=",") 
print("shape: ", experiment_results.shape) 
print(experiment_results.head())

shape:  (152640, 5)
shape: (5, 5)
┌───────────┬─────────┬────────────────┬────────────────────┬─────────┐
│ productID ┆ storeID ┆ quantile_level ┆ mean_quantile_loss ┆ hp_type │
│ ---       ┆ ---     ┆ ---            ┆ ---                ┆ ---     │
│ i64       ┆ i64     ┆ f64            ┆ f64                ┆ i64     │
╞═══════════╪═════════╪════════════════╪════════════════════╪═════════╡
│ 1889      ┆ 7       ┆ 0.5            ┆ 0.055944           ┆ 0       │
│ 1889      ┆ 7       ┆ 0.7            ┆ 0.092308           ┆ 0       │
│ 1889      ┆ 7       ┆ 0.9            ┆ 0.069231           ┆ 0       │
│ 1889      ┆ 7       ┆ 0.95           ┆ 0.051399           ┆ 0       │
│ 1889      ┆ 7       ┆ 0.99           ┆ 0.015874           ┆ 0       │
└───────────┴─────────┴────────────────┴────────────────────┴─────────┘


In [8]:
def df_to_markdown(df):
    """Convert Polars DataFrame to markdown table"""
    # Get column names
    cols = df.columns
    
    # Create header
    header = "| " + " | ".join(cols) + " |"
    separator = "| " + " | ".join(["---"] * len(cols)) + " |"
    
    # Create rows
    rows = []
    for row in df.iter_rows():
        row_str = "| " + " | ".join(str(val) for val in row) + " |"
        rows.append(row_str)
    
    # Combine all parts
    markdown = "\n".join([header, separator] + rows)
    return markdown

In [9]:
df_agg = (experiment_results
 .group_by("hp_type","quantile_level")
 .agg(
    pl.col("mean_quantile_loss").mean().alias("avg"),
    pl.col("mean_quantile_loss").std().alias("std")
))

# Pivot the data
df = (df_agg
 .with_columns(
    (pl.col("avg").round(3).cast(str) + " ± " + pl.col("std").round(3).cast(str)).alias("value")
)
.pivot(
    index="hp_type",
    on="quantile_level",
    values="value"
)
.with_columns(
    pl.col("hp_type").cast(pl.Int32).alias("sort_order")
)
.sort("sort_order")
.drop("sort_order")
.select(["hp_type", "0.5", "0.7", "0.9", "0.95", "0.99"]))

# Find minimum avg value per quantile and add bold formatting
df_min = (df_agg
 .group_by("quantile_level")
 .agg(pl.col("avg").min().alias("min_avg")))

# Join and format with bold for minimum
df_formatted = (df_agg
 .join(df_min, on="quantile_level")
 .with_columns(
    pl.when(pl.col("avg") == pl.col("min_avg"))
    .then("**" + pl.col("avg").round(3).cast(str) + " ± " + pl.col("std").round(3).cast(str) + "**")
    .otherwise(pl.col("avg").round(3).cast(str) + " ± " + pl.col("std").round(3).cast(str))
    .alias("value")
)
.pivot(
    index="hp_type",
    on="quantile_level",
    values="value"
)
.with_columns(
    pl.col("hp_type").cast(pl.Int32).alias("sort_order")
)
.sort("sort_order")
.drop("sort_order")
.select(["hp_type", "0.5", "0.7", "0.9", "0.95", "0.99"]))

markdown_table = df_to_markdown(df_formatted)
display(Markdown(markdown_table))

| hp_type | 0.5 | 0.7 | 0.9 | 0.95 | 0.99 |
| --- | --- | --- | --- | --- | --- |
| 0 | 0.574 ± 1.294 | 0.682 ± 1.719 | 0.666 ± 2.109 | 0.637 ± 2.22 | 0.604 ± 2.31 |
| 100 | **0.477 ± 0.616** | **0.498 ± 0.596** | 0.324 ± 0.446 | 0.226 ± 0.378 | 0.108 ± 0.318 |
| 500 | 0.48 ± 0.667 | 0.507 ± 0.647 | 0.331 ± 0.51 | 0.233 ± 0.448 | 0.119 ± 0.396 |
| 1000 | 0.494 ± 0.607 | 0.525 ± 0.584 | **0.311 ± 0.421** | **0.213 ± 0.354** | **0.094 ± 0.293** |

### Viz 

In [10]:
# Convert quantile_level to string for proper categorical handling
results_df_plot = experiment_results.with_columns(
    pl.col('quantile_level').cast(pl.Utf8).alias('quantile_level'),
    pl.col('hp_type').cast(str).alias('hp_type')
)

ggplot(results_df_plot, aes(x='quantile_level', y='mean_quantile_loss', fill='hp_type')) + \
    geom_boxplot(alpha=0.7, outlier_size=0.5, position='dodge') + \
    labs(
        title='Mean Quantile Loss Distribution by Quantile Level and HP Type',
        x='Quantile Level',
        y='Mean Quantile Loss',
        fill='HP Type'
    ) + \
    theme_minimal() + \
    theme(plot_title=element_text(size=14, face='bold')) + \
    ggsize(2000, 750)

In [43]:
#Log scale with y-axis limits
results_df_plot = experiment_results.with_columns(
    pl.col('quantile_level').cast(pl.Utf8).alias('quantile_level'),
    pl.col('hp_type').cast(str).alias('hp_type')
)

ggplot(results_df_plot, aes(x='quantile_level', y='mean_quantile_loss', fill='hp_type')) + \
    geom_boxplot(alpha=0.7, outlier_size=0.5, position='dodge') + \
    scale_y_log10() + \
    coord_cartesian(ylim=[0.005, None]) + \
    labs(
        title='Mean Quantile Loss Distribution by Quantile Level and HP Type (Log Scale)',
        x='Quantile Level',
        y='Mean Quantile Loss (log scale)',
        fill='HP Type'
    ) + \
    theme_minimal() + \
    theme(plot_title=element_text(size=14, face='bold')) + \
    ggsize(1700, 800)

In [45]:
# Exclude multiple hp_types
results_df_plot = experiment_results.with_columns(
    pl.col('quantile_level').cast(pl.Utf8).alias('quantile_level'),
    pl.col('hp_type').cast(str).alias('hp_type')
).filter(
    pl.col('hp_type').is_in(["100", "500", "1000"])  # Only include these
)

ggplot(results_df_plot, aes(x='quantile_level', y='mean_quantile_loss', fill='hp_type')) + \
    geom_boxplot(alpha=0.7, outlier_size=0.5, position='dodge') + \
    scale_y_log10() + \
    labs(
        title='Mean Quantile Loss Distribution by Quantile Level and HP Type',
        x='Quantile Level',
        y='Mean Quantile Loss',
        fill='HP Type'
    ) + \
    theme_minimal() + \
    theme(plot_title=element_text(size=14, face='bold')) + \
    ggsize(2000, 750)

In [12]:
# Exclude multiple hp_types
results_df_plot = experiment_results.with_columns(
    pl.col('quantile_level').cast(pl.Utf8).alias('quantile_level'),
    pl.col('hp_type').cast(str).alias('hp_type')
).filter(
    pl.col('hp_type').is_in(["100"])  # Only include these
)

ggplot(results_df_plot, aes(x='quantile_level', y='mean_quantile_loss')) + \
    geom_boxplot(alpha=0.7, outlier_size=0.5, position='dodge') + \
    scale_y_log10() + \
    labs(
        title='Mean Quantile Loss Distribution by Quantile Level for HP Type 1000',
        x='Quantile Level',
        y='Mean Quantile Loss',
        fill='HP Type'
    ) + \
    theme_minimal() + \
    theme(plot_title=element_text(size=14, face='bold')) + \
    ggsize(1700, 750)

## Get number of observations,meandemand for train/test per sku link xgb predictions to this.

In [13]:
#check the outliers:
results_df_plot

productID,storeID,quantile_level,mean_quantile_loss,hp_type
i64,i64,str,f64,str
1889,7,"""0.5""",0.136364,"""100"""
1889,7,"""0.7""",0.123776,"""100"""
1889,7,"""0.9""",0.095804,"""100"""
1889,7,"""0.95""",0.062587,"""100"""
1889,7,"""0.99""",0.021469,"""100"""
…,…,…,…,…
2923,10,"""0.5""",0.164336,"""100"""
2923,10,"""0.7""",0.236364,"""100"""
2923,10,"""0.9""",0.102098,"""100"""
2923,10,"""0.95""",0.083566,"""100"""


In [25]:
top_outliers = (experiment_results
 .with_columns(
    pl.col('quantile_level').cast(pl.Utf8).alias('quantile_level'),
    pl.col('hp_type').cast(str).alias('hp_type')
)
.sort(['quantile_level', 'hp_type', 'mean_quantile_loss'], descending=[False, False, True])
.group_by(['quantile_level', 'hp_type'])
.head(25)
.filter(pl.col("hp_type") != "0" )
.sort('mean_quantile_loss', descending=True))


top_outliers

quantile_level,hp_type,productID,storeID,mean_quantile_loss
str,str,i64,i64,f64
"""0.9""","""500""",688,9,18.496503
"""0.7""","""500""",688,9,18.406294
"""0.95""","""500""",688,9,18.046503
"""0.99""","""500""",688,9,17.554895
"""0.7""","""100""",688,9,17.444755
…,…,…,…,…
"""0.99""","""1000""",2845,1,1.934685
"""0.99""","""1000""",1214,7,1.86972
"""0.99""","""1000""",999,1,1.866014
"""0.99""","""1000""",311,1,1.777972


In [30]:
split_date = date(2016, 1, 1)
train_summary = (df_clean
 .group_by("storeID", "productID")
 .agg([
     pl.col("date").min().alias("date_first"),
     pl.col("date").filter(pl.col("date") < split_date).count().alias("obs_train"),
     pl.col("date").filter(pl.col("date") >= split_date).count().alias("obs_test"),
     pl.col("target_lag_1").filter(pl.col("date") < split_date).mean().alias("mean_train"),
     pl.col("target_lag_1").filter(pl.col("date") >= split_date).mean().alias("mean_test")
 ])
 .sort("storeID", "productID"))

In [31]:
train_summary.filter(pl.col("date_first") >= split_date)

storeID,productID,date_first,obs_train,obs_test,mean_train,mean_test
i64,i64,date,u32,u32,f64,f64
1,2258,2016-02-13,0,100,,1.23
1,2991,2016-01-16,0,128,,0.546875


In [56]:
train_summary.filter(pl.col("obs_train") < 572).sort("obs_train", descending=False)

storeID,productID,date_first,obs_train,obs_test,mean_train,mean_test
i64,i64,date,u32,u32,f64,f64
1,2258,2016-02-13,0,100,,1.23
1,2991,2016-01-16,0,128,,0.546875
1,1020,2015-12-26,6,143,0.0,0.832168
1,1790,2015-12-26,6,143,0.0,0.636364
2,1723,2015-12-26,6,143,0.0,0.426573
…,…,…,…,…,…,…
7,2300,2014-06-14,566,143,0.911661,1.538462
7,2338,2014-06-14,566,143,1.25265,1.636364
7,2993,2014-06-14,566,143,1.667845,3.048951
8,2338,2014-06-14,566,143,1.097173,1.608392


In [None]:
(top_outliers.join(
    train_summary,
    on=["storeID", "productID"],
    how="left")
.sort("mean_quantile_loss", descending=True))



quantile_level,hp_type,productID,storeID,mean_quantile_loss,date_first,obs_train,obs_test,mean_train,mean_test
str,str,i64,i64,f64,date,u32,u32,f64,f64
"""0.9""","""500""",688,9,18.496503,2012-12-29,1098,143,20.288707,40.923077
"""0.7""","""500""",688,9,18.406294,2012-12-29,1098,143,20.288707,40.923077
"""0.95""","""500""",688,9,18.046503,2012-12-29,1098,143,20.288707,40.923077
"""0.99""","""500""",688,9,17.554895,2012-12-29,1098,143,20.288707,40.923077
"""0.7""","""100""",688,9,17.444755,2012-12-29,1098,143,20.288707,40.923077
…,…,…,…,…,…,…,…,…,…
"""0.99""","""1000""",2845,1,1.934685,2013-06-22,923,143,4.488624,6.727273
"""0.99""","""1000""",1214,7,1.86972,2012-03-10,1392,143,9.95977,25.13986
"""0.99""","""1000""",999,1,1.866014,2011-05-28,1679,143,7.233472,15.020979
"""0.99""","""1000""",311,1,1.777972,2012-07-07,1273,143,6.449332,7.167832


In [52]:
(top_outliers.join(
    train_summary,
    on=["storeID", "productID"],
    how="left")
.sort("mean_quantile_loss", descending=True)
.with_columns(
    (((pl.col("mean_test")/pl.col("mean_train"))-1)*100 ).alias("rel_mean_diff"))
).sort("obs_train",descending=False)


quantile_level,hp_type,productID,storeID,mean_quantile_loss,date_first,obs_train,obs_test,mean_train,mean_test,rel_mean_diff
str,str,i64,i64,f64,date,u32,u32,f64,f64,f64
"""0.9""","""500""",1671,1,3.397902,2013-09-14,839,143,4.307509,7.685315,78.416686
"""0.9""","""100""",1671,1,3.172727,2013-09-14,839,143,4.307509,7.685315,78.416686
"""0.9""","""1000""",1671,1,3.078322,2013-09-14,839,143,4.307509,7.685315,78.416686
"""0.95""","""100""",1671,1,2.782867,2013-09-14,839,143,4.307509,7.685315,78.416686
"""0.95""","""1000""",1671,1,2.718531,2013-09-14,839,143,4.307509,7.685315,78.416686
…,…,…,…,…,…,…,…,…,…,…
"""0.99""","""100""",466,7,2.153706,2011-01-29,1798,143,9.37507,28.601399,205.079321
"""0.99""","""1000""",2427,7,2.065734,2011-01-29,1798,143,1.851976,4.342657,134.487838
"""0.99""","""1000""",890,1,2.045455,2011-01-29,1798,143,9.761269,11.104895,13.764874
"""0.99""","""1000""",1080,1,2.013427,2011-01-29,1798,143,14.012799,24.265734,73.168359


## Baseline

In [None]:
data_config = DataConfig(
    mapping_path = 'data/feature_mapping_train.pkl',
    features_path = "/Users/ivn/Documents/PhD/Transformer Research/Code/Benchmarking/data/db_snapshot_offsite/train_data/processed/train_data_features.feather",
    target_path = "/Users/ivn/Documents/PhD/Transformer Research/Code/Benchmarking/data/db_snapshot_offsite/train_data/train_data/train_data_target.feather",
    split_date="2016-01-01",
)



quantiles = [0.5, 0.7, 0.9, 0.95, 0.99]

pipeline = BenchmarkPipeline(data_config)

In [19]:
results = pipeline.run_experiment(
    sku_tuples= sku_tuples_complete,
    modeling_strategy=ModelingStrategy.INDIVIDUAL,
    model_type="empirical_quantile",
    quantile_alphas=quantiles,
    hyperparameters = {},
    experiment_name="empirical_quantile_test",
    evaluate_on_test=True
    )

100%|██████████| 7632/7632 [07:09<00:00, 17.78it/s]
Training models: 100%|██████████| 7632/7632 [00:15<00:00, 492.81it/s]


In [33]:
results_df = pl.DataFrame({
    "productID": [result.sku_tuples[0][0] for result in results.training_results],
    "storeID": [result.sku_tuples[0][1] for result in results.training_results],
    "quantile_level": [result.quantile_level for result in results.training_results],
    "mean_quantile_loss": [result.performance_metrics.get('mean_quantile_loss') for result in results.training_results],
    "model_type": "empirical_quantile" 
})

results_df.write_csv("empirical_quantile_all.csv", separator=",", include_header=True)

### Load Results

In [50]:
experiment_xgb = pl.read_csv("/Users/ivn/Documents/PhD/Transformer Research/Code/Benchmarking/Results/XGB/xgb_quantile_hp_all_magnus_params.csv", separator=",",schema_overrides={"hp_type": pl.Utf8}) 
experiment_baseline = pl.read_csv("/Users/ivn/Documents/PhD/Transformer Research/Code/Benchmarking/Results/BL/empirical_quantile_all.csv", separator=",") 

experiment_xgb = (experiment_xgb
                  .filter(pl.col("hp_type") == "100_new_2")
                  .with_columns(
                      pl.col('quantile_level').cast(pl.Utf8).alias('quantile_level'),
                      pl.col('hp_type').cast(str).alias('hp_type'))
                  .rename({"hp_type": "model_type"})
                  .with_columns(pl.lit("xgb").alias("model_type"))) 

experiment_baseline = (experiment_baseline
                       .with_columns(
                           pl.col('quantile_level').cast(pl.Utf8).alias('quantile_level'))
                       )


In [51]:
experiment_xgb.sort(["productID", "storeID", "quantile_level"])

productID,storeID,quantile_level,mean_quantile_loss,model_type
i64,i64,str,f64,str
1,1,"""0.5""",0.255245,"""xgb"""
1,1,"""0.7""",0.305594,"""xgb"""
1,1,"""0.9""",0.154545,"""xgb"""
1,1,"""0.95""",0.074476,"""xgb"""
1,1,"""0.99""",0.024895,"""xgb"""
…,…,…,…,…
3049,9,"""0.5""",0.601399,"""xgb"""
3049,9,"""0.7""",0.605594,"""xgb"""
3049,9,"""0.9""",0.395105,"""xgb"""
3049,9,"""0.95""",0.235664,"""xgb"""


In [52]:
experiment_baseline.sort(["productID", "storeID", "quantile_level"])

productID,storeID,quantile_level,mean_quantile_loss,model_type
i64,i64,str,f64,str
1,1,"""0.5""",0.255245,"""empirical_quantile"""
1,1,"""0.7""",0.251748,"""empirical_quantile"""
1,1,"""0.9""",0.153846,"""empirical_quantile"""
1,1,"""0.95""",0.074476,"""empirical_quantile"""
1,1,"""0.99""",0.024895,"""empirical_quantile"""
…,…,…,…,…
3049,9,"""0.5""",0.660839,"""empirical_quantile"""
3049,9,"""0.7""",0.669231,"""empirical_quantile"""
3049,9,"""0.9""",0.446154,"""empirical_quantile"""
3049,9,"""0.95""",0.366084,"""empirical_quantile"""


In [53]:
baseline_check = experiment_xgb.join(
    experiment_baseline.select(pl.exclude("model_type")),
    on=["productID", "storeID", "quantile_level"],
    how="inner",
    suffix="_baseline"
)

In [54]:
baseline_check.sort(["productID", "storeID", "quantile_level"]).with_columns(
    (pl.col("mean_quantile_loss") <= pl.col("mean_quantile_loss_baseline")).alias("xgb_better")
).filter(pl.col("xgb_better") == False)

productID,storeID,quantile_level,mean_quantile_loss,model_type,mean_quantile_loss_baseline,xgb_better
i64,i64,str,f64,str,f64,bool
1,1,"""0.7""",0.305594,"""xgb""",0.251748,false
1,1,"""0.9""",0.154545,"""xgb""",0.153846,false
3,7,"""0.5""",0.384615,"""xgb""",0.377622,false
3,7,"""0.9""",0.227972,"""xgb""",0.225874,false
3,7,"""0.99""",0.047692,"""xgb""",0.044685,false
…,…,…,…,…,…,…
3045,7,"""0.7""",0.32028,"""xgb""",0.316783,false
3048,1,"""0.5""",0.335664,"""xgb""",0.321678,false
3048,1,"""0.7""",0.304895,"""xgb""",0.274825,false
3048,7,"""0.9""",0.209091,"""xgb""",0.207692,false


In [55]:
plot_df = (baseline_check
           .sort(["productID", "storeID", "quantile_level"])
           .with_columns((pl.col("mean_quantile_loss")-pl.col("mean_quantile_loss_baseline")).alias("abs_diff"))
           
)

In [56]:
plot_df = (baseline_check
    .sort(["productID", "storeID", "quantile_level"])
    .with_columns([
        (pl.col("mean_quantile_loss") - pl.col("mean_quantile_loss_baseline")).alias("abs_diff"),
        # Relative percentage change with special cases
        pl.when(
            (pl.col("mean_quantile_loss") == 0) & (pl.col("mean_quantile_loss_baseline") == 0)
        )
        .then(0)  # 0/0 = 0% change
        .when(pl.col("mean_quantile_loss_baseline") == 0)
        .then(100)  # x/0 = 100% change
        .otherwise(
            ((pl.col("mean_quantile_loss") / pl.col("mean_quantile_loss_baseline")) - 1) * 100
        )
        .alias("rel_diff_pct")
    ])
)

In [61]:
plot_df.sort("abs_diff", descending=True)

productID,storeID,quantile_level,mean_quantile_loss,model_type,mean_quantile_loss_baseline,abs_diff,rel_diff_pct
i64,i64,str,f64,str,f64,f64,f64
379,6,"""0.7""",5.534965,"""xgb""",2.431469,3.103497,127.638769
2659,6,"""0.9""",3.503497,"""xgb""",1.536364,1.967133,128.038234
2659,6,"""0.95""",2.618881,"""xgb""",0.962587,1.656294,172.066836
2427,7,"""0.9""",2.475524,"""xgb""",0.875524,1.6,182.747604
1929,7,"""0.95""",1.960839,"""xgb""",0.490909,1.46993,299.430199
…,…,…,…,…,…,…,…
1194,1,"""0.7""",1.531469,"""xgb""",9.744755,-8.213287,-84.284177
2232,1,"""0.5""",2.311189,"""xgb""",11.748252,-9.437063,-80.327381
2284,9,"""0.95""",2.023077,"""xgb""",11.890559,-9.867483,-82.985856
2284,9,"""0.7""",4.090909,"""xgb""",14.881119,-10.79021,-72.509398


In [62]:
# Calculate mean per quantile level
plot_df_filtered = plot_df.filter(pl.col("rel_diff_pct").is_between(-100,100))
mean_per_quantile = (plot_df_filtered
    .group_by("quantile_level")
    .agg(pl.col("rel_diff_pct").mean().alias("mean_rel_diff"))
    .with_columns(
        pl.format("{}%", pl.col("mean_rel_diff").round(2)).alias("label")
    )
)

# Create the plot with mean points per quantile
ggplot(plot_df_filtered, aes(x='quantile_level', y='rel_diff_pct')) + \
    geom_boxplot(alpha=0.7, outlier_size=1) + \
    geom_point(data=mean_per_quantile, 
               mapping=aes(x='quantile_level', y='mean_rel_diff'), 
               color='red', size=4, shape=19) + \
    geom_text(data=mean_per_quantile,
              mapping=aes(x='quantile_level', y='mean_rel_diff', label='label'),
              color='red', size=9, vjust=2, hjust=1) + \
    geom_hline(yintercept=0, linetype='dashed', color='grey', size=1) + \
    labs(
        title='Relative Difference (XGB/Baseline) by Quantile Level',
        x='Quantile Level',
        y='Relative Difference (%)'
    ) + \
    theme_minimal() + \
    theme(plot_title=element_text(size=14, face='bold')) + \
    ggsize(1500, 600)

In [60]:
# Calculate mean per quantile level
plot_df_filtered = plot_df.filter(pl.col("rel_diff_pct").is_between(-100,100))
overall_mean_value = plot_df_filtered.select(pl.col("rel_diff_pct").mean())[0, 0]

# Create single-row DataFrame for the point
overall_mean_df = pl.DataFrame({
    'x': [0],  # Center position
    'y': [overall_mean_value]
})

# Create the plot with mean points per quantile
ggplot(plot_df_filtered, aes(y='rel_diff_pct')) + \
    geom_boxplot(alpha=0.7, outlier_size=1) + \
    geom_point(data=overall_mean_df, 
               mapping=aes(x='x', y='y'), 
               color='red', size=4) + \
    geom_hline(yintercept=0, linetype='dashed', color='grey', size=1) + \
    labs(
        title='Relative Difference (XGB/Baseline) by Quantile Level',
        x='Quantile Level',
        y='Relative Difference (%)'
    ) + \
    theme_minimal() + \
    theme(plot_title=element_text(size=14, face='bold')) + \
    ggsize(700, 600)

In [24]:
overall_mean_value

-4.0140928552202375

In [27]:
print("Overall mean relative difference (%): ", overall_mean_value)

Overall mean relative difference (%):  -4.0140928552202375


In [None]:
"xgboost_quantile_q0.7_tuned10_trials10_20251106_121746.csv"

In [None]:
from src.utils import save_hp_tuning_results