In [4]:
import sys
from pathlib import Path
import numpy as np

current_dir = Path.cwd()
if str(current_dir) not in sys.path:
    sys.path.insert(0, str(current_dir))

from src import (
    DataConfig, DataLoader,ModelingStrategy, BenchmarkPipeline, create_config
)

np.random.seed(42)

In [5]:
data_config = DataConfig(
    mapping_path = 'data/feature_mapping_train.pkl',
    features_path = 'data/processed/train_data_features.feather',
    target_path = 'data/train_data_target.feather'
)

sku_tuples = [(81054, 1334)]
pipeline = BenchmarkPipeline(data_config)

In [6]:
data_loader = DataLoader(data_config)
dataset = data_loader.prepare_modeling_dataset(
    sku_tuples=[(81054, 1334)],
    modeling_strategy=ModelingStrategy.INDIVIDUAL
)
print(f"Training data shape: {dataset.X_train.shape}")
print(f"Training target shape: {dataset.y_train.shape}")


Training data shape: (1552, 140)
Training target shape: (1552, 2)


In [5]:
dataset.X_train.columns

['bdID',
 'date',
 'feature_0000',
 'feature_0001',
 'feature_0002',
 'feature_0003',
 'feature_0004',
 'feature_0005',
 'feature_0006',
 'feature_0007',
 'feature_0008',
 'feature_0009',
 'feature_0010',
 'feature_0011',
 'feature_0012',
 'feature_0013',
 'feature_0014',
 'feature_0015',
 'feature_0016',
 'feature_0017',
 'feature_0018',
 'feature_0019',
 'feature_0020',
 'feature_0021',
 'feature_0022',
 'feature_0023',
 'feature_0024',
 'feature_0025',
 'feature_0026',
 'feature_0027',
 'feature_0028',
 'feature_0029',
 'feature_0030',
 'feature_0031',
 'feature_0032',
 'feature_0033',
 'feature_0034',
 'feature_0035',
 'feature_0036',
 'feature_0037',
 'feature_0038_lag_1',
 'feature_0038_lag_2',
 'feature_0038_lag_3',
 'feature_0038_lag_4',
 'feature_0038_lag_5',
 'feature_0038_lag_6',
 'feature_0038_lag_7',
 'feature_0039']

In [None]:
results_xgb_std = pipeline.run_experiment(
    sku_tuples=sku_tuples,
    modeling_strategy=ModelingStrategy.INDIVIDUAL,
    model_type="xgboost_standard",
    hyperparameters={
        'hidden_size': 64,
        'learning_rate': 0.001,  # Fixed parameter name
        'num_layers': 2,
        'dropout': 0.2,
        'max_epochs': 10, 
        'batch_size': 32,
        'deterministic': True,  # For reproducible results
        'random_state': 42
    },
    experiment_name="torch_lightning_test"
)

print(f"Trained {results_xgb_std.num_models} Torch Lightning model(s)")

Trained 1 XGBoost standard model(s)


In [4]:
results_xgb_quantile = pipeline.run_experiment(
    sku_tuples=sku_tuples,
    modeling_strategy=ModelingStrategy.INDIVIDUAL,
    model_type="xgboost_quantile",
    hyperparameters={
        "n_estimators": 100,
        "max_depth": 6,
        "learning_rate": 0.3,
        "random_state": 42
    },
    quantile_alphas=[0.5, 0.7, 0.9],
    experiment_name="xgb_quantile_test"
)

print(f"Trained {results_xgb_quantile.num_models} XGBoost quantile model(s)")

Trained 3 XGBoost quantile model(s)


In [7]:
results_lightning_std = pipeline.run_experiment(
    sku_tuples=sku_tuples,
    modeling_strategy=ModelingStrategy.INDIVIDUAL,
    model_type="lightning_standard",
    hyperparameters={
        "input_size": 132,
        "hidden_size": 128,
        "lr": 0.001,
        "num_layers": 2,
        "dropout": 0.2,
        "max_epochs": 50,
        "batch_size": 64,
        "random_state": 42
    },
    experiment_name="lightning_standard_test"
)

print(f"Trained {results_lightning_std.num_models} Lightning standard model(s)")

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/opt/miniconda3/envs/ml_env/lib/python3.10/site-packages/lightning/pytorch/trainer/configuration_validator.py:70: You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.

  | Name  | Type       | Params | Mode 
---------------------------------------------
0 | model | Sequential | 14.3 K | train
---------------------------------------------
14.3 K    Trainable params
0         Non-trainable params
14.3 K    Total params
0.057     Total estimated model params size (MB)
8         Modules in train mode
0         Modules in eval mode
/opt/miniconda3/envs/ml_env/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:433: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=9` in the `DataLoader` to improve performance.
`Train

ValueError: Input contains NaN.

In [None]:
pipeline.

In [None]:
sample_result = results_lightning_std.training_results[0]
print(f"Model type: {sample_result.model_type}")
print(f"Strategy: {sample_result.modeling_strategy.value}")
print(f"SKU tuples: {sample_result.sku_tuples}")
print(f"Quantile level: {sample_result.quantile_level}")
print(f"Training loss: {sample_result.training_loss}")
if sample_result.performance_metrics:
    print(f"RMSE: {sample_result.performance_metrics.get('rmse', 'N/A')}")

Model type: lightning_standard
Strategy: individual
SKU tuples: [(81054, 1334)]
Quantile level: None
Training loss: None
RMSE: 95.67386493644231


In [5]:
results_lightning_quantile = pipeline.run_experiment(
    sku_tuples=sku_tuples,
    modeling_strategy=ModelingStrategy.INDIVIDUAL,
    model_type="lightning_quantile",
    hyperparameters={
        "hidden_size": 64,
        "learning_rate": 0.001,
        "num_layers": 2,
        "dropout": 0.2,
        "max_epochs": 10,
        "batch_size": 32,
        "random_state": 42
    },
    quantile_alphas=[0.7, 0.8, 0.9],
    experiment_name="lightning_quantile_test"
)

print(f"Trained {results_lightning_quantile.num_models} Lightning quantile model(s)")

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name  | Type       | Params | Mode 
---------------------------------------------
0 | model | Sequential | 5.1 K  | train
---------------------------------------------
5.1 K     Trainable params
0         Non-trainable params
5.1 K     Total params
0.020     Total estimated model params size (MB)
8         Modules in train mode
0         Modules in eval mode
/opt/miniconda3/envs/ml_env/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:433: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=9` in the `DataLoader` to improve performance.
/opt/miniconda3/envs/ml_env/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:433: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider 

Trained 3 Lightning quantile model(s)


  y_pred = np.clip(np.round(y_pred).astype(int), 0, None)


In [None]:
results_statquant = pipeline.run_experiment(
    sku_tuples=sku_tuples,
    modeling_strategy=ModelingStrategy.INDIVIDUAL,
    model_type="statquant",
    hyperparameters={
        "method": "interior-point",
        "max_iter": 1000,
        "p_tol": 1e-6,
        "random_state": 42
    },
    quantile_alphas=[0.7],
    experiment_name="statquant_test"
)

print(f"Trained {results_statquant.num_models} statistical quantile model(s)")

In [None]:
multi_sku_tuples = [(81054, 1334), (80558, 1334)]

results_combined = pipeline.run_experiment(
    sku_tuples=multi_sku_tuples,
    modeling_strategy=ModelingStrategy.COMBINED,
    model_type="xgboost_standard",
    hyperparameters={
        "n_estimators": 50,
        "max_depth": 4,
        "learning_rate": 0.1,
        "random_state": 42
    },
    experiment_name="combined_strategy_test"
)

print(f"Trained {results_combined.num_models} combined model(s) for {len(multi_sku_tuples)} SKUs")

In [None]:
sample_result = results_xgb_quantile.training_results[0]
print(f"Model type: {sample_result.model_type}")
print(f"Strategy: {sample_result.modeling_strategy.value}")
print(f"SKU tuples: {sample_result.sku_tuples}")
print(f"Quantile level: {sample_result.quantile_level}")
print(f"Training loss: {sample_result.training_loss}")
if sample_result.performance_metrics:
    print(f"RMSE: {sample_result.performance_metrics.get('rmse', 'N/A')}")
    print(f"Coverage: {sample_result.performance_metrics.get('coverage_probability', 'N/A')}")

In [None]:
for i, result in enumerate(results_lightning_quantile.training_results[:1]):
    rmse = result.performance_metrics.get('rmse', 'N/A')
    rmse_str = f"{rmse:.4f}" if isinstance(rmse, (int, float)) else rmse
    print(f"Model {i+1}: α={result.quantile_level}, RMSE={rmse_str}")