# XGBoost Prototype Replication via Pipeline

Replicates xgb_prototype.ipynb functionality using the current M5 benchmarking framework.

In [1]:
import sys
from pathlib import Path
import numpy as np

# Add current directory to Python path for imports
current_dir = Path.cwd()
if str(current_dir.parent) not in sys.path:
    sys.path.insert(0, str(current_dir.parent))

from src import (
    DataConfig, TrainingConfig, ModelingStrategy,
    BenchmarkPipeline
)

np.random.seed(42)

In [2]:
# Data configuration matching xgb_prototype paths
data_config = DataConfig(
    features_path="../data/processed/train_data_features.feather",
    target_path="../data/train_data_target.feather",
    mapping_path="../data/feature_mapping_train.pkl",
    validation_split=0.2  # Moved from TrainingConfig to DataConfig
)

# Training configuration with xgb_prototype optimized hyperparameters
training_config = TrainingConfig(
    random_state=42
)

# Add optimized hyperparameters from xgb_prototype
training_config.add_model_config(
    model_type="xgboost_standard",
    hyperparameters={
        'n_estimators': 78,
        'max_depth': 3,
        'learning_rate': 0.06356393066232492,
        'subsample': 0.8136751464901273,
        'colsample_bytree': 0.820105725620293,
        'reg_alpha': 4.979378780027597,
        'reg_lambda': 6.663822635873432,
        'random_state': 42
    }
)

# Initialize pipeline
pipeline = BenchmarkPipeline(
    data_config=data_config,
    training_config=training_config,
    output_dir=Path("xgb_prototype_results")
)

pipeline.load_and_prepare_data()

In [3]:
# SKU tuple matching xgb_prototype: productID=80558, storeID=1331
sku_tuples = [(80558, 1331),(81054,1334),(81054,1335)]

# Train model using COMBINED strategy (single model)
models = pipeline.run_experiment(
    sku_tuples=sku_tuples,
    modeling_strategy=ModelingStrategy.INDIVIDUAL,
    experiment_name="xgb_prototype_replication"
)

model = models[0]

In [4]:
# Display results matching xgb_prototype format
metrics = model.metadata.performance_metrics
print(f"Test Metrics - MSE: {metrics['mse']:.4f}, RMSE: {metrics['rmse']:.4f}, R²: {metrics['r2']:.4f}")

Test Metrics - MSE: 1332.9290, RMSE: 36.5093, R²: 0.6167


In [5]:
model = models[1]
metrics = model.metadata.performance_metrics
print(f"Test Metrics - MSE: {metrics['mse']:.4f}, RMSE: {metrics['rmse']:.4f}, R²: {metrics['r2']:.4f}")

Test Metrics - MSE: 280.1838, RMSE: 16.7387, R²: 0.6036


In [6]:
model = models[2]
metrics = model.metadata.performance_metrics
print(f"Test Metrics - MSE: {metrics['mse']:.4f}, RMSE: {metrics['rmse']:.4f}, R²: {metrics['r2']:.4f}")

Test Metrics - MSE: 224.3976, RMSE: 14.9799, R²: 0.4630
