# 04 - Model TrainingTrain XGBoost models for revenue/quantity forecasting.**Models trained:**- Naive (last value)- Moving Average (4 weeks)- Exponential Smoothing- XGBoost with lag features

## Setup

In [None]:
import pandas as pdimport numpy as npimport xgboost as xgbfrom sklearn.preprocessing import LabelEncoderfrom sklearn.metrics import mean_absolute_error, mean_squared_errorimport warningswarnings.filterwarnings('ignore')from pathlib import PathBASE_PATH = Path('..')FEATURES_PATH = BASE_PATH / 'features_v2'MODEL_PATH = BASE_PATH / 'model_evaluation'MODEL_PATH.mkdir(exist_ok=True)print("Libraries loaded!")

## Load Training Data (H1)

In [None]:
# Load H1 data for trainingdf_h1 = pd.read_csv(FEATURES_PATH / 'forecast_sku_weekly_H1.csv')df_h2 = pd.read_csv(FEATURES_PATH / 'forecast_sku_weekly_H2.csv')print(f"H1 (training): {len(df_h1):,} rows, {df_h1['sku'].nunique():,} SKUs")print(f"H2 (testing): {len(df_h2):,} rows, {df_h2['sku'].nunique():,} SKUs")# Use revenue as target (or weekly_quantity)TARGET_COL = 'weekly_revenue'  # Change to 'weekly_quantity' if neededdf_h1.head()

## Baseline Models

In [None]:
def evaluate_predictions(actual, predicted):    """Calculate WMAPE and other metrics."""    mae = mean_absolute_error(actual, predicted)    rmse = np.sqrt(mean_squared_error(actual, predicted))        # WMAPE    total_actual = np.sum(np.abs(actual))    if total_actual > 0:        wmape = 100 * np.sum(np.abs(actual - predicted)) / total_actual    else:        wmape = 999        return {'MAE': mae, 'RMSE': rmse, 'WMAPE': wmape}def model_naive_last(train, test, id_col, value_col):    """Predict last known value from training."""    last_values = train.groupby(id_col)[value_col].last().to_dict()    predictions = test[id_col].map(last_values).fillna(0)    return predictions.valuesdef model_moving_average(train, test, id_col, value_col, window=4):    """Predict moving average of last N weeks."""    ma_values = train.groupby(id_col)[value_col].apply(        lambda x: x.tail(window).mean()    ).to_dict()    predictions = test[id_col].map(ma_values).fillna(0)    return predictions.valuesdef model_exp_smoothing(train, test, id_col, value_col, alpha=0.3):    """Exponential smoothing."""    def exp_smooth(series):        result = series.iloc[0]        for val in series.iloc[1:]:            result = alpha * val + (1 - alpha) * result        return result        es_values = train.groupby(id_col)[value_col].apply(exp_smooth).to_dict()    predictions = test[id_col].map(es_values).fillna(0)    return predictions.valuesprint("Baseline functions defined")

## Run Baseline Models

In [None]:
# Prepare datatrain = df_h1.copy()test = df_h2.copy()results = {}# Naive Lastpred_naive = model_naive_last(train, test, 'sku', TARGET_COL)results['Naive_Last'] = evaluate_predictions(test[TARGET_COL].values, pred_naive)# Moving Average 4 weekspred_ma4 = model_moving_average(train, test, 'sku', TARGET_COL, window=4)results['MA_4Week'] = evaluate_predictions(test[TARGET_COL].values, pred_ma4)# Exponential Smoothingpred_es = model_exp_smoothing(train, test, 'sku', TARGET_COL, alpha=0.3)results['ExpSmooth'] = evaluate_predictions(test[TARGET_COL].values, pred_es)print("=== Baseline Results ===")for model, metrics in results.items():    print(f"{model}: WMAPE={metrics['WMAPE']:.1f}%, MAE={metrics['MAE']:,.0f}")

## XGBoost Model

In [None]:
# Prepare features for XGBoostfeature_cols = ['week_num', 'lag1_quantity', 'lag2_quantity', 'lag4_quantity', 'rolling_avg_4w']# Check which columns existavailable_features = [c for c in feature_cols if c in train.columns]print(f"Available features: {available_features}")# Add encoded SKUle_sku = LabelEncoder()train['sku_encoded'] = le_sku.fit_transform(train['sku'].astype(str))# Handle test SKUs not in trainingtest['sku_encoded'] = test['sku'].astype(str).apply(    lambda x: le_sku.transform([x])[0] if x in le_sku.classes_ else -1)# Add sku_encoded to featuresavailable_features.append('sku_encoded')# Drop rows with NaN in featurestrain_clean = train.dropna(subset=available_features)test_clean = test[test['sku_encoded'] >= 0].dropna(subset=available_features)print(f"Training samples: {len(train_clean):,}")print(f"Test samples: {len(test_clean):,}")

In [None]:
# Train XGBoostX_train = train_clean[available_features]y_train = train_clean[TARGET_COL]X_test = test_clean[available_features]y_test = test_clean[TARGET_COL]model = xgb.XGBRegressor(    n_estimators=200,    max_depth=6,    learning_rate=0.1,    subsample=0.8,    colsample_bytree=0.8,    random_state=42,    n_jobs=-1)model.fit(X_train, y_train, verbose=False)print("✓ XGBoost trained")# Predictpred_xgb = model.predict(X_test)results['XGBoost'] = evaluate_predictions(y_test.values, pred_xgb)print(f"\nXGBoost: WMAPE={results['XGBoost']['WMAPE']:.1f}%, MAE={results['XGBoost']['MAE']:,.0f}")

## Model Comparison

In [None]:
# Compare all modelscomparison = pd.DataFrame(results).Tcomparison = comparison.sort_values('WMAPE')print("\n=== Model Comparison (sorted by WMAPE) ===")print(comparison.to_string())# Save comparisoncomparison.to_csv(MODEL_PATH / 'model_comparison.csv')print(f"\n✓ Saved model_comparison.csv")

## Feature Importance

In [None]:
import matplotlib.pyplot as plt# Feature importanceimportance = pd.DataFrame({    'feature': available_features,    'importance': model.feature_importances_}).sort_values('importance', ascending=True)plt.figure(figsize=(10, 6))plt.barh(importance['feature'], importance['importance'])plt.xlabel('Importance')plt.title('XGBoost Feature Importance')plt.tight_layout()plt.show()

## Save Model & Predictions

In [None]:
import joblib# Save modelmodel.save_model(str(MODEL_PATH / 'xgboost_model.json'))joblib.dump(le_sku, MODEL_PATH / 'label_encoder_sku.joblib')print("✓ Model saved")# Save predictionstest_clean['predicted'] = pred_xgbtest_clean['actual'] = y_testtest_clean['error'] = test_clean['actual'] - test_clean['predicted']predictions_df = test_clean[['sku', 'year_week', 'actual', 'predicted', 'error']]predictions_df.to_csv(MODEL_PATH / 'predictions_h2.csv', index=False)print(f"✓ Saved predictions_h2.csv: {len(predictions_df):,} rows")