In [7]:
# ════════════════════════════════════════════════════════════════════════════
# ARIMA Hyperparametrization + exogenous vars  – evaluate over last-2-weeks hold-out
# Results ➜ machine_learning.imputed_data.results_arima
# ════════════════════════════════════════════════════════════════════════════
from pyspark.sql import functions as F, Window
from pyspark.sql.types import (StructType, StructField, IntegerType,
                               DateType, DoubleType, StringType,ArrayType)
import pandas as pd
import numpy as np
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
# XGBoost and sklearn
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

StatementMeta(, 4df40c9b-9753-4c60-98ab-dfde7e10d27a, 9, Finished, Available, Finished)

In [8]:
# ─── 0. Load the weekly features table ──────────────────────────────────────
features = spark.table("Machine_Learning.features.weekly_features_combos_with_no_data")

# Choose a compact exogenous set
exog_cols = ["promo_flag", "log_price", "month_sin", "month_cos"]

StatementMeta(, 4df40c9b-9753-4c60-98ab-dfde7e10d27a, 10, Finished, Available, Finished)

In [9]:
# # ─── pick a random (store_key, product_key) combo ───────────────────────────
# random_pair = (features
#                .select("store_key", "product_key")
#                .distinct()
#                .orderBy(F.rand())
#                .limit(1)
#                .collect()[0])

# rand_store   = random_pair["store_key"]
# rand_product = random_pair["product_key"]

# print(f"▶︎  Using random combo  store_key = {rand_store},  product_key = {rand_product}")

# # keep only that combo, but retain the variable name `features`
# features = features.filter(
#     (F.col("store_key") == rand_store) &
#     (F.col("product_key") == rand_product)
# )

StatementMeta(, 4df40c9b-9753-4c60-98ab-dfde7e10d27a, 11, Finished, Available, Finished)

In [10]:
# Define result schema
xgb_result_schema = StructType([
    StructField("store_key", IntegerType(), False),
    StructField("product_key", IntegerType(), False),
    StructField("train_end_date", DateType(), False),
    StructField("test_start_date", DateType(), False),
    StructField("test_end_date", DateType(), False),
    StructField("r2", DoubleType(), True),
    StructField("rmse", DoubleType(), True),
    StructField("mae", DoubleType(), True),
    StructField("mape_error", DoubleType(), True),
    StructField("accuracy", DoubleType(), True),
    StructField("model_params", StringType(), True),
    StructField("y_true", ArrayType(DoubleType()), True),
    StructField("y_pred", ArrayType(DoubleType()), True)
])

StatementMeta(, 4df40c9b-9753-4c60-98ab-dfde7e10d27a, 12, Finished, Available, Finished)

In [11]:
def xgb_eval(pdf: pd.DataFrame) -> pd.DataFrame:
    # Sort by time
    pdf = pdf.sort_values("week_start")
    # Generate lag features
    df_feat = pdf.copy()
    df_feat["lag_1"] = df_feat["weekly_quantities_sold"].shift(1)
    df_feat["lag_2"] = df_feat["weekly_quantities_sold"].shift(2)
    # Drop rows with NaNs from lags or exogs
    df_feat = df_feat.dropna()
    feat_cols = ["lag_1", "lag_2"] + exog_cols

    # Split: last 2 rows for test, rest for train
    df_feat = df_feat.sort_values("week_start")

    # Check if there's enough data for a train/test split.
    if df_feat.shape[0] < 3:
        # Not enough data to create a training and test set.
        return pd.DataFrame([], columns=[f.name for f in xgb_result_schema.fields])

    train_df = df_feat.iloc[:-2]
    test_df  = df_feat.iloc[-2:]

    # Hyperparameter tuning via RandomizedSearchCV
    param_dist = {
        "n_estimators": [50, 100, 200, 300],
        "max_depth": [3, 5, 7, 9],
        "learning_rate": [0.01, 0.05, 0.1, 0.2],
        "subsample": [0.6, 0.8, 1.0],
        "colsample_bytree": [0.6, 0.8, 1.0]
    }
    model = XGBRegressor(objective="reg:squarederror", random_state=42)
    tscv = TimeSeriesSplit(n_splits=3)

    # --- IMPROVEMENT: Add a check to ensure enough data for TimeSeriesSplit ---
    # TimeSeriesSplit(n_splits=3) requires at least 4 samples for cross-validation.
    if train_df.shape[0] < tscv.get_n_splits() + 1:
        # Not enough data for cross-validation.
        print(f"Skipping evaluation for this combo: Not enough samples for cross-validation. Need at least {tscv.get_n_splits() + 1} samples for training, but have {train_df.shape[0]}.")
        return pd.DataFrame([], columns=[f.name for f in xgb_result_schema.fields])
    # --- END OF IMPROVEMENT ---

    # Capture split dates
    train_end_date  = df_feat["week_start"].iloc[-3]
    test_start_date = df_feat["week_start"].iloc[-2]
    test_end_date   = df_feat["week_start"].iloc[-1]

    # Prepare arrays
    X_train = train_df[feat_cols].astype(float).values
    y_train = train_df["weekly_quantities_sold"].astype(float).values
    X_test  = test_df[feat_cols].astype(float).values
    y_true  = test_df["weekly_quantities_sold"].astype(float).values

    search = RandomizedSearchCV(
        estimator=model,
        param_distributions=param_dist,
        n_iter=20,
        scoring="neg_root_mean_squared_error",
        cv=tscv,
        random_state=42,
        n_jobs=-1
    )
    search.fit(X_train, y_train)
    best_params = search.best_params_
    best_model  = search.best_estimator_

    # Predict on test set
    y_pred = best_model.predict(X_test)

    # Metrics
    r2   = r2_score(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    mae  = mean_absolute_error(y_true, y_pred)
    mape = np.mean(np.abs((y_true - y_pred) / np.maximum(y_true, 1e-9)))
    acc  = 1.0 - mape

    # Build result row
    res = pd.DataFrame([{
        "store_key"      : pdf["store_key"].iloc[0],
        "product_key"    : pdf["product_key"].iloc[0],
        "train_end_date" : train_end_date,
        "test_start_date": test_start_date,
        "test_end_date"  : test_end_date,
        "r2"             : r2,
        "rmse"           : rmse,
        "mae"            : mae,
        "mape_error"     : mape,
        "accuracy"       : acc,
        "model_params"   : str(best_params),
        "y_true"         : y_true.tolist(),
        "y_pred"         : y_pred.tolist()
    }])
    return res

StatementMeta(, 4df40c9b-9753-4c60-98ab-dfde7e10d27a, 13, Finished, Available, Finished)

In [12]:
# Execute evaluation
results = (
    features
    .groupBy("store_key", "product_key")
    .applyInPandas(xgb_eval, schema=xgb_result_schema)
)

results.write \
    .mode("overwrite") \
    .saveAsTable("Machine_Learning.results.results_xgboost_products_w_no_data")

print("✅  XGBoost evaluation metrics saved to Machine_Learning.results.results_xgboost_products_w_no_data")

StatementMeta(, 4df40c9b-9753-4c60-98ab-dfde7e10d27a, 14, Finished, Available, Finished)

✅  XGBoost evaluation metrics saved to Machine_Learning.results.results_xgboost_products_w_no_data
