In [None]:
# ════════════════════════════════════════════════════════════════════════════
# ARIMA Hyperparametrization + exogenous vars  – evaluate over last-2-weeks hold-out
# Results ➜ machine_learning.imputed_data.results_arima
# ════════════════════════════════════════════════════════════════════════════
from pyspark.sql import functions as F, Window
from pyspark.sql.types import (StructType, StructField, IntegerType,
                               DateType, DoubleType, StringType)
import pandas as pd
import numpy as np
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

# ─── 0. Load the weekly features table ──────────────────────────────────────
features = spark.table("Machine_Learning.features.weekly_features_combos_with_data")

# Choose a compact exogenous set
exog_cols = ["promo_flag", "log_price", "month_sin", "month_cos"]

StatementMeta(, a9d91e31-dc88-4d54-b73b-190700cea9c8, 3, Finished, Available, Finished)

In [None]:
# # ─── pick a random (store_key, product_key) combo ───────────────────────────
# random_pair = (features
#                .select("store_key", "product_key")
#                .distinct()
#                .orderBy(F.rand())
#                .limit(1)
#                .collect()[0])

# rand_store   = random_pair["store_key"]
# rand_product = random_pair["product_key"]

# print(f"▶︎  Using random combo  store_key = {rand_store},  product_key = {rand_product}")

# # keep only that combo, but retain the variable name `features`
# features = features.filter(
#     (F.col("store_key") == rand_store) &
#     (F.col("product_key") == rand_product)
# )

StatementMeta(, a9d91e31-dc88-4d54-b73b-190700cea9c8, 4, Finished, Available, Finished)

In [None]:
from pyspark.sql.types import ArrayType

# ─── 1. Pandas UDF for model-fit + evaluation ───────────────────────────────
result_schema = StructType([
    StructField("store_key"       , IntegerType(), False),
    StructField("product_key"     , IntegerType(), False),
    StructField("train_end_date"  , DateType()  , False),
    StructField("test_start_date" , DateType()  , False),
    StructField("test_end_date"   , DateType()  , False),
    StructField("r2"              , DoubleType(), True),
    StructField("rmse"            , DoubleType(), True),
    StructField("mae"             , DoubleType(), True),
    StructField("mape_error"      , DoubleType(), True),
    StructField("accuracy"        , DoubleType(), True),
    StructField("model_order"     , StringType(), True),
    StructField("y_true"          , ArrayType(DoubleType()), True),
    StructField("y_pred"          , ArrayType(DoubleType()), True)
])

StatementMeta(, a9d91e31-dc88-4d54-b73b-190700cea9c8, 5, Finished, Available, Finished)

In [None]:
import mlflow
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import numpy as np
import pandas as pd

# Attach once, outside the UDF
mlflow.set_experiment("AutoARIMA_ProductStore")

# ---------------------------------------------------------------------------
# updated helper
def best_sarimax_aic(y_train, exog_train):
    """Return (order, seasonal_order, fitted_model) with the lowest AIC."""
    best_aic = np.inf
    best_cfg = None
    best_mod = None

    for p in (0, 1, 2):
        for d in (0, 1):
            for q in (0, 1):
                if p == q == 0:
                    continue
                for P in (0, 1):
                    for Q in (0, 1):
                        order = (p, d, q)
                        sorder = (P, 1, Q, 52)
                        try:
                            mod = SARIMAX(
                                y_train,
                                exog=exog_train,
                                order=order,
                                seasonal_order=sorder,
                                enforce_stationarity=False,
                                enforce_invertibility=False,
                            ).fit(disp=False)
                            if mod.aic < best_aic:
                                best_aic, best_cfg, best_mod = mod.aic, (order, sorder), mod
                        except Exception:
                            continue  # skip bad configs
    return best_cfg, best_mod

StatementMeta(, a9d91e31-dc88-4d54-b73b-190700cea9c8, 6, Finished, Available, Finished)

2025/06/28 21:06:50 INFO mlflow.tracking.fluent: Experiment with name 'AutoARIMA_ProductStore' does not exist. Creating a new experiment.


In [None]:
def arima_eval(pdf: pd.DataFrame) -> pd.DataFrame:
    """
    Hyper-tunes SARIMAX via AIC, logs each run to MLflow, and returns one
    row with evaluation metrics + predictions for the 2-week hold-out.
    """
    pdf   = pdf.sort_values("week_start")
    y     = pdf["weekly_quantities_sold"].astype(float).values
    exog  = pdf[exog_cols].astype(float).values

    # Train-test split (last 2 weeks = test)
    y_train, y_test     = y[:-2],  y[-2:]
    exog_train, exog_ts = exog[:-2], exog[-2:]

    # Initialise outputs
    r2 = rmse = mae = mape = acc = np.nan
    order = seasonal_order = None
    y_pred = [np.nan, np.nan]

    # Skip degenerate series
    if len(np.unique(y_train)) >= 2:
        try:
            (order, seasonal_order), model = best_sarimax_aic(y_train, exog_train)
            y_pred = model.forecast(steps=2, exog=exog_ts)

            r2   = r2_score(y_test, y_pred)
            rmse = np.sqrt(mean_squared_error(y_test, y_pred))
            mae  = mean_absolute_error(y_test, y_pred)
            mape = np.mean(np.abs((y_test - y_pred) / np.maximum(y_test, 1e-9)))
            acc  = 1.0 - mape
        except Exception:
            pass   # keep NaNs / default y_pred

    # ── MLflow logging ────────────────────────────────────────────────────
    product_id = int(pdf["product_key"].iloc[0])
    store_id   = int(pdf["store_key"].iloc[0])

    with mlflow.start_run(run_name=f"AutoARIMA_{product_id}_{store_id}"):
        mlflow.log_param("product_key",  product_id)
        mlflow.log_param("store_key",    store_id)
        mlflow.log_param("seasonality_m", 52)

        if order is not None:
            p, d, q         = order
            P, D, Q, m      = seasonal_order
            mlflow.log_param("p", p)
            mlflow.log_param("d", d)
            mlflow.log_param("q", q)
            mlflow.log_param("P", P)
            mlflow.log_param("D", D)
            mlflow.log_param("Q", Q)

        mlflow.log_metric("mae",      mae  if pd.notna(mae)  else np.nan)
        mlflow.log_metric("rmse",     rmse if pd.notna(rmse) else np.nan)
        mlflow.log_metric("r2",       r2   if pd.notna(r2)   else np.nan)
        mlflow.log_metric("accuracy", acc  if pd.notna(acc)  else np.nan)

    # ── Return Spark-friendly row ─────────────────────────────────────────
    res = pd.DataFrame([{
        "store_key"      : store_id,
        "product_key"    : product_id,
        "train_end_date" : pdf["week_start"].iloc[-3],
        "test_start_date": pdf["week_start"].iloc[-2],
        "test_end_date"  : pdf["week_start"].iloc[-1],
        "r2"             : r2,
        "rmse"           : rmse,
        "mae"            : mae,
        "mape_error"     : mape,
        "accuracy"       : acc,
        "model_order"    : f"{order}_{seasonal_order}" if order else None,
        "y_true"         : y_test.tolist(),
        "y_pred"         : y_pred.tolist() if isinstance(y_pred, np.ndarray) else y_pred
    }])

    return res

StatementMeta(, a9d91e31-dc88-4d54-b73b-190700cea9c8, 7, Finished, Available, Finished)

In [None]:
results = (
    features                                           
      .select("store_key", "product_key", "week_start",
              "weekly_quantities_sold", *exog_cols)
      .groupBy("store_key", "product_key")
      .applyInPandas(arima_eval, schema=result_schema)
)

StatementMeta(, a9d91e31-dc88-4d54-b73b-190700cea9c8, 8, Finished, Available, Finished)

In [None]:
# ─── 3. Persist results table ───────────────────────────────────────────────
(results.write
        .mode("overwrite")
        .saveAsTable("Machine_Learning.results.results_arima"))

print("✅  ARIMA evaluation metrics saved to Machine_Learning.results.results_arima")