In [8]:
# ════════════════════════════════════════════════════════════════════════════
# Baseline “mean-of-train” predictor – evaluate on last-2-weeks for *every*
# (store_key, product_key) pair and write results to
#   ➜  Machine_Learning.results.results_baseline_mean
# ════════════════════════════════════════════════════════════════════════════
from pyspark.sql import functions as F, Window
from pyspark.sql.types import (StructType, StructField, IntegerType, DateType,
                               DoubleType, StringType, ArrayType)
import pandas as pd
import numpy as np

# sklearn
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error



StatementMeta(, b5e3681d-f66e-4c4e-b695-464fd3d114a0, 10, Finished, Available, Finished)

In [9]:
# ---------------------------------------------------------------------------
# 1. Spark/Arrow settings – speed boosts for group-wise Pandas UDFs
# ---------------------------------------------------------------------------
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
spark.conf.set("spark.sql.execution.arrow.maxRecordsPerBatch", "10000")

# ---------------------------------------------------------------------------
# 2. Source data – load only what we need
# ---------------------------------------------------------------------------
features = (
    spark.table("Machine_Learning.features.weekly_features_combos_with_no_data")
          .select("store_key", "product_key", "week_start", "weekly_quantities_sold")
)



StatementMeta(, b5e3681d-f66e-4c4e-b695-464fd3d114a0, 11, Finished, Available, Finished)

In [10]:
# # ─── pick a random (store_key, product_key) combo ───────────────────────────
# random_pair = (features
#                .select("store_key", "product_key")
#                .distinct()
#                .orderBy(F.rand())
#                .limit(1)
#                .collect()[0])

# rand_store   = random_pair["store_key"]
# rand_product = random_pair["product_key"]

# print(f"▶︎  Using random combo  store_key = {rand_store},  product_key = {rand_product}")

# # keep only that combo, but retain the variable name `features`
# features = features.filter(
#     (F.col("store_key") == rand_store) &
#     (F.col("product_key") == rand_product)
# )

StatementMeta(, b5e3681d-f66e-4c4e-b695-464fd3d114a0, 12, Finished, Available, Finished)

In [11]:
# ---------------------------------------------------------------------------
# 3. Define result schema once
# ---------------------------------------------------------------------------
result_schema = StructType([
    StructField("store_key"      , IntegerType(), False),
    StructField("product_key"    , IntegerType(), False),
    StructField("train_end_date" , DateType()  , True),   # can be NULL for tiny combos
    StructField("test_start_date", DateType()  , True),
    StructField("test_end_date"  , DateType()  , True),
    StructField("r2"             , DoubleType(), True),
    StructField("rmse"           , DoubleType(), True),
    StructField("mae"            , DoubleType(), True),
    StructField("mape_error"     , DoubleType(), True),
    StructField("accuracy"       , DoubleType(), True),
    StructField("model_params"   , StringType(), True),
    StructField("y_true"         , ArrayType(DoubleType()), True),
    StructField("y_pred"         , ArrayType(DoubleType()), True)
])

StatementMeta(, b5e3681d-f66e-4c4e-b695-464fd3d114a0, 13, Finished, Available, Finished)

In [12]:
# ---------------------------------------------------------------------------
# 4. Grouped Pandas UDF – runs once per combo
# ---------------------------------------------------------------------------
def evaluate_baseline(pdf: pd.DataFrame) -> pd.DataFrame:
    """
    Called for one (store_key, product_key) group.
    Returns a single-row DataFrame with the evaluation metrics.
    """
    pdf = pdf.sort_values("week_start").copy()

    # Ensure float dtype (avoids Decimal overflow from Spark)
    pdf["weekly_quantities_sold"] = pdf["weekly_quantities_sold"].astype(float)

    n_weeks = len(pdf)

    # --------------------------------------------------------------------
    # 4a. Define TRAIN and TEST splits
    #      • If ≥3 weeks → train = all but last-2   |   test = last-2
    #      • If 2 weeks  → train = first-1          |   test = last-1
    #      • If 1 week   → train = *none* (mean on full series) | test = last-1
    # --------------------------------------------------------------------
    if n_weeks >= 3:
        train_slice = slice(None, -2)   # up-to but *not* including last two rows
        test_slice  = slice(-2,  None)
    elif n_weeks == 2:
        train_slice = slice(0, 1)       # first row
        test_slice  = slice(-1, None)   # last row
    else:  # n_weeks == 1
        train_slice = slice(None, None) # will be empty -> fallback below
        test_slice  = slice(-1, None)

    train_vals = pdf["weekly_quantities_sold"].iloc[train_slice]

    # If train segment empty (e.g., 1 week series) fall back to mean on *all* weeks
    mean_val = train_vals.mean() if len(train_vals) else pdf["weekly_quantities_sold"].mean()

    y_true = pdf["weekly_quantities_sold"].iloc[test_slice].to_numpy(dtype=float)
    y_pred = np.full_like(y_true, mean_val, dtype=float)

    # --------------------------------------------------------------------
    # 4b. Dates for reference (safe if slices are short)
    # --------------------------------------------------------------------
    train_end_date = (
    pdf["week_start"].iloc[train_slice].iloc[-1]   # positional access
    if len(train_vals) else None
    )
    test_dates      = pdf["week_start"].iloc[test_slice]
    test_start_date = test_dates.iloc[0] if len(test_dates) else None
    test_end_date   = test_dates.iloc[-1] if len(test_dates) else None

    # --------------------------------------------------------------------
    # 4c. Metrics – handle small sample edge cases
    # --------------------------------------------------------------------
    if len(y_true):
        rmse       = mean_squared_error(y_true, y_pred, squared=False)
        mae        = mean_absolute_error(y_true, y_pred)
        mape_error = np.mean(np.abs((y_true - y_pred) / np.maximum(y_true, 1e-9)))
        accuracy   = 1.0 - mape_error
    else:
        rmse = mae = mape_error = accuracy = None

    # r² needs at least 2 observations
    if len(y_true) > 1:
        try:
            r2 = r2_score(y_true, y_pred)
        except ValueError:
            r2 = None
    else:
        r2 = None

    # --------------------------------------------------------------------
    # 4d. Emit single-row DataFrame
    # --------------------------------------------------------------------
    return pd.DataFrame([{
        "store_key"      : int(pdf["store_key"].iloc[0]),
        "product_key"    : int(pdf["product_key"].iloc[0]),
        "train_end_date" : train_end_date,
        "test_start_date": test_start_date,
        "test_end_date"  : test_end_date,
        "r2"             : r2,
        "rmse"           : rmse,
        "mae"            : mae,
        "mape_error"     : mape_error,
        "accuracy"       : accuracy,
        "model_params"   : f"mean={mean_val}",
        "y_true"         : y_true.tolist(),
        "y_pred"         : y_pred.tolist()
    }])

StatementMeta(, b5e3681d-f66e-4c4e-b695-464fd3d114a0, 14, Finished, Available, Finished)

In [13]:
# Register Pandas UDF with the explicit schema
baseline_udf = F.pandas_udf(result_schema, F.PandasUDFType.GROUPED_MAP)(evaluate_baseline)

StatementMeta(, b5e3681d-f66e-4c4e-b695-464fd3d114a0, 15, Finished, Available, Finished)

In [14]:
# ---------------------------------------------------------------------------
# 5. Run the evaluation in parallel
# ---------------------------------------------------------------------------
results_df = (
    features
      .groupBy("store_key", "product_key")   
      .apply(baseline_udf)
)


StatementMeta(, b5e3681d-f66e-4c4e-b695-464fd3d114a0, 16, Finished, Available, Finished)

In [15]:
# ---------------------------------------------------------------------------
# 6. Persist results – overwrite 
# ---------------------------------------------------------------------------
(results_df
    .write
    .mode("overwrite")          
    .saveAsTable("Machine_Learning.results.results_baseline_mean")
)

print("✅  Baseline-mean evaluation written to Machine_Learning.results.results_baseline_mean")

StatementMeta(, b5e3681d-f66e-4c4e-b695-464fd3d114a0, 17, Finished, Available, Finished)

✅  Baseline-mean evaluation written to Machine_Learning.results.results_baseline_mean


In [16]:
%%sql
SELECT * FROM Machine_Learning.results.results_baseline_mean

StatementMeta(, b5e3681d-f66e-4c4e-b695-464fd3d114a0, 18, Finished, Available, Finished)

<Spark SQL result set with 1000 rows and 13 fields>