In [1]:
# ════════════════════════════════════════════════════════════════════════════
# Linear Regression  – evaluate over last-2-weeks hold-out
# Results ➜ machine_learning.imputed_data.results_arima
# ════════════════════════════════════════════════════════════════════════════
from pyspark.sql import functions as F, Window
from pyspark.sql.types import (StructType, StructField, IntegerType, DateType,
                               DoubleType, StringType, ArrayType)
import pandas as pd
import numpy as np

# sklearn
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error



StatementMeta(, b8dd1f89-3ff9-44ec-8a36-c1a6d2fda106, 3, Finished, Available, Finished)

In [2]:
# ─── 0. Load the weekly features table ──────────────────────────────────────
features = spark.table("Machine_Learning.features.weekly_features_combos_with_no_data")


StatementMeta(, b8dd1f89-3ff9-44ec-8a36-c1a6d2fda106, 4, Finished, Available, Finished)

In [3]:
# # ─── pick a random (store_key, product_key) combo ───────────────────────────
# random_pair = (features
#                .select("store_key", "product_key")
#                .distinct()
#                .orderBy(F.rand())
#                .limit(1)
#                .collect()[0])

# rand_store   = random_pair["store_key"]
# rand_product = random_pair["product_key"]

# print(f"▶︎  Using random combo  store_key = {rand_store},  product_key = {rand_product}")

# # keep only that combo, but retain the variable name `features`
# features = features.filter(
#     (F.col("store_key") == rand_store) &
#     (F.col("product_key") == rand_product)
# )

StatementMeta(, b8dd1f89-3ff9-44ec-8a36-c1a6d2fda106, 5, Finished, Available, Finished)

▶︎  Using random combo  store_key = 23,  product_key = 253


In [4]:
# exogenous columns
exog_cols = ["promo_flag", "log_price", "month_sin", "month_cos"]

# Define result schema
lr_result_schema = StructType([
    StructField("store_key", IntegerType(), False),
    StructField("product_key", IntegerType(), False),
    StructField("train_end_date", DateType(), False),
    StructField("test_start_date", DateType(), False),
    StructField("test_end_date", DateType(), False),
    StructField("r2", DoubleType(), True),
    StructField("rmse", DoubleType(), True),
    StructField("mae", DoubleType(), True),
    StructField("mape_error", DoubleType(), True),
    StructField("accuracy", DoubleType(), True),
    StructField("model_params", StringType(), True),
    StructField("y_true", ArrayType(DoubleType()), True),
    StructField("y_pred", ArrayType(DoubleType()), True)
])


StatementMeta(, b8dd1f89-3ff9-44ec-8a36-c1a6d2fda106, 6, Finished, Available, Finished)

In [5]:
def lr_eval(pdf: pd.DataFrame) -> pd.DataFrame:
    # Ensure time ordering
    pdf = pdf.sort_values("week_start")
    # Create lag features
    df_feat = pdf.copy()
    df_feat["lag_1"] = df_feat["weekly_quantities_sold"].shift(1)
    df_feat["lag_2"] = df_feat["weekly_quantities_sold"].shift(2)
    df_feat = df_feat.dropna()
    feat_cols = ["lag_1", "lag_2"] + exog_cols

    # Not enough data? return empty
    if df_feat.shape[0] < 3:
        return pd.DataFrame([], columns=[f.name for f in lr_result_schema.fields])

    # Split: final two weeks for test
    train_df = df_feat.iloc[:-2]
    test_df  = df_feat.iloc[-2:]

    train_end_date  = df_feat["week_start"].iloc[-3]
    test_start_date = df_feat["week_start"].iloc[-2]
    test_end_date   = df_feat["week_start"].iloc[-1]

    X_train = train_df[feat_cols].astype(float).values
    y_train = train_df["weekly_quantities_sold"].astype(float).values
    X_test  = test_df[feat_cols].astype(float).values
    y_true  = test_df["weekly_quantities_sold"].astype(float).values

    # Fit Linear Regression
    model = LinearRegression()
    model.fit(X_train, y_train)
    params = model.get_params()

    # Predict and evaluate
    y_pred = model.predict(X_test)
    r2   = r2_score(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    mae  = mean_absolute_error(y_true, y_pred)
    mape = np.mean(np.abs((y_true - y_pred) / np.maximum(y_true, 1e-9)))
    acc  = 1.0 - mape

    # Build result row
    res = pd.DataFrame([{
        "store_key":      pdf["store_key"].iloc[0],
        "product_key":    pdf["product_key"].iloc[0],
        "train_end_date": train_end_date,
        "test_start_date": test_start_date,
        "test_end_date":  test_end_date,
        "r2":              r2,
        "rmse":            rmse,
        "mae":             mae,
        "mape_error":      mape,
        "accuracy":        acc,
        "model_params":    str(params),
        "y_true":          y_true.tolist(),
        "y_pred":          y_pred.tolist()
    }])
    return res


StatementMeta(, b8dd1f89-3ff9-44ec-8a36-c1a6d2fda106, 7, Finished, Available, Finished)

In [6]:
# Run evaluation per product-store combo
results = (
    features
      .groupBy("store_key", "product_key")
      .applyInPandas(lr_eval, schema=lr_result_schema)
)

StatementMeta(, b8dd1f89-3ff9-44ec-8a36-c1a6d2fda106, 8, Finished, Available, Finished)

In [7]:

# Persist results
results.write \
    .mode("overwrite") \
    .saveAsTable("Machine_Learning.results.results_linear_regression_products_w_no_data")

print("✅ Linear Regression metrics saved to Machine_Learning.results.results_linear_regression_products_w_no_data")

StatementMeta(, b8dd1f89-3ff9-44ec-8a36-c1a6d2fda106, 9, Finished, Available, Finished)

✅ Linear Regression metrics saved to Machine_Learning.results.results_linear_regression_products_w_no_data
