In [1]:
# ════════════════════════════════════════════════════════════════════════════
#  RE-ORDER FORECAST  –  use stored SARIMAX hyper-params to predict week t+1
# ════════════════════════════════════════════════════════════════════════════
from pyspark.sql import functions as F, Window
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType, StringType
import pandas as pd, numpy as np, ast
from statsmodels.tsa.statespace.sarimax import SARIMAX


StatementMeta(, 1de9309c-97c4-4553-a3f5-dccd4623c882, 3, Finished, Available, Finished)

In [2]:
# ---------------------------------------------------------------------------
# 1. Load features and latest hyper-params
# ---------------------------------------------------------------------------
features_all = spark.table("Machine_Learning.features.weekly_features_combos_with_data")

results_latest = (
    spark.table("Machine_Learning.results.results_arima")
         .withColumn(
             "rn",
             F.row_number().over(
                 Window.partitionBy("store_key","product_key")
                       .orderBy(F.col("train_end_date").desc())
             )
         )
         .filter("rn = 1")
         .select("store_key","product_key","model_order")
)

features = features_all.join(
    results_latest, ["store_key","product_key"], "inner"
)

# ---------------------------------------------------------------------------
# 2. Define output schema (no stock_on_order)
# ---------------------------------------------------------------------------
schema_out = StructType([
    StructField("store_key"      , IntegerType(), False),
    StructField("product_key"    , IntegerType(), False),
    StructField("model"          , StringType() , False),
    StructField("reorder_predict", DoubleType() , True)
])

# ---------------------------------------------------------------------------
# 3. Pandas UDF: forecast next week per combo
# ---------------------------------------------------------------------------
exog_cols = ["promo_flag","log_price","month_sin","month_cos"]

def forecast_next(pdf: pd.DataFrame) -> pd.DataFrame:
    pdf = pdf.sort_values("week_start")
    y   = pdf["weekly_quantities_sold"].astype(float).values
    exg = pdf[exog_cols].astype(float).values

    # parse stored hyper-params
    order = seasonal = None
    mo = pdf["model_order"].iloc[0]
    if mo:
        try:
            o_txt, s_txt = mo.split("_")
            order    = ast.literal_eval(o_txt)
            seasonal = ast.literal_eval(s_txt)
        except:
            pass

    # build next-week exog
    next_wk   = pdf["week_start"].iloc[-1] + pd.Timedelta(days=7)
    promo_n   = 0.0
    price_n   = pdf["log_price"].iloc[-1]
    m         = next_wk.month
    msin_n    = np.sin(2*np.pi * m/12)
    mcos_n    = np.cos(2*np.pi * m/12)
    exog_next = np.array([[promo_n, price_n, msin_n, mcos_n]])

    # forecast if we have valid parameters
    pred = np.nan
    if order is not None:
        try:
            mdl = SARIMAX(
                y, exog=exg,
                order=tuple(order),
                seasonal_order=tuple(seasonal),
                enforce_stationarity=False,
                enforce_invertibility=False
            ).fit(disp=False)
            pred = mdl.forecast(steps=1, exog=exog_next)[0]
        except:
            pass

    return pd.DataFrame([{
        "store_key"      : int(pdf["store_key"].iloc[0]),
        "product_key"    : int(pdf["product_key"].iloc[0]),
        "model"          : "SARIMAX",
        "reorder_predict": float(pred) if not np.isnan(pred) else None
    }])

# ---------------------------------------------------------------------------
# 4. Run & save
# ---------------------------------------------------------------------------
reorder = (
    features
      .select(
          "store_key",
          "product_key",
          "week_start",
          "weekly_quantities_sold",
          *exog_cols,
          "model_order"
      )
      .groupBy("store_key","product_key")
      .applyInPandas(forecast_next, schema=schema_out)
)




StatementMeta(, 1de9309c-97c4-4553-a3f5-dccd4623c882, 4, Finished, Available, Finished)

In [3]:
(reorder.write
        .mode("overwrite")
        .saveAsTable("Machine_Learning.predictions.reorder_forecast"))

print("✅  Reorder forecasts saved to Machine_Learning.predictions.reorder_forecast")

StatementMeta(, 1de9309c-97c4-4553-a3f5-dccd4623c882, 5, Finished, Available, Finished)

✅  Reorder forecasts saved to Machine_Learning.predictions.reorder_forecast


## **Compute the predictions for the products with no data**

In [4]:
from pyspark.sql import functions as F, Window
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType, StringType
import pandas as pd, numpy as np, ast
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression

# ─── 4. Load latest results for each model ───────────────────────────────────

# helper to load latest by lowest MAE
def load_latest(table_name, model_label):
    df = (
        spark.table(table_name)
             .withColumn("rn", F.row_number().over(
                 Window.partitionBy("store_key","product_key")
                       .orderBy(F.col("mae")))
             )
             .filter("rn = 1")
             .select("store_key","product_key","model_params","mae")
             .withColumn("model", F.lit(model_label))
    )
    return df

xgb_res = load_latest("Machine_Learning.results.results_xgboost_products_w_no_data", "XGBoost")
lr_res  = load_latest("Machine_Learning.results.results_linear_regression_products_w_no_data", "LinearRegression")
mean_res= load_latest("Machine_Learning.results.results_baseline_mean", "BaselineMean")

# union and pick best per combo
all_res = xgb_res.unionByName(lr_res).unionByName(mean_res)
best = (
    all_res
      .withColumn("rank", F.row_number().over(
          Window.partitionBy("store_key","product_key").orderBy(F.col("mae"))
      ))
      .filter("rank = 1")
      .select("store_key","product_key","model","model_params")
)

# ─── 5. Prepare features for forecasting ─────────────────────────────────────
all_features = spark.table("Machine_Learning.features.weekly_features_combos_with_no_data")
to_pred = all_features.join(best, ["store_key","product_key"], "inner")

# output schema: include chosen model
schema_out = StructType([
    StructField("store_key", IntegerType(), False),
    StructField("product_key", IntegerType(), False),
    StructField("model", StringType(), False),
    StructField("model_params", StringType(), False),
    StructField("reorder_predict", DoubleType(), True)
])

StatementMeta(, 1de9309c-97c4-4553-a3f5-dccd4623c882, 6, Finished, Available, Finished)

In [5]:
# ─── 6. Forecast next week based on chosen model ─────────────────────────────
def forecast_next(pdf: pd.DataFrame) -> pd.DataFrame:
    # require at least 3 weeks of history for lags and prediction
    if len(pdf) < 3:
        return pd.DataFrame([], columns=[f.name for f in schema_out.fields])
    pdf = pdf.sort_values("week_start")
    # history and exog
    y = pdf["weekly_quantities_sold"].astype(float).values
    exog = pdf[["promo_flag","log_price","month_sin","month_cos"]].astype(float).values
    model_name = pdf["model"].iloc[0]
    params_txt = pdf["model_params"].iloc[0]
    # next-week exog
    next_wk = pdf["week_start"].iloc[-1] + pd.Timedelta(days=7)
    exog_next = np.array([[0.0, pdf["log_price"].iloc[-1],
                           np.sin(2*np.pi*next_wk.month/12),
                           np.cos(2*np.pi*next_wk.month/12)]])
    # prepare lag features
    lag1 = pd.DataFrame(y).shift(1).fillna(method="bfill").values
    lag2 = pd.DataFrame(y).shift(2).fillna(method="bfill").values
    X_train = np.hstack([lag1, lag2, exog])[2:]
    y_train = y[2:]
    # forecasting
    if model_name == "XGBoost":
        params = ast.literal_eval(params_txt)
        mdl = XGBRegressor(objective="reg:squarederror", **params)
        mdl.fit(X_train, y_train)
        X_next = np.hstack([[[y[-1]]], [[y[-2]]], exog_next])[0]
        pred = mdl.predict(X_next.reshape(1, -1))[0]
    elif model_name == "LinearRegression":
        params = ast.literal_eval(params_txt)
        mdl = LinearRegression(**params)
        mdl.fit(X_train, y_train)
        X_next = np.hstack([[[y[-1]]], [[y[-2]]], exog_next])[0]
        pred = mdl.predict(X_next.reshape(1, -1))[0]
    else:  # BaselineMean
        pred = float(params_txt.split("=")[1])

    return pd.DataFrame([{
        "store_key": int(pdf["store_key"].iloc[0]),
        "product_key": int(pdf["product_key"].iloc[0]),
        "model": model_name,
        "model_params": params_txt,
        "reorder_predict": float(pred)
    }])


StatementMeta(, 1de9309c-97c4-4553-a3f5-dccd4623c882, 7, Finished, Available, Finished)

In [6]:
# ─── 7. Generate and save forecasts ─────────────────────────────────────────
reorder2 = (
    to_pred
      .groupBy("store_key", "product_key", "model", "model_params")
      .applyInPandas(forecast_next, schema=schema_out)
)


StatementMeta(, 1de9309c-97c4-4553-a3f5-dccd4623c882, 8, Finished, Available, Finished)

In [7]:
reorder2.write \
    .mode("overwrite") \
    .saveAsTable("Machine_Learning.predictions.reorder_forecast_2")

print("✅  Reorder forecasts saved to Machine_Learning.predictions.reorder_forecast_2")

StatementMeta(, 1de9309c-97c4-4553-a3f5-dccd4623c882, 9, Finished, Available, Finished)

✅  Reorder forecasts saved to Machine_Learning.predictions.reorder_forecast_2
