In [1]:
%pip install --quiet pmdarima scikit-learn statsmodels numpy pandas mlflow

StatementMeta(, e042e62f-599c-4423-a574-c96f03e04e95, 6, Cancelled, , Cancelled)

In [2]:
# ─────────────────────────────────────────────────────────────────────────────
# 0. LIBRARIES
# ─────────────────────────────────────────────────────────────────────────────
from pyspark.sql import functions as F, types as T
from pyspark.sql import SparkSession
from pyspark.sql.functions import pandas_udf, PandasUDFType
from pyspark.sql.types import DoubleType, StringType, StructField, StructType, IntegerType
import pandas as pd
import numpy as np
import warnings

from pmdarima import auto_arima
from sklearn.metrics import mean_absolute_error, mean_squared_error

import mlflow
import mlflow.pmdarima # Required for logging pmdarima models
from mlflow.models import infer_signature
from mlflow.pyfunc import PythonModel


StatementMeta(, 617ff04b-c95e-41ea-b8e0-1024a6f4188e, 9, Finished, Available, Finished)

In [None]:
# Initialize SparkSession if not already initialized (typical in a Fabric notebook
spark = SparkSession.builder.appName("ARIMAModelDeployment").getOrCreate()

# Configure MLflow tracking URI for Fabric
# This ensures MLflow logs to your Fabric workspace.
mlflow.set_tracking_uri("mlflow")

In [None]:
# ─────────────────────────────────────────────────────────────────────────────
# 1. LOAD WEEKLY DATA
# ─────────────────────────────────────────────────────────────────────────────

df = spark.sql("""
    SELECT *
    FROM Machine_Learning.filtered_data.all_year_fact_sales_demand_w_data
""")

# We only need the three fields for modelling & evaluation
df_model = (
    df.select("product_key", "store_key", "week_start", "weekly_quantities_sold")
      .orderBy("product_key", "store_key", "week_start")
)

In [None]:
# ─────────────────────────────────────────────────────────────────────────────
# 2. DEFINE GROUPED PANDAS-UDF FOR AUTO-ARIMA + METRICS + MLflow Logging
# ─────────────────────────────────────────────────────────────────────────────
result_schema = StructType([
    StructField("product_key"       , IntegerType() , False),
    StructField("store_key"         , IntegerType() , False),
    StructField("n_obs"             , IntegerType() , False),
    StructField("aic"               , DoubleType()  , True),
    StructField("bic"               , DoubleType()  , True),
    StructField("arima_order"       , StringType()  , True),
    StructField("seasonal_order"    , StringType()  , True),
    StructField("mae"               , DoubleType()  , True),
    StructField("rmse"              , DoubleType()  , True),
    StructField("mape"              , DoubleType()  , True),
    StructField("mlflow_run_id"     , StringType()  , True), # Add MLflow run ID to schema
    StructField("model_uri"         , StringType()  , True), # Add model URI to schema
])

FORECAST_HORIZON = 4 # last 4 weeks -> test set
MIN_SERIES_LEN   = 99

In [None]:
@pandas_udf(result_schema, PandasUDFType.GROUPED_MAP)  
def arima_per_pair(pdf: pd.DataFrame) -> pd.DataFrame:
    """
    Receives one product-store history as a Pandas DataFrame,
    fits an auto_arima model, calculates metrics, and logs the model
    and metrics to MLflow.
    Returns a single-row Pandas DataFrame with metrics and MLflow info.
    """
    product_key = int(pdf["product_key"].iloc[0])
    store_key   = int(pdf["store_key"].iloc[0])

    # Ensure ordered by week
    pdf = pdf.sort_values("week_start")
    y   = pdf["weekly_quantities_sold"].fillna(0).astype(float).values
    n   = len(y)

    # Default output (in case model cannot be fit or series is too short)
    out = {
        "product_key"   : product_key,
        "store_key"     : store_key,
        "n_obs"         : n,
        "aic"           : None,
        "bic"           : None,
        "arima_order"   : None,
        "seasonal_order": None,
        "mae"           : None,
        "rmse"          : None,
        "mape"          : None,
        "mlflow_run_id" : None,
        "model_uri"     : None,
    }

    if n < max(MIN_SERIES_LEN, FORECAST_HORIZON + 2):
        warnings.warn(f"Series too short ({n} obs) for product_key={product_key}, store_key={store_key}. No ARIMA fitted.")
        return pd.DataFrame([out])

    train, test = y[:-FORECAST_HORIZON], y[-FORECAST_HORIZON:]

    # Start an MLflow run for each product-store pair
    with mlflow.start_run(nested=True, run_name=f"ARIMA_Product_{product_key}_Store_{store_key}") as run:
        try:
            model = auto_arima(
                train,
                seasonal=True,
                m=52,                   # weekly seasonality (change if needed)
                stepwise=True,
                suppress_warnings=True,
                error_action="ignore",
                max_p=3, max_q=3, max_P=1, max_Q=1,
                max_order=5
            )

            fcst = model.predict(n_periods=FORECAST_HORIZON)
            mae  = mean_absolute_error(test, fcst)
            rmse = np.sqrt(mean_squared_error(test, fcst))
            mape = np.mean(np.abs((test - fcst) / np.where(test == 0, np.nan, test))) * 100

            # Log metrics to MLflow
            mlflow.log_metrics({
                "aic": float(model.aic()),
                "bic": float(model.bic()),
                "mae": float(mae),
                "rmse": float(rmse),
                "mape": float(mape),
            })
            
            # Log parameters to MLflow
            mlflow.log_params({
                "arima_order": str(model.order),
                "seasonal_order": str(model.seasonal_order),
                "forecast_horizon": FORECAST_HORIZON,
                "min_series_len": MIN_SERIES_LEN,
                "product_key": product_key, # Log product_key and store_key as params
                "store_key": store_key,
                "n_obs_train": len(train),
            })

            # Log the pmdarima model
            signature = infer_signature(pd.Series(train), model.predict(n_periods=FORECAST_HORIZON))
            
            # Log the model under a specific artifact path
            mlflow.pmdarima.log_model(
                pmdarima_model=model,
                artifact_path="arima_model",
                signature=signature,
                input_example=pd.Series(train[-10:]), # Example input for inference
                registered_model_name=f"ARIMA_Product_{product_key}_Store_{store_key}_Model" # Register model with a specific name
            )
            
            # Update output with MLflow run ID and model URI
            out.update({
                "aic"           : float(model.aic()),
                "bic"           : float(model.bic()),
                "arima_order"   : str(model.order),
                "seasonal_order": str(model.seasonal_order),
                "mae"           : float(mae),
                "rmse"          : float(rmse),
                "mape"          : float(mape),
                "mlflow_run_id" : run.info.run_id,
                "model_uri"     : f"runs:/{run.info.run_id}/arima_model",
            })

        except Exception as e:
            warnings.warn(f"auto_arima failed for ({product_key},{store_key}): {e}. Check MLflow run details.")
            mlflow.log_param("status", "failed") # Log failure status
            mlflow.log_param("error_message", str(e))
            # If anything goes wrong, keep NaNs (already set)

    return pd.DataFrame([out])

StatementMeta(, 617ff04b-c95e-41ea-b8e0-1024a6f4188e, -1, Cancelled, , Cancelled)

In [None]:
# ─────────────────────────────────────────────────────────────────────────────
# 3. RUN THE UDF AND COLLECT METRICS AND MLFLOW INFO
# ─────────────────────────────────────────────────────────────────────────────
# This will trigger the training and MLflow logging for each product-store pair.
print("Starting ARIMA model training and MLflow logging for all product-store pairs...")
metrics_and_mlflow_info_df = (
    df_model
        .groupBy("product_key", "store_key")
        .apply(arima_per_pair)
)

print("ARIMA model training and MLflow logging complete.")
print("Showing results and MLflow run IDs:")
metrics_and_mlflow_info_df.orderBy("mape").show(20, truncate=False)

StatementMeta(, 617ff04b-c95e-41ea-b8e0-1024a6f4188e, -1, Cancelled, , Cancelled)

# If needed experiment with Single Product-Store Combination

## **Single Run**

In [None]:
def arima_single_pair(
        spark_df,
        product_key: int,
        store_key: int,
        forecast_horizon: int = 4,
        seasonal: bool = True,
        seasonal_periods: int = 52,
        min_series_len: int = 10,
    ) -> pd.DataFrame:
    """
    Fit an auto_arima model for one product–store combination and
    return MAE/RMSE/MAPE + model orders.

    Parameters
    ----------
    spark_df : DataFrame
        Weekly-granular Spark DataFrame with columns
        ['product_key','store_key','week_start','weekly_quantities_sold'].
    product_key, store_key : int
        Keys identifying the series to analyse.
    forecast_horizon : int, default 4
        Number of latest observations to hold out for accuracy metrics.
    seasonal : bool, default True
        Whether to let auto_arima search seasonal terms.
    seasonal_periods : int, default 52
        'm' parameter in SARIMA (52 = annual seasonality on weekly data).
    min_series_len : int, default 10
        Skip modelling if the series is shorter than this.

    Returns
    -------
    pandas.DataFrame (one row)
    """
    # ── 1. Pull history for that pair ───────────────────────────────────────
    pdf = (
        spark_df
        .filter(
            (F.col("product_key") == product_key) &
            (F.col("store_key")   == store_key)
        )
        .select("week_start", "weekly_quantities_sold")
        .orderBy("week_start")
        .toPandas()
    )

    n_obs = len(pdf)
    if n_obs == 0:
        raise ValueError(f"No data found for product_key={product_key}, store_key={store_key}")

    pdf["weekly_quantities_sold"] = pdf["weekly_quantities_sold"].fillna(0).astype(float)
    y = pdf["weekly_quantities_sold"].values

    # ── 2. Default output skeleton ─────────────────────────────────────────
    result = {
        "product_key"   : product_key,
        "store_key"     : store_key,
        "n_obs"         : n_obs,
        "aic"           : np.nan,
        "bic"           : np.nan,
        "arima_order"   : None,
        "seasonal_order": None,
        "mae"           : np.nan,
        "rmse"          : np.nan,
        "mape"          : np.nan,
    }

    # ── 3. Skip very short series ──────────────────────────────────────────
    if n_obs < max(min_series_len, forecast_horizon + 2):
        warnings.warn(f"Series too short ({n_obs} obs). No ARIMA fitted.")
        return pd.DataFrame([result])

    # ── 4. Train / test split ──────────────────────────────────────────────
    train, test = y[:-forecast_horizon], y[-forecast_horizon:]

    try:
        # ── 5. Fit auto_arima ──────────────────────────────────────────────
        model = auto_arima(
            train,
            seasonal=seasonal,
            m=seasonal_periods if seasonal else 1,
            stepwise=True,
            suppress_warnings=True,
            error_action="ignore",
            max_p=3, max_q=3, max_P=1, max_Q=1,
            max_order=5
        )

        fcst = model.predict(n_periods=forecast_horizon)

        # ── 6. Metrics ─────────────────────────────────────────────────────
        mae  = mean_absolute_error(test, fcst)
        rmse = np.sqrt(mean_squared_error(test, fcst))
        mape = np.mean(
            np.abs((test - fcst) / np.where(test == 0, np.nan, test))
        ) * 100

        # ── 7. Populate result dict ───────────────────────────────────────
        result.update({
            "aic"           : float(model.aic()),
            "bic"           : float(model.bic()),
            "arima_order"   : str(model.order),
            "seasonal_order": str(model.seasonal_order),
            "mae"           : float(mae),
            "rmse"          : float(rmse),
            "mape"          : float(mape),
        })

    except Exception as e:
        warnings.warn(f"auto_arima failed for ({product_key},{store_key}): {e}")

    return pd.DataFrame([result])

StatementMeta(, 617ff04b-c95e-41ea-b8e0-1024a6f4188e, -1, Cancelled, , Cancelled)

In [None]:
# df.show(5)

StatementMeta(, 617ff04b-c95e-41ea-b8e0-1024a6f4188e, -1, Cancelled, , Cancelled)

In [None]:
# sample_product = 468
# sample_store   = 46

# metrics_pdf = arima_single_pair(
#     df,                        # the Spark DataFrame you loaded
#     product_key=sample_product,
#     store_key=sample_store,
#     forecast_horizon=4
# )

# display(metrics_pdf) 

StatementMeta(, 617ff04b-c95e-41ea-b8e0-1024a6f4188e, -1, Cancelled, , Cancelled)