In [1]:
import mlflow
import pandas as pd
import numpy as np
from datetime import datetime
from pyspark.sql import functions as F, types as T
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from statsmodels.tsa.arima.model import ARIMA
import mlflow



StatementMeta(, 0f95dcb3-19c8-4417-bb4b-5d3c272c224c, 3, Finished, Available, Finished)

In [2]:
input_path = "Files/Sales/sales_history.csv"
output_path = "Files/Forecasts"
holdout_weeks = 8
forecast_horizon = 12

StatementMeta(, 0f95dcb3-19c8-4417-bb4b-5d3c272c224c, 4, Finished, Available, Finished)

In [3]:
df = (
    spark.read.option("header", True).option("inferSchema", True).csv(input_path)
    .withColumn("date", F.to_date("date"))
)
weekly_df = (
    df.withColumn("week_start", F.date_trunc("week", F.col("date")))
      .groupBy("site_id", "product_id", "week_start")
      .agg(F.sum("units_sold").alias("units_sold"))
)
display(weekly_df.limit(10))


StatementMeta(, 0f95dcb3-19c8-4417-bb4b-5d3c272c224c, 5, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, d57579ab-3178-4c5e-8a32-8080eac700b4)

In [4]:


def fit_and_forecast(pdf: pd.DataFrame) -> pd.DataFrame:
   
    site = pdf['site_id'].iloc[0]
    prod = pdf['product_id'].iloc[0]
    ts = pdf.set_index('week_start')['units_sold'].asfreq('7D').fillna(0)

    n = len(ts)
    ts = ts.asfreq('7D').interpolate(method='linear').fillna(method='bfill')
    train_len = max(3, n - holdout_weeks)
    train = ts.iloc[:train_len]

    forecast_horizon_local = forecast_horizon  # to use inside pandas UDF
    model_uri = None

    with mlflow.start_run(run_name=f"{site}_{prod}", nested=True) as run:
        mlflow.log_params({"site_id": site, "product_id": prod})

        # --- Try ETS first ---
        seasonal_periods = 52 if len(train) >= 104 else None
        seasonal = 'add' if seasonal_periods else None

        try:
            ets_model = ExponentialSmoothing(
                train,
                trend='add',
                seasonal=seasonal,
                seasonal_periods=seasonal_periods
            ).fit(optimized=True)

            forecast = ets_model.forecast(forecast_horizon_local)
            method = "ETS" if seasonal_periods else "ETS_NoSeasonality"

            # Log ETS model to MLflow
            model_path = f"models/{site}_{prod}_ETS"
            mlflow.statsmodels.log_model(ets_model, artifact_path=model_path)
            model_uri = f"runs:/{run.info.run_id}/{model_path}"

        except Exception as e_ets:
            print(f"ETS failed for {site}-{prod}: {e_ets}")
            # --- Fallback to ARIMA ---
            try:
                if len(train) < 5:
                    raise ValueError("Not enough data for ARIMA fallback")

                ar_model = ARIMA(train, order=(1,1,1)).fit()
                forecast = ar_model.get_forecast(forecast_horizon_local).predicted_mean
                method = "ARIMA"

                # Log ARIMA model to MLflow
                model_path = f"models/{site}_{prod}_ARIMA"
                mlflow.statsmodels.log_model(ar_model, artifact_path=model_path)
                model_uri = f"runs:/{run.info.run_id}/{model_path}"

            except Exception as e_ar:
                # If both fail, raise clear error
                raise RuntimeError(f"Both ETS and ARIMA failed for {site}-{prod}: ETS({e_ets}) ARIMA({e_ar})")

        # Post-process forecast
        # nsures that all values in forecast are at least 0.
        # Any negative values are replaced with 0.
        chosen = forecast.clip(lower=0).round().astype(int)

        result = pd.DataFrame({
            'site_id': [site] * forecast_horizon_local,
            'product_id': [prod] * forecast_horizon_local,
            'week_start': pd.date_range(train.index[-1] + pd.Timedelta(7, 'd'),
                                        periods=forecast_horizon_local, freq='7D'),
            'forecast_units': chosen.values,
            'method': [method] * forecast_horizon_local,
            'mlflow_model_uri': [model_uri] * forecast_horizon_local
        })

        return result


StatementMeta(, 0f95dcb3-19c8-4417-bb4b-5d3c272c224c, 6, Finished, Available, Finished)

In [5]:
schema = T.StructType([
    T.StructField("site_id", T.StringType()),
    T.StructField("product_id", T.StringType()),
    T.StructField("week_start", T.TimestampType()),
    T.StructField("forecast_units", T.IntegerType()),
    T.StructField("method", T.StringType()),
    T.StructField("mlflow_model_uri", T.StringType())
])

result_df = (
    weekly_df.groupBy("site_id", "product_id")
             .applyInPandas(fit_and_forecast, schema=schema)
)
display(result_df.limit(10))


StatementMeta(, 0f95dcb3-19c8-4417-bb4b-5d3c272c224c, 7, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 84d74f21-08a9-4fb4-b580-fb21dd05f0d1)

In [6]:
table_name = "ForecastWeekly"  # your downstream table

# Overwrite existing table with the new forecast
result_df.write.format("delta")\
                  .mode("overwrite")\
                  .saveAsTable(table_name)

print(f" Forecasts written to Fabric data table: {table_name}")

StatementMeta(, 0f95dcb3-19c8-4417-bb4b-5d3c272c224c, 8, Finished, Available, Finished)

 Forecasts written to Fabric data table: ForecastWeekly


In [7]:
timestamp = datetime.now().strftime("%Y%m%d")
result_df.write.option("header", True).option("delimiter", ",").mode("overwrite").csv(f"{output_path}/forecasts_{timestamp}")
print(" Forecasts written to Lakehouse")


StatementMeta(, 0f95dcb3-19c8-4417-bb4b-5d3c272c224c, 9, Finished, Available, Finished)

 Forecasts written to Lakehouse
