## Parallelized Prophet Modeling with Ray

Prophet is a simple, yet powerful, additive forecasting model. To the former, it's implementation is intuitive and requires editing a few parameters and, to the latter, it provides an algorithmically efficient way to identify time-related patterns in the data. These two aspects make Prophet an ideal starting, and possibly end, point for a forecasting model. 

However, in real-world production use-cases we must overcome scaling challenges in model training and inference. Specifically, in retail use-cases we'd like to generate forecasting models for every combination of store x SKU. This can lead to 100K+ models. Furthermore, business demands may require all these models be trained overnight on a weekly basis!

In [0]:
%pip install -qU mlflow ray[default]==2.44.1 ray[data]==2.44.1
dbutils.library.restartPython()

In [0]:
catalog = "main"
schema = "ray_gtm_examples"
table = "data_synthetic_timeseries_1000_groups"
write_table = "prophet_forecasts"
label="y"

## Optional: Generate a massive time-series dataset

In [0]:
%run ./00_generate_timeseries_data

In [0]:
# Synthetic data generation 
import pandas as pd

if not spark.catalog.tableExists(f"{catalog}.{schema}.{table}"): 
  # Create table for features
  id_sdf.write.mode('overwrite').saveAsTable(f"{catalog}.{schema}.{table}")
  print(f"... OK!")

Ray Data with `map_groups`


In [0]:
import ray
from ray.util.spark import setup_ray_cluster, shutdown_ray_cluster

restart = True
if restart is True:
  try:
    shutdown_ray_cluster()
  except:
    pass
  try:
    ray.shutdown()
  except:
    pass

# The below configuration mirrors my Spark worker cluster set up. Change this to match your cluster configuration. 
setup_ray_cluster(
  min_worker_nodes=6,
  max_worker_nodes=6,
  num_cpus_worker_node=16,
  num_gpus_worker_node=0,
  collect_log_to_path="/dbfs/Users/jon.cheung@databricks.com/ray_collected_logs"
)

In [0]:
import os
# create volume for Ray Data 
os.environ['RAY_UC_VOLUMES_FUSE_TEMP_DIR'] = f'/Volumes/{catalog}/{schema}/ray_data_tmp_dir'

In [0]:
from prophet import Prophet
import pandas as pd
import mlflow
import os
from mlflow.utils.databricks_utils import get_databricks_env_vars
from ray.data import from_spark

experiment_name = '/Users/jon.cheung@databricks.com/ray_prophet_map_batches'
mlflow.set_experiment(experiment_name)

def train_and_inference_prophet(grouped_data:pd.DataFrame, 
                                horizon:int,
                                ):
        # Create nested child runs named after the group
        group_name = grouped_data.loc[0,'group_name']
        m = Prophet(daily_seasonality=True)
        m.fit(grouped_data)
        future = m.make_future_dataframe(periods=horizon)
        forecast = m.predict(future)
        
        to_write = forecast.iloc[-horizon:]
        to_write['ds'] = to_write['ds'].astype(str)
        to_write['group_name'] = group_name
        return to_write

ray_data = from_spark(spark.read.table(f"{catalog}.{schema}.{table}"), 
                      use_spark_chunk_api=False)
# with mlflow.start_run(run_name="prophet_models_250121_mini_autoscale") as parent_run: 
grouped = ray_data.groupby("group_name")
results = grouped.map_groups(train_and_inference_prophet, 
            num_cpus=1,
            fn_kwargs={"horizon": 14})

# write grouped results to a Delta table
ray.data.Dataset.write_databricks_table(results, f"{catalog}.{schema}.{write_table}", mode='overwrite')
  

In [0]:
from prophet import Prophet
import pandas as pd
import mlflow
import os
from mlflow.utils.databricks_utils import get_databricks_env_vars
from ray.data import from_spark

experiment_name = '/Users/jon.cheung@databricks.com/ray_prophet_map_batches'
mlflow.set_experiment(experiment_name)
mlflow_db_creds = get_databricks_env_vars("databricks")

def train_and_inference_prophet(grouped_data:pd.DataFrame, 
                                horizon:int,
                                parent_run_id: str
                                ):
        # Set mlflow credentials and active MLflow experiment within each Ray task
        os.environ.update(mlflow_db_creds)
        mlflow.set_experiment(experiment_name)

        # Create nested child runs named after the group
        group_name = grouped_data.loc[0,'group_name']
        with mlflow.start_run(run_name = f"{group_name}",
                              parent_run_id=parent_run_id,
                              nested=True):

          # fit the model and generate forecasts
          m = Prophet(daily_seasonality=True)
          m.fit(grouped_data)
          future = m.make_future_dataframe(periods=horizon)
          forecast = m.predict(future)
          
          # edit forecasts Dataframe for delta table logging
          to_write = forecast.iloc[-horizon:]
          to_write['ds'] = to_write['ds'].astype(str)
          to_write['group_name'] = group_name

          # mlflow logging
          dataset = mlflow.data.from_pandas(grouped_data)
          mlflow.log_input(dataset)
          mlflow.prophet.log_model(pr_model=m,
                                   artifact_path="prophet_model")

        return to_write

ray_data = from_spark(spark.read.table(f"{catalog}.{schema}.{table}"), 
                      use_spark_chunk_api=False)
with mlflow.start_run(run_name="prophet_models_250403") as parent_run: 
  grouped = ray_data.groupby("group_name")
  results = grouped.map_groups(train_and_inference_prophet, 
              num_cpus=1,
              fn_kwargs={"horizon": 14,
                         "parent_run_id": parent_run.info.run_id})

# write grouped results to a Delta table
ray.data.Dataset.write_databricks_table(results, 
                                        f"{catalog}.{schema}.{write_table}",
                                         mode='overwrite')
  