In [0]:
%pip install -qU databricks-feature-engineering mlflow ray[default] ray[data] databricks-sql-connector

dbutils.library.restartPython()

In [0]:
catalog = "jon_cheung"
schema = "ray_gtm_examples"
table = "data_synthetic_timeseries_mini"
label="y"

## Optional: Generate a massive time-series dataset

In [0]:
%run ./generate_timeseries_data

In [0]:
# Synthetic data generation 
import pandas as pd

if not spark.catalog.tableExists(f"{catalog}.{schema}.{table}"): 
  # Create table for features
  id_sdf.write.mode('overwrite').saveAsTable(f"{catalog}.{schema}.{table}")
  print(f"... OK!")

Ray Data with `map_groups`


In [0]:
import ray
from ray.util.spark import setup_ray_cluster, shutdown_ray_cluster

# The recommended configuration for a Ray cluster is as follows:
# - set the num_cpus_per_node to the CPU count per worker node (with this configuration, each Apache Spark worker node launches one Ray worker node that will fully utilize the resources of each Apache Spark worker node.)
# - set min_worker_nodes to the number of Ray worker nodes you want to launch on each node.
# - set max_worker_nodes to the total amount of worker nodes (this and `min_worker_nodes` together enable autoscaling)
setup_ray_cluster(
  min_worker_nodes=2,
  max_worker_nodes=8,
  num_cpus_per_node=16,
  num_gpus_worker_node=0,
  collect_log_to_path="/dbfs/Users/jon.cheung@databricks.com/ray_collected_logs",
  RAY_memory_monitor_refresh_ms=0
)

In [0]:
import os
# create volume for Ray Data 
os.environ['RAY_UC_VOLUMES_FUSE_TEMP_DIR'] = f'/Volumes/{catalog}/{schema}/ray_data'

In [0]:
from prophet import Prophet
import pandas as pd
import mlflow
import os
from mlflow.utils.databricks_utils import get_databricks_env_vars
from ray.data import from_spark

experiment_name = '/Users/jon.cheung@databricks.com/ray_prophet_large'
mlflow.set_experiment(experiment_name)
mlflow_db_creds = get_databricks_env_vars("databricks")

def train_and_inference_prophet(grouped_data:pd.DataFrame, 
                                horizon:int,
                                parent_run_id:str
                                ):
        # Set mlflow credentials and active MLflow experiment within each Ray task
        os.environ.update(mlflow_db_creds)
        mlflow.set_experiment(experiment_name)

        # Create nested child runs named after the group
        group_name = grouped_data.loc[0,'group_name']
        with mlflow.start_run(run_name = f"{group_name}",
                              parent_run_id=parent_run_id):
          dataset = mlflow.data.from_pandas(grouped_data)
          mlflow.log_input(dataset)
                  
          m = Prophet(daily_seasonality=True)
          m.fit(grouped_data)
          future = m.make_future_dataframe(periods=horizon)
          forecast = m.predict(future)
        return forecast

ray_data = from_spark(spark.read.table(f"{catalog}.{schema}.{table}"), 
                      use_spark_chunk_api=False)
with mlflow.start_run(run_name="prophet_models_250121_mini_autoscale") as parent_run: 
  grouped = ray_data.groupby("group_name")
  grouped.map_groups(train_and_inference_prophet, 
              num_cpus=1,
              fn_kwargs={"horizon": 14,
                         "parent_run_id": parent_run.info.run_id})
  
  # # # map_groups() operation is lazy, meaning it won't be executed until you call an action or convert it to another format. This allows Ray to optimize the execution plan for better performance.
  # results = grouped.take_all()