# Parallelized Bayesian Hyperparameter Tuning for XGBoost with Ray 

Boosting algorithms, like XGboost, offers a simple, yet powerful, model to solve many regression and classification problems. However, they are prone to overfitting and require hyperparameter tuning with validation datasets to ensure they can be generalized to the real-world problems they are meant to solve. When it comes to hyperparameter tuning, traditional grid-search is inefficient (i.e. unnecessarily time-consuming). It offers little benefit over more efficient methods like Bayesian search, especially when the search space is large. To double-click on this, Bayesian search balances exploration and exploitation. It explores the search space and uses this as a prior to determine which area to search more in-depth for later trials. 

This notebook outlines two powerful additions to XGBoost to improve (i.e. make more efficient) hyperparameter search. They are:
1. Ray for parallelized search -- specifically single-node multi-GPU
2. Optuna for Bayesian search

This notebook is also specific to using a single-node multi-GPU cluster. I'd suggest benchmarking on one trial to see if parallelizing with GPUs is worth it compared to CPUs. For example, Im currently running this on a node with 4 GPUs and 48 CPUs. Barring any memory constraints, unless a GPU is 12 times more efficient than CPU it's more efficent to use CPUs.  

In [0]:
%pip install optuna ray[data]==2.37.0 ray[train]==2.37.0 ray[tune]==2.37.0
dbutils.library.restartPython()

In [0]:
catalog = "main"
schema = "ray_gtm_examples"
num_labels=5
num_training_rows = 25_000_000
num_training_columns = 100

if num_labels > 2:
  table_path = f"synthetic_data_{num_training_rows}_rows_{num_training_columns}_columns_{num_labels}_labels"
else:
  table_path = f"synthetic_data_{num_training_rows}_rows_{num_training_columns}_columns"
  
parquet_path = f"/Volumes/{catalog}/{schema}/synthetic_data/{table_path}"
print(f"Parquet path: {parquet_path}")

In [0]:
import ray
from ray.util.spark import setup_ray_cluster, shutdown_ray_cluster
import os

restart = True
if restart is True:
  try:
    shutdown_ray_cluster()
  except:
    pass
  try:
    ray.shutdown()
  except:
    pass


context = ray.init(
  include_dashboard=True,
  dashboard_host="0.0.0.0",
  dashboard_port=9999
  )
  

def get_dashboard_url(spark=spark, dbutils=dbutils, dashboard_port='9999'):  
  base_url='https://' + spark.conf.get("spark.databricks.workspaceUrl")
  workspace_id=spark.conf.get("spark.databricks.clusterUsageTags.orgId")
  cluster_id=spark.conf.get("spark.databricks.clusterUsageTags.clusterId")

  pathname_prefix='/driver-proxy/o/' + workspace_id + '/' + cluster_id + '/' + dashboard_port+"/" 
  apitoken = dbutils.notebook().entry_point.getDbutils().notebook().getContext().apiToken().get()
  dashboard_url=base_url + pathname_prefix  

  return dashboard_url
  
print(get_dashboard_url())
# Ray dashboard at URL like: https://dbc-dp-1444828305810485.cloud.databricks.com/driver-proxy/o/1444828305810485/0325-215413-crpqybob/9999/

In [0]:
# import xgboost as xgb
# import numpy as np
# from sklearn.datasets import make_classification
# from sklearn.model_selection import train_test_split

# # 1. Set up sample data
# X, y = make_classification(n_samples=1_000_000, n_features=20, random_state=42)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Convert to DMatrix format
# dtrain = xgb.DMatrix(X_train, label=y_train)
# dtest = xgb.DMatrix(X_test, label=y_test)

In [0]:
from typing import Tuple
import ray
from ray.data import Dataset

def prepare_data() -> Tuple[Dataset, Dataset, Dataset]:
    
    dataset = ray.data.read_parquet(parquet_path)
    train_dataset, valid_dataset = dataset.train_test_split(test_size=0.3, seed=42)

    return train_dataset, valid_dataset
  
train_dataset, valid_dataset = prepare_data()

In [0]:
import time
import xgboost as xgb

devices = ['cpu', 'cuda']
run_time = {}

for device in devices:
  params = {
      "objective": "multi:softmax",
      'eval_metric': 'mlogloss', 
      "num_class": num_labels, 
      'eta': 0.1,
      'max_depth': 6,
      'tree_method': 'hist',    # Use GPU acceleration
      'device': device,         # Primary GPU ID
  }

  # Convert to DMatrix format
  train_df, test_df = train_dataset.to_pandas(), valid_dataset.to_pandas()
  X_train, y_train = train_df.drop("target", axis=1), train_df["target"]
  X_test, y_test = test_df.drop("target", axis=1), test_df["target"]
  dtrain = xgb.DMatrix(X_train, label=y_train)
  dtest = xgb.DMatrix(X_test, label=y_test)

  # 3. Train the model with one GPU
  start_time = time.time()
  evals_result = {}
  model = xgb.train(
      params,
      dtrain,
      num_boost_round=1000,
      evals=[(dtrain, 'train'), (dtest, 'test')],
      evals_result=evals_result,
      verbose_eval=10
  )

  final_time = (time.time() - start_time)
  run_time[device] = final_time


In [0]:
run_time

In [0]:
# https://optuna.readthedocs.io/en/stable/faq.html#how-can-i-use-two-gpus-for-evaluating-two-trials-simultaneously
# create one main.py 
    # params: train dataset, test dataset, target_column, optuna_study_name, CUDA_VISIBLE_DEVICES 
# run each main.py with a different CUDA_VISIBLE_DEVICES
# Multi-task job. One task per GPU available on single-node.

# or use ray in a single notebook

In [0]:
import os
import numpy as np
import mlflow
from mlflow.utils.databricks_utils import get_databricks_env_vars
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from ray import train, tune
from ray.air.integrations.mlflow import MLflowLoggerCallback
from ray.tune.search.optuna import OptunaSearch
import xgboost as xgb

# Set the number of trials to run
num_samples = 12

# Set mlflow experiment name
experiment_name = '/Users/jon.cheung@databricks.com/ray-xgb-gpu'
mlflow.set_experiment(experiment_name)
mlflow_db_creds = get_databricks_env_vars("databricks")

# Define a training function to parallelize
def train_classifier(config: dict,
                    experiment_name: str,
                    parent_run_id: str,
                    mlflow_credentials: dict,
                    ):
    """
    This objective function trains an XGBoost model given a set of sampled hyperparameters. There is no returned value but a metric that is published to the Optuna study to update the progress of the HPO run.

    config: dict, defining the sampled hyperparameters to train the model on.
    **The below three parameters are used for nesting each HPO run as a child run**
    experiment_name: str, the name of the mlflow experiment to log to. This is inherited from the driver node that initiates the mlflow parent run.
    parent_run_id: str, the ID of the parent run. This is inherited from the driver node that initiates the mlflow parent run.
    mlflow_credentials: dict, the credentials for logging to mlflow. This is inherited from the driver node. 
    """
    # # Set mlflow credentials and active MLflow experiment within each Ray task
    os.environ.update(mlflow_credentials)
    mlflow.set_experiment(experiment_name)

    # Convert to DMatrix format
    train_df, test_df = train_dataset.to_pandas(), valid_dataset.to_pandas()
    X_train, y_train = train_df.drop("target", axis=1), train_df["target"]
    X_test, y_test = test_df.drop("target", axis=1), test_df["target"]
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)

    with mlflow.start_run(run_name='xgb_model_hpo_250421', 
                          nested=True,
                          parent_run_id=parent_run_id):

        # create dictionary to collect evaluation metrics
        evals_result = {}
        # Train the classifier
        bst = xgb.train(
            config,
            dtrain,
            early_stopping_rounds=config['early_stopping_rounds'],
            num_boost_round=1000,
            evals=[(dtrain, 'train'), (dtest, 'test')],
            evals_result=evals_result,
            verbose_eval=10
            )

        # Retrieve the evaluation metric values from the training process
        final_eval_metric = evals_result['test'][config['eval_metric']][-1]
        
        # write mlflow metrics
        mlflow.log_params(config)
        mlflow.log_metrics({f'validation_{config["eval_metric"]}': final_eval_metric})

    # Return evaluation results back to driver node
    train.report({config['eval_metric']: final_eval_metric, "done": True})

# By default, Ray Tune uses 1 CPU/trial. Since we want to explicitly use one GPU/Optuna trial we'll set that here. With 25M rows x 50 columns, we can train a model with one GPU in about 15 mins. 
trainable_with_resources = tune.with_resources(train_classifier, 
                                               {"gpu": 1})

# Define the hyperparameter search space.
param_space = {
    "objective": "multi:softmax",
    'eval_metric': 'mlogloss', 
    "num_class": 5,
    "learning_rate": tune.uniform(0.01, 0.3),
    "n_estimators": tune.randint(100, 1000),
    "early_stopping_rounds": tune.randint(3, 20),
    "random_state": 42,
    'tree_method': 'hist',    
    'device': 'cuda',    # Use GPU acceleration
}

# Set up search algorithm. Here we use Optuna and use the default the Bayesian sampler (i.e. TPES)
optuna = OptunaSearch(metric="mlogloss", 
                      mode="min")

with mlflow.start_run(run_name ='gpu-run-250422') as parent_run:
    tuner = tune.Tuner(
        ray.tune.with_parameters(
            trainable_with_resources,
            experiment_name=experiment_name,
            parent_run_id = parent_run.info.run_id,
            mlflow_credentials=mlflow_db_creds),
        tune_config=tune.TuneConfig(num_samples=num_samples,
                                    search_alg=optuna),
        param_space=param_space
        )
    results = tuner.fit()

results.get_best_result(metric="mlogloss", 
                        mode="min").config