In [0]:
%pip install optuna optuna-integration # Integration with MLflo
dbutils.library.restartPython()

In [0]:
dbutils.widgets.text("experiment_name", "spark_vs_ray_xgboost", "experiment_name")
experiment_name = dbutils.widgets.get("experiment_name")
print(f"Logging to MLflow Experiment {experiment_name}")

dbutils.widgets.text("num_training_rows", "100", "num_training_rows")
num_training_rows = int(dbutils.widgets.get("num_training_rows"))
print(f"Generating {num_training_rows} synthetic rows")

dbutils.widgets.text("num_training_columns", "1000", "num_training_columns")
num_training_columns = int(dbutils.widgets.get("num_training_columns"))
print(f"Generating {num_training_columns} synthetic columns")

dbutils.widgets.text("num_labels", "2", "num_labels")
num_labels = int(dbutils.widgets.get("num_labels"))
print(f"Generating {num_labels} synthetic labels")

dbutils.widgets.text("max_depth", "5", "max_depth")
max_depth = int(dbutils.widgets.get("max_depth"))
print(f"XGBoost max_depth: {max_depth}")

dbutils.widgets.text("n_estimators", "100", "n_estimators")
n_estimators = int(dbutils.widgets.get("n_estimators"))
print(f"XGBoost n_estimators: {n_estimators}")

concurrency = sc.defaultParallelism
print(f"Setting Spark.XGBoost num_workers to {concurrency} = num cores on workers in cluster")

In [0]:

catalog = "main"
schema = "jon_cheung"

if num_labels > 2:
  table_path = f"synthetic_data_{num_training_rows}_rows_{num_training_columns}_columns_{num_labels}_labels"
else:
  table_path = f"synthetic_data_{num_training_rows}_rows_{num_training_columns}_columns"
  
parquet_path = f"/Volumes/{catalog}/{schema}/synthetic_data/{table_path}"
print(f"Parquet path: {parquet_path}")

In [0]:
sdf = spark.read.table(f'{catalog}.{schema}.{table_path}')


feature_names = sdf.columns
feature_names.remove('id')
feature_names.remove('target')

In [0]:
from xgboost.spark import SparkXGBClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
import optuna
param = {
    # 'objective': 'multi:softprob',
    'num_class': num_labels,  # Change this to the number of classes in your dataset
    'eval_metric': 'mlogloss',
    # 'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
    # 'max_depth': trial.suggest_int('max_depth', 3, 30),
    # 'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
    # 'subsample': trial.suggest_float('subsample', 0.5, 1.0),
    'n_estimators': 100,
    # 'max_delta_step': trial.suggest_int('max_delta_step', 0, 10)
}



# Initialize the GBTClassifier with parameters from Optuna
sxgbc = SparkXGBClassifier(label_col="target", features_col=feature_names, num_workers=sc.defaultParallelism, verbosity=1, device='cuda', use_gpu=True, **param) # num_workers=, 

# Evaluate the model using k-fold cross-validation
evaluator = MulticlassClassificationEvaluator(labelCol="target", predictionCol="prediction", metricName="logLoss")
crossval = CrossValidator(estimator=sxgbc, 
                        estimatorParamMaps=ParamGridBuilder().build(), 
                        evaluator=evaluator, 
                        numFolds=5)  # k-fold cross-validation

# Perform cross-validation
cv_model = crossval.fit(sdf)

# Return the average AUC from cross-validation
avg_logloss = max(cv_model.avgMetrics)

sxgbc.fit(sdf)

In [0]:
sxgbc = SparkXGBClassifier(label_col="target", features_col=feature_names, num_workers=sc.defaultParallelism, verbosity=1, device='cuda', **param)

sxgbc.fit(sdf)

In [0]:
from xgboost.spark import SparkXGBClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
import optuna

def tuning_action_propensity_class_sparkxgb(df, n_trials=150, n_folds=5):
    # Define an objective function for Optuna
    def objective(trial):
        # Define the parameter grid
        param = {
            'objective': 'multi:softprob',
            'num_class': num_labels,  # Change this to the number of classes in your dataset
            'eval_metric': 'mlogloss',
            # 'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
            # 'max_depth': trial.suggest_int('max_depth', 3, 30),
            # 'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
            # 'subsample': trial.suggest_float('subsample', 0.5, 1.0),
            'n_estimators': trial.suggest_int('n_estimators', 50, 500),
            # 'max_delta_step': trial.suggest_int('max_delta_step', 0, 10)
        }

        # Initialize the GBTClassifier with parameters from Optuna
        sxgbc = SparkXGBClassifier(label_col="target", features_col=feature_names, num_workers=256, verbosity=1, **param) # num_workers=, 
        
        # Evaluate the model using k-fold cross-validation
        evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="logLoss")
        crossval = CrossValidator(estimator=sxgbc, 
                                estimatorParamMaps=ParamGridBuilder().build(), 
                                evaluator=evaluator, 
                                numFolds=n_folds)  # k-fold cross-validation
        
        # Perform cross-validation
        cv_model = crossval.fit(df)
        
        # Return the average AUC from cross-validation
        avg_logloss = max(cv_model.avgMetrics)
        return avg_logloss

    # Run the Optuna optimization
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=n_trials, n_jobs=-1, timeout=600, catch=(Exception,), show_progress_bar=True)

    # Plotting Parameter Importance
    # param_importance_plot = vis.plot_param_importances(study)
    # param_importance_plot.show()

    # Print the best trial parameters
    print(f'Best trial number: {study.best_trial.number}')
    print(f'Best trial value: {study.best_trial.value}')
    print(f'Best trial parameters: {study.best_trial.params}')

tuning_action_propensity_class_sparkxgb(sdf, n_trials=5, n_folds=5)