# The content of this notebook will be moved to a script eventually

### Python benchmarking

In [2]:
import pandas as pd
import pickle
import optuna
import optuna.visualization as viz
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

from estimator.xgbclassifier import XGBClassifier

%matplotlib inline

optuna.logging.set_verbosity(optuna.logging.WARNING)

In [3]:
SEED = 123

FIXED_PARAM = {
    'early_stopping_rounds': 25,
    'n_estimators': 500,
    'n_class': 3,
    'objective': 'multi:softprob',
    'eval_metric': 'auc',
    'verbosity': 0,
    'tree_method': 'auto',
    'n_jobs': 8,
    'seed': SEED
}

In [4]:
x, y = make_classification(
    n_samples=10_000, 
    n_classes=3,
    n_informative=5,
    random_state=SEED,
    class_sep=0.5
)

x = pd.DataFrame(data=x, columns=[f'feature_{i}' for i in range(x.shape[1])])
x_train, x_valid, y_train, y_valid = train_test_split(x, y, train_size=0.7, random_state=SEED)

In [11]:
%%time

for i in range(10):
    python_clf = XGBClassifier.make(**FIXED_PARAM)
    python_clf.fit(x_train, y_train, eval_set=[(x_valid, y_valid)], verbose=False).best_score

CPU times: user 1min 58s, sys: 234 ms, total: 1min 58s
Wall time: 14.8 s


### Spark benchmarking

In [1]:
import pandas as pd
import optuna
import optuna.visualization as viz
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

from pyspark.sql import SparkSession
import pyspark.sql.functions as F

from estimator.xgbclassifier import XGBClassifier

%matplotlib inline

optuna.logging.set_verbosity(optuna.logging.WARNING)

In [70]:
SEED = 123

FIXED_PARAM = {
    'early_stopping_rounds': 25,
    'n_estimators': 500,
    'n_class': 3,
    'objective': 'multi:softprob',
    'eval_metric': 'auc',
    'verbosity': 0,
    'tree_method': 'auto',
    'n_jobs': 1, # Setting to 1 speeds up the process quite significantly
    'train_test_ratio': 1.0,
    'seed': SEED
}

spark = SparkSession.builder.appName('my_test').config('spark.jars', '../jar/scala-util.jar,../jar/xgboost4j_2.12-1.6.1.jar,../jar/xgboost4j-spark_2.12-1.6.1.jar').getOrCreate()

In [71]:
x, y = make_classification(
    n_samples=10_000, 
    n_classes=3,
    n_informative=5,
    random_state=123,
    class_sep=0.5
)

x = pd.DataFrame(data=x, columns=[f'feature_{i}' for i in range(x.shape[1])])
xy = x.assign(label=y)
xy_train, xy_valid = map(spark.createDataFrame, train_test_split(xy, train_size=0.7, random_state=123))
FIXED_PARAM['eval_sets'] = {'val': xy_valid}
FIXED_PARAM['verbose'] = False

In [200]:
%%time
for i in range(10):
    spark_clf = XGBClassifier \
        .make(backend='scala', spark=spark, **FIXED_PARAM)
    vectorized_df = spark_clf.transform(xy_train, xy_train.columns[:-1])
    float(spark_clf.fit(vectorized_df)._model.nativeBooster().getAttr('best_score'))

22/07/25 11:55:35 WARN XGBoostSpark: train_test_ratio is deprecated since XGBoost 0.82, we recommend to explicitly pass a training and multiple evaluation datasets by passing 'eval_sets' and 'eval_set_names'
Tracker started, with env={DMLC_NUM_SERVER=0, DMLC_TRACKER_URI=192.168.1.155, DMLC_TRACKER_PORT=62083, DMLC_NUM_WORKER=1}


[11:55:36] task 0 got new rank 0
                                                                                

22/07/25 11:55:40 WARN XGBoostSpark: train_test_ratio is deprecated since XGBoost 0.82, we recommend to explicitly pass a training and multiple evaluation datasets by passing 'eval_sets' and 'eval_set_names'
Tracker started, with env={DMLC_NUM_SERVER=0, DMLC_TRACKER_URI=192.168.1.155, DMLC_TRACKER_PORT=62159, DMLC_NUM_WORKER=1}


[11:55:41] task 0 got new rank 0
                                                                                

22/07/25 11:55:45 WARN XGBoostSpark: train_test_ratio is deprecated since XGBoost 0.82, we recommend to explicitly pass a training and multiple evaluation datasets by passing 'eval_sets' and 'eval_set_names'
Tracker started, with env={DMLC_NUM_SERVER=0, DMLC_TRACKER_URI=192.168.1.155, DMLC_TRACKER_PORT=62235, DMLC_NUM_WORKER=1}


[11:55:45] task 0 got new rank 0
                                                                                

22/07/25 11:55:49 WARN XGBoostSpark: train_test_ratio is deprecated since XGBoost 0.82, we recommend to explicitly pass a training and multiple evaluation datasets by passing 'eval_sets' and 'eval_set_names'
Tracker started, with env={DMLC_NUM_SERVER=0, DMLC_TRACKER_URI=192.168.1.155, DMLC_TRACKER_PORT=62311, DMLC_NUM_WORKER=1}


[11:55:50] task 0 got new rank 0
                                                                                

22/07/25 11:55:53 WARN XGBoostSpark: train_test_ratio is deprecated since XGBoost 0.82, we recommend to explicitly pass a training and multiple evaluation datasets by passing 'eval_sets' and 'eval_set_names'
Tracker started, with env={DMLC_NUM_SERVER=0, DMLC_TRACKER_URI=192.168.1.155, DMLC_TRACKER_PORT=62387, DMLC_NUM_WORKER=1}


[11:55:54] task 0 got new rank 0
                                                                                

22/07/25 11:55:58 WARN XGBoostSpark: train_test_ratio is deprecated since XGBoost 0.82, we recommend to explicitly pass a training and multiple evaluation datasets by passing 'eval_sets' and 'eval_set_names'
Tracker started, with env={DMLC_NUM_SERVER=0, DMLC_TRACKER_URI=192.168.1.155, DMLC_TRACKER_PORT=62463, DMLC_NUM_WORKER=1}


[11:55:58] task 0 got new rank 0
                                                                                

22/07/25 11:56:02 WARN XGBoostSpark: train_test_ratio is deprecated since XGBoost 0.82, we recommend to explicitly pass a training and multiple evaluation datasets by passing 'eval_sets' and 'eval_set_names'
Tracker started, with env={DMLC_NUM_SERVER=0, DMLC_TRACKER_URI=192.168.1.155, DMLC_TRACKER_PORT=62539, DMLC_NUM_WORKER=1}


[11:56:02] task 0 got new rank 0
                                                                                

22/07/25 11:56:06 WARN XGBoostSpark: train_test_ratio is deprecated since XGBoost 0.82, we recommend to explicitly pass a training and multiple evaluation datasets by passing 'eval_sets' and 'eval_set_names'
Tracker started, with env={DMLC_NUM_SERVER=0, DMLC_TRACKER_URI=192.168.1.155, DMLC_TRACKER_PORT=62615, DMLC_NUM_WORKER=1}


[11:56:07] task 0 got new rank 0
                                                                                

22/07/25 11:56:10 WARN XGBoostSpark: train_test_ratio is deprecated since XGBoost 0.82, we recommend to explicitly pass a training and multiple evaluation datasets by passing 'eval_sets' and 'eval_set_names'
Tracker started, with env={DMLC_NUM_SERVER=0, DMLC_TRACKER_URI=192.168.1.155, DMLC_TRACKER_PORT=62692, DMLC_NUM_WORKER=1}


[11:56:11] task 0 got new rank 0
                                                                                

22/07/25 11:56:15 WARN XGBoostSpark: train_test_ratio is deprecated since XGBoost 0.82, we recommend to explicitly pass a training and multiple evaluation datasets by passing 'eval_sets' and 'eval_set_names'
Tracker started, with env={DMLC_NUM_SERVER=0, DMLC_TRACKER_URI=192.168.1.155, DMLC_TRACKER_PORT=62778, DMLC_NUM_WORKER=1}


[11:56:16] task 0 got new rank 0
[Stage 207:>                                                        (0 + 1) / 1]

CPU times: user 125 ms, sys: 93.8 ms, total: 219 ms
Wall time: 44.6 s


                                                                                