In [8]:
import warnings

warnings.filterwarnings("ignore") 

In [9]:
import sys
sys.path.append("../")

import os
import cudf
from numpy import arange
from joblib import dump

from C_scrub.c_CUDA_RAPIDS_data_engineering import scrub_feature_engineering

from sklearn.metrics import make_scorer
from cuml.metrics.accuracy import accuracy_score
from cuml.model_selection import train_test_split

import xgboost as xgb

import dask_ml.model_selection as dcv
from dask.distributed import Client
from dask_cuda import LocalCUDACluster


# System set-up

In [10]:
BASE_DIR = os.path.dirname(os.getcwd())
DATA_DIR = os.path.join(BASE_DIR, 'A_data')
RESOURCES_DIR = os.path.join(BASE_DIR, 'B_resources', 'c_CUDA_RAPIDS')

In [11]:
def get_cluster():
    cluster = LocalCUDACluster(
        device_memory_limit='10GB',
        jit_unspill=True
    )
    client = Client(cluster)
    return client
client = get_cluster()
n_workers = len(client.scheduler_info()["workers"])
client

0,1
Connection method: Cluster object,Cluster type: dask_cuda.LocalCUDACluster
Dashboard: http://127.0.0.1:45257/status,

0,1
Dashboard: http://127.0.0.1:45257/status,Workers: 1
Total threads: 1,Total memory: 251.77 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:43021,Workers: 1
Dashboard: http://127.0.0.1:45257/status,Total threads: 1
Started: Just now,Total memory: 251.77 GiB

0,1
Comm: tcp://127.0.0.1:46213,Total threads: 1
Dashboard: http://127.0.0.1:35983/status,Memory: 251.77 GiB
Nanny: tcp://127.0.0.1:44301,
Local directory: /tmp/dask-scratch-space/worker-3zw_vpzt,Local directory: /tmp/dask-scratch-space/worker-3zw_vpzt
GPU: NVIDIA GeForce RTX 3060,GPU memory: 12.00 GiB


In [13]:
import time
from contextlib import contextmanager

# Helping time blocks of code
@contextmanager
def timed(txt):
    t0 = time.time()
    yield
    t1 = time.time()
    print("%32s time:  %8.5f" % (txt, t1 - t0))

# Collect data

In [14]:
data_compet = cudf.read_csv(os.path.join(DATA_DIR, "train.csv"))
data_compet.drop(['id'], axis=1, inplace=True)

data_orig = cudf.read_csv(os.path.join(DATA_DIR, 'horse.csv'))  # Include public data

data = cudf.concat([data_compet, data_orig], ignore_index=True)

data.dropna(subset=['outcome'], inplace=True)
data.drop_duplicates(inplace=True)
data.reset_index(drop=True, inplace=True)

del data_compet, data_orig

# Prepare data

In [15]:
data = scrub_feature_engineering(data, train=True)

# Parameters definition

In [16]:
random_seed = 5000

# test split size
split_size = 0.2

#Number of cross-validation folds
n_splits = 4

# number of iteration for RamdomizedSearchCV
XGB_grid_n_iter_search = 3**4

# Undersampling

Didn't bring improvement to the model.

In [None]:
# data = data.groupby('outcome').apply(lambda x: x.sample(data['outcome'].value_counts().min(), random_state=random_seed))

# Split data

In [19]:
X_train, X_test, y_train, y_test = train_test_split(data.drop('outcome', axis=1),
                                                    data['outcome'],
                                                    test_size=split_size,
                                                    stratify=data['outcome'])

In [20]:
X_cpu = X_train.to_pandas()
y_cpu = y_train.to_numpy()

X_test_cpu = X_test.to_pandas()
y_test_cpu = y_test.to_numpy()

# Define metrics functions

In [21]:
def accuracy_score_wrapper(y, y_hat):
    """
    A wrapper function to convert labels to float32,
    and pass it to accuracy_score.

    Params:
    - y: The y labels that need to be converted
    - y_hat: The predictions made by the model
    """
    y = y.astype("float32")  # cuML RandomForest needs the y labels to be float32
    return accuracy_score(y, y_hat, convert_dtype=True)

accuracy_wrapper_scorer = make_scorer(accuracy_score_wrapper)
cuml_accuracy_scorer = make_scorer(accuracy_score, convert_dtype=True)

# Define Hyperparameters Optimization functions

In [22]:
def do_HPO(model, gridsearch_params, scorer, X, y, mode="gpu-Grid", n_iter=10):
    """
    Perform HPO based on the mode specified

    mode: default gpu-Grid. The possible options are:
    1. gpu-grid: Perform GPU based GridSearchCV
    2. gpu-random: Perform GPU based RandomizedSearchCV

    n_iter: specified with Random option for number of parameter settings sampled

    Returns the best estimator and the results of the search
    """
    if mode == "gpu-grid":
        print("gpu-grid selected")
        clf = dcv.GridSearchCV(model, gridsearch_params, cv=n_splits, scoring=scorer)
    elif mode == "gpu-random":
        print("gpu-random selected")
        clf = dcv.RandomizedSearchCV(model, gridsearch_params, cv=n_splits, scoring=scorer, n_iter=n_iter)
    else:
        print("Unknown Option, please choose one of [gpu-grid, gpu-random]")
        return None, None

    models_fitted = clf.fit(X, y)

    print(
        "Best clf and score {} {}\n---\n".format(models_fitted.best_estimator_, models_fitted.best_score_)
    )
    return models_fitted.best_estimator_, models_fitted

In [23]:
def last_fit(model, X_train, y_train, X_test, y_test, mode_str="Default"):
    """
    Trains a model on the train data provided, and prints the accuracy of the trained model.
    mode_str: User specifies what model it is to print the value
    """
    y_pred = model.fit(X_train, y_train).predict(X_test)
    score = accuracy_score(y_pred, y_test.astype("float32"), convert_dtype=True)
    print("{} model accuracy: {}".format(mode_str, score))

# Define hyperparameters search grid

In [None]:
# # For xgb_model
# model_gpu_xgb = xgb.XGBClassifier(tree_method="gpu_hist")
# 
# # More range
# params_xgb = {
#     "max_depth": arange(start=1, stop=21, step=1),
#     "alpha": logspace(start=-5, stop=-1, num=5, endpoint=True),  # default = 0
#     "learning_rate": arange(start=0.05, stop=0.5, step=0.05),
#     "min_child_weight": arange(start=2, stop=10, step=3),  # default = 1
#     "n_estimators": arange(start=5, stop=310, step=5)
# }

In [25]:
# For xgb_model
model_gpu_xgb = xgb.XGBClassifier()

# More range
params_xgb = {
    "max_depth": arange(start=3, stop=11, step=1),
    "learning_rate": arange(start=0.05, stop=0.5, step=0.05),
    "min_child_weight": arange(start=1, stop=10, step=1),  # default = 1
    "n_estimators": [100],
    "gamma": arange(start=0, stop=0.5, step=0.05),
    "subsample": arange(start=0.6, stop=1.0, step=0.05),
    "colsample_bytree": arange(start=0.6, stop=1.0, step=0.05),
    'objective':['multi:softmax'],
    'num_class':[3],
    "eval_metric": ["mlogloss"],
    "tree_method": ["gpu_hist"],
    'predictor':['gpu_predictor'],
}

# Fit models

## Default parameters accuracy

In [26]:
model_gpu_xgb_ = xgb.XGBClassifier(tree_method="gpu_hist")
last_fit(model_gpu_xgb_, X_train, y_cpu, X_test, y_test_cpu)

Default model accuracy: 0.7385621070861816


## HPO XGBoost

In [27]:
mode = "gpu-random"

with timed("XGB-" + mode):
    best_model, models_fitted = do_HPO(
        model_gpu_xgb,
        params_xgb,
        cuml_accuracy_scorer,
        X_train,
        y_cpu,
        mode=mode,
        n_iter=XGB_grid_n_iter_search,
    )
print("Searched over {} parameters".format(len(models_fitted.cv_results_["mean_test_score"])))

gpu-random selected
Best clf and score XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.7000000000000001, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='mlogloss',
              feature_types=None, gamma=0.0, gpu_id=None, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=0.2, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=3,
              max_leaves=None, min_child_weight=2, missing=nan,
              monotone_constraints=None, n_estimators=100, n_jobs=None,
              num_class=3, num_parallel_tree=None, objective='multi:softmax', ...) 0.7528548068070684
---

                  XGB-gpu-random time:  266.15023
Searched over 81 parameters


In [28]:
last_fit(best_model, X_train, y_cpu, X_test_cpu, y_test_cpu, mode_str=mode)

gpu-random model accuracy: 0.7352941036224365


# Save model

In [29]:
dump(best_model, os.path.join(RESOURCES_DIR, "model_xgboost_v0.1.joblib"))

['/home/jgsolar/Documentos/SHARED/Projetos Pessoais/multiplatform_classifier/B_resources/c_CUDA_RAPIDS/model_xgboost_v0.1.joblib']