In [1]:
%load_ext autoreload
%autoreload 2

In [17]:
import sys; sys.path.insert(0, '..') # add parent folder path where lib folder is

In [18]:

from utils import dasker
from pprint import pprint


import numpy as np
import sklearn.datasets
import sklearn.metrics
from sklearn.model_selection import train_test_split
import xgboost as xgb

import optuna
import joblib
from dask.distributed import Client

import dask_optuna
import pandas as pd


optuna.logging.set_verbosity(optuna.logging.WARNING)

In [20]:

def get_dataset():
    df = pd.read_csv('/mnt/c/Users/rwmas/GitHub/xai/xai_api/app/test/data/20220319_covid_merge_processed.csv', sep=",")

    X = df[df.columns[df.columns!='y']]
    y = df[df.columns[df.columns=='y']]

    return X, y


In [24]:

def objective(trial):


    df_X, df_y = get_dataset()

    X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.25)
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)

    param = {
        "objective": "reg:squarederror",
        "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
        "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
        "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
    }


    if param["booster"] == "gbtree" or param["booster"] == "dart":
        param["max_depth"] = trial.suggest_int("max_depth", 1, 9)
        param["eta"] = trial.suggest_float("eta", 1e-8, 1.0, log=True)
        param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
        param["grow_policy"] = trial.suggest_categorical(
            "grow_policy", ["depthwise", "lossguide"]
        )

    if param["booster"] == "dart":
        param["sample_type"] = trial.suggest_categorical(
            "sample_type", ["uniform", "weighted"]
        )
        param["normalize_type"] = trial.suggest_categorical(
            "normalize_type", ["tree", "forest"]
        )
        param["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
        param["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)


    


    bst = xgb.train(param, dtrain, early_stopping_rounds=10)

    scores = cross_val_score(best_xgb, X, y, scoring='r2', cv=kfold) 

    preds = bst.predict(dtest)
    pred_labels = np.rint(preds)
    err = sklearn.metrics.mean_squared_error(y_test, pred_labels)
    return err


In [25]:
# with Client() as client:
client = dasker.get_dask_client()
print(f"Dask dashboard is available at {client.dashboard_link}")

Dask dashboard is available at http://127.0.0.1:8787/status


In [27]:

storage = dask_optuna.DaskStorage()
study = optuna.create_study(storage=storage, direction="minimize")

with joblib.parallel_backend("dask"):
    study.optimize(objective, n_trials=200, n_jobs=-1)

print("Best params:")
pprint(study.best_params)

print("Number of trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))    

[32m[I 2022-09-15 13:14:26,935][0m Trial 10 finished with value: 615207.4997389587 and parameters: {'booster': 'gblinear', 'lambda': 0.00104855570854328, 'alpha': 4.9025149186449506e-06}. Best is trial 5 with value: 423368.5477608335.[0m
[32m[I 2022-09-15 13:14:26,964][0m Trial 5 finished with value: 423368.5477608335 and parameters: {'booster': 'gblinear', 'lambda': 4.368355234116619e-08, 'alpha': 0.007367237520576357}. Best is trial 5 with value: 423368.5477608335.[0m
[32m[I 2022-09-15 13:14:27,091][0m Trial 0 finished with value: 812267.4909879995 and parameters: {'booster': 'gbtree', 'lambda': 0.025379095239566062, 'alpha': 0.004144018270939389, 'max_depth': 2, 'eta': 0.8632451022642735, 'gamma': 0.02857007842106117, 'grow_policy': 'lossguide'}. Best is trial 5 with value: 423368.5477608335.[0m
[32m[I 2022-09-15 13:14:27,098][0m Trial 4 finished with value: 2836083.95470717 and parameters: {'booster': 'gbtree', 'lambda': 3.210515180274675e-07, 'alpha': 0.8335358971260013

Best params:
{'alpha': 0.006987201024959492,
 'booster': 'gblinear',
 'lambda': 2.225680879103565e-08}
Number of trials: 200
Best trial:
  Value: 231688.12228577095
  Params: 
    booster: gblinear
    lambda: 2.225680879103565e-08
    alpha: 0.006987201024959492


[32m[I 2022-09-15 13:14:39,228][0m Trial 198 finished with value: 481640.66197064536 and parameters: {'booster': 'gblinear', 'lambda': 4.969476236081079e-08, 'alpha': 0.06821736274704404}. Best is trial 182 with value: 231688.12228577095.[0m
[32m[I 2022-09-15 13:14:39,252][0m Trial 197 finished with value: 874355.5276672913 and parameters: {'booster': 'gblinear', 'lambda': 5.193105520003926e-08, 'alpha': 0.021676470320667534}. Best is trial 182 with value: 231688.12228577095.[0m
[32m[I 2022-09-15 13:14:39,295][0m Trial 199 finished with value: 510373.38936560444 and parameters: {'booster': 'gblinear', 'lambda': 1.1795243683681536e-07, 'alpha': 9.971792634913102e-05}. Best is trial 182 with value: 231688.12228577095.[0m


In [46]:
df_X, df_y = get_dataset()

In [44]:

MODEL_SAVE_PATH = "/mnt/c/Users/rwmas/GitHub/xai/xai_api/app/ml/models/saved/base/brisk_xgboost"












In [45]:
mdl = build_final_model(trial.params, df_X, df_y)

{'objective': 'reg:squarederror', 'booster': 'gblinear', 'lambda': 2.225680879103565e-08, 'alpha': 0.006987201024959492}
1746654.878016419
