# XGBoost - Without XGBoost's Dask interface

**Using Optuna for hyper-parameter search  to predict TPSA from Pharmacophores**

In [1]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:98% !important; }</style>"))
%load_ext autoreload  
%autoreload 2
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline

In [2]:
# Models
import os
import numpy as np
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import pickle
import itertools
from collections.abc import Iterator
from   datetime import datetime
from pprint import PrettyPrinter
import joblib

from utils import *
from utils_ml import model_selection
# from multiprocessing import Pool, process

import dask.dataframe as dd 
pp = PrettyPrinter(indent=4)
np.set_printoptions(edgeitems=3, infstr='inf', linewidth=150, nanstr='nan')
pd.options.display.width = 170



In [3]:
os.environ["WANDB_NOTEBOOK_NAME"] = "Adashare_Train.ipynb"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

#### xgboost and dask imports 

In [4]:
import joblib
# from dask_cuda import LocalCUDACluster
# from sklearn.model_selection import GridSearchCV
import optuna

import xgboost as xgb
from xgboost import XGBClassifier, XGBRegressor

import dask
import dask.array as da
import dask.dataframe as dd
from dask.distributed import Client
from dask.distributed import LocalCluster
import dask_ml.model_selection as dcv
from dask_ml.model_selection import train_test_split
from dask_ml.model_selection import GridSearchCV, IncrementalSearchCV, HyperbandSearchCV
from dask_ml.metrics import mean_squared_error

In [5]:
import warnings
warnings.filterwarnings('ignore')

In [6]:
# time.strftime(' %x%X')
# datetime.now().strftime('%X.%f')
# time.strftime('%X %x %Z')
print(datetime.now().strftime('%D-%X.%f'))
time_fmt = '%Y-%M-%d %H:%m:%S.%f'
print(datetime.now().strftime(time_fmt))

01/04/24-14:56:44.475938
2024-56-04 14:01:44.476080


In [7]:
import logging
logLevel = os.environ.get('LOG_LEVEL', 'INFO').upper()
FORMAT = '%(asctime)s - %(levelname)s: - %(message)s'
logging.basicConfig(level="INFO", format= FORMAT)
logging.getLogger("imported_module").setLevel(logging.CRITICAL)
logging.info(f" 1/7- engine connected")
logging.warning(f" 1/7- engine connected")
logging.error(f" 1/7- engine connected")
logging.critical(f" 1/7- engine connected")

2024-01-04 14:56:44,500 - INFO: -  1/7- engine connected
2024-01-04 14:56:44,502 - ERROR: -  1/7- engine connected
2024-01-04 14:56:44,502 - CRITICAL: -  1/7- engine connected


In [8]:
def result_model_selection(results, name):
    df_results = pd.DataFrame({'model'     : [name] * len(results.cv_results_['params']),
                               'params'    : results.cv_results_['params'],
                               'mean score': results.cv_results_['mean_test_score'],
                               'std score' : results.cv_results_['std_test_score'],
                               'rank'      : results.cv_results_['rank_test_score']
                              })
    return df_results

### Create dask cluster and client 

In [9]:
# cluster = LocalCluster("Kevins_Cluster", n_workers=2, threads_per_worker=2)
# cluster = LocalCluster()

# client = Client(cluster.scheduler_address)
# client = Client("tcp://127.0.0.1:37937")
# client = Client(processes = False)
 

In [10]:
try:
    client.close()
    del client
except Exception as e:
    print("Client close failed")

Client close failed


In [11]:
try:
    cluster.close()
    del cluster
except Exception as e:
    print("Cluster close failed")


Cluster close failed


In [12]:
# cluster = LocalCluster()
# client = Client("tcp://127.0.0.1:37937")
# client = Client(processes = False)
# cluster = LocalCluster("Kevins_Cluster", n_workers=2, threads_per_worker=2)
# client = Client(cluster.scheduler_address)

In [13]:
# cluster

# client

In [14]:
# cluster.workers
# cluster.scale(2)
# cluster.close()
# client.close()
# del cluster

# cluster.name
# print(cluster)
# cluster.dashboard_link
# cluster.scheduler_address
# cluster.scheduler_spec
# cluster.workers

# cluster.scheduler.stop()
# cluster.scheduler.close()

# client 
# client.status
# client.connection_args
# del client

# with open("./metadata/parquet_columns.pkl",'rb') as f:
#     ParquetColumns = pickle.load(f)

# for k,v in ParquetColumns.items():
#     print(f" {k:20s}   items: {len(v)}")

# type(ParquetColumns['Cells']['Cells_AreaShape_Area'])
# ParquetColumns['Cells']
# del ParquetColumns


### Datasets

In [15]:
prefix = '' ### Target-2' , 'MOA'
input_path ="./input/"
output_path ="./output_11102023/"
prefix_lc = prefix.lower().replace('-', '_')

CompoundExtendedMetadata2SampleFile = f"{output_path}{prefix_lc}compound_extended_metadata_2samples.csv"
CompoundProfiles2SampleFileCSV      = f"{output_path}{prefix_lc}compound_profiles_2samples.csv"
CompoundExtendedMetadataSampleFile  = f"{output_path}{prefix_lc}compound_extended_metadata_samples.csv"
featureSelectionFile                = f"{output_path}feature_selection_columns.pkl"

In [16]:
print()
print(f" Compound Extended Metadata 2 SampleFile  : {CompoundExtendedMetadata2SampleFile }")
print(f" Compound Profiles 2 Samples File CSV     : {CompoundProfiles2SampleFileCSV}")
print(f" ")
print(f" featureSelectionFile                     : {featureSelectionFile}")


 Compound Extended Metadata 2 SampleFile  : ./output_11102023/compound_extended_metadata_2samples.csv
 Compound Profiles 2 Samples File CSV     : ./output_11102023/compound_profiles_2samples.csv
 
 featureSelectionFile                     : ./output_11102023/feature_selection_columns.pkl


### Read column metadata file

In [17]:
with open("./metadata/feature_selection_columns.pkl", 'rb') as f: 
    x = pickle.load(f)
for i in x:
    print(f" {i:20s}    {len(x[i])} ")

X_columns = x['selected']
y_columns = [ "Metadata_log10TPSA"]

all_columns = ["Metadata_log10TPSA"]
all_columns.extend(x['selected'])

x_columns_drop = ["Metadata_Source", "Metadata_Batch", "Metadata_Plate", "Metadata_Well", "Metadata_TPSA", "Metadata_lnTPSA", "Metadata_log10TPSA"]
# x_columns_drop.extend(["Metadata_JCP2022"])

x_columns_dtype = {x: np.dtype('float32') for x in X_columns}
y_columns_dtype = {x: np.dtype('float32') for x in y_columns} ## "Metadata_log10TPSA":np.dtype('float64')}
all_columns_dtype = {x: np.dtype('float32') for x in all_columns}

print(f" len(x_columms)    : {len(X_columns)}")
print(f" len(y_columms)    : {len(y_columns)}")
print(f" len(all_columms)  : {len(all_columns)}")

 selected                1477 
 dropped_correlation     2193 
 dropped_variance        0 
 len(x_columms)    : 1477
 len(y_columms)    : 1
 len(all_columms)  : 1478


### Read compound profiles

In [18]:
# Apply feature selection
profilesFile = CompoundProfiles2SampleFileCSV ## +'.'+ type_bz2

print(f" Profiles file       :  {profilesFile}")
print(f" Features select file:  {featureSelectionFile}")

 Profiles file       :  ./output_11102023/compound_profiles_2samples.csv
 Features select file:  ./output_11102023/feature_selection_columns.pkl


In [19]:
df_profiles = dd.read_csv(profilesFile, usecols=all_columns, dtype= all_columns_dtype)

# df_profiles.info()
# df_profiles.head(6)
# del df_X
# del df_y

In [20]:
df_profiles.head(3)

Unnamed: 0,Metadata_log10TPSA,Cells_AreaShape_BoundingBoxMaximum_X,Cells_AreaShape_Center_X,Cells_AreaShape_Center_Y,Cells_AreaShape_Compactness,Cells_AreaShape_Eccentricity,Cells_AreaShape_EulerNumber,Cells_AreaShape_Extent,Cells_AreaShape_MajorAxisLength,Cells_AreaShape_MedianRadius,...,Nuclei_Texture_SumAverage_DNA_10_01_256,Nuclei_Texture_SumAverage_ER_10_01_256,Nuclei_Texture_SumAverage_Mito_10_01_256,Nuclei_Texture_SumAverage_RNA_10_01_256,Nuclei_Texture_SumEntropy_DNA_10_03_256,Nuclei_Texture_SumVariance_AGP_10_03_256,Nuclei_Texture_SumVariance_DNA_10_03_256,Nuclei_Texture_SumVariance_ER_10_01_256,Nuclei_Texture_SumVariance_Mito_10_03_256,Nuclei_Texture_SumVariance_RNA_10_01_256
0,1.803116,-0.377545,-0.294115,-1.370007,-0.010496,-0.296029,-0.134166,-0.207722,-0.156127,-0.230863,...,0.151205,0.016566,0.591573,0.950152,0.110363,-0.151072,-0.267783,-0.319627,-0.135347,0.033476
1,1.771293,-0.939467,-0.850871,-1.398116,-0.045341,-0.525316,0.146076,-0.51008,-0.222982,-0.28602,...,0.551443,-0.421324,0.020442,0.349053,0.372093,-0.150682,-0.108719,-0.561259,-0.33011,-0.246885
2,1.771293,-0.939467,-0.850871,-1.398116,-0.045341,-0.525316,0.146076,-0.51008,-0.222982,-0.28602,...,0.551443,-0.421324,0.020442,0.349053,0.372093,-0.150682,-0.108719,-0.561259,-0.33011,-0.246885


In [None]:
# df_X = dd.read_csv(profilesFile, blocksize="100MB", usecols=X_columns, dtype= x_columns_dtype)  ##, index_col = 'CASRN')
# df_y = dd.read_csv(profilesFile, blocksize="100MB", usecols=y_columns, dtype=y_columns_dtype)  ##, index_col = 'CASRN')

# df_X.info()
# df_X.head()
# df_X.shape

# df_y_array.info()
# df_y_array.head()
# df_y_array.shape

# df_X_array = df_X.to_dask_array(lengths = True)

# df_X_array = df_X_array.rechunk(chunks=(10000,-1))
# df_X_array.to_zarr('df_X_array.zarr' ) 

# df_y_array = df_y.to_dask_array(lengths = True)

# df_y_array = df_y_array.rechunk(chunks=(10000,-1))
# df_y_array.to_zarr('df_y_array.zarr' ) 

# df_X_array.to_hdf5('df_X_array.hdf5' , '/x')  
# df_y_array.to_hdf5('df_y_array.hdf5' , '/x')  

# del df_X, df_y, df_X_array, df_y_array

# df_y = df_profiles[y_columns].compute()
# df_X = df_profiles[list(x['selected'])] ## .drop(labels=x_columns_drop, axis =1)

# df_X_array = dask.array.from_zarr('df_X_array.zarr' )

# df_y_array = dask.array.from_zarr('df_y_array.zarr' )

# XGBoost - Using Optuna for hyper-parameter search

In [25]:
def make_cv_splits(n_folds: int = 5,) -> Iterator[tuple[dd.DataFrame, dd.DataFrame]]:
    frac = [1 / n_folds] * n_folds
    print(frac, n_folds)
    splits = df_profiles.random_split(frac, shuffle=True)
    # print(f"splits: {type(splits)} ")
    for i in range(n_folds):
        print(type(splits[i]))
        train = [splits[j] for j in range(n_folds) if j != i]
        train = dd.concat(train)
        test = splits[i] 
        # print(type(train), type(test))
        yield train, test

In [120]:
def train_model(**study_params):
    scores = []
    client = None
    
    for i, (train, test) in enumerate(make_cv_splits()):
        logging.info(f"Training/Test split #{i}")
        y_train = train[y_columns]
        X_train = train.drop(columns=y_columns)
        y_test = test[y_columns]
        X_test = test.drop(columns=y_columns)

        logging.info(f"Building DMatrix...")
        d_train = xgb.DMatrix(
            # client, 
            X_train, y_train,
        )
        d_test = xgb.DMatrix(X_test)
        logging.info(f"Building DMatrix...Completed")

        logging.info(f"Training model...")
        model = xgb.train(
            # client,
            {"tree_method": "hist", **study_params},
            d_train,
            num_boost_round=24,
            evals=[(d_train, "train")],
        )
        logging.info(f"Training model...Completed")

        logging.info(f"Running model on test data...")
        predictions = model.predict(# client, 
                                    # model, 
                                    # X_test,
                                    d_test
                                   )
        logging.info(f"Running model on test data...Completed")

        logging.info(f"Measuring accuracy of model vs. ground truth...")
        score = mean_squared_error(
            y_true = y_test.to_dask_array(),
            y_pred = predictions,
            # y_true = y_test,
            # y_pred = predictions.to_dask_array(),
            squared=False,
            compute=False,
        )
        logging.info(f"Measuring accuracy of model vs. ground truth...Completed")
 
        # Compute predictions and mean squared error for this iteration
        # while we start the next one
        scores.append(score.reshape(1).persist())
        del d_train, d_test, y_test, X_test,  model, predictions, score
        print("-" * 80)

    scores = da.concatenate(scores).compute()
    print(f"RSME={scores.mean()} +/- {scores.std()}")
    return scores.mean()
    print("-" * 80)

In [121]:
def objective(trial):
    params = {
        "n_estimators"     : trial.suggest_int("n_estimators", 75, 125),
        "learning_rate"    : trial.suggest_float("learning_rate", 0.5, 0.7),
        # "colsample_bytree" : trial.suggest_float("colsample_bytree", 0.5, 1),
        # "colsample_bynode" : trial.suggest_float("colsample_bynode", 0.5, 1),
        # "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.5, 1),
        # "reg_lambda"       : trial.suggest_float("reg_lambda", 0, 1),
        "max_depth"        : trial.suggest_int("max_depth", 1, 6),
        "max_leaves"       : trial.suggest_int("max_leaves", 0, 2),
        # "max_cat_to_onehot": trial.suggest_int("max_cat_to_onehot", 1, 10),
    }
    
    print(f"Training model (trial #{trial.number}) - Parameters:")
    for k, v in params.items():
        print(f"  {k}={v}")
    return train_model(**params)
    # return params

In [122]:
start = datetime.now()
try:
    optuna.delete_study(storage="sqlite:///example.db", study_name="kevin-study-1")
except Exception as e:
    print("delete failed")

In [123]:
study = optuna.create_study(storage="sqlite:///example.db",
                            study_name="kevin-study-1",
                            direction="maximize", load_if_exists=True)

[I 2024-01-03 16:35:00,849] A new study created in RDB with name: kevin-study-1


In [124]:
print(f"Total time:  {datetime.now() - start}")

study.optimize(objective, n_trials=10, timeout=600)

print("Number of finished trials: ", len(study.trials))
print("Best trial:")

Total time:  0:00:00.597776
Training model (trial #0) - Parameters:
  n_estimators=96
  learning_rate=0.6772057433643197
  max_depth=6
  max_leaves=2
[0.2, 0.2, 0.2, 0.2, 0.2] 5
splits: <class 'list'> 
<class 'dask.dataframe.core.DataFrame'>


2024-01-03 16:35:03,127 - INFO: - Building DMatrix...


<class 'dask.dataframe.core.DataFrame'> <class 'dask.dataframe.core.DataFrame'>
Training/Test split #0


2024-01-03 16:44:31,050 - INFO: - Building DMatrix...Completed
2024-01-03 16:44:31,055 - INFO: - Training model...


Parameters: { "n_estimators" } are not used.

[0]	train-rmse:0.46213
[1]	train-rmse:0.21765
[2]	train-rmse:0.17330
[3]	train-rmse:0.16795


2024-01-03 16:44:46,909 - INFO: - Training model...Completed
2024-01-03 16:44:46,911 - INFO: - Running model on test data...
2024-01-03 16:44:46,930 - INFO: - Running model on test data...Completed
2024-01-03 16:44:46,933 - INFO: - Measuring accuracy of model vs. ground truth...
2024-01-03 16:44:46,949 - INFO: - Measuring accuracy of model vs. ground truth...Completed


()
()
--------------------------------------------------------------------------------
<class 'dask.dataframe.core.DataFrame'>


2024-01-03 16:48:00,625 - INFO: - Building DMatrix...


<class 'dask.dataframe.core.DataFrame'> <class 'dask.dataframe.core.DataFrame'>
Training/Test split #1


2024-01-03 16:57:42,437 - INFO: - Building DMatrix...Completed
2024-01-03 16:57:42,441 - INFO: - Training model...


Parameters: { "n_estimators" } are not used.

[0]	train-rmse:0.46187
[1]	train-rmse:0.21742
[2]	train-rmse:0.17306
[3]	train-rmse:0.16771


2024-01-03 16:57:56,799 - INFO: - Training model...Completed
2024-01-03 16:57:56,801 - INFO: - Running model on test data...
2024-01-03 16:57:56,831 - INFO: - Running model on test data...Completed
2024-01-03 16:57:56,834 - INFO: - Measuring accuracy of model vs. ground truth...
2024-01-03 16:57:56,848 - INFO: - Measuring accuracy of model vs. ground truth...Completed


()
()
--------------------------------------------------------------------------------
<class 'dask.dataframe.core.DataFrame'>
<class 'dask.dataframe.core.DataFrame'> <class 'dask.dataframe.core.DataFrame'>
Training/Test split #2


2024-01-03 17:01:10,748 - INFO: - Building DMatrix...
2024-01-03 17:10:49,847 - INFO: - Building DMatrix...Completed
2024-01-03 17:10:49,850 - INFO: - Training model...


Parameters: { "n_estimators" } are not used.

[0]	train-rmse:0.46210
[1]	train-rmse:0.21781
[2]	train-rmse:0.17353
[3]	train-rmse:0.16818


2024-01-03 17:11:02,552 - INFO: - Training model...Completed
2024-01-03 17:11:02,564 - INFO: - Running model on test data...
2024-01-03 17:11:02,581 - INFO: - Running model on test data...Completed
2024-01-03 17:11:02,583 - INFO: - Measuring accuracy of model vs. ground truth...
2024-01-03 17:11:02,593 - INFO: - Measuring accuracy of model vs. ground truth...Completed


()
()
--------------------------------------------------------------------------------
<class 'dask.dataframe.core.DataFrame'>


2024-01-03 17:14:10,214 - INFO: - Building DMatrix...


<class 'dask.dataframe.core.DataFrame'> <class 'dask.dataframe.core.DataFrame'>
Training/Test split #3


2024-01-03 17:23:19,367 - INFO: - Building DMatrix...Completed
2024-01-03 17:23:19,375 - INFO: - Training model...


Parameters: { "n_estimators" } are not used.

[0]	train-rmse:0.46200
[1]	train-rmse:0.21747
[2]	train-rmse:0.17307
[3]	train-rmse:0.16771


2024-01-03 17:23:32,391 - INFO: - Training model...Completed
2024-01-03 17:23:32,393 - INFO: - Running model on test data...
2024-01-03 17:23:32,419 - INFO: - Running model on test data...Completed
2024-01-03 17:23:32,420 - INFO: - Measuring accuracy of model vs. ground truth...
2024-01-03 17:23:32,444 - INFO: - Measuring accuracy of model vs. ground truth...Completed


()
()
--------------------------------------------------------------------------------
<class 'dask.dataframe.core.DataFrame'>


2024-01-03 17:26:46,540 - INFO: - Building DMatrix...


<class 'dask.dataframe.core.DataFrame'> <class 'dask.dataframe.core.DataFrame'>
Training/Test split #4


2024-01-03 17:36:21,250 - INFO: - Building DMatrix...Completed
2024-01-03 17:36:21,260 - INFO: - Training model...


Parameters: { "n_estimators" } are not used.

[0]	train-rmse:0.46201
[1]	train-rmse:0.21757
[2]	train-rmse:0.17322
[3]	train-rmse:0.16785


2024-01-03 17:36:34,027 - INFO: - Training model...Completed
2024-01-03 17:36:34,028 - INFO: - Running model on test data...
2024-01-03 17:36:34,048 - INFO: - Running model on test data...Completed
2024-01-03 17:36:34,050 - INFO: - Measuring accuracy of model vs. ground truth...
2024-01-03 17:36:34,073 - INFO: - Measuring accuracy of model vs. ground truth...Completed


()
()


[I 2024-01-03 17:39:44,920] Trial 0 finished with value: 0.1684536188840866 and parameters: {'n_estimators': 96, 'learning_rate': 0.6772057433643197, 'max_depth': 6, 'max_leaves': 2}. Best is trial 0 with value: 0.1684536188840866.


--------------------------------------------------------------------------------
RSME=0.1684536188840866 +/- 0.0007142742979340255
Number of finished trials:  1
Best trial:


In [None]:
trial = study.best_trial

print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

print("  Number of estimators: {}".format(trial.user_attrs["n_estimators"]))

In [10]:
# xgb_grid_parameters = {
#     'learning_rate': [0.1, 0.01],
#     'max_depth': [12, 10 ,8], 
#     'max_depth': [15,10,5],
#     'min_child_weight':[5,3,2,1], 
#     'min_child_weight':[2,3], 
#     'gamma':[1,5, 10],  
#     'gamma':[2.5, 3, 3.5, 4],  
#     'subsample':[i/10.0 for i in range(6,11)],
#     'subsample':[0.5],
#     'colsample_bytree':[i/10.0 for i in range(5,11)], 
#     'colsample_bytree':[0.5], 
#     'n_estimators': [1250, 1000, 750, 500, 200]}
# xgb_reg.set_params(**xgb_grid_parameters)

In [None]:
# client.get_versions(check=True)

# XGBoost - Training using XGBoost native interface

**`xgboost.train`** `(params, dtrain, num_boost_round=10, *, `\
`evals=None, obj=None, feval=None, maximize=None, early_stopping_rounds=None, `\
`evals_result=None, verbose_eval=True, xgb_model=None, callbacks=None, custom_metric=None)`

**Parameters** 

**param** `(Dic[str, Any])`  Booster params

**tree_method** string [default= auto] - The tree construction algorithm used in XGBoost. See description in the reference paper and Tree Methods. \
Choices: `auto, exact, approx, hist` - this is a combination of commonly used updaters. For other updaters like refresh, set the parameter updater directly.\
    `auto:` Same as the hist tree method.\
    `exact:` Exact greedy algorithm. Enumerates all split candidates.\
    `approx:` Approximate greedy algorithm using quantile sketch and gradient histogram.\
    `hist:` Faster histogram optimized approximate greedy algorithm.y algorithm.

**Returns:** Booster: a trained booster model

In [None]:
# del output, dtrain

In [None]:
 
dtrain = xgb.dask.DaskDMatrix(client, train_X, train_y)

dval = xgb.dask.DaskDMatrix(client, val_X, val_y)

In [None]:
if __name__ == "__main__":
 
    # X and y must be Dask dataframes or arrays
    # num_obs = 1e5
    # num_features = 20
    # X = da.random.random(size=(num_obs, num_features), chunks=(1000, num_features))
    # y = da.random.random(size=(num_obs, 1), chunks=(1000, 1))
    # dtrain = xgb.dask.DaskDMatrix(client, X, y)
    # or
    # dtrain = xgb.dask.DaskQuantileDMatrix(client, X, y)
    
    early_stopping_rounds=20
    es = xgb.callback.EarlyStopping(rounds=early_stopping_rounds, save_best=True)
    
    output = xgb.dask.train(
        client,
        {"verbosity": 2, "tree_method": "hist", "objective": "reg:squarederror"},
        dtrain,
        num_boost_round=200,
        evals=[(dtrain, "train"), (dval, "val")],
        # xgb_model= output['booster'],
        callbacks = [es],
    )

In [None]:

type(output['booster'])
# output
output['booster'][133]
output['booster'].best_ntree_limit
# output['history']['train']['rmse']
# output['history']['val']['rmse']
# prev_history = output['history']



In [None]:
plt.plot(output['history']['train']['rmse']);
plt.plot(output['history']['val']['rmse']);

In [None]:
import matplotlib.pylab as pylab
params = {'legend.fontsize': 'x-large',
          'figure.figsize': (15, 5),
         'axes.labelsize': 'x-large',
         'axes.titlesize':'x-large',
         'xtick.labelsize':'x-large',
         'ytick.labelsize':'x-large'}
pylab.rcParams.update(params)

In [None]:
fig = plt.figure(figsize=(20, 20))
plt.yticks(fontsize = 12)
ax = fig.add_subplot()
ax.set_xlim(10,50)
ax = xgb.plot_importance(output['booster'], max_num_features= 30, ax = ax)
# for label in ( ax.get_xticklabels() + ax.get_yticklabels()):
#     label.set_fontsize(22)
ax.get_yticklabels()
# ax.autoscale(enable=None, axis="y", tight=True)


In [None]:
output['booster'][133].save_model('./save_20231218_233500_model.json')

In [None]:
config = output['booster'][133].save_config()
type(config)
print(config)

In [None]:
output['booster'][133].save_model('./save_20231218_233500_model.json')

# Training using XGBoost Scikit-Learn Interface

**XGBRegressor**

Implementation of the scikit-learn API for XGBoost regression. See Using the Scikit-Learn Estimator Interface for more information.

- **Gamma:** Minimum loss reduction required to make a further partition on a leaf node of the tree. The larger gamma is, the more conservative the algorithm will be.

- **max_depth:** Maximum depth of a tree. Increasing this value will make the model more complex and more likely to overfit. 0 indicates no limit on depth. \
  Beware that XGBoost aggressively consumes memory when training a deep tree. exact tree method requires non-zero value.

- **min_child_weight:** Minimum sum of instance weight (hessian) needed in a child. If the tree partition step results in a leaf node with the sum of \
  instance weight less than min_child_weight, then the building process will give up further partitioning.

- **subsample:** Subsample ratio of the training instances. Setting it to 0.5 means that XGBoost would randomly sample half of the training data prior to growing trees. and this will prevent overfitting.\
   Subsampling will occur once in every boosting iteration.
  

- **colsample_bytree:** is the subsample ratio of columns when constructing each tree. Subsampling occurs once for every tree constructed.


- **lambda:** L2 regularization term on weights. Increasing this value will make model more conservative.

In [None]:
from sklearn.metrics import get_scorer_names
# pp.pprint(get_scorer_names())
# for i in get_scorer_names() :
#     if "error" in i:
#         print(i)

In [None]:
early_stop = xgb.callback.EarlyStopping(rounds=10, 
                                        metric_name='rmse', 
                                        # data_name='Validation_0', 
                                        save_best=True, 
                                        maximize = False,
)

In [None]:
early_stop.best_scores
early_stop.metric_name
early_stop.current_rounds
# early_stop.stopping_history
# len(early_stop.best_scores['validation_']['rmse'])

In [None]:
xgb_reg = xgb.XGBRegressor(random_state =123, 
                           n_jobs=None,
                           objective ='reg:squarederror', 
                           eval_metric = "rmse",
                           tree_method='hist', 
                           # early_stopping_rounds= 4,
                           booster = 'gbtree', 
                           # device = "cuda",
                           # gpu_id = 0,
                           # client = client, 
                           verbosity=2,
                           # subsample = 1,
                           # sampling_method="uniform",
                           # callbacks = [early_stop],
                          )
# xgb.XGBRFRegressor 
# xgb.XGBModel

In [None]:

# xgb_reg.set_params(tree_method="hist", device = "cuda")
# xgb_reg.set_params(early_stopping_rounds= 3)
# xgb_reg.set_params(gpu_id = 0)
# xgb.client = client

In [None]:
print('Default Parameters :\n')
pp.pprint(xgb_reg.get_params())

In [None]:
# xgb_pcfp = model_selection(xgb_reg, 
                           # xgb_grid_parameters, train_X, train_y, 
                           # scoring =  'neg_mean_squared_error', cv=5, 
                           # GridSearch = True, 
                           # n_iter=30, 
                           # n_jobs=6, 
                           # verbose= 3)

In [None]:
# xgb_reg.fit(train_X, train_y, verbose = 3, eval_set=[(train_X, train_y), (val_X, val_y)])


In [None]:
# model_train = GridSearchCV(xgb_reg, xgb_grid_parameters, 
#                                # cv=3, 
#                                n_jobs = 1,
#                                scoring = None,
#                                # refit = True,
#                           )
model_train = IncrementalSearchCV(xgb_reg, 
                                  xgb_grid_parameters, 
                                  n_initial_parameters=1,
                                  # cv=3, 
                                  # n_jobs = 1,
                                    patience = 5,
                                    random_state = 1234, 
                               scoring = None,
                               # refit = True,
                          )

In [None]:
with joblib.parallel_backend('dask'):
    # model_train.fit(train_X, train_y, eval_set=[(train_X, train_y),(val_X, val_y)],  verbose = 3 )
    model_train.fit(train_X, train_y, eval_set=[(train_X, train_y),(val_X, val_y)], verbose = 3 )

In [None]:
bst = xgb_reg.get_booster()
history = xgb_reg.evals_result()

In [None]:
# history
xgb_reg.best_iteration
xgb_reg.base_score
xgb_reg.best_ntree_limit
# xgb_reg.best_score
xgb_reg.callbacks
xgb_reg.eval_metric
type(bst)

In [None]:
# len(xgb_reg.feature_importances_[xgb_reg.feature_importances_ < 1e-06])
xgb_reg.feature_importances_ 
xgb_reg.feature_names_in_
xgb_reg.n_features_in_
 

In [None]:
train_score = xgb_reg.score(train_X, train_y)

val_score = xgb_reg.score(val_X, val_y)

test_score = xgb_reg.score(test_X, test_y)

In [None]:
print(f" R2 score - Training   : {train_score:0.6f}")
print(f" R2 score - Validation : {val_score:0.6f}")
print(f" R2 score - Test data  : {test_score:0.6f}")

In [None]:
fig = plt.figure(figsize=(20, 20))
plt.yticks(fontsize = 12)
ax = fig.add_subplot()
ax.set_xlim(10,50)
ax = xgb.plot_importance(xgb_reg, max_num_features= 30, ax = ax)
# for label in ( ax.get_xticklabels() + ax.get_yticklabels()):
#     label.set_fontsize(22)
ax.get_yticklabels()
# ax.autoscale(enable=None, axis="y", tight=True)

## Comments

In [None]:
# idx = my_enumerator()
# fn = model_selectiona
# args = (1,xgb_reg, xgb_grid_parameters, train_X, train_y)

# kwargs = dict({'scoring':'neg_mean_squared_error', 'cv':5, 'GridSearch':True, 'n_iter':30, 'n_jobs':6, 'verbose': 4}) 
 
# start_time = time.perf_counter()
# print(f" {datetime.now().strftime('%X.%f')} | Started ")   
# pool = Pool(processes=2)

# # result = pool.starmap_async(get_pharmacophores, enumerate(df_iterator)) 
# # results = starmap_with_kwargs_async(pool, fn, args, kwargs, processes = 1)
# results = pool.apply_async(fn,args, kwargs)

# print(f" {datetime.now().strftime('%X.%f')} | starmap_with_kwargs_async() | close pool. . .  ") 
# pool.close()

# print(f" {datetime.now().strftime('%X.%f')} | starmap_with_kwargs_async() | Waiting for results. . .  ")    

In [None]:
# %%time

## returns: a: labeled_feature, b: unlabeled_feature, c: labeled_Y, d: df_labeled.index, e: df_unlabled.index
# a,b,c,d,e = prepare_input(train_labels, train_ecfp6_bits, target = 'logLD50_mmolkg', encoder = None)

## Calls 
# xgb_pcfp = model_selection(xgb_reg, 
                           # xgb_grid_parameters, train_X, train_y, 
                           # scoring =  'neg_mean_squared_error', cv=5, 
                           # GridSearch = True, 
                           # n_iter=30, 
                           # n_jobs=6, 
                           # verbose= 3)

In [None]:
#  verbose is 3
# Fitting 5 folds for each of 8 candidates, totalling 40 fits
# Best parameters set found on development set: {'colsample_bytree': 0.5, 'gamma': 2.5, 'learning_rate': 0.1, 'max_depth': 15, 'min_child_weight': 3, 'n_estimators': 1500, 'subsample': 0.5}
# Best score: -0.06345690434364994
# Grid scores on development set:

# -0.06349 (+/-0.002) for {'colsample_bytree': 0.5, 'gamma': 2.5, 'learning_rate': 0.1, 'max_depth': 15, 'min_child_weight': 2, 'n_estimators': 1500, 'subsample': 0.5}
# -0.06346 (+/-0.002) for {'colsample_bytree': 0.5, 'gamma': 2.5, 'learning_rate': 0.1, 'max_depth': 15, 'min_child_weight': 3, 'n_estimators': 1500, 'subsample': 0.5}
# -0.06505 (+/-0.002) for {'colsample_bytree': 0.5, 'gamma': 3, 'learning_rate': 0.1, 'max_depth': 15, 'min_child_weight': 2, 'n_estimators': 1500, 'subsample': 0.5}
# -0.06508 (+/-0.002) for {'colsample_bytree': 0.5, 'gamma': 3, 'learning_rate': 0.1, 'max_depth': 15, 'min_child_weight': 3, 'n_estimators': 1500, 'subsample': 0.5}
# -0.06638 (+/-0.002) for {'colsample_bytree': 0.5, 'gamma': 3.5, 'learning_rate': 0.1, 'max_depth': 15, 'min_child_weight': 2, 'n_estimators': 1500, 'subsample': 0.5}
# -0.06638 (+/-0.002) for {'colsample_bytree': 0.5, 'gamma': 3.5, 'learning_rate': 0.1, 'max_depth': 15, 'min_child_weight': 3, 'n_estimators': 1500, 'subsample': 0.5}
# -0.06788 (+/-0.002) for {'colsample_bytree': 0.5, 'gamma': 4, 'learning_rate': 0.1, 'max_depth': 15, 'min_child_weight': 2, 'n_estimators': 1500, 'subsample': 0.5}
# -0.06788 (+/-0.002) for {'colsample_bytree': 0.5, 'gamma': 4, 'learning_rate': 0.1, 'max_depth': 15, 'min_child_weight': 3, 'n_estimators': 1500, 'subsample': 0.5}
# CPU times: user 6.46 s, sys: 286 ms, total: 6.74 s
# Wall time: 3min 7s
    
#     	model	params	mean score	std score	rank
# 0	xgb_pcfp	{'colsample_bytree': 0.5, 'gamma': 2.5, 'learn...	-0.063492	0.001178	2
# 1	xgb_pcfp	{'colsample_bytree': 0.5, 'gamma': 2.5, 'learn...	-0.063457	0.001010	1
# 2	xgb_pcfp	{'colsample_bytree': 0.5, 'gamma': 3, 'learnin...	-0.065046	0.000987	3
# 3	xgb_pcfp	{'colsample_bytree': 0.5, 'gamma': 3, 'learnin...	-0.065080	0.000957	4
# 4	xgb_pcfp	{'colsample_bytree': 0.5, 'gamma': 3.5, 'learn...	-0.066377	0.001204	5
# 5	xgb_pcfp	{'colsample_bytree': 0.5, 'gamma': 3.5, 'learn...	-0.066377	0.001204	5
# 6	xgb_pcfp	{'colsample_bytree': 0.5, 'gamma': 4, 'learnin...	-0.067878	0.001242	7
# 7	xgb_pcfp	{'colsample_bytree': 0.5, 'gamma': 4, 'learnin...	-0.067878	0.001242	7

In [None]:
# df_results = result_model_selection(results = xgb_pcfp, name='xgb_pcfp');
# df_results

In [None]:
# df_results.sort_values('rank')

In [None]:
# df_results.loc[1].params - 
# {'colsample_bytree': 0.5,
#  'gamma': 2.5,
#  'learning_rate': 0.1,
#  'max_depth': 15,
#  'min_child_weight': 3,
#  'n_estimators': 1500,
#  'subsample': 0.5} 


# df_results.loc[2].params = 
# {'colsample_bytree': 0.5,
#  'gamma': 2.5,
#  'learning_rate': 0.1,
#  'max_depth': 15,
#  'min_child_weight': 3,
#  'n_estimators': 1500,
#  'subsample': 0.5}

# df_results.loc[12].params

In [None]:
ALL_RESULTS = []
ALL_RESULTS.append(df_results)

# XGBoost - Cancer Dataset (Classification example)

In [None]:
import os
import shutil
import optuna
import sklearn.datasets
import sklearn.metrics
import xgboost as xgb

SEED = 108
N_FOLDS = 3
CV_RESULT_DIR = "./xgboost_cv_results"

In [None]:
"""
Optuna example that optimizes a classifier configuration for cancer dataset using XGBoost.

In this example, we optimize the accuracy of cancer detection using the XGBoost. The accuracy is
estimated by cross-validation. We optimize both the choice of booster model and its
hyperparameters.
"""
def propose_parameters(trial):
    param = {
        "verbosity": 0,
        "objective"  :  "reg:squarederror",
        "eval_metric":  "rmse",
        "booster"    :  "gbtree",   ## trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
        "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
        "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
        # sampling ratio for training data.
        "subsample": trial.suggest_float("subsample", 0.2, 1.0),
        # sampling according to each tree.
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
    }

    if param["booster"] == "gbtree" or param["booster"] == "dart":
        param["max_depth"] = trial.suggest_int("max_depth", 1, 9)
        # minimum child weight, larger the term more conservative the tree.
        param["min_child_weight"] = trial.suggest_int("min_child_weight", 2, 10)
        param["eta"] = trial.suggest_float("eta", 1e-8, 1.0, log=True)
        param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
        param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])

    if param["booster"] == "dart":
        param["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
        param["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
        param["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
        param["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)
    return param
    
def objective(trial):
    # (data, target) = sklearn.datasets.load_breast_cancer(return_X_y=True)
    df_X = dd.read_csv(profilesFile, usecols=X_columns, dtype=x_columns_dtype)
    df_y = dd.read_csv(profilesFile, usecols=y_columns, dtype=y_columns_dtype)
    
    dtrain = xgb.DMatrix(df_X, label=df_y)

    param = propose_parameters(trial)
     
    xgb_cv_results = xgb.cv(
        params=param,
        dtrain=dtrain,
        num_boost_round=10000,
        nfold=N_FOLDS,
        stratified=True,
        early_stopping_rounds=100,
        seed=SEED,
        verbose_eval=False,
    )

    # Set n_estimators as a trial attribute; Accessible via study.trials_dataframe().
    trial.set_user_attr("n_estimators", len(xgb_cv_results))

    # Save cross-validation results.
    filepath = os.path.join(CV_RESULT_DIR, "{}.csv".format(trial.number))
    xgb_cv_results.to_csv(filepath, index=False)

    # Extract the best score.
    best_score = xgb_cv_results["test-auc-mean"].values[-1]
    return best_score

In [None]:
if __name__ == "__main__":
    if not os.path.exists(CV_RESULT_DIR):
        os.mkdir(CV_RESULT_DIR)

    study = optuna.create_study(direction="maximize")
    
    study.optimize(objective, n_trials=20, timeout=600)

    print("Number of finished trials: ", len(study.trials))
    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))
    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

    print("  Number of estimators: {}".format(trial.user_attrs["n_estimators"]))

    shutil.rmtree(CV_RESULT_DIR)

In [None]:
start = datetime.now()
study = optuna.delete_study(storage="sqlite:///example.db",
                            study_name="kevin-study-1")