# XGBoost - WithDask

**Using Optuna for hyper-parameter search  to predict TPSA from Pharmacophores**

In [1]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:98% !important; }</style>"))
%load_ext autoreload  
%autoreload 2
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline

In [2]:
# Models
import os
import numpy as np
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import pickle
import itertools
from collections.abc import Iterator
from   datetime import datetime
from pprint import PrettyPrinter
import joblib

from utils import *
from utils_ml import model_selection
# from multiprocessing import Pool, process

pp = PrettyPrinter(indent=4)
np.set_printoptions(edgeitems=3, infstr='inf', linewidth=150, nanstr='nan')
pd.options.display.width = 170

#### xgboost and dask imports 

In [3]:
import joblib
# from dask_cuda import LocalCUDACluster
# from sklearn.model_selection import GridSearchCV
import optuna

import xgboost as xgb
from xgboost import XGBClassifier, XGBRegressor

import dask
import dask.array as da
import dask.dataframe as dd
from dask import delayed
from dask.distributed import Client
from dask.distributed import LocalCluster
import dask_ml.model_selection as dcv
from dask_ml.model_selection import train_test_split
from dask_ml.model_selection import GridSearchCV, IncrementalSearchCV, HyperbandSearchCV
from dask_ml.metrics import mean_squared_error, r2_score, mean_squared_log_error
import sklearn.metrics as skm



In [4]:
os.environ["WANDB_NOTEBOOK_NAME"] = "Adashare_Train.ipynb"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [5]:
import warnings
warnings.filterwarnings('ignore')

In [6]:
# time.strftime(' %x%X')
# datetime.now().strftime('%X.%f')
# time.strftime('%X %x %Z')
print(datetime.now().strftime('%D-%X.%f'))
time_fmt = '%Y-%M-%d %H:%m:%S.%f'
print(datetime.now().strftime(time_fmt))

01/10/24-11:51:40.334060
2024-51-10 11:01:40.334201


In [7]:
import logging
logLevel = os.environ.get('LOG_LEVEL', 'INFO').upper()
FORMAT = '%(asctime)s - %(levelname)s: - %(message)s'
logging.basicConfig(level="INFO", format= FORMAT)
logging.getLogger("imported_module").setLevel(logging.CRITICAL)
logging.info(f" 1/7- engine connected")
logging.warning(f" 1/7- engine connected")
logging.error(f" 1/7- engine connected")
logging.critical(f" 1/7- engine connected")

2024-01-10 11:51:40,384 - INFO: -  1/7- engine connected
2024-01-10 11:51:40,386 - ERROR: -  1/7- engine connected
2024-01-10 11:51:40,387 - CRITICAL: -  1/7- engine connected


In [8]:
print(logging.NOTSET, logging.DEBUG,  logging.INFO, logging.WARN, logging.WARNING, logging.ERROR, logging.CRITICAL,  logging.FATAL)

0 10 20 30 30 40 50 50


In [9]:
def result_model_selection(results, name):
    df_results = pd.DataFrame({'model'     : [name] * len(results.cv_results_['params']),
                               'params'    : results.cv_results_['params'],
                               'mean score': results.cv_results_['mean_test_score'],
                               'std score' : results.cv_results_['std_test_score'],
                               'rank'      : results.cv_results_['rank_test_score']
                              })
    return df_results

#### Datasets

In [10]:
prefix = '' ### Target-2' , 'MOA'
input_path ="./input/"
output_path ="./output_11102023/"
prefix_lc = prefix.lower().replace('-', '_')

CompoundExtendedMetadata2SampleFile = f"{output_path}{prefix_lc}compound_extended_metadata_2samples.csv"
CompoundProfiles2SampleFileCSV      = f"{output_path}{prefix_lc}compound_profiles_2samples.csv"
CompoundExtendedMetadataSampleFile  = f"{output_path}{prefix_lc}compound_extended_metadata_samples.csv"
featureSelectionFile                = f"./metadata/feature_selection_columns.pkl"
trainingMetrics                     = f"{output_path}{prefix_lc}training_metrics.pkl"
print()
print(f" Compound Extended Metadata 2 SampleFile  : {CompoundExtendedMetadata2SampleFile }")
print(f" Compound Profiles 2 Samples File CSV     : {CompoundProfiles2SampleFileCSV}")
print(f" ")
print(f" featureSelectionFile                     : {featureSelectionFile}")
print(f" training metrics                         : {trainingMetrics}")


 Compound Extended Metadata 2 SampleFile  : ./output_11102023/compound_extended_metadata_2samples.csv
 Compound Profiles 2 Samples File CSV     : ./output_11102023/compound_profiles_2samples.csv
 
 featureSelectionFile                     : ./metadata/feature_selection_columns.pkl
 training metrics                         : ./output_11102023/training_metrics.pkl


#### Create dask cluster and client 

In [11]:
try:
    client.close()
    del client
except Exception as e:
    print("Client close failed")

Client close failed


In [12]:
try:
    cluster.close()
    del cluster
except Exception as e:
    print("Cluster close failed")


Cluster close failed


In [13]:
n_workers = 6
n_threads = 2
cluster = LocalCluster("Kevins_Cluster", n_workers=n_workers, threads_per_worker= n_threads, silence_logs = logging.WARNING)
# cluster = LocalCluster("Kevins_Cluster", n_workers=2, threads_per_worker=2)
# cluster = LocalCluster()

In [14]:
print(cluster)
# cluster
cluster.dashboard_link
cluster.worker_spec
# cluster.
# cluster.workers[0].status
# cluster.scale(5)
# cluster.close()
# del cluster

Kevins_Cluster(Kevins_Cluster, 'tcp://127.0.0.1:37969', workers=6, threads=12, memory=64.00 GiB)


'http://127.0.0.1:8787/status'

{0: {'cls': distributed.nanny.Nanny,
  'options': {'memory_limit': 11453246122,
   'host': '127.0.0.1',
   'nthreads': 2,
   'services': {},
   'dashboard_address': None,
   'dashboard': False,
   'interface': None,
   'protocol': 'tcp://',
   'security': Security(require_encryption=False, tls_min_version=771),
   'silence_logs': 30}},
 1: {'cls': distributed.nanny.Nanny,
  'options': {'memory_limit': 11453246122,
   'host': '127.0.0.1',
   'nthreads': 2,
   'services': {},
   'dashboard_address': None,
   'dashboard': False,
   'interface': None,
   'protocol': 'tcp://',
   'security': Security(require_encryption=False, tls_min_version=771),
   'silence_logs': 30}},
 2: {'cls': distributed.nanny.Nanny,
  'options': {'memory_limit': 11453246122,
   'host': '127.0.0.1',
   'nthreads': 2,
   'services': {},
   'dashboard_address': None,
   'dashboard': False,
   'interface': None,
   'protocol': 'tcp://',
   'security': Security(require_encryption=False, tls_min_version=771),
   'silence

In [15]:
client = Client(cluster)

# client = Client(cluster.scheduler_address)
# client = Client("tcp://127.0.0.1:37937")
# client = Client(processes = False)
# client = Client("tcp://127.0.0.1:37937")
# client = Client(processes = False)
# client = Client(cluster.scheduler_address)

In [16]:
client.status
# client.start

# client.status
# client.close()
# client.status
# del client

'running'

# Data Load and Prep

In [17]:
# with open("./metadata/parquet_columns.pkl",'rb') as f:
#     ParquetColumns = pickle.load(f)

# for k,v in ParquetColumns.items():
#     print(f" {k:20s}   items: {len(v)}")

# type(ParquetColumns['Cells']['Cells_AreaShape_Area'])
# ParquetColumns['Cells']
# del ParquetColumns

In [18]:
# df_X = dd.read_csv(profilesFile, blocksize="100MB", usecols=X_columns, dtype= x_columns_dtype)  ##, index_col = 'CASRN')
# df_y = dd.read_csv(profilesFile, blocksize="100MB", usecols=y_columns, dtype=y_columns_dtype)  ##, index_col = 'CASRN')

# df_X_array = df_X_array.rechunk(chunks=(10000,-1))
# df_y_array = df_y.to_dask_array(lengths = True)
# df_X_array.to_zarr('df_X_array.zarr' ) 
# df_y_array.to_zarr('df_y_array.zarr' ) 

# df_X_array.to_hdf5('df_X_array.hdf5' , '/x')  
# df_y_array.to_hdf5('df_y_array.hdf5' , '/x')  

# df_y = df_profiles[y_columns].compute()
# df_X = df_profiles[list(x['selected'])] ## .drop(labels=x_columns_drop, axis =1)

# df_X_array = dask.array.from_zarr('df_X_array.zarr' )
# df_y_array = dask.array.from_zarr('df_y_array.zarr' )

In [19]:
# def read_profiles(n_rows = 50000):
#     return    pd.read_csv(CompoundProfiles2SampleFileCSV, usecols=all_columns, dtype= all_columns_dtype, nrows = 50000)     

# read_profiles = delayed(read_profiles)
# read_profiles

### Features metadata file

In [20]:
print(f" Features select file:  {featureSelectionFile}")

with open(featureSelectionFile, 'rb') as f: 
    x = pickle.load(f)
for i in x:
    print(f" {i:20s}    {len(x[i])} ")

X_columns = x['selected']
y_columns = ["Metadata_log10TPSA"]

all_columns = ["Metadata_log10TPSA"]
all_columns.extend(x['selected'])

x_columns_drop = ["Metadata_Source", "Metadata_Batch", "Metadata_Plate", "Metadata_Well", "Metadata_TPSA", "Metadata_lnTPSA", "Metadata_log10TPSA"]
# x_columns_drop.extend(["Metadata_JCP2022"])

x_columns_dtype = {x: np.dtype('float32') for x in X_columns}
y_columns_dtype = {x: np.dtype('float32') for x in y_columns} ## "Metadata_log10TPSA":np.dtype('float64')}
all_columns_dtype = {x: np.dtype('float32') for x in all_columns}

print(f" len(x_columms)    : {len(X_columns)}")
print(f" len(y_columms)    : {len(y_columns)}")
print(f" len(all_columms)  : {len(all_columns)}")

 Features select file:  ./metadata/feature_selection_columns.pkl
 selected                1477 
 dropped_correlation     2193 
 dropped_variance        0 
 len(x_columms)    : 1477
 len(y_columms)    : 1
 len(all_columms)  : 1478


### Read compound profiles

In [21]:
print(f" Profiles file       :  {CompoundProfiles2SampleFileCSV}")
print()

df_profiles = dd.read_csv(CompoundProfiles2SampleFileCSV, usecols=all_columns, dtype= all_columns_dtype)   
# df_profiles.head(3)
df_profiles.shape
df_profiles.npartitions
df_profiles.get_partition(1).shape

df_profiles = df_profiles.head(npartitions = df_profiles.npartitions, n=50000)
# type(df_profiles)
# df_profiles.shape
# df_profiles.head()

df_profiles = dd.from_pandas(df_profiles, npartitions = 100)
type(df_profiles)
df_profiles.shape
# df_profiles.head()

 Profiles file       :  ./output_11102023/compound_profiles_2samples.csv



(Delayed('int-4a0a3090-00f0-40f8-b14f-94f4549b31b3'), 1478)

261

(Delayed('int-f2cf00d6-4559-478d-ba05-da1b66160a6e'), 1478)



dask.dataframe.core.DataFrame

(Delayed('int-4af9f9d0-5f4f-4a28-9f7c-e93bea7601fe'), 1478)

# XGBoost with Dask - Development 

- Need to convert Dask data frames to dask arrays in order to use data in parm grid search

In [None]:
# df_X_array = df_X_array.rechunk(chunks=(10000,-1))
# df_y_array = df_y_array.rechunk(chunks=(10000,-1))
# df_X_array.chunks
# df_y_array.chunks
# df_y_array.shape

# train_X, test_X,  train_y, test_y = train_test_split(df_X_array, df_y_array, 
#                                                      train_size= 0.8, 
#                                                      test_size=0.2, 
#                                                      random_state= 1234, 
#                                                      shuffle=True)
#
# print(f" Training data         : {train_X.shape} \t training labels: {train_y.shape}")
# print(f" Test & Validation data: {test_X.shape}  \t test labels    : {test_y.shape}")

# val_X ,  test_X,  val_y  , test_y = train_test_split(test_X, test_y, 
#                                                      train_size= 0.5, 
#                                                      test_size=0.5, 
#                                                      random_state= 1234, 
#                                                      shuffle=True)
# print()
# print(f" Training data   : {train_X.shape} \t Training labels: {train_y.shape}")
# print(f" Validation data : {test_X.shape} \t Val labels     : {test_y.shape}")
# print(f" Test set data   : {test_X.shape} \t Test labels    : {test_y.shape}")

In [22]:
def make_cv_splits(n_folds: int = 5,) -> Iterator[tuple[dd.DataFrame, dd.DataFrame]]:
    frac = [1 / n_folds] * n_folds
    print(frac, n_folds)
    splits = df_profiles.random_split(frac, shuffle=True)
    # print(type(splits[i]))
    # print(f"splits: {type(splits)} ")
    for i in range(n_folds):
        print(f"Training/Test split #{i}")
        train = [splits[j] for j in range(n_folds) if j != i]
        train = dd.concat(train)
        test = splits[i] 
        y_train = train[y_columns]
        X_train = train.drop(columns=y_columns)
        y_test = test[y_columns]
        X_test = test.drop(columns=y_columns)        
        yield (X_train, y_train), (X_test, y_test)
        # yield train, test

In [23]:
study_params = {  'learning_rate': 0.255, 'max_depth': 4, 'max_leaves': 10} 
mse_scores = []
R2_scores = []
print(study_params)
iter_files = make_cv_splits()

{'learning_rate': 0.255, 'max_depth': 4, 'max_leaves': 10}


### Beginning of loop

In [25]:
(X_train,y_train), (X_test, y_test) = next(iter_files)

[0.2, 0.2, 0.2, 0.2, 0.2] 5
Training/Test split #0


In [26]:
logging.info(f"Building TRAINING DMatrix...")
d_train = xgb.dask.DaskDMatrix(client, X_train, y_train )
logging.info(f"Building TRAINING DMatrix...Completed")

2024-01-10 11:32:56,688 - INFO: - Building TRAINING DMatrix...
2024-01-10 11:33:41,838 - INFO: - Building TRAINING DMatrix...Completed


In [27]:
logging.info(f"Building TEST DMatrix...")
d_test = xgb.dask.DaskDMatrix(client, X_test, y_test )
logging.info(f"Building TEST DMatrix...Completed")

2024-01-10 11:33:41,937 - INFO: - Building TEST DMatrix...
2024-01-10 11:34:00,287 - INFO: - Building TEST DMatrix...Completed


In [28]:
X_train.shape
y_train.shape

X_test.shape
y_test.shape

(Delayed('int-4c45f529-9352-476c-b37a-e3ce71157deb'), 1477)

(Delayed('int-25c32419-db62-49d6-b783-1acbda4abdce'), 1)

(Delayed('int-b873c60e-649a-492d-a428-970ff4bf55eb'), 1477)

(Delayed('int-9300455a-14f8-4c23-9069-cf3bb3bd737a'), 1)

In [29]:
logging.info(f"Training model...")
model = xgb.dask.train(
    None,
    {"verbosity": 0, "tree_method": "hist", **study_params},
    d_train,
    num_boost_round=100,
    evals=[(d_train, "train"), (d_test, "test")],
    verbose_eval=False
)
logging.info(f"Training model...Completed")

2024-01-10 11:34:00,401 - INFO: - Training model...
2024-01-10 11:34:00,476 - INFO: - start listen on 10.91.28.16:38809
[11:34:00] task [xgboost.dask-0]:tcp://127.0.0.1:33361 got new rank 0
[11:34:01] task [xgboost.dask-1]:tcp://127.0.0.1:41587 got new rank 1
[11:34:01] task [xgboost.dask-2]:tcp://127.0.0.1:42645 got new rank 2
[11:34:01] task [xgboost.dask-3]:tcp://127.0.0.1:42945 got new rank 3
[11:34:01] task [xgboost.dask-4]:tcp://127.0.0.1:45569 got new rank 4
[11:34:01] task [xgboost.dask-5]:tcp://127.0.0.1:46015 got new rank 5
2024-01-10 11:34:01,006 - INFO: - @tracker All of 6 nodes getting started
2024-01-10 11:34:46,403 - INFO: - @tracker All nodes finishes job
2024-01-10 11:34:46,561 - INFO: - Training model...Completed


In [62]:
logging.info(f"Running model on test data...")
predictions = xgb.dask.predict(client, model,d_test).compute()
logging.info(f"Running model on test data...Completed")

2024-01-10 11:43:10,572 - INFO: - Running model on test data...
2024-01-10 11:43:13,191 - INFO: - Running model on test data...Completed


In [60]:
logging.info(f"Reshape y_test data...")
# y_test_c = y_test.to_dask_array(lengths=True)
y_test_c = y_test.compute().to_numpy().squeeze()
logging.info(f"Reshape y_test data...Complete")

2024-01-10 11:42:53,954 - INFO: - Reshape y_test data...
2024-01-10 11:43:10,418 - INFO: - Reshape y_test data...Complete


In [64]:
print(f" y_test     : {type(y_test)} \t shape: {y_test.shape}") 
print(f" y_test_c   : {type(y_test_c)} \t\t\t shape: {y_test_c.shape}")
print(f" Predictions: {type(predictions)} \t\t\t shape: {predictions.shape}")

 y_test     : <class 'dask.dataframe.core.DataFrame'> 	 shape: (Delayed('int-f3e0b477-4527-4589-ab98-e8211d5268b9'), 1)
 y_test_c   : <class 'numpy.ndarray'> 			 shape: (10108,)
 Predictions: <class 'numpy.ndarray'> 			 shape: (10108,)


In [None]:
logging.info(f"Calculating R2 and MSE metrics...")
R2_score = skm.r2_score(
    y_true = y_test_c,
    y_pred = predictions,
)

mse_score = skm.mean_squared_error(
    y_true = y_test_c,
    y_pred = predictions,
    squared=True,
)
logging.info(f"Calculating R2 and MSE metrics...Completed")

In [None]:
logging.info(f"Measuring accuracy of model vs. ground truth...")
mse_score = mean_squared_error(
    y_true = y_test_c,
    y_pred = predictions,
    squared=True,
    # compute=True,
)
 
R2_score = r2_score(
    # y_true = d_test.get_label(),
    # y_true = y_test.to_dask_array(),
    y_true = y_test_c,
    y_pred = predictions,
    compute=True,
)
logging.info(f"Measuring accuracy of model vs. ground truth...Completed")

In [72]:
# print(score.shape, type(score))
print(y_test_c)
print(predictions)
print("mse_score: " , mse_score)
print("R2_score : " , R2_score)


mse_scores
R2_scores

[1.5931753 1.5901729 1.7077404 ... 1.9105177 1.93465   1.9674076]
[1.8370492 1.8152739 1.8682554 ... 1.8270347 1.7794722 1.8226285]
mse_score:  0.02311318
R2_score :  0.03680282529484136


[0.02311318, 0.02311318]

[0.03680282529484136, 0.03680282529484136]

In [44]:
del d_train, d_test
del X_train, y_train, X_test, y_test, y_test_c,
del model, predictions, mse_score, R2_score

### End of loop

In [40]:
# for x in mse_scores:
#     print(x.compute())
mse_scores_np = np.array(mse_scores, dtype =np.float64)
mse_scores_np
R2_scores_np = np.array(R2_scores, dtype =np.float64)
R2_scores_np

array([0.023117])

array([0.03237749])

In [None]:
print(f"RSME : {mse_scores_np.mean():.5f} +/- {mse_scores_np.std():.5f}")
# return mse_scores.mean()
print("-" * 80)
print(f"R^2  : {R2_scores_np.mean():.5f} +/- {R2_scores_np.std():.5f}")
# return mse_scores.mean()


In [None]:
# type(model)
data = { 'mse_scores' : mse_scores_np,
         'R2_scores' : R2_scores_np}
    
with open(trainingMetrics, 'wb') as f: 
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)

In [None]:
with open(trainingMetrics, 'rb') as f:
    # The protocol version used is detected automatically, so we do not
    # have to specify it.
    data = pickle.load(f)
data

# XGBoost - Using Optuna for hyper-parameter search

In [22]:
def make_cv_splits(n_folds: int = 5,) -> Iterator[tuple[dd.DataFrame, dd.DataFrame]]:
    frac = [1 / n_folds] * n_folds
    print(frac, n_folds)
    splits = df_profiles.random_split(frac, shuffle=True)
    # print(type(splits[i]))
    # print(f"splits: {type(splits)} ")
    for i in range(n_folds):
        print(f"Training/Test split #{i}")
        train = [splits[j] for j in range(n_folds) if j != i]
        train = dd.concat(train)
        test = splits[i] 
        y_train = train[y_columns]
        X_train = train.drop(columns=y_columns)
        y_test = test[y_columns]
        X_test = test.drop(columns=y_columns)        
        yield (X_train, y_train), (X_test, y_test)
        # yield train, test

In [23]:
def train_model(iter_files, **study_params):
    mse_scores = []
    R2_scores = []
 
    
    # for i, (train, test) in enumerate(make_cv_splits()):
    for i, ((X_train,y_train), (X_test, y_test)) in enumerate(iter_files):
 

        logging.info(f"Building training DMatrix...")
        d_train = xgb.dask.DaskDMatrix( client, X_train, y_train)
        logging.info(f"Building training DMatrix...Completed")

        logging.info(f"Building test DMatrix...")
        d_test = xgb.dask.DaskDMatrix(client, X_test, y_test)
        logging.info(f"Building DMatrix...Completed")

        logging.info(f"Training model...")
        model = xgb.dask.train(
            None, 
            {"verbosity":0, "tree_method": "hist", **study_params},
            d_train,
            num_boost_round=50,
            evals=[(d_train, "train"), (d_test, "test")],
            verbose_eval=False
        )
        logging.info(f"Training model...Completed")
        
        logging.info(f"Running model on test data...")
        predictions = xgb.dask.predict(client, model, d_test).compute()
        logging.info(f"Running model on test data...Completed")

        logging.info(f"Reshape y_test data...")
        # y_test_c = y_test.to_dask_array(lengths=True).compute().squeeze()
        y_test_c = y_test.compute().to_numpy().squeeze()
        logging.info(f"Reshape y_test data...Complete")

        print(f" y_test_c: {type(y_test_c)}   Shape:  {y_test_c.shape}       Predictions: {type(predictions)}  shape: {predictions.shape}")
        
        logging.info(f"Compute R2 & MSE scores...")
        R2_score = skm.r2_score(
            y_true = y_test_c,
            y_pred = predictions,
            # compute=True,
        )
        
        mse_score = skm.mean_squared_error(
            y_true = y_test_c,
            y_pred = predictions,
            squared=True,
            # compute=True,
        )
        logging.info(f"Compute R2 & MSE scores...Completed")

        # print(y_test_c)
        # print(predictions)
        print(f"MSE Score : {mse_score:.6f}")
        print(f" R2 Score : {R2_score:.6f}")
        
        mse_scores.append(mse_score)
        R2_scores.append(R2_score)        
        
        # Compute predictions and mean squared error for this iteration
        # while we start the next one
        # scores.append(score.reshape(1).persist())
        del d_train, d_test
        del X_train, y_train, X_test, y_test, y_test_c,
        del model, predictions, mse_score, R2_score        
        print("-" * 80)

    # scores = da.concatenate(scores).compute()
    # print(f"RSME={scores.mean()} +/- {scores.std()}")
    mse_scores_np = np.array(mse_scores, dtype =np.float64)
    R2_scores_np = np.array(R2_scores, dtype =np.float64)
    mse_scores_np
    R2_scores_np    

    print(f"RSME : {mse_scores_np.mean():.5f} +/- {mse_scores_np.std():.5f}")
    # return mse_scores.mean()
    print("-" * 80)
    print(f"R^2  : {R2_scores_np.mean():.5f} +/- {R2_scores_np.std():.5f}")
    print("-" * 80)
    return mse_scores_np.mean()    


In [24]:
def objective(trial):
    iter_files = make_cv_splits()
    params = {
        # "n_estimators"     : trial.suggest_int("n_estimators", 75, 125),
        "learning_rate"    : trial.suggest_float("learning_rate", 0.2, 0.7),
        # "colsample_bytree" : trial.suggest_float("colsample_bytree", 0.5, 1),
        # "colsample_bynode" : trial.suggest_float("colsample_bynode", 0.5, 1),
        # "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.5, 1),
        # "reg_lambda"       : trial.suggest_float("reg_lambda", 0, 1),
        "max_depth"        : trial.suggest_int("max_depth", 1, 6),
        "max_leaves"       : trial.suggest_int("max_leaves", 0, 2),
        # "max_cat_to_onehot": trial.suggest_int("max_cat_to_onehot", 1, 10),
    }
    
    print(f"Training model (trial #{trial.number}) - Parameters:")
    for k, v in params.items():
        print(f"  {k}={v}")
    return train_model(iter_files, **params)
    # return params

In [32]:
# try:
#     optuna.delete_study(storage="sqlite:///example.db", study_name="kevin-study-1")
# except Exception as e:
#     print("delete failed")

In [25]:
start = datetime.now()
study = optuna.create_study(storage="sqlite:///example.db",
                            study_name="kevin-study-1",
                            direction="minimize", load_if_exists=True)

print(f"Total time:  {datetime.now() - start}")

[I 2024-01-10 11:52:33,415] Using an existing study with name 'kevin-study-1' instead of creating a new one.


Total time:  0:00:00.193502


In [26]:
start = datetime.now()
study.optimize(objective, n_trials=10, timeout=60000)
print(f"Total time:  {datetime.now() - start}")

Training model (trial #6) - Parameters:
  learning_rate=0.5546995150921215
  max_depth=1
  max_leaves=0
[0.2, 0.2, 0.2, 0.2, 0.2] 5
Training/Test split #0


2024-01-10 11:55:24,974 - INFO: - Building training DMatrix...
2024-01-10 11:56:13,889 - INFO: - Building training DMatrix...Completed
2024-01-10 11:56:13,890 - INFO: - Building test DMatrix...
2024-01-10 11:56:33,992 - INFO: - Building DMatrix...Completed
2024-01-10 11:56:33,998 - INFO: - Training model...
2024-01-10 11:56:34,073 - INFO: - start listen on 10.91.28.16:55629
[11:56:34] task [xgboost.dask-0]:tcp://127.0.0.1:32979 got new rank 0
[11:56:34] task [xgboost.dask-1]:tcp://127.0.0.1:45541 got new rank 1
[11:56:34] task [xgboost.dask-2]:tcp://127.0.0.1:36753 got new rank 2
[11:56:34] task [xgboost.dask-3]:tcp://127.0.0.1:34831 got new rank 3
[11:56:34] task [xgboost.dask-4]:tcp://127.0.0.1:42505 got new rank 4
[11:56:34] task [xgboost.dask-5]:tcp://127.0.0.1:38213 got new rank 5
2024-01-10 11:56:34,624 - INFO: - @tracker All of 6 nodes getting started
2024-01-10 11:56:46,445 - INFO: - @tracker All nodes finishes job
2024-01-10 11:56:46,558 - INFO: - Training model...Completed
20

 y_test_c: <class 'numpy.ndarray'>   Shape:  (10119,)       Predictions: <class 'numpy.ndarray'>  shape: (10119,)
MSE Score : 0.022843
 R2 Score : 0.018665
--------------------------------------------------------------------------------
Training/Test split #1


2024-01-10 11:57:06,548 - INFO: - Building training DMatrix...
2024-01-10 11:57:59,236 - INFO: - Building training DMatrix...Completed
2024-01-10 11:57:59,237 - INFO: - Building test DMatrix...
2024-01-10 11:58:19,696 - INFO: - Building DMatrix...Completed
2024-01-10 11:58:19,702 - INFO: - Training model...
2024-01-10 11:58:19,711 - INFO: - start listen on 10.91.28.16:51805
[11:58:19] task [xgboost.dask-0]:tcp://127.0.0.1:32979 got new rank 0
[11:58:19] task [xgboost.dask-1]:tcp://127.0.0.1:45541 got new rank 1
[11:58:19] task [xgboost.dask-2]:tcp://127.0.0.1:36753 got new rank 2
[11:58:19] task [xgboost.dask-3]:tcp://127.0.0.1:34831 got new rank 3
[11:58:19] task [xgboost.dask-4]:tcp://127.0.0.1:42505 got new rank 4
[11:58:19] task [xgboost.dask-5]:tcp://127.0.0.1:38213 got new rank 5
2024-01-10 11:58:19,780 - INFO: - @tracker All of 6 nodes getting started
2024-01-10 11:58:31,893 - INFO: - @tracker All nodes finishes job
2024-01-10 11:58:32,041 - INFO: - Training model...Completed
20

 y_test_c: <class 'numpy.ndarray'>   Shape:  (9931,)       Predictions: <class 'numpy.ndarray'>  shape: (9931,)
MSE Score : 0.023839
 R2 Score : 0.012547
--------------------------------------------------------------------------------
Training/Test split #2


2024-01-10 11:58:52,340 - INFO: - Building training DMatrix...
2024-01-10 11:59:48,622 - INFO: - Building training DMatrix...Completed
2024-01-10 11:59:48,624 - INFO: - Building test DMatrix...
2024-01-10 12:00:10,256 - INFO: - Building DMatrix...Completed
2024-01-10 12:00:10,262 - INFO: - Training model...
2024-01-10 12:00:10,269 - INFO: - start listen on 10.91.28.16:39923
[12:00:10] task [xgboost.dask-0]:tcp://127.0.0.1:32979 got new rank 0
[12:00:10] task [xgboost.dask-1]:tcp://127.0.0.1:45541 got new rank 1
[12:00:10] task [xgboost.dask-2]:tcp://127.0.0.1:36753 got new rank 2
[12:00:10] task [xgboost.dask-3]:tcp://127.0.0.1:34831 got new rank 3
[12:00:10] task [xgboost.dask-4]:tcp://127.0.0.1:42505 got new rank 4
[12:00:10] task [xgboost.dask-5]:tcp://127.0.0.1:38213 got new rank 5
2024-01-10 12:00:10,343 - INFO: - @tracker All of 6 nodes getting started
2024-01-10 12:00:21,752 - INFO: - @tracker All nodes finishes job
2024-01-10 12:00:21,874 - INFO: - Training model...Completed
20

 y_test_c: <class 'numpy.ndarray'>   Shape:  (10043,)       Predictions: <class 'numpy.ndarray'>  shape: (10043,)
MSE Score : 0.024377
 R2 Score : 0.016114
--------------------------------------------------------------------------------
Training/Test split #3


2024-01-10 12:00:42,125 - INFO: - Building training DMatrix...
2024-01-10 12:01:36,114 - INFO: - Building training DMatrix...Completed
2024-01-10 12:01:36,134 - INFO: - Building test DMatrix...
2024-01-10 12:01:57,572 - INFO: - Building DMatrix...Completed
2024-01-10 12:01:57,579 - INFO: - Training model...
2024-01-10 12:01:57,585 - INFO: - start listen on 10.91.28.16:41293
[12:01:57] task [xgboost.dask-0]:tcp://127.0.0.1:32979 got new rank 0
[12:01:57] task [xgboost.dask-1]:tcp://127.0.0.1:45541 got new rank 1
[12:01:57] task [xgboost.dask-2]:tcp://127.0.0.1:36753 got new rank 2
[12:01:57] task [xgboost.dask-3]:tcp://127.0.0.1:34831 got new rank 3
[12:01:57] task [xgboost.dask-4]:tcp://127.0.0.1:42505 got new rank 4
[12:01:57] task [xgboost.dask-5]:tcp://127.0.0.1:38213 got new rank 5
2024-01-10 12:01:57,664 - INFO: - @tracker All of 6 nodes getting started
2024-01-10 12:02:08,779 - INFO: - @tracker All nodes finishes job
2024-01-10 12:02:08,871 - INFO: - Training model...Completed
20

 y_test_c: <class 'numpy.ndarray'>   Shape:  (9984,)       Predictions: <class 'numpy.ndarray'>  shape: (9984,)
MSE Score : 0.023065
 R2 Score : 0.017077
--------------------------------------------------------------------------------
Training/Test split #4


2024-01-10 12:02:30,657 - INFO: - Building training DMatrix...
2024-01-10 12:03:31,419 - INFO: - Building training DMatrix...Completed
2024-01-10 12:03:31,420 - INFO: - Building test DMatrix...
2024-01-10 12:03:53,225 - INFO: - Building DMatrix...Completed
2024-01-10 12:03:53,232 - INFO: - Training model...
2024-01-10 12:03:53,237 - INFO: - start listen on 10.91.28.16:60103
[12:03:53] task [xgboost.dask-0]:tcp://127.0.0.1:32979 got new rank 0
[12:03:53] task [xgboost.dask-1]:tcp://127.0.0.1:45541 got new rank 1
[12:03:53] task [xgboost.dask-2]:tcp://127.0.0.1:36753 got new rank 2
[12:03:53] task [xgboost.dask-3]:tcp://127.0.0.1:34831 got new rank 3
[12:03:53] task [xgboost.dask-4]:tcp://127.0.0.1:42505 got new rank 4
[12:03:53] task [xgboost.dask-5]:tcp://127.0.0.1:38213 got new rank 5
2024-01-10 12:03:53,313 - INFO: - @tracker All of 6 nodes getting started
2024-01-10 12:04:06,200 - INFO: - @tracker All nodes finishes job
2024-01-10 12:04:06,322 - INFO: - Training model...Completed
20

 y_test_c: <class 'numpy.ndarray'>   Shape:  (9923,)       Predictions: <class 'numpy.ndarray'>  shape: (9923,)
MSE Score : 0.022888
 R2 Score : 0.013232
--------------------------------------------------------------------------------
RSME : 0.02340 +/- 0.00061
--------------------------------------------------------------------------------
R^2  : 0.01553 +/- 0.00231
--------------------------------------------------------------------------------
Training model (trial #7) - Parameters:
  learning_rate=0.46971462253500823
  max_depth=5
  max_leaves=1
[0.2, 0.2, 0.2, 0.2, 0.2] 5
Training/Test split #0


2024-01-10 12:04:28,455 - INFO: - Building training DMatrix...
2024-01-10 12:05:28,711 - INFO: - Building training DMatrix...Completed
2024-01-10 12:05:28,712 - INFO: - Building test DMatrix...
2024-01-10 12:05:51,292 - INFO: - Building DMatrix...Completed
2024-01-10 12:05:51,299 - INFO: - Training model...
2024-01-10 12:05:51,305 - INFO: - start listen on 10.91.28.16:59061
[12:05:51] task [xgboost.dask-0]:tcp://127.0.0.1:32979 got new rank 0
[12:05:51] task [xgboost.dask-1]:tcp://127.0.0.1:45541 got new rank 1
[12:05:51] task [xgboost.dask-2]:tcp://127.0.0.1:36753 got new rank 2
[12:05:51] task [xgboost.dask-3]:tcp://127.0.0.1:34831 got new rank 3
[12:05:51] task [xgboost.dask-4]:tcp://127.0.0.1:42505 got new rank 4
[12:05:51] task [xgboost.dask-5]:tcp://127.0.0.1:38213 got new rank 5
2024-01-10 12:05:51,376 - INFO: - @tracker All of 6 nodes getting started
2024-01-10 12:06:02,817 - INFO: - @tracker All nodes finishes job
2024-01-10 12:06:02,902 - INFO: - Training model...Completed
20

 y_test_c: <class 'numpy.ndarray'>   Shape:  (9993,)       Predictions: <class 'numpy.ndarray'>  shape: (9993,)
MSE Score : 0.023751
 R2 Score : -0.000329
--------------------------------------------------------------------------------
Training/Test split #1


2024-01-10 12:06:25,283 - INFO: - Building training DMatrix...
2024-01-10 12:07:20,815 - INFO: - Building training DMatrix...Completed
2024-01-10 12:07:20,817 - INFO: - Building test DMatrix...
2024-01-10 12:07:42,869 - INFO: - Building DMatrix...Completed
2024-01-10 12:07:42,872 - INFO: - Training model...
2024-01-10 12:07:42,883 - INFO: - start listen on 10.91.28.16:52399
[12:07:42] task [xgboost.dask-0]:tcp://127.0.0.1:32979 got new rank 0
[12:07:42] task [xgboost.dask-1]:tcp://127.0.0.1:45541 got new rank 1
[12:07:42] task [xgboost.dask-2]:tcp://127.0.0.1:36753 got new rank 2
[12:07:42] task [xgboost.dask-3]:tcp://127.0.0.1:34831 got new rank 3
[12:07:42] task [xgboost.dask-4]:tcp://127.0.0.1:42505 got new rank 4
[12:07:42] task [xgboost.dask-5]:tcp://127.0.0.1:38213 got new rank 5
2024-01-10 12:07:42,972 - INFO: - @tracker All of 6 nodes getting started
2024-01-10 12:07:54,549 - INFO: - @tracker All nodes finishes job
2024-01-10 12:07:54,654 - INFO: - Training model...Completed
20

 y_test_c: <class 'numpy.ndarray'>   Shape:  (10029,)       Predictions: <class 'numpy.ndarray'>  shape: (10029,)
MSE Score : 0.022806
 R2 Score : -0.000005
--------------------------------------------------------------------------------
Training/Test split #2


2024-01-10 12:08:16,352 - INFO: - Building training DMatrix...
2024-01-10 12:09:18,280 - INFO: - Building training DMatrix...Completed
2024-01-10 12:09:18,282 - INFO: - Building test DMatrix...
2024-01-10 12:09:41,760 - INFO: - Building DMatrix...Completed
2024-01-10 12:09:41,762 - INFO: - Training model...
2024-01-10 12:09:41,773 - INFO: - start listen on 10.91.28.16:34457
[12:09:41] task [xgboost.dask-0]:tcp://127.0.0.1:32979 got new rank 0
[12:09:41] task [xgboost.dask-1]:tcp://127.0.0.1:45541 got new rank 1
[12:09:41] task [xgboost.dask-2]:tcp://127.0.0.1:36753 got new rank 2
[12:09:41] task [xgboost.dask-3]:tcp://127.0.0.1:34831 got new rank 3
[12:09:41] task [xgboost.dask-4]:tcp://127.0.0.1:42505 got new rank 4
[12:09:41] task [xgboost.dask-5]:tcp://127.0.0.1:38213 got new rank 5
2024-01-10 12:09:41,846 - INFO: - @tracker All of 6 nodes getting started
2024-01-10 12:09:52,839 - INFO: - @tracker All nodes finishes job
2024-01-10 12:09:53,011 - INFO: - Training model...Completed
20

 y_test_c: <class 'numpy.ndarray'>   Shape:  (9986,)       Predictions: <class 'numpy.ndarray'>  shape: (9986,)
MSE Score : 0.024371
 R2 Score : -0.000015
--------------------------------------------------------------------------------
Training/Test split #3


2024-01-10 12:10:15,512 - INFO: - Building training DMatrix...
2024-01-10 12:11:13,057 - INFO: - Building training DMatrix...Completed
2024-01-10 12:11:13,076 - INFO: - Building test DMatrix...
2024-01-10 12:11:35,850 - INFO: - Building DMatrix...Completed
2024-01-10 12:11:35,858 - INFO: - Training model...
2024-01-10 12:11:35,864 - INFO: - start listen on 10.91.28.16:37515
[12:11:35] task [xgboost.dask-0]:tcp://127.0.0.1:32979 got new rank 0
[12:11:35] task [xgboost.dask-1]:tcp://127.0.0.1:45541 got new rank 1
[12:11:35] task [xgboost.dask-2]:tcp://127.0.0.1:36753 got new rank 2
[12:11:35] task [xgboost.dask-3]:tcp://127.0.0.1:34831 got new rank 3
[12:11:35] task [xgboost.dask-4]:tcp://127.0.0.1:42505 got new rank 4
[12:11:35] task [xgboost.dask-5]:tcp://127.0.0.1:38213 got new rank 5
2024-01-10 12:11:35,945 - INFO: - @tracker All of 6 nodes getting started
2024-01-10 12:11:47,407 - INFO: - @tracker All nodes finishes job
2024-01-10 12:11:47,512 - INFO: - Training model...Completed
20

 y_test_c: <class 'numpy.ndarray'>   Shape:  (9905,)       Predictions: <class 'numpy.ndarray'>  shape: (9905,)
MSE Score : 0.024052
 R2 Score : -0.000277
--------------------------------------------------------------------------------
Training/Test split #4


2024-01-10 12:12:09,432 - INFO: - Building training DMatrix...
2024-01-10 12:13:08,620 - INFO: - Building training DMatrix...Completed
2024-01-10 12:13:08,623 - INFO: - Building test DMatrix...
2024-01-10 12:13:32,218 - INFO: - Building DMatrix...Completed
2024-01-10 12:13:32,219 - INFO: - Training model...
2024-01-10 12:13:32,228 - INFO: - start listen on 10.91.28.16:53979
[12:13:32] task [xgboost.dask-0]:tcp://127.0.0.1:32979 got new rank 0
[12:13:32] task [xgboost.dask-1]:tcp://127.0.0.1:45541 got new rank 1
[12:13:32] task [xgboost.dask-2]:tcp://127.0.0.1:36753 got new rank 2
[12:13:32] task [xgboost.dask-3]:tcp://127.0.0.1:34831 got new rank 3
[12:13:32] task [xgboost.dask-4]:tcp://127.0.0.1:42505 got new rank 4
[12:13:32] task [xgboost.dask-5]:tcp://127.0.0.1:38213 got new rank 5
2024-01-10 12:13:32,305 - INFO: - @tracker All of 6 nodes getting started
2024-01-10 12:13:43,584 - INFO: - @tracker All nodes finishes job
2024-01-10 12:13:43,752 - INFO: - Training model...Completed
20

 y_test_c: <class 'numpy.ndarray'>   Shape:  (10087,)       Predictions: <class 'numpy.ndarray'>  shape: (10087,)
MSE Score : 0.023893
 R2 Score : -0.000010
--------------------------------------------------------------------------------
RSME : 0.02377 +/- 0.00053
--------------------------------------------------------------------------------
R^2  : -0.00013 +/- 0.00014
--------------------------------------------------------------------------------
Training model (trial #8) - Parameters:
  learning_rate=0.4347161529731449
  max_depth=4
  max_leaves=2
[0.2, 0.2, 0.2, 0.2, 0.2] 5
Training/Test split #0


2024-01-10 12:14:07,243 - INFO: - Building training DMatrix...
2024-01-10 12:15:10,493 - INFO: - Building training DMatrix...Completed
2024-01-10 12:15:10,496 - INFO: - Building test DMatrix...
2024-01-10 12:15:34,012 - INFO: - Building DMatrix...Completed
2024-01-10 12:15:34,014 - INFO: - Training model...
2024-01-10 12:15:34,024 - INFO: - start listen on 10.91.28.16:41155
[12:15:34] task [xgboost.dask-0]:tcp://127.0.0.1:32979 got new rank 0
[12:15:34] task [xgboost.dask-1]:tcp://127.0.0.1:45541 got new rank 1
[12:15:34] task [xgboost.dask-2]:tcp://127.0.0.1:36753 got new rank 2
[12:15:34] task [xgboost.dask-3]:tcp://127.0.0.1:34831 got new rank 3
[12:15:34] task [xgboost.dask-4]:tcp://127.0.0.1:42505 got new rank 4
[12:15:34] task [xgboost.dask-5]:tcp://127.0.0.1:38213 got new rank 5
2024-01-10 12:15:34,101 - INFO: - @tracker All of 6 nodes getting started
2024-01-10 12:15:45,143 - INFO: - @tracker All nodes finishes job
2024-01-10 12:15:45,243 - INFO: - Training model...Completed
20

 y_test_c: <class 'numpy.ndarray'>   Shape:  (9883,)       Predictions: <class 'numpy.ndarray'>  shape: (9883,)
MSE Score : 0.023382
 R2 Score : 0.013657
--------------------------------------------------------------------------------
Training/Test split #1


2024-01-10 12:16:08,449 - INFO: - Building training DMatrix...
2024-01-10 12:17:11,293 - INFO: - Building training DMatrix...Completed
2024-01-10 12:17:11,294 - INFO: - Building test DMatrix...
2024-01-10 12:17:35,360 - INFO: - Building DMatrix...Completed
2024-01-10 12:17:35,368 - INFO: - Training model...
2024-01-10 12:17:35,376 - INFO: - start listen on 10.91.28.16:59367
[12:17:35] task [xgboost.dask-0]:tcp://127.0.0.1:32979 got new rank 0
[12:17:35] task [xgboost.dask-1]:tcp://127.0.0.1:45541 got new rank 1
[12:17:35] task [xgboost.dask-2]:tcp://127.0.0.1:36753 got new rank 2
[12:17:35] task [xgboost.dask-3]:tcp://127.0.0.1:34831 got new rank 3
[12:17:35] task [xgboost.dask-4]:tcp://127.0.0.1:42505 got new rank 4
[12:17:35] task [xgboost.dask-5]:tcp://127.0.0.1:38213 got new rank 5
2024-01-10 12:17:35,452 - INFO: - @tracker All of 6 nodes getting started
2024-01-10 12:17:46,641 - INFO: - @tracker All nodes finishes job
2024-01-10 12:17:46,731 - INFO: - Training model...Completed
20

 y_test_c: <class 'numpy.ndarray'>   Shape:  (9943,)       Predictions: <class 'numpy.ndarray'>  shape: (9943,)
MSE Score : 0.023046
 R2 Score : 0.014626
--------------------------------------------------------------------------------
Training/Test split #2


2024-01-10 12:18:09,942 - INFO: - Building training DMatrix...
2024-01-10 12:19:12,824 - INFO: - Building training DMatrix...Completed
2024-01-10 12:19:12,826 - INFO: - Building test DMatrix...
2024-01-10 12:19:36,134 - INFO: - Building DMatrix...Completed
2024-01-10 12:19:36,136 - INFO: - Training model...
2024-01-10 12:19:36,153 - INFO: - start listen on 10.91.28.16:56945
[12:19:36] task [xgboost.dask-0]:tcp://127.0.0.1:32979 got new rank 0
[12:19:36] task [xgboost.dask-1]:tcp://127.0.0.1:45541 got new rank 1
[12:19:36] task [xgboost.dask-2]:tcp://127.0.0.1:36753 got new rank 2
[12:19:36] task [xgboost.dask-3]:tcp://127.0.0.1:34831 got new rank 3
[12:19:36] task [xgboost.dask-4]:tcp://127.0.0.1:42505 got new rank 4
[12:19:36] task [xgboost.dask-5]:tcp://127.0.0.1:38213 got new rank 5
2024-01-10 12:19:36,231 - INFO: - @tracker All of 6 nodes getting started
2024-01-10 12:19:47,320 - INFO: - @tracker All nodes finishes job
2024-01-10 12:19:47,411 - INFO: - Training model...Completed
20

 y_test_c: <class 'numpy.ndarray'>   Shape:  (10084,)       Predictions: <class 'numpy.ndarray'>  shape: (10084,)
MSE Score : 0.022993
 R2 Score : 0.016230
--------------------------------------------------------------------------------
Training/Test split #3


2024-01-10 12:20:10,463 - INFO: - Building training DMatrix...
2024-01-10 12:21:17,426 - INFO: - Building training DMatrix...Completed
2024-01-10 12:21:17,428 - INFO: - Building test DMatrix...
2024-01-10 12:21:41,458 - INFO: - Building DMatrix...Completed
2024-01-10 12:21:41,461 - INFO: - Training model...
2024-01-10 12:21:41,471 - INFO: - start listen on 10.91.28.16:39045
[12:21:41] task [xgboost.dask-0]:tcp://127.0.0.1:32979 got new rank 0
[12:21:41] task [xgboost.dask-1]:tcp://127.0.0.1:45541 got new rank 1
[12:21:41] task [xgboost.dask-2]:tcp://127.0.0.1:36753 got new rank 2
[12:21:41] task [xgboost.dask-3]:tcp://127.0.0.1:34831 got new rank 3
[12:21:41] task [xgboost.dask-4]:tcp://127.0.0.1:42505 got new rank 4
[12:21:41] task [xgboost.dask-5]:tcp://127.0.0.1:38213 got new rank 5
2024-01-10 12:21:41,555 - INFO: - @tracker All of 6 nodes getting started
2024-01-10 12:21:52,520 - INFO: - @tracker All nodes finishes job
2024-01-10 12:21:52,647 - INFO: - Training model...Completed
20

 y_test_c: <class 'numpy.ndarray'>   Shape:  (9915,)       Predictions: <class 'numpy.ndarray'>  shape: (9915,)
MSE Score : 0.022683
 R2 Score : 0.015659
--------------------------------------------------------------------------------
Training/Test split #4


2024-01-10 12:22:15,564 - INFO: - Building training DMatrix...
2024-01-10 12:23:18,215 - INFO: - Building training DMatrix...Completed
2024-01-10 12:23:18,217 - INFO: - Building test DMatrix...
2024-01-10 12:23:43,589 - INFO: - Building DMatrix...Completed
2024-01-10 12:23:43,590 - INFO: - Training model...
2024-01-10 12:23:43,601 - INFO: - start listen on 10.91.28.16:44461
[12:23:43] task [xgboost.dask-0]:tcp://127.0.0.1:32979 got new rank 0
[12:23:43] task [xgboost.dask-1]:tcp://127.0.0.1:45541 got new rank 1
[12:23:43] task [xgboost.dask-2]:tcp://127.0.0.1:36753 got new rank 2
[12:23:43] task [xgboost.dask-3]:tcp://127.0.0.1:34831 got new rank 3
[12:23:43] task [xgboost.dask-4]:tcp://127.0.0.1:42505 got new rank 4
[12:23:43] task [xgboost.dask-5]:tcp://127.0.0.1:38213 got new rank 5
2024-01-10 12:23:43,678 - INFO: - @tracker All of 6 nodes getting started
2024-01-10 12:23:54,824 - INFO: - @tracker All nodes finishes job
2024-01-10 12:23:54,955 - INFO: - Training model...Completed
20

 y_test_c: <class 'numpy.ndarray'>   Shape:  (10175,)       Predictions: <class 'numpy.ndarray'>  shape: (10175,)
MSE Score : 0.025008
 R2 Score : 0.011602
--------------------------------------------------------------------------------
RSME : 0.02342 +/- 0.00082
--------------------------------------------------------------------------------
R^2  : 0.01435 +/- 0.00163
--------------------------------------------------------------------------------
Training model (trial #9) - Parameters:
  learning_rate=0.3847533718023459
  max_depth=4
  max_leaves=0
[0.2, 0.2, 0.2, 0.2, 0.2] 5
Training/Test split #0


2024-01-10 12:24:18,465 - INFO: - Building training DMatrix...
2024-01-10 12:25:23,483 - INFO: - Building training DMatrix...Completed
2024-01-10 12:25:23,484 - INFO: - Building test DMatrix...
2024-01-10 12:25:47,443 - INFO: - Building DMatrix...Completed
2024-01-10 12:25:47,447 - INFO: - Training model...
2024-01-10 12:25:47,457 - INFO: - start listen on 10.91.28.16:57481
[12:25:47] task [xgboost.dask-0]:tcp://127.0.0.1:32979 got new rank 0
[12:25:47] task [xgboost.dask-1]:tcp://127.0.0.1:45541 got new rank 1
[12:25:47] task [xgboost.dask-2]:tcp://127.0.0.1:36753 got new rank 2
[12:25:47] task [xgboost.dask-3]:tcp://127.0.0.1:34831 got new rank 3
[12:25:47] task [xgboost.dask-4]:tcp://127.0.0.1:42505 got new rank 4
[12:25:47] task [xgboost.dask-5]:tcp://127.0.0.1:38213 got new rank 5
2024-01-10 12:25:47,538 - INFO: - @tracker All of 6 nodes getting started
2024-01-10 12:26:12,018 - INFO: - @tracker All nodes finishes job
2024-01-10 12:26:12,114 - INFO: - Training model...Completed
20

 y_test_c: <class 'numpy.ndarray'>   Shape:  (9889,)       Predictions: <class 'numpy.ndarray'>  shape: (9889,)
MSE Score : 0.023646
 R2 Score : 0.022409
--------------------------------------------------------------------------------
Training/Test split #1


2024-01-10 12:26:34,818 - INFO: - Building training DMatrix...
2024-01-10 12:27:38,479 - INFO: - Building training DMatrix...Completed
2024-01-10 12:27:38,483 - INFO: - Building test DMatrix...
2024-01-10 12:28:02,262 - INFO: - Building DMatrix...Completed
2024-01-10 12:28:02,271 - INFO: - Training model...
2024-01-10 12:28:02,279 - INFO: - start listen on 10.91.28.16:60435
[12:28:02] task [xgboost.dask-0]:tcp://127.0.0.1:32979 got new rank 0
[12:28:02] task [xgboost.dask-1]:tcp://127.0.0.1:45541 got new rank 1
[12:28:02] task [xgboost.dask-2]:tcp://127.0.0.1:36753 got new rank 2
[12:28:02] task [xgboost.dask-3]:tcp://127.0.0.1:34831 got new rank 3
[12:28:02] task [xgboost.dask-4]:tcp://127.0.0.1:42505 got new rank 4
[12:28:02] task [xgboost.dask-5]:tcp://127.0.0.1:38213 got new rank 5
2024-01-10 12:28:02,354 - INFO: - @tracker All of 6 nodes getting started
2024-01-10 12:28:27,202 - INFO: - @tracker All nodes finishes job
2024-01-10 12:28:27,350 - INFO: - Training model...Completed
20

 y_test_c: <class 'numpy.ndarray'>   Shape:  (9918,)       Predictions: <class 'numpy.ndarray'>  shape: (9918,)
MSE Score : 0.023422
 R2 Score : 0.026079
--------------------------------------------------------------------------------
Training/Test split #2


2024-01-10 12:28:50,480 - INFO: - Building training DMatrix...
2024-01-10 12:29:52,472 - INFO: - Building training DMatrix...Completed
2024-01-10 12:29:52,497 - INFO: - Building test DMatrix...
2024-01-10 12:30:16,447 - INFO: - Building DMatrix...Completed
2024-01-10 12:30:16,450 - INFO: - Training model...
2024-01-10 12:30:16,461 - INFO: - start listen on 10.91.28.16:53537
[12:30:16] task [xgboost.dask-0]:tcp://127.0.0.1:32979 got new rank 0
[12:30:16] task [xgboost.dask-1]:tcp://127.0.0.1:45541 got new rank 1
[12:30:16] task [xgboost.dask-2]:tcp://127.0.0.1:36753 got new rank 2
[12:30:16] task [xgboost.dask-3]:tcp://127.0.0.1:34831 got new rank 3
[12:30:16] task [xgboost.dask-4]:tcp://127.0.0.1:42505 got new rank 4
[12:30:16] task [xgboost.dask-5]:tcp://127.0.0.1:38213 got new rank 5
2024-01-10 12:30:16,541 - INFO: - @tracker All of 6 nodes getting started
2024-01-10 12:30:43,555 - INFO: - @tracker All nodes finishes job
2024-01-10 12:30:43,686 - INFO: - Training model...Completed
20

 y_test_c: <class 'numpy.ndarray'>   Shape:  (9955,)       Predictions: <class 'numpy.ndarray'>  shape: (9955,)
MSE Score : 0.022085
 R2 Score : 0.023280
--------------------------------------------------------------------------------
Training/Test split #3


2024-01-10 12:31:07,033 - INFO: - Building training DMatrix...
2024-01-10 12:32:14,175 - INFO: - Building training DMatrix...Completed
2024-01-10 12:32:14,177 - INFO: - Building test DMatrix...
2024-01-10 12:32:39,449 - INFO: - Building DMatrix...Completed
2024-01-10 12:32:39,456 - INFO: - Training model...
2024-01-10 12:32:39,461 - INFO: - start listen on 10.91.28.16:35079
[12:32:39] task [xgboost.dask-0]:tcp://127.0.0.1:32979 got new rank 0
[12:32:39] task [xgboost.dask-1]:tcp://127.0.0.1:45541 got new rank 1
[12:32:39] task [xgboost.dask-2]:tcp://127.0.0.1:36753 got new rank 2
[12:32:39] task [xgboost.dask-3]:tcp://127.0.0.1:34831 got new rank 3
[12:32:39] task [xgboost.dask-4]:tcp://127.0.0.1:42505 got new rank 4
[12:32:39] task [xgboost.dask-5]:tcp://127.0.0.1:38213 got new rank 5
2024-01-10 12:32:39,541 - INFO: - @tracker All of 6 nodes getting started
2024-01-10 12:33:06,582 - INFO: - @tracker All nodes finishes job
2024-01-10 12:33:06,702 - INFO: - Training model...Completed
20

 y_test_c: <class 'numpy.ndarray'>   Shape:  (10120,)       Predictions: <class 'numpy.ndarray'>  shape: (10120,)
MSE Score : 0.023833
 R2 Score : 0.019968
--------------------------------------------------------------------------------
Training/Test split #4


2024-01-10 12:33:29,490 - INFO: - Building training DMatrix...
2024-01-10 12:34:34,937 - INFO: - Building training DMatrix...Completed
2024-01-10 12:34:34,941 - INFO: - Building test DMatrix...
2024-01-10 12:34:58,844 - INFO: - Building DMatrix...Completed
2024-01-10 12:34:58,845 - INFO: - Training model...
2024-01-10 12:34:58,859 - INFO: - start listen on 10.91.28.16:35597
[12:34:58] task [xgboost.dask-0]:tcp://127.0.0.1:32979 got new rank 0
[12:34:58] task [xgboost.dask-1]:tcp://127.0.0.1:45541 got new rank 1
[12:34:58] task [xgboost.dask-2]:tcp://127.0.0.1:36753 got new rank 2
[12:34:58] task [xgboost.dask-3]:tcp://127.0.0.1:34831 got new rank 3
[12:34:58] task [xgboost.dask-4]:tcp://127.0.0.1:42505 got new rank 4
[12:34:58] task [xgboost.dask-5]:tcp://127.0.0.1:38213 got new rank 5
2024-01-10 12:34:58,931 - INFO: - @tracker All of 6 nodes getting started
2024-01-10 12:35:28,580 - INFO: - @tracker All nodes finishes job
2024-01-10 12:35:28,675 - INFO: - Training model...Completed
20

 y_test_c: <class 'numpy.ndarray'>   Shape:  (10118,)       Predictions: <class 'numpy.ndarray'>  shape: (10118,)
MSE Score : 0.023070
 R2 Score : 0.025948
--------------------------------------------------------------------------------
RSME : 0.02321 +/- 0.00062
--------------------------------------------------------------------------------
R^2  : 0.02354 +/- 0.00230
--------------------------------------------------------------------------------
Training model (trial #10) - Parameters:
  learning_rate=0.25635898003977226
  max_depth=1
  max_leaves=1
[0.2, 0.2, 0.2, 0.2, 0.2] 5
Training/Test split #0


2024-01-10 12:35:52,079 - INFO: - Building training DMatrix...
2024-01-10 12:36:59,254 - INFO: - Building training DMatrix...Completed
2024-01-10 12:36:59,256 - INFO: - Building test DMatrix...
2024-01-10 12:37:23,923 - INFO: - Building DMatrix...Completed
2024-01-10 12:37:23,924 - INFO: - Training model...
2024-01-10 12:37:23,940 - INFO: - start listen on 10.91.28.16:46737
[12:37:24] task [xgboost.dask-0]:tcp://127.0.0.1:32979 got new rank 0
[12:37:24] task [xgboost.dask-1]:tcp://127.0.0.1:45541 got new rank 1
[12:37:24] task [xgboost.dask-2]:tcp://127.0.0.1:36753 got new rank 2
[12:37:24] task [xgboost.dask-3]:tcp://127.0.0.1:34831 got new rank 3
[12:37:24] task [xgboost.dask-4]:tcp://127.0.0.1:42505 got new rank 4
[12:37:24] task [xgboost.dask-5]:tcp://127.0.0.1:38213 got new rank 5
2024-01-10 12:37:24,024 - INFO: - @tracker All of 6 nodes getting started
2024-01-10 12:37:35,305 - INFO: - @tracker All nodes finishes job
2024-01-10 12:37:35,402 - INFO: - Training model...Completed
20

 y_test_c: <class 'numpy.ndarray'>   Shape:  (10194,)       Predictions: <class 'numpy.ndarray'>  shape: (10194,)
MSE Score : 0.023781
 R2 Score : -0.000048
--------------------------------------------------------------------------------
Training/Test split #1


2024-01-10 12:37:59,400 - INFO: - Building training DMatrix...
2024-01-10 12:39:06,353 - INFO: - Building training DMatrix...Completed
2024-01-10 12:39:06,356 - INFO: - Building test DMatrix...
2024-01-10 12:39:31,250 - INFO: - Building DMatrix...Completed
2024-01-10 12:39:31,253 - INFO: - Training model...
2024-01-10 12:39:31,266 - INFO: - start listen on 10.91.28.16:37069
[12:39:31] task [xgboost.dask-0]:tcp://127.0.0.1:32979 got new rank 0
[12:39:31] task [xgboost.dask-1]:tcp://127.0.0.1:45541 got new rank 1
[12:39:31] task [xgboost.dask-2]:tcp://127.0.0.1:36753 got new rank 2
[12:39:31] task [xgboost.dask-3]:tcp://127.0.0.1:34831 got new rank 3
[12:39:31] task [xgboost.dask-4]:tcp://127.0.0.1:42505 got new rank 4
[12:39:31] task [xgboost.dask-5]:tcp://127.0.0.1:38213 got new rank 5
2024-01-10 12:39:31,343 - INFO: - @tracker All of 6 nodes getting started
2024-01-10 12:39:42,964 - INFO: - @tracker All nodes finishes job
2024-01-10 12:39:43,093 - INFO: - Training model...Completed
20

 y_test_c: <class 'numpy.ndarray'>   Shape:  (9756,)       Predictions: <class 'numpy.ndarray'>  shape: (9756,)
MSE Score : 0.024387
 R2 Score : -0.000046
--------------------------------------------------------------------------------
Training/Test split #2


2024-01-10 12:40:07,483 - INFO: - Building training DMatrix...
2024-01-10 12:41:15,007 - INFO: - Building training DMatrix...Completed
2024-01-10 12:41:15,009 - INFO: - Building test DMatrix...
2024-01-10 12:41:40,953 - INFO: - Building DMatrix...Completed
2024-01-10 12:41:40,959 - INFO: - Training model...
2024-01-10 12:41:40,964 - INFO: - start listen on 10.91.28.16:43901
[12:41:41] task [xgboost.dask-0]:tcp://127.0.0.1:32979 got new rank 0
[12:41:41] task [xgboost.dask-1]:tcp://127.0.0.1:45541 got new rank 1
[12:41:41] task [xgboost.dask-2]:tcp://127.0.0.1:36753 got new rank 2
[12:41:41] task [xgboost.dask-3]:tcp://127.0.0.1:34831 got new rank 3
[12:41:41] task [xgboost.dask-4]:tcp://127.0.0.1:42505 got new rank 4
[12:41:41] task [xgboost.dask-5]:tcp://127.0.0.1:38213 got new rank 5
2024-01-10 12:41:41,045 - INFO: - @tracker All of 6 nodes getting started
2024-01-10 12:41:52,299 - INFO: - @tracker All nodes finishes job
2024-01-10 12:41:52,430 - INFO: - Training model...Completed
20

 y_test_c: <class 'numpy.ndarray'>   Shape:  (10010,)       Predictions: <class 'numpy.ndarray'>  shape: (10010,)
MSE Score : 0.022729
 R2 Score : -0.000003
--------------------------------------------------------------------------------
Training/Test split #3


2024-01-10 12:42:15,802 - INFO: - Building training DMatrix...
2024-01-10 12:43:21,331 - INFO: - Building training DMatrix...Completed
2024-01-10 12:43:21,333 - INFO: - Building test DMatrix...
2024-01-10 12:43:45,906 - INFO: - Building DMatrix...Completed
2024-01-10 12:43:45,908 - INFO: - Training model...
2024-01-10 12:43:45,919 - INFO: - start listen on 10.91.28.16:49359
[12:43:45] task [xgboost.dask-0]:tcp://127.0.0.1:32979 got new rank 0
[12:43:45] task [xgboost.dask-1]:tcp://127.0.0.1:45541 got new rank 1
[12:43:45] task [xgboost.dask-2]:tcp://127.0.0.1:36753 got new rank 2
[12:43:45] task [xgboost.dask-3]:tcp://127.0.0.1:34831 got new rank 3
[12:43:45] task [xgboost.dask-4]:tcp://127.0.0.1:42505 got new rank 4
[12:43:46] task [xgboost.dask-5]:tcp://127.0.0.1:38213 got new rank 5
2024-01-10 12:43:46,007 - INFO: - @tracker All of 6 nodes getting started
2024-01-10 12:43:57,082 - INFO: - @tracker All nodes finishes job
2024-01-10 12:43:57,229 - INFO: - Training model...Completed
20

 y_test_c: <class 'numpy.ndarray'>   Shape:  (9962,)       Predictions: <class 'numpy.ndarray'>  shape: (9962,)
MSE Score : 0.023322
 R2 Score : -0.000015
--------------------------------------------------------------------------------
Training/Test split #4


2024-01-10 12:44:21,121 - INFO: - Building training DMatrix...
2024-01-10 12:45:28,990 - INFO: - Building training DMatrix...Completed
2024-01-10 12:45:28,991 - INFO: - Building test DMatrix...
2024-01-10 12:45:53,772 - INFO: - Building DMatrix...Completed
2024-01-10 12:45:53,775 - INFO: - Training model...
2024-01-10 12:45:53,785 - INFO: - start listen on 10.91.28.16:42941
[12:45:53] task [xgboost.dask-0]:tcp://127.0.0.1:32979 got new rank 0
[12:45:53] task [xgboost.dask-1]:tcp://127.0.0.1:45541 got new rank 1
[12:45:53] task [xgboost.dask-2]:tcp://127.0.0.1:36753 got new rank 2
[12:45:53] task [xgboost.dask-3]:tcp://127.0.0.1:34831 got new rank 3
[12:45:53] task [xgboost.dask-4]:tcp://127.0.0.1:42505 got new rank 4
[12:45:53] task [xgboost.dask-5]:tcp://127.0.0.1:38213 got new rank 5
2024-01-10 12:45:53,866 - INFO: - @tracker All of 6 nodes getting started
2024-01-10 12:46:05,357 - INFO: - @tracker All nodes finishes job
2024-01-10 12:46:05,545 - INFO: - Training model...Completed
20

 y_test_c: <class 'numpy.ndarray'>   Shape:  (10078,)       Predictions: <class 'numpy.ndarray'>  shape: (10078,)
MSE Score : 0.024654
 R2 Score : -0.000348
--------------------------------------------------------------------------------
RSME : 0.02377 +/- 0.00070
--------------------------------------------------------------------------------
R^2  : -0.00009 +/- 0.00013
--------------------------------------------------------------------------------


[I 2024-01-10 12:46:26,794] Trial 10 finished with value: 0.02377457357943058 and parameters: {'learning_rate': 0.25635898003977226, 'max_depth': 1, 'max_leaves': 1}. Best is trial 9 with value: 0.02321113310754299.


Training model (trial #11) - Parameters:
  learning_rate=0.6904343985485115
  max_depth=1
  max_leaves=0
[0.2, 0.2, 0.2, 0.2, 0.2] 5
Training/Test split #0


2024-01-10 12:46:29,466 - INFO: - Building training DMatrix...
2024-01-10 12:47:34,105 - INFO: - Building training DMatrix...Completed
2024-01-10 12:47:34,107 - INFO: - Building test DMatrix...
2024-01-10 12:47:58,656 - INFO: - Building DMatrix...Completed
2024-01-10 12:47:58,665 - INFO: - Training model...
2024-01-10 12:47:58,671 - INFO: - start listen on 10.91.28.16:40327
[12:47:58] task [xgboost.dask-0]:tcp://127.0.0.1:32979 got new rank 0
[12:47:58] task [xgboost.dask-1]:tcp://127.0.0.1:45541 got new rank 1
[12:47:58] task [xgboost.dask-2]:tcp://127.0.0.1:36753 got new rank 2
[12:47:58] task [xgboost.dask-3]:tcp://127.0.0.1:34831 got new rank 3
[12:47:58] task [xgboost.dask-4]:tcp://127.0.0.1:42505 got new rank 4
[12:47:58] task [xgboost.dask-5]:tcp://127.0.0.1:38213 got new rank 5
2024-01-10 12:47:58,750 - INFO: - @tracker All of 6 nodes getting started
2024-01-10 12:48:10,167 - INFO: - @tracker All nodes finishes job
2024-01-10 12:48:10,268 - INFO: - Training model...Completed
20

 y_test_c: <class 'numpy.ndarray'>   Shape:  (10194,)       Predictions: <class 'numpy.ndarray'>  shape: (10194,)
MSE Score : 0.023409
 R2 Score : 0.015612
--------------------------------------------------------------------------------
Training/Test split #1


2024-01-10 12:48:33,934 - INFO: - Building training DMatrix...
2024-01-10 12:49:46,711 - INFO: - Building training DMatrix...Completed
2024-01-10 12:49:46,713 - INFO: - Building test DMatrix...
2024-01-10 12:50:13,700 - INFO: - Building DMatrix...Completed
2024-01-10 12:50:13,707 - INFO: - Training model...
2024-01-10 12:50:13,715 - INFO: - start listen on 10.91.28.16:47857
[12:50:13] task [xgboost.dask-0]:tcp://127.0.0.1:32979 got new rank 0
[12:50:13] task [xgboost.dask-1]:tcp://127.0.0.1:45541 got new rank 1
[12:50:13] task [xgboost.dask-2]:tcp://127.0.0.1:36753 got new rank 2
[12:50:13] task [xgboost.dask-3]:tcp://127.0.0.1:34831 got new rank 3
[12:50:13] task [xgboost.dask-4]:tcp://127.0.0.1:42505 got new rank 4
[12:50:13] task [xgboost.dask-5]:tcp://127.0.0.1:38213 got new rank 5
2024-01-10 12:50:13,803 - INFO: - @tracker All of 6 nodes getting started
2024-01-10 12:50:25,264 - INFO: - @tracker All nodes finishes job
2024-01-10 12:50:25,337 - INFO: - Training model...Completed
20

 y_test_c: <class 'numpy.ndarray'>   Shape:  (9943,)       Predictions: <class 'numpy.ndarray'>  shape: (9943,)
MSE Score : 0.023556
 R2 Score : 0.015730
--------------------------------------------------------------------------------
Training/Test split #2


2024-01-10 12:50:50,193 - INFO: - Building training DMatrix...
2024-01-10 12:51:58,245 - INFO: - Building training DMatrix...Completed
2024-01-10 12:51:58,246 - INFO: - Building test DMatrix...
2024-01-10 12:52:23,564 - INFO: - Building DMatrix...Completed
2024-01-10 12:52:23,572 - INFO: - Training model...
2024-01-10 12:52:23,577 - INFO: - start listen on 10.91.28.16:54395
[12:52:23] task [xgboost.dask-0]:tcp://127.0.0.1:32979 got new rank 0
[12:52:23] task [xgboost.dask-1]:tcp://127.0.0.1:45541 got new rank 1
[12:52:23] task [xgboost.dask-2]:tcp://127.0.0.1:36753 got new rank 2
[12:52:23] task [xgboost.dask-3]:tcp://127.0.0.1:34831 got new rank 3
[12:52:23] task [xgboost.dask-4]:tcp://127.0.0.1:42505 got new rank 4
[12:52:23] task [xgboost.dask-5]:tcp://127.0.0.1:38213 got new rank 5
2024-01-10 12:52:23,655 - INFO: - @tracker All of 6 nodes getting started
2024-01-10 12:52:34,412 - INFO: - @tracker All nodes finishes job
2024-01-10 12:52:34,524 - INFO: - Training model...Completed
20

 y_test_c: <class 'numpy.ndarray'>   Shape:  (9903,)       Predictions: <class 'numpy.ndarray'>  shape: (9903,)
MSE Score : 0.023668
 R2 Score : 0.017148
--------------------------------------------------------------------------------
Training/Test split #3


2024-01-10 12:52:57,677 - INFO: - Building training DMatrix...
2024-01-10 12:54:03,204 - INFO: - Building training DMatrix...Completed
2024-01-10 12:54:03,206 - INFO: - Building test DMatrix...
2024-01-10 12:54:27,640 - INFO: - Building DMatrix...Completed
2024-01-10 12:54:27,642 - INFO: - Training model...
2024-01-10 12:54:27,653 - INFO: - start listen on 10.91.28.16:46853
[12:54:27] task [xgboost.dask-0]:tcp://127.0.0.1:32979 got new rank 0
[12:54:27] task [xgboost.dask-1]:tcp://127.0.0.1:45541 got new rank 1
[12:54:27] task [xgboost.dask-2]:tcp://127.0.0.1:36753 got new rank 2
[12:54:27] task [xgboost.dask-3]:tcp://127.0.0.1:34831 got new rank 3
[12:54:27] task [xgboost.dask-4]:tcp://127.0.0.1:42505 got new rank 4
[12:54:27] task [xgboost.dask-5]:tcp://127.0.0.1:38213 got new rank 5
2024-01-10 12:54:27,732 - INFO: - @tracker All of 6 nodes getting started
2024-01-10 12:54:38,965 - INFO: - @tracker All nodes finishes job
2024-01-10 12:54:39,065 - INFO: - Training model...Completed
20

 y_test_c: <class 'numpy.ndarray'>   Shape:  (10022,)       Predictions: <class 'numpy.ndarray'>  shape: (10022,)
MSE Score : 0.023723
 R2 Score : 0.014293
--------------------------------------------------------------------------------
Training/Test split #4


2024-01-10 12:55:02,825 - INFO: - Building training DMatrix...
2024-01-10 12:56:09,935 - INFO: - Building training DMatrix...Completed
2024-01-10 12:56:09,939 - INFO: - Building test DMatrix...
2024-01-10 12:56:33,920 - INFO: - Building DMatrix...Completed
2024-01-10 12:56:33,925 - INFO: - Training model...
2024-01-10 12:56:33,931 - INFO: - start listen on 10.91.28.16:36151
[12:56:34] task [xgboost.dask-0]:tcp://127.0.0.1:32979 got new rank 0
[12:56:34] task [xgboost.dask-1]:tcp://127.0.0.1:45541 got new rank 1
[12:56:34] task [xgboost.dask-2]:tcp://127.0.0.1:36753 got new rank 2
[12:56:34] task [xgboost.dask-3]:tcp://127.0.0.1:34831 got new rank 3
[12:56:34] task [xgboost.dask-4]:tcp://127.0.0.1:42505 got new rank 4
[12:56:34] task [xgboost.dask-5]:tcp://127.0.0.1:38213 got new rank 5
2024-01-10 12:56:34,017 - INFO: - @tracker All of 6 nodes getting started
2024-01-10 12:56:45,627 - INFO: - @tracker All nodes finishes job
2024-01-10 12:56:45,726 - INFO: - Training model...Completed
20

 y_test_c: <class 'numpy.ndarray'>   Shape:  (10159,)       Predictions: <class 'numpy.ndarray'>  shape: (10159,)
MSE Score : 0.022713
 R2 Score : 0.018851
--------------------------------------------------------------------------------
RSME : 0.02341 +/- 0.00037
--------------------------------------------------------------------------------
R^2  : 0.01633 +/- 0.00155
--------------------------------------------------------------------------------
Training model (trial #12) - Parameters:
  learning_rate=0.3436517064417405
  max_depth=3
  max_leaves=0
[0.2, 0.2, 0.2, 0.2, 0.2] 5
Training/Test split #0


2024-01-10 12:57:09,915 - INFO: - Building training DMatrix...
2024-01-10 12:58:18,356 - INFO: - Building training DMatrix...Completed
2024-01-10 12:58:18,363 - INFO: - Building test DMatrix...
2024-01-10 12:58:43,368 - INFO: - Building DMatrix...Completed
2024-01-10 12:58:43,369 - INFO: - Training model...
2024-01-10 12:58:43,380 - INFO: - start listen on 10.91.28.16:59217
[12:58:43] task [xgboost.dask-0]:tcp://127.0.0.1:32979 got new rank 0
[12:58:43] task [xgboost.dask-1]:tcp://127.0.0.1:45541 got new rank 1
[12:58:43] task [xgboost.dask-2]:tcp://127.0.0.1:36753 got new rank 2
[12:58:43] task [xgboost.dask-3]:tcp://127.0.0.1:34831 got new rank 3
[12:58:43] task [xgboost.dask-4]:tcp://127.0.0.1:42505 got new rank 4
[12:58:43] task [xgboost.dask-5]:tcp://127.0.0.1:38213 got new rank 5
2024-01-10 12:58:43,456 - INFO: - @tracker All of 6 nodes getting started
2024-01-10 12:59:02,107 - INFO: - @tracker All nodes finishes job
2024-01-10 12:59:02,248 - INFO: - Training model...Completed
20

 y_test_c: <class 'numpy.ndarray'>   Shape:  (10137,)       Predictions: <class 'numpy.ndarray'>  shape: (10137,)
MSE Score : 0.023724
 R2 Score : 0.024002
--------------------------------------------------------------------------------
Training/Test split #1


2024-01-10 12:59:25,789 - INFO: - Building training DMatrix...
2024-01-10 13:00:31,112 - INFO: - Building training DMatrix...Completed
2024-01-10 13:00:31,114 - INFO: - Building test DMatrix...
2024-01-10 13:00:55,674 - INFO: - Building DMatrix...Completed
2024-01-10 13:00:55,680 - INFO: - Training model...
2024-01-10 13:00:55,685 - INFO: - start listen on 10.91.28.16:52603
[13:00:55] task [xgboost.dask-0]:tcp://127.0.0.1:32979 got new rank 0
[13:00:55] task [xgboost.dask-1]:tcp://127.0.0.1:45541 got new rank 1
[13:00:55] task [xgboost.dask-2]:tcp://127.0.0.1:36753 got new rank 2
[13:00:55] task [xgboost.dask-3]:tcp://127.0.0.1:34831 got new rank 3
[13:00:55] task [xgboost.dask-4]:tcp://127.0.0.1:42505 got new rank 4
[13:00:55] task [xgboost.dask-5]:tcp://127.0.0.1:38213 got new rank 5
2024-01-10 13:00:55,762 - INFO: - @tracker All of 6 nodes getting started
2024-01-10 13:01:13,720 - INFO: - @tracker All nodes finishes job
2024-01-10 13:01:13,838 - INFO: - Training model...Completed
20

 y_test_c: <class 'numpy.ndarray'>   Shape:  (9768,)       Predictions: <class 'numpy.ndarray'>  shape: (9768,)
MSE Score : 0.022491
 R2 Score : 0.026290
--------------------------------------------------------------------------------
Training/Test split #2


2024-01-10 13:01:37,325 - INFO: - Building training DMatrix...
2024-01-10 13:02:43,910 - INFO: - Building training DMatrix...Completed
2024-01-10 13:02:43,914 - INFO: - Building test DMatrix...
2024-01-10 13:03:07,796 - INFO: - Building DMatrix...Completed
2024-01-10 13:03:07,798 - INFO: - Training model...
2024-01-10 13:03:07,812 - INFO: - start listen on 10.91.28.16:37225
[13:03:07] task [xgboost.dask-0]:tcp://127.0.0.1:32979 got new rank 0
[13:03:07] task [xgboost.dask-1]:tcp://127.0.0.1:45541 got new rank 1
[13:03:07] task [xgboost.dask-2]:tcp://127.0.0.1:36753 got new rank 2
[13:03:07] task [xgboost.dask-3]:tcp://127.0.0.1:34831 got new rank 3
[13:03:07] task [xgboost.dask-4]:tcp://127.0.0.1:42505 got new rank 4
[13:03:07] task [xgboost.dask-5]:tcp://127.0.0.1:38213 got new rank 5
2024-01-10 13:03:07,889 - INFO: - @tracker All of 6 nodes getting started
2024-01-10 13:03:26,072 - INFO: - @tracker All nodes finishes job
2024-01-10 13:03:26,175 - INFO: - Training model...Completed
20

 y_test_c: <class 'numpy.ndarray'>   Shape:  (10065,)       Predictions: <class 'numpy.ndarray'>  shape: (10065,)
MSE Score : 0.023163
 R2 Score : 0.028942
--------------------------------------------------------------------------------
Training/Test split #3


2024-01-10 13:03:49,934 - INFO: - Building training DMatrix...
2024-01-10 13:04:59,061 - INFO: - Building training DMatrix...Completed
2024-01-10 13:04:59,069 - INFO: - Building test DMatrix...
2024-01-10 13:05:23,422 - INFO: - Building DMatrix...Completed
2024-01-10 13:05:23,423 - INFO: - Training model...
2024-01-10 13:05:23,445 - INFO: - start listen on 10.91.28.16:55427
[13:05:23] task [xgboost.dask-0]:tcp://127.0.0.1:32979 got new rank 0
[13:05:23] task [xgboost.dask-1]:tcp://127.0.0.1:45541 got new rank 1
[13:05:23] task [xgboost.dask-2]:tcp://127.0.0.1:36753 got new rank 2
[13:05:23] task [xgboost.dask-3]:tcp://127.0.0.1:34831 got new rank 3
[13:05:23] task [xgboost.dask-4]:tcp://127.0.0.1:42505 got new rank 4
[13:05:23] task [xgboost.dask-5]:tcp://127.0.0.1:38213 got new rank 5
2024-01-10 13:05:23,526 - INFO: - @tracker All of 6 nodes getting started
2024-01-10 13:05:43,174 - INFO: - @tracker All nodes finishes job
2024-01-10 13:05:43,306 - INFO: - Training model...Completed
20

 y_test_c: <class 'numpy.ndarray'>   Shape:  (10019,)       Predictions: <class 'numpy.ndarray'>  shape: (10019,)
MSE Score : 0.022503
 R2 Score : 0.021940
--------------------------------------------------------------------------------
Training/Test split #4


2024-01-10 13:06:07,776 - INFO: - Building training DMatrix...
2024-01-10 13:07:12,902 - INFO: - Building training DMatrix...Completed
2024-01-10 13:07:12,903 - INFO: - Building test DMatrix...
2024-01-10 13:07:37,591 - INFO: - Building DMatrix...Completed
2024-01-10 13:07:37,593 - INFO: - Training model...
2024-01-10 13:07:37,603 - INFO: - start listen on 10.91.28.16:48079
[13:07:37] task [xgboost.dask-0]:tcp://127.0.0.1:32979 got new rank 0
[13:07:37] task [xgboost.dask-1]:tcp://127.0.0.1:45541 got new rank 1
[13:07:37] task [xgboost.dask-2]:tcp://127.0.0.1:36753 got new rank 2
[13:07:37] task [xgboost.dask-3]:tcp://127.0.0.1:34831 got new rank 3
[13:07:37] task [xgboost.dask-4]:tcp://127.0.0.1:42505 got new rank 4
[13:07:37] task [xgboost.dask-5]:tcp://127.0.0.1:38213 got new rank 5
2024-01-10 13:07:37,680 - INFO: - @tracker All of 6 nodes getting started
2024-01-10 13:07:57,806 - INFO: - @tracker All nodes finishes job
2024-01-10 13:07:57,903 - INFO: - Training model...Completed
20

 y_test_c: <class 'numpy.ndarray'>   Shape:  (10011,)       Predictions: <class 'numpy.ndarray'>  shape: (10011,)
MSE Score : 0.023797
 R2 Score : 0.030771
--------------------------------------------------------------------------------
RSME : 0.02314 +/- 0.00057
--------------------------------------------------------------------------------
R^2  : 0.02639 +/- 0.00320
--------------------------------------------------------------------------------
Training model (trial #13) - Parameters:
  learning_rate=0.3606332473342941
  max_depth=3
  max_leaves=0
[0.2, 0.2, 0.2, 0.2, 0.2] 5
Training/Test split #0


2024-01-10 13:08:22,307 - INFO: - Building training DMatrix...
2024-01-10 13:09:32,271 - INFO: - Building training DMatrix...Completed
2024-01-10 13:09:32,272 - INFO: - Building test DMatrix...
2024-01-10 13:09:56,456 - INFO: - Building DMatrix...Completed
2024-01-10 13:09:56,463 - INFO: - Training model...
2024-01-10 13:09:56,472 - INFO: - start listen on 10.91.28.16:36653
[13:09:56] task [xgboost.dask-0]:tcp://127.0.0.1:32979 got new rank 0
[13:09:56] task [xgboost.dask-1]:tcp://127.0.0.1:45541 got new rank 1
[13:09:56] task [xgboost.dask-2]:tcp://127.0.0.1:36753 got new rank 2
[13:09:56] task [xgboost.dask-3]:tcp://127.0.0.1:34831 got new rank 3
[13:09:56] task [xgboost.dask-4]:tcp://127.0.0.1:42505 got new rank 4
[13:09:56] task [xgboost.dask-5]:tcp://127.0.0.1:38213 got new rank 5
2024-01-10 13:09:56,558 - INFO: - @tracker All of 6 nodes getting started
2024-01-10 13:10:15,391 - INFO: - @tracker All nodes finishes job
2024-01-10 13:10:15,482 - INFO: - Training model...Completed
20

 y_test_c: <class 'numpy.ndarray'>   Shape:  (9961,)       Predictions: <class 'numpy.ndarray'>  shape: (9961,)
MSE Score : 0.022585
 R2 Score : 0.033100
--------------------------------------------------------------------------------
Training/Test split #1


2024-01-10 13:10:38,717 - INFO: - Building training DMatrix...
2024-01-10 13:11:45,562 - INFO: - Building training DMatrix...Completed
2024-01-10 13:11:45,583 - INFO: - Building test DMatrix...
2024-01-10 13:12:09,636 - INFO: - Building DMatrix...Completed
2024-01-10 13:12:09,640 - INFO: - Training model...
2024-01-10 13:12:09,650 - INFO: - start listen on 10.91.28.16:59299
[13:12:09] task [xgboost.dask-0]:tcp://127.0.0.1:32979 got new rank 0
[13:12:09] task [xgboost.dask-1]:tcp://127.0.0.1:45541 got new rank 1
[13:12:09] task [xgboost.dask-2]:tcp://127.0.0.1:36753 got new rank 2
[13:12:09] task [xgboost.dask-3]:tcp://127.0.0.1:34831 got new rank 3
[13:12:09] task [xgboost.dask-4]:tcp://127.0.0.1:42505 got new rank 4
[13:12:09] task [xgboost.dask-5]:tcp://127.0.0.1:38213 got new rank 5
2024-01-10 13:12:09,731 - INFO: - @tracker All of 6 nodes getting started
2024-01-10 13:12:29,743 - INFO: - @tracker All nodes finishes job
2024-01-10 13:12:29,829 - INFO: - Training model...Completed
20

 y_test_c: <class 'numpy.ndarray'>   Shape:  (9960,)       Predictions: <class 'numpy.ndarray'>  shape: (9960,)
MSE Score : 0.022749
 R2 Score : 0.031333
--------------------------------------------------------------------------------
Training/Test split #2


2024-01-10 13:12:53,151 - INFO: - Building training DMatrix...
2024-01-10 13:14:02,129 - INFO: - Building training DMatrix...Completed
2024-01-10 13:14:02,132 - INFO: - Building test DMatrix...
2024-01-10 13:14:28,234 - INFO: - Building DMatrix...Completed
2024-01-10 13:14:28,235 - INFO: - Training model...
2024-01-10 13:14:28,246 - INFO: - start listen on 10.91.28.16:44341
[13:14:28] task [xgboost.dask-0]:tcp://127.0.0.1:32979 got new rank 0
[13:14:28] task [xgboost.dask-1]:tcp://127.0.0.1:45541 got new rank 1
[13:14:28] task [xgboost.dask-2]:tcp://127.0.0.1:36753 got new rank 2
[13:14:28] task [xgboost.dask-3]:tcp://127.0.0.1:34831 got new rank 3
[13:14:28] task [xgboost.dask-4]:tcp://127.0.0.1:42505 got new rank 4
[13:14:28] task [xgboost.dask-5]:tcp://127.0.0.1:38213 got new rank 5
2024-01-10 13:14:28,323 - INFO: - @tracker All of 6 nodes getting started
2024-01-10 13:14:48,238 - INFO: - @tracker All nodes finishes job
2024-01-10 13:14:48,318 - INFO: - Training model...Completed
20

 y_test_c: <class 'numpy.ndarray'>   Shape:  (10121,)       Predictions: <class 'numpy.ndarray'>  shape: (10121,)
MSE Score : 0.022791
 R2 Score : 0.029305
--------------------------------------------------------------------------------
Training/Test split #3


2024-01-10 13:15:11,223 - INFO: - Building training DMatrix...
2024-01-10 13:16:18,384 - INFO: - Building training DMatrix...Completed
2024-01-10 13:16:18,386 - INFO: - Building test DMatrix...
2024-01-10 13:16:43,749 - INFO: - Building DMatrix...Completed
2024-01-10 13:16:43,756 - INFO: - Training model...
2024-01-10 13:16:43,765 - INFO: - start listen on 10.91.28.16:42691
[13:16:43] task [xgboost.dask-0]:tcp://127.0.0.1:32979 got new rank 0
[13:16:43] task [xgboost.dask-1]:tcp://127.0.0.1:45541 got new rank 1
[13:16:43] task [xgboost.dask-2]:tcp://127.0.0.1:36753 got new rank 2
[13:16:43] task [xgboost.dask-3]:tcp://127.0.0.1:34831 got new rank 3
[13:16:43] task [xgboost.dask-4]:tcp://127.0.0.1:42505 got new rank 4
[13:16:43] task [xgboost.dask-5]:tcp://127.0.0.1:38213 got new rank 5
2024-01-10 13:16:43,838 - INFO: - @tracker All of 6 nodes getting started
2024-01-10 13:17:01,570 - INFO: - @tracker All nodes finishes job
2024-01-10 13:17:01,688 - INFO: - Training model...Completed
20

 y_test_c: <class 'numpy.ndarray'>   Shape:  (10044,)       Predictions: <class 'numpy.ndarray'>  shape: (10044,)
MSE Score : 0.023484
 R2 Score : 0.024623
--------------------------------------------------------------------------------
Training/Test split #4


2024-01-10 13:17:24,840 - INFO: - Building training DMatrix...
2024-01-10 13:18:31,454 - INFO: - Building training DMatrix...Completed
2024-01-10 13:18:31,456 - INFO: - Building test DMatrix...
2024-01-10 13:18:56,516 - INFO: - Building DMatrix...Completed
2024-01-10 13:18:56,522 - INFO: - Training model...
2024-01-10 13:18:56,530 - INFO: - start listen on 10.91.28.16:38275
[13:18:56] task [xgboost.dask-0]:tcp://127.0.0.1:32979 got new rank 0
[13:18:56] task [xgboost.dask-1]:tcp://127.0.0.1:45541 got new rank 1
[13:18:56] task [xgboost.dask-2]:tcp://127.0.0.1:36753 got new rank 2
[13:18:56] task [xgboost.dask-3]:tcp://127.0.0.1:34831 got new rank 3
[13:18:56] task [xgboost.dask-4]:tcp://127.0.0.1:42505 got new rank 4
[13:18:56] task [xgboost.dask-5]:tcp://127.0.0.1:38213 got new rank 5
2024-01-10 13:18:56,610 - INFO: - @tracker All of 6 nodes getting started
2024-01-10 13:19:15,321 - INFO: - @tracker All nodes finishes job
2024-01-10 13:19:15,415 - INFO: - Training model...Completed
20

 y_test_c: <class 'numpy.ndarray'>   Shape:  (9914,)       Predictions: <class 'numpy.ndarray'>  shape: (9914,)
MSE Score : 0.023839
 R2 Score : 0.025556
--------------------------------------------------------------------------------
RSME : 0.02309 +/- 0.00049
--------------------------------------------------------------------------------
R^2  : 0.02878 +/- 0.00326
--------------------------------------------------------------------------------
Training model (trial #14) - Parameters:
  learning_rate=0.31351624754624685
  max_depth=3
  max_leaves=0
[0.2, 0.2, 0.2, 0.2, 0.2] 5
Training/Test split #0


2024-01-10 13:19:39,757 - INFO: - Building training DMatrix...
2024-01-10 13:20:49,731 - INFO: - Building training DMatrix...Completed
2024-01-10 13:20:49,734 - INFO: - Building test DMatrix...
2024-01-10 13:21:14,117 - INFO: - Building DMatrix...Completed
2024-01-10 13:21:14,126 - INFO: - Training model...
2024-01-10 13:21:14,134 - INFO: - start listen on 10.91.28.16:48421
[13:21:14] task [xgboost.dask-0]:tcp://127.0.0.1:32979 got new rank 0
[13:21:14] task [xgboost.dask-1]:tcp://127.0.0.1:45541 got new rank 1
[13:21:14] task [xgboost.dask-2]:tcp://127.0.0.1:36753 got new rank 2
[13:21:14] task [xgboost.dask-3]:tcp://127.0.0.1:34831 got new rank 3
[13:21:14] task [xgboost.dask-4]:tcp://127.0.0.1:42505 got new rank 4
[13:21:14] task [xgboost.dask-5]:tcp://127.0.0.1:38213 got new rank 5
2024-01-10 13:21:14,206 - INFO: - @tracker All of 6 nodes getting started
2024-01-10 13:21:32,679 - INFO: - @tracker All nodes finishes job
2024-01-10 13:21:32,770 - INFO: - Training model...Completed
20

 y_test_c: <class 'numpy.ndarray'>   Shape:  (9922,)       Predictions: <class 'numpy.ndarray'>  shape: (9922,)
MSE Score : 0.022764
 R2 Score : 0.025618
--------------------------------------------------------------------------------
Training/Test split #1


2024-01-10 13:21:56,704 - INFO: - Building training DMatrix...
2024-01-10 13:23:06,525 - INFO: - Building training DMatrix...Completed
2024-01-10 13:23:06,546 - INFO: - Building test DMatrix...
2024-01-10 13:23:32,681 - INFO: - Building DMatrix...Completed
2024-01-10 13:23:32,688 - INFO: - Training model...
2024-01-10 13:23:32,698 - INFO: - start listen on 10.91.28.16:57473
[13:23:32] task [xgboost.dask-0]:tcp://127.0.0.1:32979 got new rank 0
[13:23:32] task [xgboost.dask-1]:tcp://127.0.0.1:45541 got new rank 1
[13:23:32] task [xgboost.dask-2]:tcp://127.0.0.1:36753 got new rank 2
[13:23:32] task [xgboost.dask-3]:tcp://127.0.0.1:34831 got new rank 3
[13:23:32] task [xgboost.dask-4]:tcp://127.0.0.1:42505 got new rank 4
[13:23:32] task [xgboost.dask-5]:tcp://127.0.0.1:38213 got new rank 5
2024-01-10 13:23:32,771 - INFO: - @tracker All of 6 nodes getting started
2024-01-10 13:23:50,783 - INFO: - @tracker All nodes finishes job
2024-01-10 13:23:50,873 - INFO: - Training model...Completed
20

 y_test_c: <class 'numpy.ndarray'>   Shape:  (9991,)       Predictions: <class 'numpy.ndarray'>  shape: (9991,)
MSE Score : 0.022760
 R2 Score : 0.033806
--------------------------------------------------------------------------------
Training/Test split #2


2024-01-10 13:24:14,510 - INFO: - Building training DMatrix...
2024-01-10 13:25:21,135 - INFO: - Building training DMatrix...Completed
2024-01-10 13:25:21,136 - INFO: - Building test DMatrix...
2024-01-10 13:25:45,901 - INFO: - Building DMatrix...Completed
2024-01-10 13:25:45,902 - INFO: - Training model...
2024-01-10 13:25:45,914 - INFO: - start listen on 10.91.28.16:54499
[13:25:45] task [xgboost.dask-0]:tcp://127.0.0.1:32979 got new rank 0
[13:25:45] task [xgboost.dask-1]:tcp://127.0.0.1:45541 got new rank 1
[13:25:45] task [xgboost.dask-2]:tcp://127.0.0.1:36753 got new rank 2
[13:25:45] task [xgboost.dask-3]:tcp://127.0.0.1:34831 got new rank 3
[13:25:45] task [xgboost.dask-4]:tcp://127.0.0.1:42505 got new rank 4
[13:25:45] task [xgboost.dask-5]:tcp://127.0.0.1:38213 got new rank 5
2024-01-10 13:25:46,000 - INFO: - @tracker All of 6 nodes getting started
2024-01-10 13:26:03,992 - INFO: - @tracker All nodes finishes job
2024-01-10 13:26:04,096 - INFO: - Training model...Completed
20

 y_test_c: <class 'numpy.ndarray'>   Shape:  (9979,)       Predictions: <class 'numpy.ndarray'>  shape: (9979,)
MSE Score : 0.023043
 R2 Score : 0.032916
--------------------------------------------------------------------------------
Training/Test split #3


2024-01-10 13:26:28,222 - INFO: - Building training DMatrix...
2024-01-10 13:27:34,026 - INFO: - Building training DMatrix...Completed
2024-01-10 13:27:34,028 - INFO: - Building test DMatrix...
2024-01-10 13:27:58,455 - INFO: - Building DMatrix...Completed
2024-01-10 13:27:58,461 - INFO: - Training model...
2024-01-10 13:27:58,469 - INFO: - start listen on 10.91.28.16:37947
[13:27:58] task [xgboost.dask-0]:tcp://127.0.0.1:32979 got new rank 0
[13:27:58] task [xgboost.dask-1]:tcp://127.0.0.1:45541 got new rank 1
[13:27:58] task [xgboost.dask-2]:tcp://127.0.0.1:36753 got new rank 2
[13:27:58] task [xgboost.dask-3]:tcp://127.0.0.1:34831 got new rank 3
[13:27:58] task [xgboost.dask-4]:tcp://127.0.0.1:42505 got new rank 4
[13:27:58] task [xgboost.dask-5]:tcp://127.0.0.1:38213 got new rank 5
2024-01-10 13:27:58,552 - INFO: - @tracker All of 6 nodes getting started
2024-01-10 13:28:19,383 - INFO: - @tracker All nodes finishes job
2024-01-10 13:28:19,494 - INFO: - Training model...Completed
20

 y_test_c: <class 'numpy.ndarray'>   Shape:  (9898,)       Predictions: <class 'numpy.ndarray'>  shape: (9898,)
MSE Score : 0.022971
 R2 Score : 0.025895
--------------------------------------------------------------------------------
Training/Test split #4


2024-01-10 13:28:43,385 - INFO: - Building training DMatrix...
2024-01-10 13:29:49,617 - INFO: - Building training DMatrix...Completed
2024-01-10 13:29:49,622 - INFO: - Building test DMatrix...
2024-01-10 13:30:14,948 - INFO: - Building DMatrix...Completed
2024-01-10 13:30:14,949 - INFO: - Training model...
2024-01-10 13:30:14,961 - INFO: - start listen on 10.91.28.16:43457
[13:30:15] task [xgboost.dask-0]:tcp://127.0.0.1:32979 got new rank 0
[13:30:15] task [xgboost.dask-1]:tcp://127.0.0.1:45541 got new rank 1
[13:30:15] task [xgboost.dask-2]:tcp://127.0.0.1:36753 got new rank 2
[13:30:15] task [xgboost.dask-3]:tcp://127.0.0.1:34831 got new rank 3
[13:30:15] task [xgboost.dask-4]:tcp://127.0.0.1:42505 got new rank 4
[13:30:15] task [xgboost.dask-5]:tcp://127.0.0.1:38213 got new rank 5
2024-01-10 13:30:15,045 - INFO: - @tracker All of 6 nodes getting started
2024-01-10 13:30:34,498 - INFO: - @tracker All nodes finishes job
2024-01-10 13:30:34,662 - INFO: - Training model...Completed
20

 y_test_c: <class 'numpy.ndarray'>   Shape:  (10210,)       Predictions: <class 'numpy.ndarray'>  shape: (10210,)
MSE Score : 0.023944
 R2 Score : 0.022386
--------------------------------------------------------------------------------
RSME : 0.02310 +/- 0.00044
--------------------------------------------------------------------------------
R^2  : 0.02812 +/- 0.00446
--------------------------------------------------------------------------------
Training model (trial #15) - Parameters:
  learning_rate=0.3222032820191526
  max_depth=3
  max_leaves=0
[0.2, 0.2, 0.2, 0.2, 0.2] 5
Training/Test split #0


2024-01-10 13:30:59,245 - INFO: - Building training DMatrix...
2024-01-10 13:32:09,340 - INFO: - Building training DMatrix...Completed
2024-01-10 13:32:09,341 - INFO: - Building test DMatrix...
2024-01-10 13:32:34,502 - INFO: - Building DMatrix...Completed
2024-01-10 13:32:34,508 - INFO: - Training model...
2024-01-10 13:32:34,514 - INFO: - start listen on 10.91.28.16:44653
[13:32:34] task [xgboost.dask-0]:tcp://127.0.0.1:32979 got new rank 0
[13:32:34] task [xgboost.dask-1]:tcp://127.0.0.1:45541 got new rank 1
[13:32:34] task [xgboost.dask-2]:tcp://127.0.0.1:36753 got new rank 2
[13:32:34] task [xgboost.dask-3]:tcp://127.0.0.1:34831 got new rank 3
[13:32:34] task [xgboost.dask-4]:tcp://127.0.0.1:42505 got new rank 4
[13:32:34] task [xgboost.dask-5]:tcp://127.0.0.1:38213 got new rank 5
2024-01-10 13:32:34,588 - INFO: - @tracker All of 6 nodes getting started
2024-01-10 13:32:52,247 - INFO: - @tracker All nodes finishes job
2024-01-10 13:32:52,429 - INFO: - Training model...Completed
20

 y_test_c: <class 'numpy.ndarray'>   Shape:  (10038,)       Predictions: <class 'numpy.ndarray'>  shape: (10038,)
MSE Score : 0.022638
 R2 Score : 0.033260
--------------------------------------------------------------------------------
Training/Test split #1


2024-01-10 13:33:15,582 - INFO: - Building training DMatrix...
2024-01-10 13:34:22,267 - INFO: - Building training DMatrix...Completed
2024-01-10 13:34:22,269 - INFO: - Building test DMatrix...
2024-01-10 13:34:47,062 - INFO: - Building DMatrix...Completed
2024-01-10 13:34:47,063 - INFO: - Training model...
2024-01-10 13:34:47,073 - INFO: - start listen on 10.91.28.16:47723
[13:34:47] task [xgboost.dask-0]:tcp://127.0.0.1:32979 got new rank 0
[13:34:47] task [xgboost.dask-1]:tcp://127.0.0.1:45541 got new rank 1
[13:34:47] task [xgboost.dask-2]:tcp://127.0.0.1:36753 got new rank 2
[13:34:47] task [xgboost.dask-3]:tcp://127.0.0.1:34831 got new rank 3
[13:34:47] task [xgboost.dask-4]:tcp://127.0.0.1:42505 got new rank 4
[13:34:47] task [xgboost.dask-5]:tcp://127.0.0.1:38213 got new rank 5
2024-01-10 13:34:47,150 - INFO: - @tracker All of 6 nodes getting started
2024-01-10 13:35:06,190 - INFO: - @tracker All nodes finishes job
2024-01-10 13:35:06,330 - INFO: - Training model...Completed
20

 y_test_c: <class 'numpy.ndarray'>   Shape:  (9924,)       Predictions: <class 'numpy.ndarray'>  shape: (9924,)
MSE Score : 0.023859
 R2 Score : 0.025985
--------------------------------------------------------------------------------
Training/Test split #2


2024-01-10 13:35:29,372 - INFO: - Building training DMatrix...
2024-01-10 13:36:36,181 - INFO: - Building training DMatrix...Completed
2024-01-10 13:36:36,183 - INFO: - Building test DMatrix...
2024-01-10 13:37:01,150 - INFO: - Building DMatrix...Completed
2024-01-10 13:37:01,156 - INFO: - Training model...
2024-01-10 13:37:01,162 - INFO: - start listen on 10.91.28.16:46999
[13:37:01] task [xgboost.dask-0]:tcp://127.0.0.1:32979 got new rank 0
[13:37:01] task [xgboost.dask-1]:tcp://127.0.0.1:45541 got new rank 1
[13:37:01] task [xgboost.dask-2]:tcp://127.0.0.1:36753 got new rank 2
[13:37:01] task [xgboost.dask-3]:tcp://127.0.0.1:34831 got new rank 3
[13:37:01] task [xgboost.dask-4]:tcp://127.0.0.1:42505 got new rank 4
[13:37:01] task [xgboost.dask-5]:tcp://127.0.0.1:38213 got new rank 5
2024-01-10 13:37:01,243 - INFO: - @tracker All of 6 nodes getting started
2024-01-10 13:37:20,584 - INFO: - @tracker All nodes finishes job
2024-01-10 13:37:20,686 - INFO: - Training model...Completed
20

 y_test_c: <class 'numpy.ndarray'>   Shape:  (9879,)       Predictions: <class 'numpy.ndarray'>  shape: (9879,)
MSE Score : 0.023345
 R2 Score : 0.024229
--------------------------------------------------------------------------------
Training/Test split #3


2024-01-10 13:37:44,602 - INFO: - Building training DMatrix...
2024-01-10 13:38:52,825 - INFO: - Building training DMatrix...Completed
2024-01-10 13:38:52,828 - INFO: - Building test DMatrix...
2024-01-10 13:39:17,741 - INFO: - Building DMatrix...Completed
2024-01-10 13:39:17,747 - INFO: - Training model...
2024-01-10 13:39:17,755 - INFO: - start listen on 10.91.28.16:53801
[13:39:17] task [xgboost.dask-0]:tcp://127.0.0.1:32979 got new rank 0
[13:39:17] task [xgboost.dask-1]:tcp://127.0.0.1:45541 got new rank 1
[13:39:17] task [xgboost.dask-2]:tcp://127.0.0.1:36753 got new rank 2
[13:39:17] task [xgboost.dask-3]:tcp://127.0.0.1:34831 got new rank 3
[13:39:17] task [xgboost.dask-4]:tcp://127.0.0.1:42505 got new rank 4
[13:39:17] task [xgboost.dask-5]:tcp://127.0.0.1:38213 got new rank 5
2024-01-10 13:39:17,822 - INFO: - @tracker All of 6 nodes getting started
2024-01-10 13:39:37,272 - INFO: - @tracker All nodes finishes job
2024-01-10 13:39:37,362 - INFO: - Training model...Completed
20

 y_test_c: <class 'numpy.ndarray'>   Shape:  (10144,)       Predictions: <class 'numpy.ndarray'>  shape: (10144,)
MSE Score : 0.022778
 R2 Score : 0.028149
--------------------------------------------------------------------------------
Training/Test split #4


2024-01-10 13:40:00,934 - INFO: - Building training DMatrix...
2024-01-10 13:41:05,812 - INFO: - Building training DMatrix...Completed
2024-01-10 13:41:05,833 - INFO: - Building test DMatrix...
2024-01-10 13:41:31,000 - INFO: - Building DMatrix...Completed
2024-01-10 13:41:31,002 - INFO: - Training model...
2024-01-10 13:41:31,020 - INFO: - start listen on 10.91.28.16:43659
[13:41:31] task [xgboost.dask-0]:tcp://127.0.0.1:32979 got new rank 0
[13:41:31] task [xgboost.dask-1]:tcp://127.0.0.1:45541 got new rank 1
[13:41:31] task [xgboost.dask-2]:tcp://127.0.0.1:36753 got new rank 2
[13:41:31] task [xgboost.dask-3]:tcp://127.0.0.1:34831 got new rank 3
[13:41:31] task [xgboost.dask-4]:tcp://127.0.0.1:42505 got new rank 4
[13:41:31] task [xgboost.dask-5]:tcp://127.0.0.1:38213 got new rank 5
2024-01-10 13:41:31,093 - INFO: - @tracker All of 6 nodes getting started
2024-01-10 13:41:50,564 - INFO: - @tracker All nodes finishes job
2024-01-10 13:41:50,719 - INFO: - Training model...Completed
20

 y_test_c: <class 'numpy.ndarray'>   Shape:  (10015,)       Predictions: <class 'numpy.ndarray'>  shape: (10015,)
MSE Score : 0.022944
 R2 Score : 0.027651
--------------------------------------------------------------------------------
RSME : 0.02311 +/- 0.00044
--------------------------------------------------------------------------------
R^2  : 0.02785 +/- 0.00303
--------------------------------------------------------------------------------
Total time:  1:46:50.343012


In [27]:
print("Number of finished trials: ", len(study.trials))
print("Best trial:")

trial = study.best_trial

print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

# print("  Number of estimators: {}".format(trial.user_attrs["n_estimators"]))

Number of finished trials:  16
Best trial:
  Value: 0.023089442029595376
  Params: 
    learning_rate: 0.3606332473342941
    max_depth: 3
    max_leaves: 0


In [None]:
# xgb_grid_parameters = {
#     'learning_rate': [0.1, 0.01],
#     'max_depth': [12, 10 ,8], 
#     'max_depth': [15,10,5],
#     'min_child_weight':[5,3,2,1], 
#     'min_child_weight':[2,3], 
#     'gamma':[1,5, 10],  
#     'gamma':[2.5, 3, 3.5, 4],  
#     'subsample':[i/10.0 for i in range(6,11)],
#     'subsample':[0.5],
#     'colsample_bytree':[i/10.0 for i in range(5,11)], 
#     'colsample_bytree':[0.5], 
#     'n_estimators': [1250, 1000, 750, 500, 200]}
# xgb_reg.set_params(**xgb_grid_parameters)

In [None]:
# client.get_versions(check=True)

# XGBoost - TPSA (Regression)

In [None]:
import os
import shutil
import optuna
import sklearn.datasets
import sklearn.metrics
import xgboost as xgb

SEED = 108
N_FOLDS = 3
CV_RESULT_DIR = "./xgboost_cv_results"

In [None]:
"""
Optuna example that optimizes a classifier configuration for cancer dataset using XGBoost.

In this example, we optimize the accuracy of cancer detection using the XGBoost. The accuracy is
estimated by cross-validation. We optimize both the choice of booster model and its
hyperparameters.
"""

def objective(trial):
    # (data, target) = sklearn.datasets.load_breast_cancer(return_X_y=True)
    df_X = dd.read_csv(profilesFile, usecols=X_columns, dtype=x_columns_dtype)
    df_y = dd.read_csv(profilesFile, usecols=y_columns, dtype=y_columns_dtype)
    
    dtrain = xgb.DMatrix(df_X, label=df_y)

    param = {
        "verbosity": 0,
        "objective"  :  "reg:squarederror",
        "eval_metric":  "rmse",
        "booster"    :  "gbtree",   ## trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
        "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
        "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
        # sampling ratio for training data.
        "subsample": trial.suggest_float("subsample", 0.2, 1.0),
        # sampling according to each tree.
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
    }

    if param["booster"] == "gbtree" or param["booster"] == "dart":
        param["max_depth"] = trial.suggest_int("max_depth", 1, 9)
        # minimum child weight, larger the term more conservative the tree.
        param["min_child_weight"] = trial.suggest_int("min_child_weight", 2, 10)
        param["eta"] = trial.suggest_float("eta", 1e-8, 1.0, log=True)
        param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
        param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])

    if param["booster"] == "dart":
        param["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
        param["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
        param["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
        param["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)

    xgb_cv_results = xgb.cv(
        params=param,
        dtrain=dtrain,
        num_boost_round=10000,
        nfold=N_FOLDS,
        stratified=True,
        early_stopping_rounds=100,
        seed=SEED,
        verbose_eval=False,
    )

    # Set n_estimators as a trial attribute; Accessible via study.trials_dataframe().
    trial.set_user_attr("n_estimators", len(xgb_cv_results))

    # Save cross-validation results.
    filepath = os.path.join(CV_RESULT_DIR, "{}.csv".format(trial.number))
    xgb_cv_results.to_csv(filepath, index=False)

    # Extract the best score.
    best_score = xgb_cv_results["test-auc-mean"].values[-1]
    return best_score

In [None]:
if __name__ == "__main__":
    if not os.path.exists(CV_RESULT_DIR):
        os.mkdir(CV_RESULT_DIR)

    study = optuna.create_study(direction="maximize")
    
    study.optimize(objective, n_trials=20, timeout=600)

    print("Number of finished trials: ", len(study.trials))
    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))
    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

    print("  Number of estimators: {}".format(trial.user_attrs["n_estimators"]))

    shutil.rmtree(CV_RESULT_DIR)

In [None]:
start = datetime.now()
study = optuna.delete_study(storage="sqlite:///example.db",
                            study_name="kevin-study-1")

# XGBoost - Training using XGBoost native interface

**`xgboost.train`** `(params, dtrain, num_boost_round=10, *, `\
`evals=None, obj=None, feval=None, maximize=None, early_stopping_rounds=None, `\
`evals_result=None, verbose_eval=True, xgb_model=None, callbacks=None, custom_metric=None)`

**Parameters** 

**param** `(Dic[str, Any])`  Booster params

**tree_method** string [default= auto] - The tree construction algorithm used in XGBoost. See description in the reference paper and Tree Methods. \
Choices: `auto, exact, approx, hist` - this is a combination of commonly used updaters. For other updaters like refresh, set the parameter updater directly.\
    `auto:` Same as the hist tree method.\
    `exact:` Exact greedy algorithm. Enumerates all split candidates.\
    `approx:` Approximate greedy algorithm using quantile sketch and gradient histogram.\
    `hist:` Faster histogram optimized approximate greedy algorithm.y algorithm.

**Returns:** Booster: a trained booster model

In [None]:
# del output, dtrain

In [None]:
 
dtrain = xgb.dask.DaskDMatrix(client, train_X, train_y)

dval = xgb.dask.DaskDMatrix(client, val_X, val_y)

In [None]:
if __name__ == "__main__":
 
    # X and y must be Dask dataframes or arrays
    # num_obs = 1e5
    # num_features = 20
    # X = da.random.random(size=(num_obs, num_features), chunks=(1000, num_features))
    # y = da.random.random(size=(num_obs, 1), chunks=(1000, 1))
    # dtrain = xgb.dask.DaskDMatrix(client, X, y)
    # or
    # dtrain = xgb.dask.DaskQuantileDMatrix(client, X, y)
    
    early_stopping_rounds=20
    es = xgb.callback.EarlyStopping(rounds=early_stopping_rounds, save_best=True)
    
    output = xgb.dask.train(
        client,
        {"verbosity": 2, "tree_method": "hist", "objective": "reg:squarederror"},
        dtrain,
        num_boost_round=200,
        evals=[(dtrain, "train"), (dval, "val")],
        # xgb_model= output['booster'],
        callbacks = [es],
    )

In [None]:

type(output['booster'])
# output
output['booster'][133]
output['booster'].best_ntree_limit
# output['history']['train']['rmse']
# output['history']['val']['rmse']
# prev_history = output['history']



In [None]:
plt.plot(output['history']['train']['rmse']);
plt.plot(output['history']['val']['rmse']);

In [None]:
import matplotlib.pylab as pylab
params = {'legend.fontsize': 'x-large',
          'figure.figsize': (15, 5),
         'axes.labelsize': 'x-large',
         'axes.titlesize':'x-large',
         'xtick.labelsize':'x-large',
         'ytick.labelsize':'x-large'}
pylab.rcParams.update(params)

In [None]:
fig = plt.figure(figsize=(20, 20))
plt.yticks(fontsize = 12)
ax = fig.add_subplot()
ax.set_xlim(10,50)
ax = xgb.plot_importance(output['booster'], max_num_features= 30, ax = ax)
# for label in ( ax.get_xticklabels() + ax.get_yticklabels()):
#     label.set_fontsize(22)
ax.get_yticklabels()
# ax.autoscale(enable=None, axis="y", tight=True)


In [None]:
output['booster'][133].save_model('./save_20231218_233500_model.json')

In [None]:
config = output['booster'][133].save_config()
type(config)
print(config)

In [None]:
output['booster'][133].save_model('./save_20231218_233500_model.json')

# Training using XGBoost Scikit-Learn Interface

**XGBRegressor**

Implementation of the scikit-learn API for XGBoost regression. See Using the Scikit-Learn Estimator Interface for more information.

- **Gamma:** Minimum loss reduction required to make a further partition on a leaf node of the tree. The larger gamma is, the more conservative the algorithm will be.

- **max_depth:** Maximum depth of a tree. Increasing this value will make the model more complex and more likely to overfit. 0 indicates no limit on depth. \
  Beware that XGBoost aggressively consumes memory when training a deep tree. exact tree method requires non-zero value.

- **min_child_weight:** Minimum sum of instance weight (hessian) needed in a child. If the tree partition step results in a leaf node with the sum of \
  instance weight less than min_child_weight, then the building process will give up further partitioning.

- **subsample:** Subsample ratio of the training instances. Setting it to 0.5 means that XGBoost would randomly sample half of the training data prior to growing trees. and this will prevent overfitting.\
   Subsampling will occur once in every boosting iteration.
  

- **colsample_bytree:** is the subsample ratio of columns when constructing each tree. Subsampling occurs once for every tree constructed.


- **lambda:** L2 regularization term on weights. Increasing this value will make model more conservative.

In [None]:
from sklearn.metrics import get_scorer_names
# pp.pprint(get_scorer_names())
# for i in get_scorer_names() :
#     if "error" in i:
#         print(i)

In [None]:
early_stop = xgb.callback.EarlyStopping(rounds=10, 
                                        metric_name='rmse', 
                                        # data_name='Validation_0', 
                                        save_best=True, 
                                        maximize = False,
)

In [None]:
early_stop.best_scores
early_stop.metric_name
early_stop.current_rounds
# early_stop.stopping_history
# len(early_stop.best_scores['validation_']['rmse'])

In [None]:
xgb_reg = xgb.XGBRegressor(random_state =123, 
                           n_jobs=None,
                           objective ='reg:squarederror', 
                           eval_metric = "rmse",
                           tree_method='hist', 
                           # early_stopping_rounds= 4,
                           booster = 'gbtree', 
                           # device = "cuda",
                           # gpu_id = 0,
                           # client = client, 
                           verbosity=2,
                           # subsample = 1,
                           # sampling_method="uniform",
                           # callbacks = [early_stop],
                          )
# xgb.XGBRFRegressor 
# xgb.XGBModel

In [None]:

# xgb_reg.set_params(tree_method="hist", device = "cuda")
# xgb_reg.set_params(early_stopping_rounds= 3)
# xgb_reg.set_params(gpu_id = 0)
# xgb.client = client

In [None]:
print('Default Parameters :\n')
pp.pprint(xgb_reg.get_params())

In [None]:
# xgb_pcfp = model_selection(xgb_reg, 
                           # xgb_grid_parameters, train_X, train_y, 
                           # scoring =  'neg_mean_squared_error', cv=5, 
                           # GridSearch = True, 
                           # n_iter=30, 
                           # n_jobs=6, 
                           # verbose= 3)

In [None]:
# xgb_reg.fit(train_X, train_y, verbose = 3, eval_set=[(train_X, train_y), (val_X, val_y)])


In [None]:
# model_train = GridSearchCV(xgb_reg, xgb_grid_parameters, 
#                                # cv=3, 
#                                n_jobs = 1,
#                                scoring = None,
#                                # refit = True,
#                           )
model_train = IncrementalSearchCV(xgb_reg, 
                                  xgb_grid_parameters, 
                                  n_initial_parameters=1,
                                  # cv=3, 
                                  # n_jobs = 1,
                                    patience = 5,
                                    random_state = 1234, 
                               scoring = None,
                               # refit = True,
                          )

In [None]:
with joblib.parallel_backend('dask'):
    # model_train.fit(train_X, train_y, eval_set=[(train_X, train_y),(val_X, val_y)],  verbose = 3 )
    model_train.fit(train_X, train_y, eval_set=[(train_X, train_y),(val_X, val_y)], verbose = 3 )

In [None]:
bst = xgb_reg.get_booster()
history = xgb_reg.evals_result()

In [None]:
# history
xgb_reg.best_iteration
xgb_reg.base_score
xgb_reg.best_ntree_limit
# xgb_reg.best_score
xgb_reg.callbacks
xgb_reg.eval_metric
type(bst)

In [None]:
# len(xgb_reg.feature_importances_[xgb_reg.feature_importances_ < 1e-06])
xgb_reg.feature_importances_ 
xgb_reg.feature_names_in_
xgb_reg.n_features_in_
 

In [None]:
train_score = xgb_reg.score(train_X, train_y)

val_score = xgb_reg.score(val_X, val_y)

test_score = xgb_reg.score(test_X, test_y)

In [None]:
print(f" R2 score - Training   : {train_score:0.6f}")
print(f" R2 score - Validation : {val_score:0.6f}")
print(f" R2 score - Test data  : {test_score:0.6f}")

In [None]:
fig = plt.figure(figsize=(20, 20))
plt.yticks(fontsize = 12)
ax = fig.add_subplot()
ax.set_xlim(10,50)
ax = xgb.plot_importance(xgb_reg, max_num_features= 30, ax = ax)
# for label in ( ax.get_xticklabels() + ax.get_yticklabels()):
#     label.set_fontsize(22)
ax.get_yticklabels()
# ax.autoscale(enable=None, axis="y", tight=True)

## Comments

In [None]:
# idx = my_enumerator()
# fn = model_selectiona
# args = (1,xgb_reg, xgb_grid_parameters, train_X, train_y)

# kwargs = dict({'scoring':'neg_mean_squared_error', 'cv':5, 'GridSearch':True, 'n_iter':30, 'n_jobs':6, 'verbose': 4}) 
 
# start_time = time.perf_counter()
# print(f" {datetime.now().strftime('%X.%f')} | Started ")   
# pool = Pool(processes=2)

# # result = pool.starmap_async(get_pharmacophores, enumerate(df_iterator)) 
# # results = starmap_with_kwargs_async(pool, fn, args, kwargs, processes = 1)
# results = pool.apply_async(fn,args, kwargs)

# print(f" {datetime.now().strftime('%X.%f')} | starmap_with_kwargs_async() | close pool. . .  ") 
# pool.close()

# print(f" {datetime.now().strftime('%X.%f')} | starmap_with_kwargs_async() | Waiting for results. . .  ")    

In [None]:
# %%time

## returns: a: labeled_feature, b: unlabeled_feature, c: labeled_Y, d: df_labeled.index, e: df_unlabled.index
# a,b,c,d,e = prepare_input(train_labels, train_ecfp6_bits, target = 'logLD50_mmolkg', encoder = None)

## Calls 
# xgb_pcfp = model_selection(xgb_reg, 
                           # xgb_grid_parameters, train_X, train_y, 
                           # scoring =  'neg_mean_squared_error', cv=5, 
                           # GridSearch = True, 
                           # n_iter=30, 
                           # n_jobs=6, 
                           # verbose= 3)

In [None]:
#  verbose is 3
# Fitting 5 folds for each of 8 candidates, totalling 40 fits
# Best parameters set found on development set: {'colsample_bytree': 0.5, 'gamma': 2.5, 'learning_rate': 0.1, 'max_depth': 15, 'min_child_weight': 3, 'n_estimators': 1500, 'subsample': 0.5}
# Best score: -0.06345690434364994
# Grid scores on development set:

# -0.06349 (+/-0.002) for {'colsample_bytree': 0.5, 'gamma': 2.5, 'learning_rate': 0.1, 'max_depth': 15, 'min_child_weight': 2, 'n_estimators': 1500, 'subsample': 0.5}
# -0.06346 (+/-0.002) for {'colsample_bytree': 0.5, 'gamma': 2.5, 'learning_rate': 0.1, 'max_depth': 15, 'min_child_weight': 3, 'n_estimators': 1500, 'subsample': 0.5}
# -0.06505 (+/-0.002) for {'colsample_bytree': 0.5, 'gamma': 3, 'learning_rate': 0.1, 'max_depth': 15, 'min_child_weight': 2, 'n_estimators': 1500, 'subsample': 0.5}
# -0.06508 (+/-0.002) for {'colsample_bytree': 0.5, 'gamma': 3, 'learning_rate': 0.1, 'max_depth': 15, 'min_child_weight': 3, 'n_estimators': 1500, 'subsample': 0.5}
# -0.06638 (+/-0.002) for {'colsample_bytree': 0.5, 'gamma': 3.5, 'learning_rate': 0.1, 'max_depth': 15, 'min_child_weight': 2, 'n_estimators': 1500, 'subsample': 0.5}
# -0.06638 (+/-0.002) for {'colsample_bytree': 0.5, 'gamma': 3.5, 'learning_rate': 0.1, 'max_depth': 15, 'min_child_weight': 3, 'n_estimators': 1500, 'subsample': 0.5}
# -0.06788 (+/-0.002) for {'colsample_bytree': 0.5, 'gamma': 4, 'learning_rate': 0.1, 'max_depth': 15, 'min_child_weight': 2, 'n_estimators': 1500, 'subsample': 0.5}
# -0.06788 (+/-0.002) for {'colsample_bytree': 0.5, 'gamma': 4, 'learning_rate': 0.1, 'max_depth': 15, 'min_child_weight': 3, 'n_estimators': 1500, 'subsample': 0.5}
# CPU times: user 6.46 s, sys: 286 ms, total: 6.74 s
# Wall time: 3min 7s
    
#     	model	params	mean score	std score	rank
# 0	xgb_pcfp	{'colsample_bytree': 0.5, 'gamma': 2.5, 'learn...	-0.063492	0.001178	2
# 1	xgb_pcfp	{'colsample_bytree': 0.5, 'gamma': 2.5, 'learn...	-0.063457	0.001010	1
# 2	xgb_pcfp	{'colsample_bytree': 0.5, 'gamma': 3, 'learnin...	-0.065046	0.000987	3
# 3	xgb_pcfp	{'colsample_bytree': 0.5, 'gamma': 3, 'learnin...	-0.065080	0.000957	4
# 4	xgb_pcfp	{'colsample_bytree': 0.5, 'gamma': 3.5, 'learn...	-0.066377	0.001204	5
# 5	xgb_pcfp	{'colsample_bytree': 0.5, 'gamma': 3.5, 'learn...	-0.066377	0.001204	5
# 6	xgb_pcfp	{'colsample_bytree': 0.5, 'gamma': 4, 'learnin...	-0.067878	0.001242	7
# 7	xgb_pcfp	{'colsample_bytree': 0.5, 'gamma': 4, 'learnin...	-0.067878	0.001242	7

In [None]:
# df_results = result_model_selection(results = xgb_pcfp, name='xgb_pcfp');
# df_results

In [None]:
# df_results.sort_values('rank')

In [None]:
# df_results.loc[1].params - 
# {'colsample_bytree': 0.5,
#  'gamma': 2.5,
#  'learning_rate': 0.1,
#  'max_depth': 15,
#  'min_child_weight': 3,
#  'n_estimators': 1500,
#  'subsample': 0.5} 


# df_results.loc[2].params = 
# {'colsample_bytree': 0.5,
#  'gamma': 2.5,
#  'learning_rate': 0.1,
#  'max_depth': 15,
#  'min_child_weight': 3,
#  'n_estimators': 1500,
#  'subsample': 0.5}

# df_results.loc[12].params

In [None]:
ALL_RESULTS = []
ALL_RESULTS.append(df_results)