 # XGBoost - Classification

**Using Optuna for hyper-parameter search  to predict TPSA from morphology profiles**

# Initialization

In [1]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:98% !important; }</style>"))
%load_ext autoreload  
%autoreload 2
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline

## imports 

In [2]:
# Models
import os, sys
import math
import pickle
import itertools
import copy 
import joblib
import logging 
# from multiprocessing import Pool, process
from datetime import datetime, time
from collections import defaultdict
from collections.abc import Iterator

if './src' not in sys.path:
    print(f"insert ./src")
    sys.path.insert(0, './src')
print(sys.path)

import numpy as np
np.set_printoptions(edgeitems=3, infstr='inf', linewidth=150, nanstr='nan')

import pandas as pd
pd.options.display.width = 170

import scipy.stats as sps 
import matplotlib.pyplot as plt

from pprint import PrettyPrinter
pp = PrettyPrinter(indent=4)

import warnings
warnings.filterwarnings('ignore')

from utils import display_gpu_info, display_gpu_device_info
from utils_cellpainting import *
from utils_ml import model_selection 

# (initialize, init_dataloaders, init_environment, init_wandb, training_initializations, model_initializations, 
#  check_for_resume_training, disp_dataloader_info, disp_training_parms, warmup_phase, weight_policy_training, 
#  display_gpu_info, display_gpu_device_info, init_dataloaders_by_fold_id, print_separator, print_heading, 
#  timestring, print_loss, get_command_line_args, load_from_pickle ) 

insert ./src
['./src', '/home/kevin/WSL-shared/Cellpainting/cj-datasets', '/home/kevin/miniforge3/envs/cp311/lib/python311.zip', '/home/kevin/miniforge3/envs/cp311/lib/python3.11', '/home/kevin/miniforge3/envs/cp311/lib/python3.11/lib-dynload', '', '/home/kevin/miniforge3/envs/cp311/lib/python3.11/site-packages']


In [3]:
import optuna

import dask
import dask.array as da
import dask.dataframe as dd
from dask import delayed
from dask.distributed import Client
from dask.distributed import LocalCluster
from dask_cuda import LocalCUDACluster
import dask_ml.model_selection as dcv

import xgboost as xgb
from xgboost import XGBClassifier, XGBRegressor

import sklearn.metrics as skm

# from dask_ml.model_selection import train_test_split
# from dask_ml.model_selection import GridSearchCV, IncrementalSearchCV, HyperbandSearchCV
# from dask_ml.metrics import mean_squared_error, r2_score, mean_squared_log_error

In [4]:
# os.environ["WANDB_NOTEBOOK_NAME"] = "Adashare_Train.ipynb"
# os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

In [5]:
setup_logging()
logging.info(f" 1/7- engine connected")
# logging.warning(f" 1/7- engine connected")
# logging.error(f" 1/7- engine connected")
# logging.critical(f" 1/7- engine connected")
# print(logging.NOTSET, logging.DEBUG,  logging.INFO, logging.WARN, logging.WARNING, logging.ERROR, logging.CRITICAL,  logging.FATAL)

2024-02-14 12:27:34,895 - INFO: -  1/7- engine connected


In [6]:
print()
for time_fmt in ['%x %X', '%X %x %Z', '%X.%f', '%D-%X.%f', '%Y-%m-%d %H:%M:%S.%f']:
    cmd_string = f"datetime.now().strftime('{time_fmt}')"
    print(f" {cmd_string:50s}  : {datetime.now().strftime(time_fmt)}")


 datetime.now().strftime('%x %X')                    : 02/14/24 12:27:34
 datetime.now().strftime('%X %x %Z')                 : 12:27:34 02/14/24 
 datetime.now().strftime('%X.%f')                    : 12:27:34.991606
 datetime.now().strftime('%D-%X.%f')                 : 02/14/24-12:27:34.991619
 datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')     : 2024-02-14 12:27:34.991629


In [7]:
# display_gpu_info()

## Helper Functions

In [56]:
def train_model(iter_files, metric_keys = None, study_name = "", trial_num = 0, save = False, 
                **model_params):
    metrics_hist = defaultdict(list)
    my_model = None
    
    for i, ((X_train,y_train), (X_val, y_val)) in enumerate(iter_files):
        logging.info(f"CrossValidation split #{i}")
        logging.info(f"Building training & validation DMatrix...")
        
        d_train = xgb.dask.DaskDMatrix( client, X_train, y_train)      
        d_val = xgb.dask.DaskDMatrix(client, X_val, y_val)
        
        logging.info(f"Training model...")
        # booster_model = my_model['booster'] if my_model is not None else None
        my_model = xgb.dask.train(
            None,  
            model_params, 
            d_train,
            # xgb_model = booster_model,
            num_boost_round=1000,
            evals=[(d_train, "train"), (d_val, "test")],
            verbose_eval=False,
            early_stopping_rounds = 50
        )
        # logging.info(f"Training model...Completed")
        
        logging.info(f"Running model on test data...")
        y_pred = xgb.dask.predict(client, my_model, d_val).compute()      
        y_true = y_val.compute().to_numpy().squeeze()

        # Compute predictions and mean squared error for this iteration
        logging.info(f"Calculating metrics...")
        iter_metrics = compute_classification_metrics(my_model, y_true, y_pred)
        logging.info(f"Calculating metrics...Completed")
        
        print("-" * 80)            
        for key in metric_keys:
            metrics_hist[key].append(iter_metrics[key])
            print(f" {key:20s}: {iter_metrics[key]:9.5f}")
        print("-" * 80)            
        
        del d_train, d_val, X_train, y_train, X_val, y_val, y_true, y_pred
        if save:
            # save_as_filename = ".\saved_models\{1}_trial_{0:03d}.json".format(study.study_name,trial.number)
            save_as_filename = "./saved_models/{0}_trial_{1:03d}.json".format(study_name,trial_num)
            print(f" Save model to : {save_as_filename}")
            model['booster'].save_model(save_as_filename)
    ## End of loop  
    return my_model, metrics_hist

def objective(trial, disp_params = True, save = True):
    metric_keys = ["train_auc","train_logloss", "val_auc", "val_logloss", "roc_auc", "logloss",
                    "accuracy","bal_acc","top_k_acc","F1_score","map","pearson_corr"]    
    
    study_params = propose_parameters(trial)   
    study_params = {"booster"      : "gbtree",
                    "device"       : "gpu",
                    "objective"    :  "binary:logistic",
                    "eval_metric"  :  ["auc", "logloss"],
                    "verbosity"    : 0, 
                    "disable_default_eval_metric" : False,
                    ** study_params}
    print('-'*80)
    print(f" Training model (trial #{trial.number}) ")
    print('-'*80)

    if disp_params:
        print(f" Parameters:")
        for k, v in study_params.items():
            print(f"  {k:30s} {v}")
            
    iter_files = make_cv_splits_2(input_file_list, n_folds=5, y_columns=y_columns)
    model, metrics =  train_model(iter_files, metric_keys = metric_keys, 
                                  study_name = study.study_name,
                                  trial_num = trial.number,
                                  save = True,
                                  ** study_params)

    print_metric_hist(metrics)
    print(f" model best score    :  {model['booster'].best_score}")
    print(f" model best iteration:  {model['booster'].best_iteration}")
    

    r1 = np.array(metrics['val_auc']).mean()
    r2 = np.array(metrics['val_logloss']).mean()
    del model, metrics 
    return r1,r2 

## Datasets

In [9]:
prefix = '' ### Target-2' , 'MOA'
input_path ="./metadata/"
output_path ="./output_11102023"
prefix_lc = prefix.lower().replace('-', '_')
CSV = '.csv'
x = 999

CompoundExtendedMetadata2SampleFile = f"{output_path}{prefix_lc}/compound_extended_metadata_2samples.csv"
CompoundProfiles2SampleFile         = f"{output_path}{prefix_lc}/profiles/compound_profiles_2samples"
CompoundExtendedMetadataSampleFile  = f"{output_path}{prefix_lc}/compound_extended_metadata_samples.csv"
profileInputFiles                   = "./output_11102023/profiles/compound_profiles_2samples_{0:03d}.csv"
binnedProfileFiles                  = "./output_11102023/binned_profiles/binned_2sample_profiles_{0:03d}.csv"
trainingMetrics                     = f"{output_path}{prefix_lc}/training_metrics.pkl"
profileMetadataFile                 = f"{input_path}profile_metadata.pkl"

print()
print(f" Compound Extended Metadata 2 SampleFile  : {CompoundExtendedMetadata2SampleFile }")
print(f" Compound Profiles 2 Samples File CSV     : {CompoundProfiles2SampleFile}")
print(f" ")
print(f" profiles Metadata File                   : {profileMetadataFile}")
print(f" ")
print(f" profileInputFiles                        : {profileInputFiles}")
print(f" binnedProfileFiles                       : {binnedProfileFiles}")
print(f" training metrics                         : {trainingMetrics}")


 Compound Extended Metadata 2 SampleFile  : ./output_11102023/compound_extended_metadata_2samples.csv
 Compound Profiles 2 Samples File CSV     : ./output_11102023/profiles/compound_profiles_2samples
 
 profiles Metadata File                   : ./metadata/profile_metadata.pkl
 
 profileInputFiles                        : ./output_11102023/profiles/compound_profiles_2samples_{0:03d}.csv
 binnedProfileFiles                       : ./output_11102023/binned_profiles/binned_2sample_profiles_{0:03d}.csv
 training metrics                         : ./output_11102023/training_metrics.pkl


## Read Features metadata file

In [10]:
print(f" Features select file:  {profileMetadataFile}")

with open(profileMetadataFile, 'rb') as f: 
    pickle_data = pickle.load(f)

COMPOUND_PROFILE_COLUMNS = pickle_data['all_profile_columns']

metadata_categorical_columns = set(pickle_data['metadata_columns']['MetadataCategoricalColumns'])
metadata_categorical_columns_dtype = pickle_data['metadata_columns']['MetadataCategoricalColumns']

metadata_float_columns =  set(pickle_data['metadata_columns']['MetadataFloatColumns'])
metadata_float_columns_dtype =  pickle_data['metadata_columns']['MetadataFloatColumns']

metadata_integer_columns =  set(pickle_data['metadata_columns']['MetadataIntegerColumns'])
metadata_integer_columns_dtype =  pickle_data['metadata_columns']['MetadataIntegerColumns']

metadata_numeric_columns =  set(pickle_data['metadata_columns']['MetadataNumericColumns'])
metadata_numeric_columns_dtype =  pickle_data['metadata_columns']['MetadataNumericColumns']


X_columns = pickle_data['selected_columns']['selected']
X_columns_dtype = {x: np.float32 for x in X_columns}

y_columns = set(["Metadata_Permiation"])
y_columns_dtype = {x: np.int64 for x in y_columns} ## "Metadata_log10TPSA":np.dtype('float64')}

Xy_columns = list(y_columns | X_columns)
Xy_columns_dtype = y_columns_dtype | X_columns_dtype

 Features select file:  ./metadata/profile_metadata.pkl


### Display metadata info and constants used 

In [17]:
for k in pickle_data['metadata_columns'].keys():
    print("-"*80)
    print(f" {k}  - length({len(pickle_data['metadata_columns'][k])} )")
    print("-"*80)
    if isinstance(pickle_data['metadata_columns'][k], list):
        for v in pickle_data['metadata_columns'][k]:
            print(f" \t : list item : {v}")

    elif isinstance(pickle_data['metadata_columns'][k], dict):    
        for i,v in pickle_data['metadata_columns'][k].items():
            print(f" \t : key :  {i:25s}     item: {v}")
    print()

print("\n all_profile_columns")
print("-"*80)
print(f" Length            : {len(COMPOUND_PROFILE_COLUMNS)}")
print(f" profile cols [:5] : {COMPOUND_PROFILE_COLUMNS[:5]}")
print(f" profile cols [:10]: {COMPOUND_PROFILE_COLUMNS[5:10]}")
print(f" profile cols [:14]: {COMPOUND_PROFILE_COLUMNS[10:14]}")
print(f" profile cols [:18]: {COMPOUND_PROFILE_COLUMNS[14:18]}")

print()
print("\n metadata_columns")
print("-"*80)
for i in pickle_data['metadata_columns'].keys():
    print(f" {i:28s} ({len(pickle_data['metadata_columns'][i]):4d}) {list(pickle_data['metadata_columns'][i])[:5]}")
 
print("\n selected_columns")
print("-"*80)
ttl = 0 
for i in pickle_data['selected_columns'].keys():
    ttl += len(pickle_data['selected_columns'][i])
    print(f" Feature columns -/{i:25s}/   {len(pickle_data['selected_columns'][i]):5d}")
print(f" {'total':47s}{ttl:5d}     ")     
 
print('\n')
print(f" metadata_categorical_columms  ({len(metadata_categorical_columns):4d}) {metadata_categorical_columns} ")
print(f" metadata_float_columms        ({  len(metadata_float_columns):4d}) {metadata_float_columns}   ")
print(f" metadata_integer_columms      ({len(metadata_integer_columns):4d}) {metadata_integer_columns} ")
print(f" metadata_numeric_columms      ({len(metadata_numeric_columns):4d}) {metadata_numeric_columns} ")
print(f" COMPOUND_PROFILE_COLUMNS      ({len(COMPOUND_PROFILE_COLUMNS):4d}) {COMPOUND_PROFILE_COLUMNS[:7]}")
print()
print(f" len(X_columms)                ({len(X_columns):4d})")
print(f" len(y_columms)                ({len(y_columns):4d}) {y_columns}")
print(f" len(Xy_columms)               ({len(Xy_columns)}) {list(Xy_columns)[:3]}")
print(f" len(Xy_columms_dtype)         ({len(Xy_columns)}) {list(Xy_columns_dtype)[:3]}")

print("\n Conversion Dictionaries")
print("-"*80)
print(f" categorical_columns_dtype     ({len(metadata_categorical_columns_dtype):4d}) {metadata_categorical_columns_dtype} ")
print(f" float_columns_dtype           ({len(metadata_float_columns_dtype):4d}) {metadata_float_columns_dtype} ")
print(f" integer_columns_dtype         ({len(metadata_integer_columns_dtype):4d}) {metadata_integer_columns_dtype} ")
print('\n')
# print(f" len(label_columms_dtype)     : {len(label_columns_dtype)} ")
# print(f" len(X_columms_dtype)         : {len(X_columns_dtype)}")
# print(f" len(X_columms_dtype)           : {len(X_columns_dtype)}   {X_columns_dtype}")
# print(f" len(y_columms_dtype)         : {len(y_columns_dtype)}")
# print(f" len(Xy_columms_dtype)        : {len(Xy_columns_dtype)}")
# print(f" len(all_columms_dtype)       : {len(all_columns_dtype)} ")
# del all_columns_dtype
# for k  in sorted(all_columns_dtype.keys()):
#     print(f" {k:60s}  {all_columns_dtype[k]}")
# Xy_columns_dtype

--------------------------------------------------------------------------------
 MetadataOriginalNames  - length(10 )
--------------------------------------------------------------------------------
 	 : list item : Metadata_Source
 	 : list item : Metadata_Batch
 	 : list item : Metadata_Plate
 	 : list item : Metadata_Well
 	 : list item : Metadata_JCP2022
 	 : list item : Metadata_Hash
 	 : list item : TPSA
 	 : list item : lnTPSA
 	 : list item : log10TPSA
 	 : list item : permiation

--------------------------------------------------------------------------------
 MetadataNames  - length(11 )
--------------------------------------------------------------------------------
 	 : list item : Metadata_Source
 	 : list item : Metadata_Batch
 	 : list item : Metadata_Plate
 	 : list item : Metadata_Well
 	 : list item : Metadata_JCP2022
 	 : list item : Metadata_Hash
 	 : list item : Metadata_Bin
 	 : list item : Metadata_TPSA
 	 : list item : Metadata_lnTPSA
 	 : list item : Metadata_

In [None]:
# profile_file= "./output_11102023/profiles/compound_profiles_2samples_002.csv"
# df_profiles = pd.read_csv(profile_file_001, header=0, names = all_columns, usecols=Xy_columns, dtype= Xy_columns_dtype, nrows =5)   
# df_profiles = pd.read_csv(profile_file, header=0, names = profileColumnNames, usecols=Xy_columns, dtype= Xy_columns_dtype, nrows =5)   

In [None]:
# all_columns == col_names
# len(all_columns), len(col_names)
# for i, (c1,c2 ) in enumerate(zip(all_columns, col_names)):
#     if  not(c1 == c2):
#         print(f" item {i}   {c1:50s}   {c2:50s}")

In [None]:
# len(col_names_1), len(col_names_2)
# for i, (c1,c2 ) in enumerate(zip(col_names_1, col_names_2)):
#     if  not(c1 == c2):
#         print(f" item {i}   {c1:50s}   {c2:50s}")

## Create dask cluster and client 

In [49]:
!nvidia-smi

Wed Feb 14 19:02:15 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.129.06   Driver Version: 470.129.06   CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA TITAN Xp     Off  | 00000000:09:00.0 Off |                  N/A |
| 38%   62C    P2    95W / 250W |   3478MiB / 12196MiB |     44%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Quadro GV100        Off  | 00000000:83:00.0 Off |                  Off |
| 45%   59C    P2    42W / 250W |   1524MiB / 32508MiB |      0%      Default |
|       

In [34]:
try:
    cluster.close()
    del cluster
except Exception as e:
    print("Cluster close failed")
else:
    print("Cluster close succeeded")



Cluster close succeeded


In [35]:
n_workers = 3
n_threads = 2
cuda_visible_devices = [0,1,2]
cluster = LocalCUDACluster( CUDA_VISIBLE_DEVICES=[0,1,2], n_workers=n_workers, threads_per_worker= n_threads, memory_limit='6GB', silence_logs = logging.WARNING)
# cluster = LocalCUDACluster( CUDA_VISIBLE_DEVICES=cuda_visible_devices, n_workers=n_workers, threads_per_worker= n_threads, silence_logs = logging.WARNING)
# cluster = LocalCluster("Kevins_Cluster", n_workers=2, threads_per_worker=2)
print(f" Cluster dashboard link : {cluster.dashboard_link}")

 self.scheduler_spec: {'cls': <class 'distributed.scheduler.Scheduler'>, 'options': {'host': '127.0.0.1', 'services': {}, 'service_kwargs': None, 'security': Security(require_encryption=False, tls_min_version=771), 'port': 0, 'interface': None, 'protocol': 'tcp://', 'dashboard': True, 'dashboard_address': ':8787', 'blocked_handlers': None}}




 Cluster dashboard link : http://127.0.0.1:8787/status


In [52]:
# print(cluster)
# print(f" Cluster dashboard link : {cluster.dashboard_link}")
cluster.workers[2].restart
cluster.start_worker
# cluster.worker_spec
# cluster.scale(3)

<bound method Nanny.restart of <Nanny: tcp://127.0.0.1:38937, threads: 2>>

<bound method LocalCluster.start_worker of LocalCUDACluster(3feb3d7d, 'tcp://127.0.0.1:41689', workers=3, threads=6, memory=16.76 GiB)>

In [40]:
try:
    client.close()
    del client
except Exception as e:
    print("Client close failed")
else:
    print("Client close succeeded")
finally:
    print(" Client finally")

Client close succeeded
 Client finally


In [54]:
    client = Client(cluster)    
    print(f"Client created - {client.status}")

Client created - running


In [55]:
client
    
# client = Client(cluster.scheduler_address)
# client = Client(processes = False)
# client = Client("tcp://127.0.0.1:37937")

0,1
Connection method: Cluster object,Cluster type: dask_cuda.LocalCUDACluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 3
Total threads: 6,Total memory: 16.76 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:41689,Workers: 3
Dashboard: http://127.0.0.1:8787/status,Total threads: 6
Started: 35 minutes ago,Total memory: 16.76 GiB

0,1
Comm: tcp://127.0.0.1:43953,Total threads: 2
Dashboard: http://127.0.0.1:44811/status,Memory: 5.59 GiB
Nanny: tcp://127.0.0.1:33755,
Local directory: /tmp/dask-scratch-space/worker-oqvzzy28,Local directory: /tmp/dask-scratch-space/worker-oqvzzy28

0,1
Comm: tcp://127.0.0.1:35161,Total threads: 2
Dashboard: http://127.0.0.1:40049/status,Memory: 5.59 GiB
Nanny: tcp://127.0.0.1:41777,
Local directory: /tmp/dask-scratch-space/worker-oean24jn,Local directory: /tmp/dask-scratch-space/worker-oean24jn

0,1
Comm: tcp://127.0.0.1:38937,Total threads: 2
Dashboard: http://127.0.0.1:41605/status,Memory: 5.59 GiB
Nanny: tcp://127.0.0.1:34195,
Local directory: /tmp/dask-scratch-space/worker-7kx5ei1e,Local directory: /tmp/dask-scratch-space/worker-7kx5ei1e


# Use Optuna for hyper-parameter search

### Setup ML env

In [18]:
xgb.__version__
xgb.config_context(device = "gpu", booster = "gbtree", verbosity = 0, validate_parameters = True ) 

'2.0.3'

<contextlib._GeneratorContextManager at 0x7f1279f73ed0>

In [26]:
# df_profiles = read_cell_profiles(input_files[i], names = COMPOUND_PROFILE_COLUMNS, usecols = Xy_columns, dtype = Xy_columns_dtype)
# df_profiles = read_cell_profiles(training_files, names = COMPOUND_PROFILE_COLUMNS, usecols = Xy_columns, dtype = Xy_columns_dtype)
# file_idxs = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}

TRAINING_FILES = {0, 1, 2, 4, 5, 6, 7, 8, 9, 11}
TEST_FILES = {3, 10}
input_file_list = read_binned_profile_files(file_idxs = TRAINING_FILES, 
                                            filename = binnedProfileFiles,
                                            names = COMPOUND_PROFILE_COLUMNS, 
                                            usecols = Xy_columns, 
                                            dtype = Xy_columns_dtype)

2024-02-14 12:36:55,432 - INFO: -  Read profiles file ...


 Reading cell profiles file :  ./output_11102023/binned_profiles/binned_2sample_profiles_000.csv
 Reading ALL  rows - Number of partitions:  21   

 Reading cell profiles file :  ./output_11102023/binned_profiles/binned_2sample_profiles_001.csv
 Reading ALL  rows - Number of partitions:  20   

 Reading cell profiles file :  ./output_11102023/binned_profiles/binned_2sample_profiles_002.csv
 Reading ALL  rows - Number of partitions:  21   

 Reading cell profiles file :  ./output_11102023/binned_profiles/binned_2sample_profiles_004.csv
 Reading ALL  rows - Number of partitions:  21   

 Reading cell profiles file :  ./output_11102023/binned_profiles/binned_2sample_profiles_005.csv
 Reading ALL  rows - Number of partitions:  21   

 Reading cell profiles file :  ./output_11102023/binned_profiles/binned_2sample_profiles_006.csv
 Reading ALL  rows - Number of partitions:  21   

 Reading cell profiles file :  ./output_11102023/binned_profiles/binned_2sample_profiles_007.csv
 Reading ALL  r

2024-02-14 12:36:57,601 - INFO: -  Read profiles file ... complete


 Reading ALL  rows - Number of partitions:  21   



### Load study

  #### Address for optuna dashboard repository:  `sqlite:////home/kevin/WSL-shared/Cellpainting/cj-datasets/optuna_data.db`
  #### Address for optuna dashboard repository:  `sqlite:////home/kevin/WSL-shared/Cellpainting/cj-datasets/example.db`

In [27]:
# storage = "sqlite:///example.db"
storage_copy = "sqlite:///example_copy.db"
study_name_clone="classification-study-1-clone"
# study_name="classification-study-1"

## https://optuna.readthedocs.io/en/stable/reference/generated/optuna.storages.RetryFailedTrialCallback.html

from optuna.storages import RetryFailedTrialCallback

storage_instance = optuna.storages.RDBStorage(
    url=storage_copy,
    heartbeat_interval=60,
    grace_period=120,
    failed_trial_callback=RetryFailedTrialCallback(max_retry=3),
)
storage_instance


<optuna.storages._rdb.storage.RDBStorage at 0x7f124ac8a350>

In [43]:
# resp = input(f" Delete study {study_name} ?")
# if resp.lower() in ['yes' ,'y']:
#     print(' You responsed yes')
#     try:
#         optuna.delete_study(storage=storage_instance, study_name=study_name)
#         print("delete successful")
#     except Exception as e:
#         print("delete failed")
# else:
#     print(f" {study_name} will be kept")

# study = optuna.create_study(storage=storage_instance_copy,
#                             study_name=study_name_copy,
#                             directions=["maximize","minimize"], 
#                             load_if_exists=True)
# study.set_metric_names(["roc_auc", "logloss"])

In [28]:
start = datetime.now()
study  = optuna.load_study(study_name= study_name_clone, storage=storage_instance)
print(f"Total time:  {datetime.now() - start}")

Total time:  0:00:00.010578


In [58]:
disp_study_history(study)

 classification-study-1-clone  study history

                start     -   completion      status        validation metrics
 trial#         time      -      time          code      roc_auc        logloss
--------------------------------------------------------------------------------
Trial #: 0    2024-02-11   15:31:07 - 16:38:29    1      0.62593        0.35270    
Trial #: 1    2024-02-11   21:50:47 - 22:14:47    1      0.60966        0.35607    
Trial #: 2    2024-02-11   23:21:11 - 00:01:20    1      0.61561        0.36063    
Trial #: 3    2024-02-12   00:01:21 - 00:39:17    1      0.60933        0.35866    
Trial #: 4    2024-02-12   00:39:18 - 00:56:58    1      0.59683        0.36285    
Trial #: 5    2024-02-12   01:55:54 - 02:12:44    1      0.55990        0.56057    
Trial #: 6    2024-02-12   02:12:45 - 02:29:49    1      0.59509        0.36394    
Trial #: 7    2024-02-12   02:29:50 - 03:59:12    1      0.61147        0.37472    
Trial #: 8    2024-02-12   03:59:13 - 04:4

### Run studies

In [57]:
start = datetime.now()

study.optimize(objective, n_trials=1, timeout=3600*7, gc_after_trial=True, show_progress_bar=False)

print(f"Total time:  {datetime.now() - start}")

--------------------------------------------------------------------------------
 Training model (trial #48) 
--------------------------------------------------------------------------------
 Parameters:
  booster                        gbtree
  device                         gpu
  objective                      binary:logistic
  eval_metric                    ['auc', 'logloss']
  verbosity                      0
  disable_default_eval_metric    False
  tree_method                    auto
  learning_rate                  8.83247837937422e-06
  gamma                          5.960817626998998
  max_depth                      13
  min_child_weight               7.9484145607586045
  max_delta_step                 8.715884892658252
  subsample                      0.817346268641036
  sampling_method                uniform
  colsample_bytree               1.0
  colsample_bylevel              0.8740197655399355
  colsample_bynode               0.7375294776118874
  lambda                     

2024-02-14 19:10:44,910 - INFO: - CrossValidation split #0
2024-02-14 19:10:44,912 - INFO: - Building training & validation DMatrix...
2024-02-14 19:12:17,172 - INFO: - Training model...
2024-02-14 19:12:17,181 - INFO: - start listen on 10.91.28.16:34493
[19:12:17] task [xgboost.dask-0]:tcp://127.0.0.1:43953 got new rank 0
[19:12:17] task [xgboost.dask-1]:tcp://127.0.0.1:35161 got new rank 1
[19:12:17] task [xgboost.dask-2]:tcp://127.0.0.1:38937 got new rank 2
2024-02-14 19:12:17,222 - INFO: - @tracker All of 3 nodes getting started
2024-02-14 19:25:52,893 - INFO: - @tracker All nodes finishes job
2024-02-14 19:25:53,443 - INFO: - Running model on test data...
2024-02-14 19:26:13,064 - INFO: - Calculating metrics...
2024-02-14 19:26:13,124 - INFO: - Calculating metrics...Completed
[W 2024-02-14 19:26:13,207] Trial 48 failed with parameters: {'learning_rate': 8.83247837937422e-06, 'min_split_loss': 5.960817626998998, 'max_depth': 13, 'min_child_weight': 7.9484145607586045, 'max_delta_st

--------------------------------------------------------------------------------
 train_auc           :   0.86940
 train_logloss       :   0.37798
 val_auc             :   0.61136
 val_logloss         :   0.38035
 roc_auc             :   0.61302
 logloss             :   0.38035
 accuracy            :   0.87953
 bal_acc             :   0.00000
 top_k_acc           :   1.00000
 F1_score            :   0.00000
 map                 :   0.19587
 pearson_corr        :   0.16075
--------------------------------------------------------------------------------
 Save model to : .\saved_models\classification-study-1-clone_trial_048.json


NameError: name 'model' is not defined

### Display study results

In [251]:
# print(" Number of finished trials: ", len(study.trials))
# print(" Best trials: ", [x.number for x in study.best_trials])

In [252]:
disp_study_history(study)

 classification-study-1  study history

                start     -   completion      status        validation metrics
 trial#         time      -      time          code      roc_auc        logloss
--------------------------------------------------------------------------------
Trial #: 0    2024-02-04   00:48:09 - 00:54:05    3          *** FAILED ***        
Trial #: 1    2024-02-04   01:15:51 - 01:40:09    1      0.92354        0.20999    
Trial #: 2    2024-02-04   10:36:45 - 12:28:29    1      0.82900        0.34993    
Trial #: 3    2024-02-04   14:50:22 - 15:11:49    1      0.77843        0.32556    
Trial #: 4    2024-02-04   15:11:50 - 16:04:43    1      0.71748        0.36995    
Trial #: 5    2024-02-04   16:04:44 - 16:25:30    1      0.84788        0.29725    
Trial #: 6    2024-02-04   16:25:30 - 16:55:59    1      0.62604        0.36119    
Trial #: 7    2024-02-04   18:06:45 - 18:29:19    3          *** FAILED ***        
Trial #: 8    2024-02-04   23:09:09 - 23:29:46  

In [177]:
# study.best_trials[0].values
# study.best_trials[0].params
# study.best_trials[0].intermediate_values
# study.best_trials[0].system_attrs
# study.best_trials[0].user_attrs
# study.best_trials[0].distributions
# study.best_trials[0].datetime_complete
# study_names = optuna.study.get_all_study_names(storage="sqlite:///example.db")
# study_summaries = optuna.study.get_all_study_summaries(storage=storage_copy)

In [224]:

study.trials_dataframe()

Unnamed: 0,number,values_logloss,values_roc_auc,datetime_start,datetime_complete,duration,params_colsample_bylevel,params_colsample_bynode,params_learning_rate,params_max_delta_step,params_max_depth,params_min_child_weight,params_min_split_loss,params_subsample,user_attrs_memo,system_attrs_fixed_params,system_attrs_nsga2:generation,state
0,0,,,2024-02-04 00:48:09.793433,2024-02-04 00:54:05.488762,0 days 00:05:55.695329,0.585618,0.582087,0.143276,0.074656,11.0,9.683349,0.013665,0.986915,,,0.0,FAIL
1,1,0.209986,0.923542,2024-02-04 01:15:51.892559,2024-02-04 01:40:09.121813,0 days 00:24:17.229254,0.947399,0.659437,0.034551,1.081959,13.0,4.008506,4.749165,0.719159,,,0.0,COMPLETE
2,2,0.349931,0.828998,2024-02-04 10:36:45.174090,2024-02-04 12:28:29.964490,0 days 01:51:44.790400,0.615872,0.718371,0.000242,8.546141,14.0,7.313969,4.537306,0.769055,,,0.0,COMPLETE
3,3,0.32556,0.778433,2024-02-04 14:50:22.133099,2024-02-04 15:11:49.830438,0 days 00:21:27.697339,0.708804,0.868822,0.037708,6.3319,4.0,8.058972,0.101454,0.690761,,,0.0,COMPLETE
4,4,0.369949,0.717477,2024-02-04 15:11:50.222464,2024-02-04 16:04:43.940043,0 days 00:52:53.717579,0.589903,0.625624,0.000115,2.824893,11.0,5.67236,7.394496,0.61697,,,0.0,COMPLETE
5,5,0.297251,0.847876,2024-02-04 16:04:44.416577,2024-02-04 16:25:30.425923,0 days 00:20:46.009346,0.560386,0.811272,0.134297,1.754813,8.0,6.75223,9.105621,0.95664,,,0.0,COMPLETE
6,6,0.361188,0.626037,2024-02-04 16:25:30.920333,2024-02-04 16:55:59.415587,0 days 00:30:28.495254,0.846711,0.615957,0.001634,0.168275,5.0,6.905918,0.882063,0.411585,,,0.0,COMPLETE
7,7,,,2024-02-04 18:06:45.959407,2024-02-04 18:29:19.630616,0 days 00:22:33.671209,0.933313,0.607458,0.00973,5.846445,13.0,7.403698,7.19797,0.748567,,,0.0,FAIL
8,8,0.227714,0.879901,2024-02-04 23:09:09.154656,2024-02-04 23:29:46.918035,0 days 00:20:37.763379,0.576115,0.582256,0.205679,2.778248,8.0,4.007208,0.047983,0.651714,,,0.0,COMPLETE
9,9,0.290389,0.81107,2024-02-04 23:29:47.321242,2024-02-04 23:50:19.758571,0 days 00:20:32.437329,0.725259,0.989566,0.394348,2.710428,6.0,3.515444,9.489928,0.822929,,,0.0,COMPLETE


### Rerun a previous trial

In [231]:
# study.trials[1].params
# study.trials[2].params

In [249]:
# for i in [3,4,5,6]:
# for i in [11]:
#     study.enqueue_trial(study.trials[i].params, user_attrs={"memo": f"rerun of trial {i}"})
 

In [250]:
start = datetime.now()
study.optimize(objective, n_trials=4, timeout=3600*4, gc_after_trial=True, show_progress_bar=False)
print(f"Total time:  {datetime.now() - start}")

--------------------------------------------------------------------------------
 Training model (trial #43) 
--------------------------------------------------------------------------------
 Parameters:
  booster                        gbtree
  device                         gpu
  objective                      binary:logistic
  eval_metric                    ['auc', 'logloss']
  verbosity                      0
  disable_default_eval_metric    False
  tree_method                    auto
  learning_rate                  0.8586535510091051
  gamma                          3.6051285358958864
  max_depth                      10
  min_child_weight               9.583542252493253
  max_delta_step                 2.117239838558076
  subsample                      0.8834610960429072
  sampling_method                uniform
  colsample_bytree               1.0
  colsample_bylevel              0.6119722382203123
  colsample_bynode               0.8598034905841013
  lambda                      

2024-02-13 04:23:05,789 - INFO: - CrossValidation split #0
2024-02-13 04:23:05,790 - INFO: - Building training & validation DMatrix...
2024-02-13 04:24:32,217 - INFO: - Training model...
2024-02-13 04:24:32,225 - INFO: - start listen on 10.91.28.16:58805
[04:24:32] task [xgboost.dask-0]:tcp://127.0.0.1:33413 got new rank 0
[04:24:32] task [xgboost.dask-1]:tcp://127.0.0.1:41855 got new rank 1
[04:24:32] task [xgboost.dask-2]:tcp://127.0.0.1:42983 got new rank 2
2024-02-13 04:24:32,637 - INFO: - @tracker All of 3 nodes getting started
2024-02-13 04:24:55,979 - INFO: - @tracker All nodes finishes job
2024-02-13 04:24:56,171 - INFO: - Running model on test data...
2024-02-13 04:25:14,455 - INFO: - Calculating metrics...
2024-02-13 04:25:14,512 - INFO: - Calculating metrics...Completed


--------------------------------------------------------------------------------
 train_auc           :   0.99860
 train_logloss       :   0.07333
 val_auc             :   0.56001
 val_logloss         :   0.57511
 roc_auc             :   0.55996
 logloss             :   0.57511
 accuracy            :   0.84386
 bal_acc             :   0.03786
 top_k_acc           :   1.00000
 F1_score            :   0.12297
 map                 :   0.15024
 pearson_corr        :   0.06910
--------------------------------------------------------------------------------
 CV Split 1 -  Training files: [0, 1, 4, 5, 6, 7, 8, 9]   Validation files: [2, 3]  


2024-02-13 04:25:21,127 - INFO: - CrossValidation split #1
2024-02-13 04:25:21,128 - INFO: - Building training & validation DMatrix...
2024-02-13 04:26:51,259 - INFO: - Training model...
2024-02-13 04:26:51,267 - INFO: - start listen on 10.91.28.16:54153
[04:26:51] task [xgboost.dask-0]:tcp://127.0.0.1:33413 got new rank 0
[04:26:51] task [xgboost.dask-1]:tcp://127.0.0.1:41855 got new rank 1
[04:26:51] task [xgboost.dask-2]:tcp://127.0.0.1:42983 got new rank 2
2024-02-13 04:26:51,312 - INFO: - @tracker All of 3 nodes getting started
2024-02-13 04:27:13,274 - INFO: - @tracker All nodes finishes job
2024-02-13 04:27:13,483 - INFO: - Running model on test data...
2024-02-13 04:27:32,355 - INFO: - Calculating metrics...
2024-02-13 04:27:32,411 - INFO: - Calculating metrics...Completed


--------------------------------------------------------------------------------
 train_auc           :   0.99873
 train_logloss       :   0.07299
 val_auc             :   0.55798
 val_logloss         :   0.58456
 roc_auc             :   0.55796
 logloss             :   0.58456
 accuracy            :   0.84035
 bal_acc             :   0.02844
 top_k_acc           :   1.00000
 F1_score            :   0.11232
 map                 :   0.14786
 pearson_corr        :   0.06006
--------------------------------------------------------------------------------
 CV Split 2 -  Training files: [0, 1, 2, 3, 6, 7, 8, 9]   Validation files: [4, 5]  


2024-02-13 04:27:38,317 - INFO: - CrossValidation split #2
2024-02-13 04:27:38,320 - INFO: - Building training & validation DMatrix...
2024-02-13 04:29:12,544 - INFO: - Training model...
2024-02-13 04:29:12,554 - INFO: - start listen on 10.91.28.16:54373
[04:29:12] task [xgboost.dask-0]:tcp://127.0.0.1:33413 got new rank 0
[04:29:12] task [xgboost.dask-1]:tcp://127.0.0.1:41855 got new rank 1
[04:29:12] task [xgboost.dask-2]:tcp://127.0.0.1:42983 got new rank 2
2024-02-13 04:29:12,600 - INFO: - @tracker All of 3 nodes getting started
2024-02-13 04:29:35,019 - INFO: - @tracker All nodes finishes job
2024-02-13 04:29:35,208 - INFO: - Running model on test data...
2024-02-13 04:29:54,252 - INFO: - Calculating metrics...
2024-02-13 04:29:54,306 - INFO: - Calculating metrics...Completed


--------------------------------------------------------------------------------
 train_auc           :   0.99892
 train_logloss       :   0.07157
 val_auc             :   0.54819
 val_logloss         :   0.56691
 roc_auc             :   0.55117
 logloss             :   0.56688
 accuracy            :   0.84680
 bal_acc             :   0.02859
 top_k_acc           :   1.00000
 F1_score            :   0.10935
 map                 :   0.13995
 pearson_corr        :   0.05550
--------------------------------------------------------------------------------
 CV Split 3 -  Training files: [0, 1, 2, 3, 4, 5, 8, 9]   Validation files: [6, 7]  


2024-02-13 04:30:00,106 - INFO: - CrossValidation split #3
2024-02-13 04:30:00,107 - INFO: - Building training & validation DMatrix...
2024-02-13 04:31:32,798 - INFO: - Training model...
2024-02-13 04:31:32,808 - INFO: - start listen on 10.91.28.16:60613
[04:31:32] task [xgboost.dask-0]:tcp://127.0.0.1:33413 got new rank 0
[04:31:32] task [xgboost.dask-1]:tcp://127.0.0.1:41855 got new rank 1
[04:31:32] task [xgboost.dask-2]:tcp://127.0.0.1:42983 got new rank 2
2024-02-13 04:31:32,853 - INFO: - @tracker All of 3 nodes getting started
2024-02-13 04:31:55,619 - INFO: - @tracker All nodes finishes job
2024-02-13 04:31:55,809 - INFO: - Running model on test data...
2024-02-13 04:32:14,850 - INFO: - Calculating metrics...
2024-02-13 04:32:14,903 - INFO: - Calculating metrics...Completed


--------------------------------------------------------------------------------
 train_auc           :   0.99879
 train_logloss       :   0.07237
 val_auc             :   0.55173
 val_logloss         :   0.57346
 roc_auc             :   0.55165
 logloss             :   0.57346
 accuracy            :   0.84524
 bal_acc             :   0.03064
 top_k_acc           :   1.00000
 F1_score            :   0.11275
 map                 :   0.14261
 pearson_corr        :   0.05601
--------------------------------------------------------------------------------
 CV Split 4 -  Training files: [0, 1, 2, 3, 4, 5, 6, 7]   Validation files: [8, 9]  


2024-02-13 04:32:20,368 - INFO: - CrossValidation split #4
2024-02-13 04:32:20,370 - INFO: - Building training & validation DMatrix...
2024-02-13 04:33:54,489 - INFO: - Training model...
2024-02-13 04:33:54,498 - INFO: - start listen on 10.91.28.16:54121
[04:33:54] task [xgboost.dask-0]:tcp://127.0.0.1:33413 got new rank 0
[04:33:54] task [xgboost.dask-1]:tcp://127.0.0.1:41855 got new rank 1
[04:33:54] task [xgboost.dask-2]:tcp://127.0.0.1:42983 got new rank 2
2024-02-13 04:33:54,540 - INFO: - @tracker All of 3 nodes getting started
2024-02-13 04:34:17,222 - INFO: - @tracker All nodes finishes job
2024-02-13 04:34:17,470 - INFO: - Running model on test data...
2024-02-13 04:34:36,680 - INFO: - Calculating metrics...
2024-02-13 04:34:36,739 - INFO: - Calculating metrics...Completed
[I 2024-02-13 04:34:36,831] Trial 43 finished with values: {'roc_auc': 0.555849008982151, 'logloss': 0.573688003040822} and parameters: {'learning_rate': 0.8586535510091051, 'min_split_loss': 3.60512853589588

--------------------------------------------------------------------------------
 train_auc           :   0.99852
 train_logloss       :   0.07406
 val_auc             :   0.56134
 val_logloss         :   0.56839
 roc_auc             :   0.56138
 logloss             :   0.56839
 accuracy            :   0.84580
 bal_acc             :   0.03725
 top_k_acc           :   1.00000
 F1_score            :   0.12097
 map                 :   0.14835
 pearson_corr        :   0.06881
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
 train_auc                 0.99871 +/- 0.00014
 train_logloss             0.07286 +/- 0.00085
 val_auc                   0.55585 +/- 0.00505
 val_logloss               0.57369 +/- 0.00623
 roc_auc                   0.55642 +/- 0.00424
 logloss                   0.57368 +/- 0.00624
 accuracy                  0.84441 +/- 0.00224
 bal_acc                   0.032

2024-02-13 04:34:43,406 - INFO: - CrossValidation split #0
2024-02-13 04:34:43,407 - INFO: - Building training & validation DMatrix...
2024-02-13 04:36:17,336 - INFO: - Training model...
2024-02-13 04:36:17,342 - INFO: - start listen on 10.91.28.16:49531
[04:36:17] task [xgboost.dask-0]:tcp://127.0.0.1:33413 got new rank 0
[04:36:17] task [xgboost.dask-1]:tcp://127.0.0.1:41855 got new rank 1
[04:36:17] task [xgboost.dask-2]:tcp://127.0.0.1:42983 got new rank 2
2024-02-13 04:36:17,390 - INFO: - @tracker All of 3 nodes getting started
2024-02-13 04:37:09,022 - INFO: - @tracker All nodes finishes job
2024-02-13 04:37:09,254 - INFO: - Running model on test data...
2024-02-13 04:37:27,885 - INFO: - Calculating metrics...
2024-02-13 04:37:27,939 - INFO: - Calculating metrics...Completed


--------------------------------------------------------------------------------
 train_auc           :   0.97137
 train_logloss       :   0.25510
 val_auc             :   0.61085
 val_logloss         :   0.35795
 roc_auc             :   0.61193
 logloss             :   0.35795
 accuracy            :   0.87948
 bal_acc             :   0.00257
 top_k_acc           :   1.00000
 F1_score            :   0.00606
 map                 :   0.19608
 pearson_corr        :   0.16040
--------------------------------------------------------------------------------
 CV Split 1 -  Training files: [0, 1, 4, 5, 6, 7, 8, 9]   Validation files: [2, 3]  


2024-02-13 04:37:33,670 - INFO: - CrossValidation split #1
2024-02-13 04:37:33,671 - INFO: - Building training & validation DMatrix...
2024-02-13 04:39:07,099 - INFO: - Training model...
2024-02-13 04:39:07,106 - INFO: - start listen on 10.91.28.16:36497
[04:39:07] task [xgboost.dask-0]:tcp://127.0.0.1:33413 got new rank 0
[04:39:07] task [xgboost.dask-1]:tcp://127.0.0.1:41855 got new rank 1
[04:39:07] task [xgboost.dask-2]:tcp://127.0.0.1:42983 got new rank 2
2024-02-13 04:39:07,146 - INFO: - @tracker All of 3 nodes getting started
2024-02-13 04:40:16,396 - INFO: - @tracker All nodes finishes job
2024-02-13 04:40:16,630 - INFO: - Running model on test data...
2024-02-13 04:40:35,478 - INFO: - Calculating metrics...
2024-02-13 04:40:35,529 - INFO: - Calculating metrics...Completed


--------------------------------------------------------------------------------
 train_auc           :   0.98837
 train_logloss       :   0.22325
 val_auc             :   0.61967
 val_logloss         :   0.36053
 roc_auc             :   0.62045
 logloss             :   0.36053
 accuracy            :   0.87763
 bal_acc             :   0.00169
 top_k_acc           :   1.00000
 F1_score            :   0.00466
 map                 :   0.20368
 pearson_corr        :   0.16977
--------------------------------------------------------------------------------
 CV Split 2 -  Training files: [0, 1, 2, 3, 6, 7, 8, 9]   Validation files: [4, 5]  


2024-02-13 04:40:41,295 - INFO: - CrossValidation split #2
2024-02-13 04:40:41,296 - INFO: - Building training & validation DMatrix...
2024-02-13 04:42:15,568 - INFO: - Training model...
2024-02-13 04:42:15,579 - INFO: - start listen on 10.91.28.16:46125
[04:42:15] task [xgboost.dask-0]:tcp://127.0.0.1:33413 got new rank 0
[04:42:15] task [xgboost.dask-1]:tcp://127.0.0.1:41855 got new rank 1
[04:42:15] task [xgboost.dask-2]:tcp://127.0.0.1:42983 got new rank 2
2024-02-13 04:42:15,618 - INFO: - @tracker All of 3 nodes getting started
2024-02-13 04:43:17,369 - INFO: - @tracker All nodes finishes job
2024-02-13 04:43:17,603 - INFO: - Running model on test data...
2024-02-13 04:43:36,901 - INFO: - Calculating metrics...
2024-02-13 04:43:36,953 - INFO: - Calculating metrics...Completed


--------------------------------------------------------------------------------
 train_auc           :   0.98251
 train_logloss       :   0.23952
 val_auc             :   0.61842
 val_logloss         :   0.34885
 roc_auc             :   0.61877
 logloss             :   0.34885
 accuracy            :   0.88354
 bal_acc             :   0.00461
 top_k_acc           :   1.00000
 F1_score            :   0.01007
 map                 :   0.20155
 pearson_corr        :   0.17282
--------------------------------------------------------------------------------
 CV Split 3 -  Training files: [0, 1, 2, 3, 4, 5, 8, 9]   Validation files: [6, 7]  


2024-02-13 04:43:43,300 - INFO: - CrossValidation split #3
2024-02-13 04:43:43,302 - INFO: - Building training & validation DMatrix...
2024-02-13 04:45:17,789 - INFO: - Training model...
2024-02-13 04:45:17,794 - INFO: - start listen on 10.91.28.16:56291
[04:45:17] task [xgboost.dask-0]:tcp://127.0.0.1:33413 got new rank 0
[04:45:17] task [xgboost.dask-1]:tcp://127.0.0.1:41855 got new rank 1
[04:45:17] task [xgboost.dask-2]:tcp://127.0.0.1:42983 got new rank 2
2024-02-13 04:45:17,846 - INFO: - @tracker All of 3 nodes getting started
2024-02-13 04:46:27,202 - INFO: - @tracker All nodes finishes job
2024-02-13 04:46:27,430 - INFO: - Running model on test data...
2024-02-13 04:46:46,713 - INFO: - Calculating metrics...
2024-02-13 04:46:46,767 - INFO: - Calculating metrics...Completed


--------------------------------------------------------------------------------
 train_auc           :   0.98711
 train_logloss       :   0.22657
 val_auc             :   0.60829
 val_logloss         :   0.35529
 roc_auc             :   0.60897
 logloss             :   0.35529
 accuracy            :   0.88175
 bal_acc             :   0.00543
 top_k_acc           :   1.00000
 F1_score            :   0.01214
 map                 :   0.19131
 pearson_corr        :   0.15468
--------------------------------------------------------------------------------
 CV Split 4 -  Training files: [0, 1, 2, 3, 4, 5, 6, 7]   Validation files: [8, 9]  


2024-02-13 04:46:52,336 - INFO: - CrossValidation split #4
2024-02-13 04:46:52,338 - INFO: - Building training & validation DMatrix...
2024-02-13 04:48:26,376 - INFO: - Training model...
2024-02-13 04:48:26,383 - INFO: - start listen on 10.91.28.16:55925
[04:48:26] task [xgboost.dask-0]:tcp://127.0.0.1:33413 got new rank 0
[04:48:26] task [xgboost.dask-1]:tcp://127.0.0.1:41855 got new rank 1
[04:48:26] task [xgboost.dask-2]:tcp://127.0.0.1:42983 got new rank 2
2024-02-13 04:48:26,425 - INFO: - @tracker All of 3 nodes getting started
2024-02-13 04:49:23,445 - INFO: - @tracker All nodes finishes job
2024-02-13 04:49:23,666 - INFO: - Running model on test data...
2024-02-13 04:49:42,704 - INFO: - Calculating metrics...
2024-02-13 04:49:42,758 - INFO: - Calculating metrics...Completed
[I 2024-02-13 04:49:42,884] Trial 44 finished with values: {'roc_auc': 0.6142202582916583, 'logloss': 0.3555057752575358} and parameters: {'learning_rate': 0.06227003367608594, 'min_split_loss': 4.44429663025

--------------------------------------------------------------------------------
 train_auc           :   0.97699
 train_logloss       :   0.24746
 val_auc             :   0.61387
 val_logloss         :   0.35491
 roc_auc             :   0.61425
 logloss             :   0.35491
 accuracy            :   0.88082
 bal_acc             :   0.00329
 top_k_acc           :   1.00000
 F1_score            :   0.00735
 map                 :   0.19944
 pearson_corr        :   0.16591
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
 train_auc                 0.98127 +/- 0.00636
 train_logloss             0.23838 +/- 0.01210
 val_auc                   0.61422 +/- 0.00434
 val_logloss               0.35551 +/- 0.00390
 roc_auc                   0.61487 +/- 0.00425
 logloss                   0.35551 +/- 0.00390
 accuracy                  0.88064 +/- 0.00200
 bal_acc                   0.003

2024-02-13 04:49:49,553 - INFO: - CrossValidation split #0
2024-02-13 04:49:49,554 - INFO: - Building training & validation DMatrix...
2024-02-13 04:51:22,942 - INFO: - Training model...
2024-02-13 04:51:22,950 - INFO: - start listen on 10.91.28.16:54957
[04:51:22] task [xgboost.dask-0]:tcp://127.0.0.1:33413 got new rank 0
[04:51:22] task [xgboost.dask-1]:tcp://127.0.0.1:41855 got new rank 1
[04:51:22] task [xgboost.dask-2]:tcp://127.0.0.1:42983 got new rank 2
2024-02-13 04:51:22,989 - INFO: - @tracker All of 3 nodes getting started
2024-02-13 04:51:31,349 - INFO: - @tracker All nodes finishes job
2024-02-13 04:51:31,502 - INFO: - Running model on test data...
2024-02-13 04:51:50,542 - INFO: - Calculating metrics...
2024-02-13 04:51:50,597 - INFO: - Calculating metrics...Completed


--------------------------------------------------------------------------------
 train_auc           :   0.70172
 train_logloss       :   0.33544
 val_auc             :   0.57635
 val_logloss         :   0.38423
 roc_auc             :   0.57773
 logloss             :   0.38423
 accuracy            :   0.87381
 bal_acc             :   0.01681
 top_k_acc           :   1.00000
 F1_score            :   0.04906
 map                 :   0.16453
 pearson_corr        :   0.10489
--------------------------------------------------------------------------------
 CV Split 1 -  Training files: [0, 1, 4, 5, 6, 7, 8, 9]   Validation files: [2, 3]  


2024-02-13 04:51:56,496 - INFO: - CrossValidation split #1
2024-02-13 04:51:56,498 - INFO: - Building training & validation DMatrix...
2024-02-13 04:53:30,177 - INFO: - Training model...
2024-02-13 04:53:30,185 - INFO: - start listen on 10.91.28.16:59863
[04:53:30] task [xgboost.dask-0]:tcp://127.0.0.1:33413 got new rank 0
[04:53:30] task [xgboost.dask-1]:tcp://127.0.0.1:41855 got new rank 1
[04:53:30] task [xgboost.dask-2]:tcp://127.0.0.1:42983 got new rank 2
2024-02-13 04:53:30,230 - INFO: - @tracker All of 3 nodes getting started
2024-02-13 04:53:38,997 - INFO: - @tracker All nodes finishes job
2024-02-13 04:53:39,156 - INFO: - Running model on test data...
2024-02-13 04:53:58,276 - INFO: - Calculating metrics...
2024-02-13 04:53:58,359 - INFO: - Calculating metrics...Completed


--------------------------------------------------------------------------------
 train_auc           :   0.70154
 train_logloss       :   0.33442
 val_auc             :   0.58738
 val_logloss         :   0.38406
 roc_auc             :   0.58746
 logloss             :   0.38406
 accuracy            :   0.87255
 bal_acc             :   0.01335
 top_k_acc           :   1.00000
 F1_score            :   0.04150
 map                 :   0.16753
 pearson_corr        :   0.10816
--------------------------------------------------------------------------------
 CV Split 2 -  Training files: [0, 1, 2, 3, 6, 7, 8, 9]   Validation files: [4, 5]  


2024-02-13 04:54:04,245 - INFO: - CrossValidation split #2
2024-02-13 04:54:04,246 - INFO: - Building training & validation DMatrix...
2024-02-13 04:55:38,302 - INFO: - Training model...
2024-02-13 04:55:38,310 - INFO: - start listen on 10.91.28.16:37403
[04:55:38] task [xgboost.dask-0]:tcp://127.0.0.1:33413 got new rank 0
[04:55:38] task [xgboost.dask-1]:tcp://127.0.0.1:41855 got new rank 1
[04:55:38] task [xgboost.dask-2]:tcp://127.0.0.1:42983 got new rank 2
2024-02-13 04:55:38,362 - INFO: - @tracker All of 3 nodes getting started
2024-02-13 04:55:47,369 - INFO: - @tracker All nodes finishes job
2024-02-13 04:55:47,541 - INFO: - Running model on test data...
2024-02-13 04:56:06,700 - INFO: - Calculating metrics...
2024-02-13 04:56:06,755 - INFO: - Calculating metrics...Completed


--------------------------------------------------------------------------------
 train_auc           :   0.70299
 train_logloss       :   0.33674
 val_auc             :   0.58553
 val_logloss         :   0.37340
 roc_auc             :   0.58596
 logloss             :   0.37340
 accuracy            :   0.87849
 bal_acc             :   0.01749
 top_k_acc           :   1.00000
 F1_score            :   0.04843
 map                 :   0.16327
 pearson_corr        :   0.11101
--------------------------------------------------------------------------------
 CV Split 3 -  Training files: [0, 1, 2, 3, 4, 5, 8, 9]   Validation files: [6, 7]  


2024-02-13 04:56:12,126 - INFO: - CrossValidation split #3
2024-02-13 04:56:12,127 - INFO: - Building training & validation DMatrix...
2024-02-13 04:57:46,768 - INFO: - Training model...
2024-02-13 04:57:46,778 - INFO: - start listen on 10.91.28.16:55841
[04:57:46] task [xgboost.dask-0]:tcp://127.0.0.1:33413 got new rank 0
[04:57:46] task [xgboost.dask-1]:tcp://127.0.0.1:41855 got new rank 1
[04:57:46] task [xgboost.dask-2]:tcp://127.0.0.1:42983 got new rank 2
2024-02-13 04:57:46,821 - INFO: - @tracker All of 3 nodes getting started
2024-02-13 04:57:55,139 - INFO: - @tracker All nodes finishes job
2024-02-13 04:57:55,290 - INFO: - Running model on test data...
2024-02-13 04:58:14,212 - INFO: - Calculating metrics...
2024-02-13 04:58:14,267 - INFO: - Calculating metrics...Completed


--------------------------------------------------------------------------------
 train_auc           :   0.70259
 train_logloss       :   0.33597
 val_auc             :   0.57064
 val_logloss         :   0.38147
 roc_auc             :   0.57064
 logloss             :   0.38147
 accuracy            :   0.87567
 bal_acc             :   0.01411
 top_k_acc           :   1.00000
 F1_score            :   0.04389
 map                 :   0.15650
 pearson_corr        :   0.09259
--------------------------------------------------------------------------------
 CV Split 4 -  Training files: [0, 1, 2, 3, 4, 5, 6, 7]   Validation files: [8, 9]  


2024-02-13 04:58:20,551 - INFO: - CrossValidation split #4
2024-02-13 04:58:20,552 - INFO: - Building training & validation DMatrix...
2024-02-13 04:59:55,492 - INFO: - Training model...
2024-02-13 04:59:55,501 - INFO: - start listen on 10.91.28.16:42317
[04:59:55] task [xgboost.dask-0]:tcp://127.0.0.1:33413 got new rank 0
[04:59:55] task [xgboost.dask-1]:tcp://127.0.0.1:41855 got new rank 1
[04:59:55] task [xgboost.dask-2]:tcp://127.0.0.1:42983 got new rank 2
2024-02-13 04:59:55,547 - INFO: - @tracker All of 3 nodes getting started
2024-02-13 05:00:04,270 - INFO: - @tracker All nodes finishes job
2024-02-13 05:00:04,418 - INFO: - Running model on test data...
2024-02-13 05:00:22,926 - INFO: - Calculating metrics...
2024-02-13 05:00:22,983 - INFO: - Calculating metrics...Completed
[I 2024-02-13 05:00:23,076] Trial 45 finished with values: {'roc_auc': 0.58133119456533, 'logloss': 0.3803776491121085} and parameters: {'learning_rate': 0.7442330152740122, 'min_split_loss': 8.21849212816352

--------------------------------------------------------------------------------
 train_auc           :   0.70652
 train_logloss       :   0.33454
 val_auc             :   0.58676
 val_logloss         :   0.37873
 roc_auc             :   0.58693
 logloss             :   0.37873
 accuracy            :   0.87571
 bal_acc             :   0.02005
 top_k_acc           :   1.00000
 F1_score            :   0.05410
 map                 :   0.16599
 pearson_corr        :   0.11054
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
 train_auc                 0.70307 +/- 0.00181
 train_logloss             0.33542 +/- 0.00087
 val_auc                   0.58133 +/- 0.00668
 val_logloss               0.38038 +/- 0.00402
 roc_auc                   0.58175 +/- 0.00658
 logloss                   0.38038 +/- 0.00402
 accuracy                  0.87525 +/- 0.00201
 bal_acc                   0.016

2024-02-13 05:00:29,764 - INFO: - CrossValidation split #0
2024-02-13 05:00:29,765 - INFO: - Building training & validation DMatrix...
2024-02-13 05:02:04,684 - INFO: - Training model...
2024-02-13 05:02:04,695 - INFO: - start listen on 10.91.28.16:47009
[05:02:04] task [xgboost.dask-0]:tcp://127.0.0.1:33413 got new rank 0
[05:02:04] task [xgboost.dask-1]:tcp://127.0.0.1:41855 got new rank 1
[05:02:04] task [xgboost.dask-2]:tcp://127.0.0.1:42983 got new rank 2
2024-02-13 05:02:04,742 - INFO: - @tracker All of 3 nodes getting started
2024-02-13 05:03:09,330 - INFO: - @tracker All nodes finishes job
2024-02-13 05:03:09,530 - INFO: - Running model on test data...
2024-02-13 05:03:28,709 - INFO: - Calculating metrics...
2024-02-13 05:03:28,763 - INFO: - Calculating metrics...Completed


--------------------------------------------------------------------------------
 train_auc           :   0.60813
 train_logloss       :   0.35775
 val_auc             :   0.59426
 val_logloss         :   0.36162
 roc_auc             :   0.59482
 logloss             :   0.36162
 accuracy            :   0.87953
 bal_acc             :   0.00000
 top_k_acc           :   1.00000
 F1_score            :   0.00000
 map                 :   0.17563
 pearson_corr        :   0.12363
--------------------------------------------------------------------------------
 CV Split 1 -  Training files: [0, 1, 4, 5, 6, 7, 8, 9]   Validation files: [2, 3]  


2024-02-13 05:03:34,503 - INFO: - CrossValidation split #1
2024-02-13 05:03:34,504 - INFO: - Building training & validation DMatrix...
2024-02-13 05:05:08,302 - INFO: - Training model...
2024-02-13 05:05:08,311 - INFO: - start listen on 10.91.28.16:52673
[05:05:08] task [xgboost.dask-0]:tcp://127.0.0.1:33413 got new rank 0
[05:05:08] task [xgboost.dask-1]:tcp://127.0.0.1:41855 got new rank 1
[05:05:08] task [xgboost.dask-2]:tcp://127.0.0.1:42983 got new rank 2
2024-02-13 05:05:08,354 - INFO: - @tracker All of 3 nodes getting started
2024-02-13 05:06:15,688 - INFO: - @tracker All nodes finishes job
2024-02-13 05:06:15,889 - INFO: - Running model on test data...
2024-02-13 05:06:34,855 - INFO: - Calculating metrics...
2024-02-13 05:06:34,910 - INFO: - Calculating metrics...Completed


--------------------------------------------------------------------------------
 train_auc           :   0.60572
 train_logloss       :   0.35704
 val_auc             :   0.59990
 val_logloss         :   0.36486
 roc_auc             :   0.60040
 logloss             :   0.36486
 accuracy            :   0.87792
 bal_acc             :   0.00000
 top_k_acc           :   1.00000
 F1_score            :   0.00000
 map                 :   0.17641
 pearson_corr        :   0.12005
--------------------------------------------------------------------------------
 CV Split 2 -  Training files: [0, 1, 2, 3, 6, 7, 8, 9]   Validation files: [4, 5]  


2024-02-13 05:06:40,598 - INFO: - CrossValidation split #2
2024-02-13 05:06:40,599 - INFO: - Building training & validation DMatrix...
2024-02-13 05:08:16,082 - INFO: - Training model...
2024-02-13 05:08:16,090 - INFO: - start listen on 10.91.28.16:52285
[05:08:16] task [xgboost.dask-0]:tcp://127.0.0.1:33413 got new rank 0
[05:08:16] task [xgboost.dask-1]:tcp://127.0.0.1:41855 got new rank 1
[05:08:16] task [xgboost.dask-2]:tcp://127.0.0.1:42983 got new rank 2
2024-02-13 05:08:16,132 - INFO: - @tracker All of 3 nodes getting started
2024-02-13 05:09:20,113 - INFO: - @tracker All nodes finishes job
2024-02-13 05:09:20,322 - INFO: - Running model on test data...
2024-02-13 05:09:39,960 - INFO: - Calculating metrics...
2024-02-13 05:09:40,015 - INFO: - Calculating metrics...Completed


--------------------------------------------------------------------------------
 train_auc           :   0.60774
 train_logloss       :   0.35997
 val_auc             :   0.59875
 val_logloss         :   0.35344
 roc_auc             :   0.59887
 logloss             :   0.35344
 accuracy            :   0.88336
 bal_acc             :   0.00000
 top_k_acc           :   1.00000
 F1_score            :   0.00000
 map                 :   0.18033
 pearson_corr        :   0.13615
--------------------------------------------------------------------------------
 CV Split 3 -  Training files: [0, 1, 2, 3, 4, 5, 8, 9]   Validation files: [6, 7]  


2024-02-13 05:09:45,886 - INFO: - CrossValidation split #3
2024-02-13 05:09:45,887 - INFO: - Building training & validation DMatrix...
2024-02-13 05:11:21,118 - INFO: - Training model...
2024-02-13 05:11:21,130 - INFO: - start listen on 10.91.28.16:38707
[05:11:21] task [xgboost.dask-0]:tcp://127.0.0.1:33413 got new rank 0
[05:11:21] task [xgboost.dask-1]:tcp://127.0.0.1:41855 got new rank 1
[05:11:21] task [xgboost.dask-2]:tcp://127.0.0.1:42983 got new rank 2
2024-02-13 05:11:21,174 - INFO: - @tracker All of 3 nodes getting started
2024-02-13 05:12:26,695 - INFO: - @tracker All nodes finishes job
2024-02-13 05:12:26,914 - INFO: - Running model on test data...
2024-02-13 05:12:46,488 - INFO: - Calculating metrics...
2024-02-13 05:12:46,540 - INFO: - Calculating metrics...Completed


--------------------------------------------------------------------------------
 train_auc           :   0.60903
 train_logloss       :   0.35885
 val_auc             :   0.59207
 val_logloss         :   0.35742
 roc_auc             :   0.59300
 logloss             :   0.35742
 accuracy            :   0.88164
 bal_acc             :   0.00000
 top_k_acc           :   1.00000
 F1_score            :   0.00000
 map                 :   0.17709
 pearson_corr        :   0.12621
--------------------------------------------------------------------------------
 CV Split 4 -  Training files: [0, 1, 2, 3, 4, 5, 6, 7]   Validation files: [8, 9]  


2024-02-13 05:12:52,301 - INFO: - CrossValidation split #4
2024-02-13 05:12:52,303 - INFO: - Building training & validation DMatrix...
2024-02-13 05:14:27,292 - INFO: - Training model...
2024-02-13 05:14:27,300 - INFO: - start listen on 10.91.28.16:36817
[05:14:27] task [xgboost.dask-0]:tcp://127.0.0.1:33413 got new rank 0
[05:14:27] task [xgboost.dask-1]:tcp://127.0.0.1:41855 got new rank 1
[05:14:27] task [xgboost.dask-2]:tcp://127.0.0.1:42983 got new rank 2
2024-02-13 05:14:27,342 - INFO: - @tracker All of 3 nodes getting started
2024-02-13 05:15:31,081 - INFO: - @tracker All nodes finishes job
2024-02-13 05:15:31,310 - INFO: - Running model on test data...
2024-02-13 05:15:50,872 - INFO: - Calculating metrics...
2024-02-13 05:15:50,928 - INFO: - Calculating metrics...Completed
[I 2024-02-13 05:15:51,024] Trial 46 finished with values: {'roc_auc': 0.5961399338966344, 'logloss': 0.35924618014679427} and parameters: {'learning_rate': 0.011507708277560936, 'min_split_loss': 0.454292171

--------------------------------------------------------------------------------
 train_auc           :   0.60654
 train_logloss       :   0.35849
 val_auc             :   0.59571
 val_logloss         :   0.35889
 roc_auc             :   0.59631
 logloss             :   0.35889
 accuracy            :   0.88074
 bal_acc             :   0.00000
 top_k_acc           :   1.00000
 F1_score            :   0.00000
 map                 :   0.17784
 pearson_corr        :   0.12985
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
 train_auc                 0.60743 +/- 0.00117
 train_logloss             0.35842 +/- 0.00100
 val_auc                   0.59614 +/- 0.00287
 val_logloss               0.35925 +/- 0.00386
 roc_auc                   0.59668 +/- 0.00267
 logloss                   0.35925 +/- 0.00386
 accuracy                  0.88064 +/- 0.00185
 bal_acc                   0.000

### Identify / Rerun  / Save best study

In [30]:
print(" Best trials: ", [x.number for x in study.best_trials])
# study.best_params

 Best trials:  [0]


In [None]:
best_trial_copy = copy.deepcopy(study.best_trials[0])

In [None]:
best_model = objective(best_trial_copy, disp_params = True, save = True)

In [None]:
trial.number
best_model.keys()

In [None]:
save_as_filename = f"{study.study_name}_trial_{trial.number:03d}.json"
print(f" Save model to : {save_as_filename}")

In [None]:
best_model['booster'].save_model(save_as_filename)

# Load best saved model and run against a test set

In [None]:
study_name="multiobjective-study-3"

In [None]:
start = datetime.now()
study = optuna.create_study(storage="sqlite:///example.db",
                            study_name=study_name,
                            directions=["minimize", "maximize"], 
                            load_if_exists=True)
# study.set_metric_names(["mse_score", "R2_score"])
print(f"Total time:  {datetime.now() - start}")

In [None]:
best_trial = study.best_trials[0]

In [None]:
best_filename = f"{study.study_name}_trial_{best_trial.number:03d}.model"
print(f" Load model from : {best_filename}")

In [None]:
model_copy = xgb.Booster()
model_copy.load_model(best_filename)
model_copy

## Read test data

In [None]:
# df_ps = dd.read_csv(profileInputFiles, names = all_columns, usecols=Xy_columns, dtype= Xy_columns_dtype)   
# df_ps = read_cell_profiles(profileInputFiles, names = all_columns, usecols = Xy_columns, dtype = Xy_columns_dtype)
# print(f" Number of partitions:  {df_profiles.npartitions}   partition(1) shape: {df_ps.get_partition(0).shape}")

# st1, en1 = get_dd_subset(df_ps, skiprows = 0, nrows = 200000, ss=None, verbose = False)
# st2, en2 = get_dd_subset(df_ps, skiprows = 200000, nrows = 90000, ss=None, verbose = False)

# st1,en1
# st2,en2

# df_train =df_ps.partitions[st1:en1]
# df_test = split_Xy(df_ps.partitions[st2:en2], y_columns = list(y_columns) )

In [None]:
print(f" Reading cell profiles file :  {profileInputFiles}")
# df_ps = dd.read_csv(profileInputFiles, names = all_columns, usecols=Xy_columns, dtype= Xy_columns_dtype)   
df_ps = read_cell_profiles(profileInputFiles, names = all_columns, usecols = Xy_columns, dtype = Xy_columns_dtype)
print(f" Number of partitions:  {df_ps.npartitions}   partition(1) shape: {df_ps.get_partition(0).shape}")

In [None]:
# df_ps.shape
type(df_ps)
df_ps.info()
# df_ps.head()
# df_ps.get_partition(0).compute().shape
# df_ps.get_partition(1).compute().shape
# df_ps.get_partition(100).compute().shape
# _ps.get_partition(150).compute().shape
# df_ps.npartitions
# df_ps.get_partition(0).shape
# df_ps.known_divisions 
# df_ps.get_partition(df_ps.npartitions-1).tail(10)
# df_ps2 = df_ps.tail(50)
# type(df_ps2)
# df_ps2.npartitions
# df_ps2.get_partition(0).shape
# df_ps2.head()

In [None]:
# ss_floorsum = ss_cumsum -ss
# st, en = get_dd_subset(df_ps, skiprows = 230454, nrows = 10, ss =ss)
# st, en = get_dd_subset(df_ps, skiprows = 230455, nrows = 10, ss =ss)
# st, en = get_dd_subset(df_ps, skiprows = 230456, nrows = 10, ss =ss)

In [None]:
df_test.npartitions
dd = df_test.map_partitions(len).compute()
dd_cumsum = dd.cumsum()
dd
dd_cumsum

In [None]:
y_columns

X_test, y_test = split_Xy(df_test, y_columns)

y_test.columns, y_test.shape
X_test.shape, X_test.columns

In [None]:
d_test = xgb.dask.DaskDMatrix(client, X_test, y_test)

In [None]:
# df_pr = read_cell_profiles_2(CompoundProfiles2SampleFileCSV, rows = 40, skiprows = None)
# df_pr.npartitions
# df_pr.get_partition(0).head(2)
y_test.compute().shape

## Run prediction on test data

In [None]:
y_pred = xgb.dask.predict(client, model_copy, d_test).compute()

In [None]:
y_test_np = y_test.compute().to_numpy(copy=True).squeeze()
type(y_test),type(y_test_np), type(y_pred)

In [None]:
y_test_np.shape
y_pred.shape

In [None]:
logging.info(f"Compute R2 & MSE scores...")
R2_score = skm.r2_score(
    y_true = y_test_np,
    y_pred = y_pred,
)

mse_score = skm.mean_squared_error(
    y_true = y_test_np,
    y_pred = y_pred,
    squared=False,
    # compute=True,
)
# logging.info(f"Compute R2 & MSE scores...Completed")

print(f"MSE Score : {mse_score:.6f}")
print(f" R2 Score : {R2_score:.6f}")

In [None]:
pearson_corr, pearson_p = sps.pearsonr(y_test_np, y_pred)

In [None]:
print(pearson_corr, pearson_p)
print(f"  Pearson : {pearson_corr:.6f}      P: {pearson_p:.3e}")

In [None]:
spearman_corr, spearman_p = sps.spearmanr(y_test.squeeze(), y_pred)

In [None]:
print(spearman_corr, spearman_p)
print(f"  Spearman : {spearman_corr:.6f}      P: {spearman_p:.6e}")

In [None]:
print(y_test_np.min(), y_test_np.max(), y_test_np.mean())
print(y_pred.min(), y_pred.max(), y_pred.mean())

In [None]:
fig = plt.figure(figsize=(6,6))
plt.plot(y_test_np, y_pred, '.')
plt.title(f" Predicted vs. True TPSA - Pearson: {pearson_corr:.5f} / p:{pearson_p:.3e}")
plt.xlabel("Y True");
plt.ylabel("Y Pred");
# plt.xlim(1.775, 1.875)
# plt.xlim(1.775, 1.875)

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import colors
from matplotlib.ticker import PercentFormatter
num_bins = 40
# fig, ax = plt.subplots()
 
# the histogram of the data

fig = plt.figure(figsize=(10,5))
n, bins, patches = plt.hist(y_test_np, num_bins, density=False, label = 'True', range=[1.5,2.25] )
# We can set the number of bins with the *bins* keyword argument.
n, bins, patches = plt.hist(y_pred, num_bins, density=False, label='Predicitons' )
plt.xlabel('TPSA Value')
plt.ylabel('Compounds')
plt.legend()
# ax.set_title('Histogram of normal distribution sample: '
#              fr'$\mu={mu:.0f}$, $\sigma={sigma:.0f}$')
# axs[1].hist(dist2, bins=n_bins)

# Plot feature importance and tree

### Plot importance 

In [None]:
# train_score = xgb_reg.score(train_X, train_y)
# val_score = xgb_reg.score(val_X, val_y)
# test_score = xgb_reg.score(test_X, test_y)

In [None]:
print(f" R2 score - Training   : {train_score:0.6f}")
print(f" R2 score - Validation : {val_score:0.6f}")
print(f" R2 score - Test data  : {test_score:0.6f}")

In [None]:
# fig = plt.figure(figsize=(20, 20))
# plt.yticks(fontsize = 12)
# ax = fig.add_subplot()
# ax.set_xlim(10,50)
ax = xgb.plot_importance(model_copy, max_num_features= None,  grid = False,  show_values = True, )
for label in ( ax.get_xticklabels() + ax.get_yticklabels()):
    label.set_fontsize(9)
ax.get_yticklabels()
# ax.autoscale(enable=None, axis="y", tight=True)

In [None]:
from graphviz import Source 
xgb.plot_tree(model_copy,num_tress=5)

In [None]:
# model_copy.attributes()
model_copy.feature_names
model_copy.

In [None]:
xgb.get_config()

# XGBoost + Dask - Development 

- Need to convert Dask data frames to dask arrays in order to use data in parm grid search

In [None]:
files = os.walk('./output_11102023')

for i in files :
    print(i)

### Read compound profiles

In [None]:
profileInputFiles = './output_11102023/profiles/compound_profiles_2samples_*.csv'

In [None]:
# del df_profiles
df_profiles = read_cell_profiles(profileInputFiles, rows = 200000)

In [None]:
type(df_profiles)
df_profiles.head(3)
df_profiles.shape

### Training initialization

In [None]:
 
metrics_hist = defaultdict(list)
metrics_keys = ["train_auc","train_logloss", "val_auc", "val_logloss", "roc_auc", "logloss",
                "accuracy","bal_acc","top_k_acc","F1_score","map","pearson_corr"]

# print(study_params)

In [None]:
study_params = {  "booster"            : "gbtree",
                  "device"             : "gpu",
                  "objective"          :  "binary:logistic",
                  "eval_metric"        :  ["auc", "logloss"]
                  "learning_rate"      : 0.001,    ## eta
                  "max_depth"          : 10,
                  "max_leaves"         : 10,
                  "verbosity"          : 0,
                  "min_child_weight"   : 1, 
                  "scale_pos_weight"   : 7.89,
                  "gamma"              : 0.0001,  ## min split loss 
               }

In [None]:
iter_files = make_cv_splits(df_profiles, n_folds=5, y_columns=y_columns)

In [None]:
xgb.__version__
xgb.config_context(device = "gpu", booster = "gbtree", verbosity = 0, validate_parameters = True ) 
# xgb.set_config(use_rmm = True, verbosity = 1)
# xgb.set_config()
# xgb.get_config()

### Beginning of loop

In [None]:
(X_train,y_train), (X_test, y_test) = next(iter_files)

In [None]:
logging.info(f"Building TRAINING/TEST DMatricies...")

d_train = xgb.dask.DaskDMatrix(client, X_train, y_train )
# logging.info(f"Building TRAINING DMatrix...Completed")

# logging.info(f"Building TEST DMatrix...")
d_test = xgb.dask.DaskDMatrix(client, X_test, y_test )
logging.info(f"Building TRAINING/TEST DMatricies...Completed")

In [None]:
# X_train.shape , X_test.shape, # y_train.shape , y_test.shape
booster_model = model['booster'] if model is not None else None

In [None]:
logging.info(f"Training model...")
model = xgb.dask.train(
    None,
    {"verbosity": 0, "tree_method": "hist", **study_params},
    d_train,
    num_boost_round=10000,
    evals=[(d_train, "train"), (d_test, "test")],
    xgb_model = booster_model,
    verbose_eval=True,
    early_stopping_rounds = 50,
)
logging.info(f"Training model...Completed")

In [None]:
model['booster'].best_iteration, model['booster'].best_score

In [None]:
logging.info(f"Running model on test data...")
predictions = xgb.dask.predict(client, model,d_test).compute()
logging.info(f"Running model on test data...Completed")

logging.info(f"Reshape y_test data...")
y_test_c = y_test.compute().to_numpy().squeeze()
logging.info(f"Reshape y_test data...Complete")

In [None]:
print(f" y_test_c   : {type(y_test_c)} \t\t\t shape: {y_test_c.shape}")
print(f" Predictions: {type(predictions)} \t\t\t shape: {predictions.shape}")
np.bincount(y_test_c)

In [None]:
logging.info(f"Calculating metrics...")
# R2_score = skm.r2_score(y_true = y_test_c, y_pred = predictions)
# mse_score = skm.mean_squared_error(y_true = y_test_c, y_pred = predictions, squared=False,)
# metrics_keys = ["train_auc","train_logloss", "val_auc", "val_logloss", "roc_auc", "logloss",
#                "accuracy","bal_acc","top_k_acc","F1_score","map","pearson_corr"]
metrics = compute_classification_metrics(model, y_test_c, predictions)
logging.info(f"Calculating metrics...Completed")

In [None]:
# print(y_test_c)
# print(predictions)
print()
print(f"    log_loss :  {metrics['logloss']:9.6f}")
print(f"    Accuracy :  {metrics['accuracy']:9.6f}")
print(f" Bal. Accrcy :  {metrics['bal_acc']:9.6f}")
print(f" Top 3 Score :  {metrics['top_k_acc']:9.6f}")
print(f"    F1 Score :  {metrics['F1_score']:9.6f}")
print(f"     Pearson :  {metrics['pearson_corr']:9.6f}          P : {pearson_p:9.4e}")
print()
print(f"   Training AUC :  {metrics['train_auc']:.6f}      Logloss : {metrics['train_logloss']:.6f}")
print(f" Validation AUC :  {metrics['val_auc']:.6f}      Logloss : {metrics['val_logloss']:.6f}")
print(f"       Test AUC :  {metrics['roc_auc']:.6f}      Logloss : {metrics['logloss']:.6f}")

In [None]:
for key in metrics_keys:
    print(f" {key:20s}:    {metrics_hist[key]}")

In [None]:
model['booster'].best_score
model['booster'].best_iteration

### plot results

In [None]:
### from sklearn.metrics import RocCurveDisplay
fig, axes = plt.subplots(1, 3, figsize=(16, 5))


roc_display = skm.RocCurveDisplay.from_predictions(
    y_test_c,
    predictions, 
    name=f"Roc Curve",
    color="darkorange",
    plot_chance_level=True,
    ax = axes[0]
)
_ = roc_display.ax_.set(
    xlabel="False Positive Rate",
    ylabel="True Positive Rate",
    title=f"ROC curve - TPSA Classification (XGBoost)\n LogLoss: {metrics_hist['logloss'][-1]:0.3f}      AUC: {metrics_hist['roc_auc'][-1]:0.3f} ",
)
_ = roc_display.ax_.legend(fontsize=8);

# prec, recall, _ = skm.precision_recall_curve(y_test, predictions, pos_label=1)
# pr_display = skm.PrecisionRecallDisplay(precision=prec, recall=recall)
pr_display = skm.PrecisionRecallDisplay.from_predictions(
    y_test_c, 
    predictions, 
    name="Precision/Recall - XGBoost", 
    plot_chance_level=True,
    ax = axes[1]
)
_ = pr_display.ax_.set_title(" Precision-Recall curve");
_ = pr_display.ax_.legend(fontsize=8);
cm_display = skm.ConfusionMatrixDisplay.from_predictions(y_test_c, (predictions >= 0.5), values_format="5d", ax = axes[2])

# fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 8))
# roc_display.plot(ax=ax1)
# pr_display.plot(ax=ax2)
# plt.show()

In [None]:
# del d_train, d_test
del model, predictions, pearson_p
del d_train, d_test, X_train, y_train, X_test, y_test, y_test_c

### End of loop

In [None]:
print_metric_hist(metrics_hist)

In [None]:
# type(model)
data = { 'mse_scores' : mse_scores_np,
         'R2_scores' : R2_scores_np}
    
with open(trainingMetrics, 'wb') as f: 
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)

In [None]:
with open(trainingMetrics, 'rb') as f:
    # The protocol version used is detected automatically, so we do not
    # have to specify it.
    data = pickle.load(f)
data