 # XGBoost - Classification

**Using Optuna for hyper-parameter search  to predict TPSA from morphology profiles**

# Initialization

In [1]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:98% !important; }</style>"))
%load_ext autoreload  
%autoreload 2
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline

## imports 

In [2]:
# Models
import os, sys
import math
import pickle
import itertools
import copy 
import joblib
import logging 
import types

from datetime import datetime, time
from collections.abc import Iterator
if './src' not in sys.path:
    print(f"insert ./src")
    sys.path.insert(0, './src')
print(sys.path)

import numpy as np
np.set_printoptions(edgeitems=3, infstr='inf', linewidth=150, nanstr='nan')

import pandas as pd
pd.options.display.width = 170

import scipy.stats as sps 

import matplotlib.pyplot as plt

from pprint import PrettyPrinter
pp = PrettyPrinter(indent=4)
from collections import defaultdict
from utils_cellpainting import *
from utils import display_gpu_info, display_gpu_device_info
# (initialize, init_dataloaders, init_environment, init_wandb, training_initializations, model_initializations, 


import warnings
warnings.filterwarnings('ignore')

insert ./src
['./src', '/home/kevin/WSL-shared/Cellpainting/cj-datasets', '/home/kevin/miniforge3/envs/cp311/lib/python311.zip', '/home/kevin/miniforge3/envs/cp311/lib/python3.11', '/home/kevin/miniforge3/envs/cp311/lib/python3.11/lib-dynload', '', '/home/kevin/miniforge3/envs/cp311/lib/python3.11/site-packages']




In [3]:
display_gpu_info()


CUDA Device(s) available
--------------------------
 CUDA device count   :  3
 CUDA current device :  0   name:  Quadro GV100


GPU Device Info 
------------------
 Device : cuda:0
   name:        Quadro GV100
   capability:  (7, 0)
   properties:  _CudaDeviceProperties(name='Quadro GV100', major=7, minor=0, total_memory=32508MB, multi_processor_count=80)
   Allocated :  0
   Reserved  :  0

 Device : cuda:1
   name:        Quadro GV100
   capability:  (7, 0)
   properties:  _CudaDeviceProperties(name='Quadro GV100', major=7, minor=0, total_memory=32508MB, multi_processor_count=80)
   Allocated :  0
   Reserved  :  0

 Device : cuda:2
   name:        NVIDIA TITAN Xp
   capability:  (6, 1)
   properties:  _CudaDeviceProperties(name='NVIDIA TITAN Xp', major=6, minor=1, total_memory=12196MB, multi_processor_count=30)
   Allocated :  0
   Reserved  :  0


GPU Usage Stats 
------------------
| ID | GPU | MEM |
------------------
|  0 | 18% | 41% |
|  1 |  0% | 13% |
|  2 | 22% | 18% |

 to

In [4]:
import joblib
# from dask_cuda import LocalCUDACluster
# from sklearn.model_selection import GridSearchCV
import optuna

import xgboost as xgb
from xgboost import XGBClassifier, XGBRegressor

import dask
import dask.array as da
import dask.dataframe as dd
from dask import delayed
from dask.distributed import Client
from dask.distributed import LocalCluster
from dask_cuda import LocalCUDACluster
import dask_ml.model_selection as dcv
# from dask_ml.model_selection import train_test_split
# from dask_ml.model_selection import GridSearchCV, IncrementalSearchCV, HyperbandSearchCV
# from dask_ml.metrics import mean_squared_error, r2_score, mean_squared_log_error
import sklearn.metrics as skm

In [5]:
# os.environ["WANDB_NOTEBOOK_NAME"] = "Adashare_Train.ipynb"
# os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

In [6]:
logLevel = os.environ.get('LOG_LEVEL', 'INFO').upper()
FORMAT = '%(asctime)s - %(levelname)s: - %(message)s'
logging.basicConfig(level="INFO", format= FORMAT)
logging.getLogger("imported_module").setLevel(logging.CRITICAL)
# logging.info(f" 1/7- engine connected")
# logging.warning(f" 1/7- engine connected")
# logging.error(f" 1/7- engine connected")
# logging.critical(f" 1/7- engine connected")

# print(logging.NOTSET, logging.DEBUG,  logging.INFO, logging.WARN, logging.WARNING, logging.ERROR, logging.CRITICAL,  logging.FATAL)
# xgb.__version__

print()
for time_fmt in ['%x%X', '%X %x %Z', '%X.%f', '%D-%X.%f', '%Y-%m-%d %H:%M:%S.%f']:
    cmd_string = f"datetime.now().strftime('{time_fmt}')"
    print(f" {cmd_string:50s}  : {datetime.now().strftime(time_fmt)}")


 datetime.now().strftime('%x%X')                     : 02/13/2413:32:56
 datetime.now().strftime('%X %x %Z')                 : 13:32:56 02/13/24 
 datetime.now().strftime('%X.%f')                    : 13:32:56.944426
 datetime.now().strftime('%D-%X.%f')                 : 02/13/24-13:32:56.944452
 datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')     : 2024-02-13 13:32:56.944473


## Datasets

In [7]:
prefix = '' ### Target-2' , 'MOA'
input_path ="./metadata/"
output_path ="./output_11102023"
prefix_lc = prefix.lower().replace('-', '_')
CSV = '.csv'

CompoundExtendedMetadata2SampleFile = f"{output_path}{prefix_lc}/compound_extended_metadata_2samples.csv"
CompoundProfiles2SampleFile         = f"{output_path}{prefix_lc}/profiles/compound_profiles_2samples"
CompoundExtendedMetadataSampleFile  = f"{output_path}{prefix_lc}/compound_extended_metadata_samples.csv"
profileInputFiles                   = "./output_11102023/profiles/compound_profiles_2samples_{0:03d}.csv"
binnedProfileFiles                  = "./output_11102023/binned_profiles/binned_2sample_profiles_{0:03d}.csv"
trainingMetrics                     = f"{output_path}{prefix_lc}/training_metrics.pkl"
profileMetadataFile                 = f"{input_path}profile_metadata.pkl"


print()
print(f" Compound Extended Metadata 2 SampleFile  : {CompoundExtendedMetadata2SampleFile }")
print(f" Compound Profiles 2 Samples File CSV     : {CompoundProfiles2SampleFile}")
print(f" ")
print(f" profiles Metadata File                   : {profileMetadataFile}")
print(f" ")
x = 999
print(f" profileInputFiles                        : {profileInputFiles.format(x)}")
print(f" binnedProfileFiles                       : {binnedProfileFiles.format(x)}")
print(f" training metrics                         : {trainingMetrics}")



 Compound Extended Metadata 2 SampleFile  : ./output_11102023/compound_extended_metadata_2samples.csv
 Compound Profiles 2 Samples File CSV     : ./output_11102023/profiles/compound_profiles_2samples
 
 profiles Metadata File                   : ./metadata/profile_metadata.pkl
 
 profileInputFiles                        : ./output_11102023/profiles/compound_profiles_2samples_999.csv
 binnedProfileFiles                       : ./output_11102023/binned_profiles/binned_2sample_profiles_999.csv
 training metrics                         : ./output_11102023/training_metrics.pkl


## Read Features metadata file

In [44]:
print(f" Features select file:  {profileMetadataFile}")

with open(profileMetadataFile, 'rb') as f: 
    pickle_data = pickle.load(f)
pickle_data.keys()    


 Features select file:  ./metadata/profile_metadata.pkl


dict_keys(['all_profile_columns', 'metadata_columns', 'selected_columns'])

In [None]:
ns = types.SimpleNamespace()

In [45]:
COMPOUND_PROFILE_COLUMNS = pickle_data['all_profile_columns']

metadata_categorical_columns = set(pickle_data['metadata_columns']['MetadataCategoricalColumns'])
metadata_categorical_columns_dtype = pickle_data['metadata_columns']['MetadataCategoricalColumns']

metadata_float_columns =  set(pickle_data['metadata_columns']['MetadataFloatColumns'])
metadata_float_columns_dtype =  pickle_data['metadata_columns']['MetadataFloatColumns']

metadata_integer_columns =  set(pickle_data['metadata_columns']['MetadataIntegerColumns'])
metadata_integer_columns_dtype =  pickle_data['metadata_columns']['MetadataIntegerColumns']

metadata_numeric_columns =  set(pickle_data['metadata_columns']['MetadataNumericColumns'])
metadata_numeric_columns_dtype =  pickle_data['metadata_columns']['MetadataNumericColumns']


X_columns = pickle_data['selected_columns']['selected']
X_columns_dtype = {x: np.float32 for x in X_columns}

y_columns = set(["Metadata_Permiation"])
y_columns_dtype = {x: np.int64 for x in y_columns} ## "Metadata_log10TPSA":np.dtype('float64')}

Xy_columns = y_columns | X_columns
Xy_columns_dtype = y_columns_dtype | X_columns_dtype

In [46]:
for k in pickle_data['metadata_columns'].keys():
    print("-"*80)
    print(f" {k}  - length({len(pickle_data['metadata_columns'][k])} )")
    print("-"*80)
    if isinstance(pickle_data['metadata_columns'][k], list):
        for v in pickle_data['metadata_columns'][k]:
            print(f" \t : list item : {v}")

    elif isinstance(pickle_data['metadata_columns'][k], dict):    
        for i,v in pickle_data['metadata_columns'][k].items():
            print(f" \t : key :  {i:25s}     item: {v}")
    print()

--------------------------------------------------------------------------------
 MetadataOriginalNames  - length(10 )
--------------------------------------------------------------------------------
 	 : list item : Metadata_Source
 	 : list item : Metadata_Batch
 	 : list item : Metadata_Plate
 	 : list item : Metadata_Well
 	 : list item : Metadata_JCP2022
 	 : list item : Metadata_Hash
 	 : list item : TPSA
 	 : list item : lnTPSA
 	 : list item : log10TPSA
 	 : list item : permiation

--------------------------------------------------------------------------------
 MetadataNames  - length(11 )
--------------------------------------------------------------------------------
 	 : list item : Metadata_Source
 	 : list item : Metadata_Batch
 	 : list item : Metadata_Plate
 	 : list item : Metadata_Well
 	 : list item : Metadata_JCP2022
 	 : list item : Metadata_Hash
 	 : list item : Metadata_Bin
 	 : list item : Metadata_TPSA
 	 : list item : Metadata_lnTPSA
 	 : list item : Metadata_

In [79]:
print("\n all_profile_columns")
print("-"*80)
print(f" Length            : {len(COMPOUND_PROFILE_COLUMNS)}")
print(f" profile cols [:5] : {COMPOUND_PROFILE_COLUMNS[:5]}")
print(f" profile cols [:10]: {COMPOUND_PROFILE_COLUMNS[5:10]}")
print(f" profile cols [:14]: {COMPOUND_PROFILE_COLUMNS[10:14]}")
print(f" profile cols [:18]: {COMPOUND_PROFILE_COLUMNS[14:18]}")

print()
print("\n metadata_columns")
print("-"*80)
for i in pickle_data['metadata_columns'].keys():
    print(f" {i:28s} ({len(pickle_data['metadata_columns'][i]):4d}) {list(pickle_data['metadata_columns'][i])[:5]}")
 
print("\n selected_columns")
print("-"*80)
ttl = 0 
for i in pickle_data['selected_columns'].keys():
    ttl += len(pickle_data['selected_columns'][i])
    print(f" Feature columns -/{i:25s}/   {len(pickle_data['selected_columns'][i]):5d}")
print(f" {'total':47s}{ttl:5d}     ")     
 
print('\n')
print(f" metadata_categorical_columms  ({len(metadata_categorical_columns):4d}) {metadata_categorical_columns} ")
print(f" metadata_float_columms        ({  len(metadata_float_columns):4d}) {metadata_float_columns}   ")
print(f" metadata_integer_columms      ({len(metadata_integer_columns):4d}) {metadata_integer_columns} ")
print(f" metadata_numeric_columms      ({len(metadata_numeric_columns):4d}) {metadata_numeric_columns} ")
print(f" COMPOUND_PROFILE_COLUMNS      ({len(COMPOUND_PROFILE_COLUMNS):4d}) {COMPOUND_PROFILE_COLUMNS[:7]}")
print()
print(f" len(X_columms)                ({len(X_columns):4d})")
print(f" len(y_columms)                ({len(y_columns):4d}) {y_columns}")
print(f" len(Xy_columms)               ({len(Xy_columns)}) {list(Xy_columns)[:3]}")
print(f" len(Xy_columms_dtype)         ({len(Xy_columns)}) {list(Xy_columns_dtype)[:3]}")

print("\n Conversion Dictionaries")
print("-"*80)
print(f" categorical_columns_dtype     ({len(metadata_categorical_columns_dtype):4d}) {metadata_categorical_columns_dtype} ")
print(f" float_columns_dtype           ({len(metadata_float_columns_dtype):4d}) {metadata_float_columns_dtype} ")
print(f" integer_columns_dtype         ({len(metadata_integer_columns_dtype):4d}) {metadata_integer_columns_dtype} ")
print('\n')
# print(f" len(label_columms_dtype)     : {len(label_columns_dtype)} ")
# print(f" len(X_columms_dtype)         : {len(X_columns_dtype)}")
# print(f" len(X_columms_dtype)           : {len(X_columns_dtype)}   {X_columns_dtype}")
# print(f" len(y_columms_dtype)         : {len(y_columns_dtype)}")
# print(f" len(Xy_columms_dtype)        : {len(Xy_columns_dtype)}")
# print(f" len(all_columms_dtype)       : {len(all_columns_dtype)} ")
# del all_columns_dtype
# for k  in sorted(all_columns_dtype.keys()):
#     print(f" {k:60s}  {all_columns_dtype[k]}")
# Xy_columns_dtype


 all_profile_columns
--------------------------------------------------------------------------------
 Length            : 3681
 profile cols [:5] : ['Metadata_Source', 'Metadata_Batch', 'Metadata_Plate', 'Metadata_Well', 'Metadata_JCP2022']
 profile cols [:10]: ['Metadata_Hash', 'Metadata_Bin', 'Metadata_TPSA', 'Metadata_lnTPSA', 'Metadata_log10TPSA']
 profile cols [:14]: ['Metadata_Permiation', 'Cells_AreaShape_Area', 'Cells_AreaShape_BoundingBoxArea', 'Cells_AreaShape_BoundingBoxMaximum_X']
 profile cols [:18]: ['Cells_AreaShape_BoundingBoxMaximum_Y', 'Cells_AreaShape_BoundingBoxMinimum_X', 'Cells_AreaShape_BoundingBoxMinimum_Y', 'Cells_AreaShape_Center_X']


 metadata_columns
--------------------------------------------------------------------------------
 MetadataOriginalNames        (  10) ['Metadata_Source', 'Metadata_Batch', 'Metadata_Plate', 'Metadata_Well', 'Metadata_JCP2022']
 MetadataNames                (  11) ['Metadata_Source', 'Metadata_Batch', 'Metadata_Plate', 'Metad

# Load study

  #### Address for optuna dashboard repository:  `sqlite:////home/kevin/WSL-shared/Cellpainting/cj-datasets/optuna_data.db`
  #### Address for optuna dashboard repository:  `sqlite:////home/kevin/WSL-shared/Cellpainting/cj-datasets/example.db`

In [9]:
storage = "sqlite:///example.db"
storage_copy = "sqlite:///example_copy.db"

study_name="classification-study-1"
study_name_clone="classification-study-1-clone"

## https://optuna.readthedocs.io/en/stable/reference/generated/optuna.storages.RetryFailedTrialCallback.html

from optuna.storages import RetryFailedTrialCallback

storage_instance = optuna.storages.RDBStorage(
    url=storage_copy,
    heartbeat_interval=60,
    grace_period=120,
    failed_trial_callback=RetryFailedTrialCallback(max_retry=3),
)
storage_instance


<optuna.storages._rdb.storage.RDBStorage at 0x7f67999b0dd0>

In [7]:
# resp = input(f" Delete study {study_name} ?")
# if resp.lower() in ['yes' ,'y']:
#     print(' You responsed yes')
#     try:
#         optuna.delete_study(storage=storage_instance, study_name=study_name)
#         print("delete successful")
#     except Exception as e:
#         print("delete failed")
# else:
#     print(f" {study_name} will be kept")

# study = optuna.create_study(storage=storage_instance_copy,
#                             study_name=study_name_copy,
#                             directions=["maximize","minimize"], 
#                             load_if_exists=True)
# study.set_metric_names(["roc_auc", "logloss"])

In [10]:
start = datetime.now()
study  = optuna.load_study(study_name= study_name, storage=storage_instance)
study_clone  = optuna.load_study(study_name= study_name_clone, storage=storage_instance)
print(f"Total time:  {datetime.now() - start}")

Total time:  0:00:00.010225


In [11]:
disp_study_history(study)

 classification-study-1  study history

                start     -   completion      status        validation metrics
 trial#         time      -      time          code      roc_auc        logloss
--------------------------------------------------------------------------------
Trial #: 0    2024-02-04   00:48:09 - 00:54:05    3          *** FAILED ***        
Trial #: 1    2024-02-04   01:15:51 - 01:40:09    1      0.92354        0.20999    
Trial #: 2    2024-02-04   10:36:45 - 12:28:29    1      0.82900        0.34993    
Trial #: 3    2024-02-04   14:50:22 - 15:11:49    1      0.77843        0.32556    
Trial #: 4    2024-02-04   15:11:50 - 16:04:43    1      0.71748        0.36995    
Trial #: 5    2024-02-04   16:04:44 - 16:25:30    1      0.84788        0.29725    
Trial #: 6    2024-02-04   16:25:30 - 16:55:59    1      0.62604        0.36119    
Trial #: 7    2024-02-04   18:06:45 - 18:29:19    3          *** FAILED ***        
Trial #: 8    2024-02-04   23:09:09 - 23:29:46  

In [12]:
disp_study_history(study_clone)

 classification-study-1-clone  study history

                start     -   completion      status        validation metrics
 trial#         time      -      time          code      roc_auc        logloss
--------------------------------------------------------------------------------
Trial #: 0    2024-02-11   15:31:07 - 16:38:29    1      0.62593        0.35270    
Trial #: 1    2024-02-11   21:50:47 - 22:14:47    1      0.60966        0.35607    
Trial #: 2    2024-02-11   23:21:11 - 00:01:20    1      0.61561        0.36063    
Trial #: 3    2024-02-12   00:01:21 - 00:39:17    1      0.60933        0.35866    
Trial #: 4    2024-02-12   00:39:18 - 00:56:58    1      0.59683        0.36285    
Trial #: 5    2024-02-12   01:55:54 - 02:12:44    1      0.55990        0.56057    
Trial #: 6    2024-02-12   02:12:45 - 02:29:49    1      0.59509        0.36394    
Trial #: 7    2024-02-12   02:29:50 - 03:59:12    1      0.61147        0.37472    
Trial #: 8    2024-02-12   03:59:13 - 04:4

In [13]:
# study.trials_dataframe()
study_clone.trials_dataframe()

Unnamed: 0,number,values_logloss,values_roc_auc,datetime_start,datetime_complete,duration,params_colsample_bylevel,params_colsample_bynode,params_learning_rate,params_max_delta_step,params_max_depth,params_min_child_weight,params_min_split_loss,params_subsample,user_attrs_memo,system_attrs_fixed_params,system_attrs_nsga2:generation,state
0,0,0.352702,0.625932,2024-02-11 15:31:07.275621,2024-02-11 16:38:29.798596,0 days 01:07:22.522975,0.961781,0.611034,0.012489,5.035702,9,6.705902,5.097113,0.837417,,,0.0,COMPLETE
1,1,0.356066,0.609663,2024-02-11 21:50:47.174731,2024-02-11 22:14:47.156695,0 days 00:23:59.981964,0.850865,0.59851,0.004244,3.627402,4,7.524141,9.060415,0.930222,,,0.0,COMPLETE
2,2,0.360627,0.615614,2024-02-11 23:21:11.076371,2024-02-12 00:01:20.691746,0 days 00:40:09.615375,0.627851,0.553138,0.000931,9.350387,9,6.864519,4.622671,0.822365,,,0.0,COMPLETE
3,3,0.358661,0.609327,2024-02-12 00:01:21.649277,2024-02-12 00:39:17.230068,0 days 00:37:55.580791,0.823439,0.71195,0.036443,3.3545,13,0.640387,0.25171,0.736546,,,0.0,COMPLETE
4,4,0.362854,0.596833,2024-02-12 00:39:18.091689,2024-02-12 00:56:58.827582,0 days 00:17:40.735893,0.741545,0.533578,0.193726,5.290435,8,7.932249,0.638827,0.85334,,,0.0,COMPLETE
5,5,0.560568,0.559895,2024-02-12 01:55:54.321211,2024-02-12 02:12:44.511828,0 days 00:16:50.190617,0.852864,0.692054,0.718021,8.219028,11,6.864818,1.274035,0.82163,,,0.0,COMPLETE
6,6,0.363938,0.595092,2024-02-12 02:12:45.046738,2024-02-12 02:29:49.915460,0 days 00:17:04.868722,0.705275,0.812628,0.101733,9.827028,12,3.320796,4.932578,0.622028,,,0.0,COMPLETE
7,7,0.374718,0.611472,2024-02-12 02:29:50.425523,2024-02-12 03:59:12.655144,0 days 01:29:22.229621,0.947233,0.510696,0.000121,5.16697,12,0.244069,4.749472,0.620393,,,0.0,COMPLETE
8,8,0.354772,0.616641,2024-02-12 03:59:13.129237,2024-02-12 04:49:11.655533,0 days 00:49:58.526296,0.857213,0.509796,0.010357,1.03411,15,4.793893,1.719604,0.412073,,,0.0,COMPLETE
9,9,0.372122,0.527256,2024-02-12 04:49:12.123042,2024-02-12 05:08:23.327686,0 days 00:19:11.204644,0.860123,0.874122,0.000303,7.04718,1,8.575491,0.326933,0.89776,,,0.0,COMPLETE


# Clone study

In [18]:
from optuna.study import create_study, load_study

In [10]:
storage = "sqlite:///example.db"
storage_copy = "sqlite:///example_copy.db"

In [None]:
study_name="classification-study-1"
study_name_copy="classification-study-1-clone"

In [11]:
## https://optuna.readthedocs.io/en/stable/reference/generated/optuna.storages.RetryFailedTrialCallback.html

from optuna.storages import RetryFailedTrialCallback

storage_instance = optuna.storages.RDBStorage(
    url=storage_copy,
    heartbeat_interval=60,
    grace_period=120,
    failed_trial_callback=RetryFailedTrialCallback(max_retry=3),
)
storage_instance

  failed_trial_callback=RetryFailedTrialCallback(max_retry=3),


<optuna.storages._rdb.storage.RDBStorage at 0x7fee65f65250>

In [15]:
from_storage = storage_instance
to_storage = storage_instance
from_study_name = "classification-study-1"
to_study_name = "classification-study-1-clone"

In [19]:
from_study = load_study(study_name=from_study_name, storage=storage_instance)

In [20]:
disp_study_history(from_study)

 classification-study-1  study history

                start     -   completion      status        validation metrics
 trial#         time      -      time          code      roc_auc        logloss
--------------------------------------------------------------------------------
Trial #: 0    2024-02-04   00:48:09 - 00:54:05    3          *** FAILED ***        
Trial #: 1    2024-02-04   01:15:51 - 01:40:09    1      0.92354        0.20999    
Trial #: 2    2024-02-04   10:36:45 - 12:28:29    1      0.82900        0.34993    
Trial #: 3    2024-02-04   14:50:22 - 15:11:49    1      0.77843        0.32556    
Trial #: 4    2024-02-04   15:11:50 - 16:04:43    1      0.71748        0.36995    
Trial #: 5    2024-02-04   16:04:44 - 16:25:30    1      0.84788        0.29725    
Trial #: 6    2024-02-04   16:25:30 - 16:55:59    1      0.62604        0.36119    
Trial #: 7    2024-02-04   18:06:45 - 18:29:19    3          *** FAILED ***        
Trial #: 8    2024-02-04   23:09:09 - 23:29:46  

In [21]:
to_study = load_study(study_name=study_name_clone, storage=storage_instance)

In [22]:
disp_study_history(to_study)

 classification-study-1-clone  study history

                start     -   completion      status        validation metrics
 trial#         time      -      time          code      roc_auc        logloss
--------------------------------------------------------------------------------
Trial #: 0    2024-02-11   15:31:07 - 16:38:29    1      0.62593        0.35270    
Trial #: 1    2024-02-11   21:50:47 - 22:14:47    1      0.60966        0.35607    
Trial #: 2    2024-02-11   23:21:11 - 00:01:20    1      0.61561        0.36063    
Trial #: 3    2024-02-12   00:01:21 - 00:39:17    1      0.60933        0.35866    
Trial #: 4    2024-02-12   00:39:18 - 00:56:58    1      0.59683        0.36285    
Trial #: 5    2024-02-12   01:55:54 - 02:12:44    1      0.55990        0.56057    
Trial #: 6    2024-02-12   02:12:45 - 02:29:49    1      0.59509        0.36394    
Trial #: 7    2024-02-12   02:29:50 - 03:59:12    1      0.61147        0.37472    
Trial #: 8    2024-02-12   03:59:13 - 04:4

####  Create a new study (to copy over to)

In [23]:
# to_study = create_study(
#     study_name=to_study_name ,
#     storage=to_storage,
#     directions=from_study.directions,
#     load_if_exists=False,
# )

  ##### Copy system attributes

In [24]:
# for key, value in from_study._storage.get_study_system_attrs(from_study._study_id).items():
#     to_study._storage.set_study_system_attr(to_study._study_id, key, value)

  ##### Copy user attributes

In [25]:
# for key, value in from_study.user_attrs.items():
#     to_study.set_user_attr(key, value)

In [23]:
# disp_study_history(from_study)

#### Get trials from source study (to copy over)

In [26]:
# Trials are deep copied on `add_trials`.
from_trials = from_study.get_trials(deepcopy=False)
len(from_trials)
type(from_trials)

62

list

In [27]:
copy_trials = from_trials[52:62]
len(copy_trials)

10

In [28]:
# Trials are deep copied on `add_trials`.
to_study.add_trials(copy_trials)

In [37]:
# Trials are deep copied on `add_trials`.
# to_study.add_trials(from_study.get_trials(deepcopy=False))

In [29]:
disp_study_history(to_study)

 classification-study-1-clone  study history

                start     -   completion      status        validation metrics
 trial#         time      -      time          code      roc_auc        logloss
--------------------------------------------------------------------------------
Trial #: 0    2024-02-11   15:31:07 - 16:38:29    1      0.62593        0.35270    
Trial #: 1    2024-02-11   21:50:47 - 22:14:47    1      0.60966        0.35607    
Trial #: 2    2024-02-11   23:21:11 - 00:01:20    1      0.61561        0.36063    
Trial #: 3    2024-02-12   00:01:21 - 00:39:17    1      0.60933        0.35866    
Trial #: 4    2024-02-12   00:39:18 - 00:56:58    1      0.59683        0.36285    
Trial #: 5    2024-02-12   01:55:54 - 02:12:44    1      0.55990        0.56057    
Trial #: 6    2024-02-12   02:12:45 - 02:29:49    1      0.59509        0.36394    
Trial #: 7    2024-02-12   02:29:50 - 03:59:12    1      0.61147        0.37472    
Trial #: 8    2024-02-12   03:59:13 - 04:4

# Optuna trying to copy /delete trials from study

In [None]:
study_name="classification-study-1"
storage = "sqlite:///example.db"
study_name_copy="classification-study-1-copy"
storage_copy = "sqlite:///example_copy.db"

In [None]:
## https://optuna.readthedocs.io/en/stable/reference/generated/optuna.storages.RetryFailedTrialCallback.html

from optuna.storages import RetryFailedTrialCallback

storage_instance = optuna.storages.RDBStorage(
    url=storage_copy,
    heartbeat_interval=60,
    grace_period=120,
    failed_trial_callback=RetryFailedTrialCallback(max_retry=3),
)
storage_instance

In [91]:
# optuna.copy_study(
#     from_study_name=study_name,
#     from_storage="sqlite:///example.db",
#     to_storage="sqlite:///example_copy.db",
# )
# study = optuna.load_study(storage="sqlite:///example_copy.db",
                            # study_name="classification-study-1-copy",)

study = optuna.load_study(study_name = study_name_copy, 
                          storage = storage_instance)


In [92]:
disp_study_history(study)

 classification-study-1-copy  study history

            status        validation metrics


TypeError: 'NoneType' object is not subscriptable

In [87]:
ho_study_name = "classification-study-1-copy"
ho_storage_name = storage_copy
ho_trials_to_delete = [0,7,14]

num_trials = study.trials[-1].number
num_trials_to_keep = 50

num_trials

29

In [67]:
t = study.trials[0] 
t 

FrozenTrial(number=0, state=3, values=None, datetime_start=datetime.datetime(2024, 2, 4, 0, 48, 9, 793433), datetime_complete=datetime.datetime(2024, 2, 4, 0, 54, 5, 488762), params={'learning_rate': 0.14327608105242665, 'min_split_loss': 0.013665061540799606, 'max_depth': 11, 'min_child_weight': 9.683348905910576, 'max_delta_step': 0.0746562549161478, 'subsample': 0.9869150345493672, 'colsample_bylevel': 0.5856183883769744, 'colsample_bynode': 0.5820866507844484}, user_attrs={}, system_attrs={'nsga2:generation': 0}, intermediate_values={}, distributions={'learning_rate': FloatDistribution(high=1.0, log=True, low=0.0001, step=None), 'min_split_loss': FloatDistribution(high=10.0, log=False, low=0.0, step=None), 'max_depth': IntDistribution(high=15, log=False, low=1, step=1), 'min_child_weight': FloatDistribution(high=10.0, log=False, low=0.0, step=None), 'max_delta_step': FloatDistribution(high=10.0, log=False, low=0.0, step=None), 'subsample': FloatDistribution(high=1.0, log=False, low

In [69]:
trials_to_keep = [
    optuna.trial.create_trial(
        state=optuna.trial.TrialState.COMPLETE,
        params=t.params,
        user_attrs=t.user_attrs,
        system_attrs=t.system_attrs,
        intermediate_values=t.intermediate_values,
        distributions=t.distributions,
        values=t.values,
        value =t.value
    )   t in study.trials if t.state = optuna.trial.TrialState.COMPLETE ] ##t.number not in ho_trials_to_delete]

SyntaxError: invalid syntax. Perhaps you forgot a comma? (757330297.py, line 2)

In [85]:
trials_to_keep = [] 
for t in study.trials:
     if t.state == optuna.trial.TrialState.COMPLETE : 
        copy_t =  optuna.trial.create_trial(
            
                            trial_id = t.trial_id
                            state=optuna.trial.TrialState.COMPLETE,
                            params=t.params,
                            user_attrs=t.user_attrs,
                            system_attrs=t.system_attrs,
                            intermediate_values=t.intermediate_values,
                            distributions=t.distributions,
                            values=t.values,
                            # value =t.value
                        ) 
        trials_to_keep.append(copy_t)

SyntaxError: invalid syntax. Perhaps you forgot a comma? (582776261.py, line 6)

In [10]:
# trials_to_keep

In [88]:
ho_storage_name

'sqlite:///example_copy.db'

In [90]:
# Delete study before recreating
# optuna.delete_study(study_name=ho_study_name, storage=ho_storage_name)

# Recreate study and add trials to keep
new_study = optuna.create_study(
    pruner=optuna.pruners.MedianPruner(n_warmup_steps=5),
    directions=["maximize","minimize"],     
    study_name=ho_study_name,
    storage=ho_storage_name,
    load_if_exists=True)
new_study.add_trials(trials_to_keep)

[I 2024-02-12 13:13:32,369] A new study created in RDB with name: classification-study-1-copy


In [58]:

study.trials[0]
# study.trials.pop(0)
study.trials[1].value

FrozenTrial(number=0, state=3, values=None, datetime_start=datetime.datetime(2024, 2, 4, 0, 48, 9, 793433), datetime_complete=datetime.datetime(2024, 2, 4, 0, 54, 5, 488762), params={'learning_rate': 0.14327608105242665, 'min_split_loss': 0.013665061540799606, 'max_depth': 11, 'min_child_weight': 9.683348905910576, 'max_delta_step': 0.0746562549161478, 'subsample': 0.9869150345493672, 'colsample_bylevel': 0.5856183883769744, 'colsample_bynode': 0.5820866507844484}, user_attrs={}, system_attrs={'nsga2:generation': 0}, intermediate_values={}, distributions={'learning_rate': FloatDistribution(high=1.0, log=True, low=0.0001, step=None), 'min_split_loss': FloatDistribution(high=10.0, log=False, low=0.0, step=None), 'max_depth': IntDistribution(high=15, log=False, low=1, step=1), 'min_child_weight': FloatDistribution(high=10.0, log=False, low=0.0, step=None), 'max_delta_step': FloatDistribution(high=10.0, log=False, low=0.0, step=None), 'subsample': FloatDistribution(high=1.0, log=False, low

RuntimeError: This attribute is not available during multi-objective optimization.