# Optuna

**Using Optuna for hyper-parameter search  to predict TPSA from morphology profiles**

# Initialization

In [1]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:98% !important; }</style>"))
%load_ext autoreload  
%autoreload 2
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline

## imports 

In [2]:
# Models
import os, sys
import math
import pickle
import itertools
import copy 
import joblib
import logging 
import types
import joblib

from datetime import datetime, time
from collections.abc import Iterator
for p in ['./src','../..']:
    if p not in sys.path:
        print(f"insert {p}")
        sys.path.insert(0, p)
print(sys.path)

import numpy as np
np.set_printoptions(edgeitems=3, infstr='inf', linewidth=150, nanstr='nan')

import pandas as pd
pd.options.display.width = 170

import scipy.stats as sps 

import matplotlib.pyplot as plt

from pprint import PrettyPrinter
pp = PrettyPrinter(indent=4)
from collections import defaultdict
from utils.utils_cellpainting import *
from KevinsRoutines.utils.utils_gpu import display_gpu_info, display_gpu_device_info
# (initialize, init_dataloaders, init_environment, init_wandb, training_initializations, model_initializations, 


import warnings
warnings.filterwarnings('ignore')

insert ./src
insert ../..
['../..', './src', '/home/kevin/WSL-shared/cellpainting/cj-datasets', '/home/kevin/miniforge3/envs/cp311/lib/python311.zip', '/home/kevin/miniforge3/envs/cp311/lib/python3.11', '/home/kevin/miniforge3/envs/cp311/lib/python3.11/lib-dynload', '', '/home/kevin/miniforge3/envs/cp311/lib/python3.11/site-packages', '/home/kevin/miniforge3/envs/cp311/lib/python3.11/site-packages/huggingface_hub-0.20.3-py3.8.egg']




In [3]:
# display_gpu_info()

In [4]:
logLevel = os.environ.get('LOG_LEVEL', 'INFO').upper()
FORMAT = '%(asctime)s - %(levelname)s: - %(message)s'
logging.basicConfig(level="INFO", format= FORMAT)
logging.getLogger("imported_module").setLevel(logging.CRITICAL)
# logging.info(f" 1/7- engine connected")
# logging.warning(f" 1/7- engine connected")
# logging.error(f" 1/7- engine connected")
# logging.critical(f" 1/7- engine connected")
print()
for time_fmt in ['%x%X', '%X %x %Z', '%X.%f', '%D-%X.%f', '%Y-%m-%d %H:%M:%S.%f']:
    cmd_string = f"datetime.now().strftime('{time_fmt}')"
    print(f" {cmd_string:50s}  : {datetime.now().strftime(time_fmt)}")


 datetime.now().strftime('%x%X')                     : 08/06/2421:03:22
 datetime.now().strftime('%X %x %Z')                 : 21:03:22 08/06/24 
 datetime.now().strftime('%X.%f')                    : 21:03:22.643271
 datetime.now().strftime('%D-%X.%f')                 : 08/06/24-21:03:22.643310
 datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')     : 2024-08-06 21:03:22.643341


In [5]:
def disp_study_history(study):
 
    print(f"\n\n Study  : {study.study_name}")
    print(f" Metrics: {study.metric_names}")
    metric_0_ttl = study.metric_names[0] if study.metric_names is not None else 'n/a'
    metric_1_ttl = study.metric_names[1] if study.metric_names is not None else 'n/a'
    print()
    print(f"                start     -   completion      status        validation metrics")
    print(f" trial#         time      -      time          code      {metric_0_ttl}        {metric_1_ttl}")
    print("-"*80)
    for st in study.trials:
        dt_start = st.datetime_start.strftime('%Y-%m-%d   %H:%M:%S') if st.datetime_start is not None else '-- n/a --' 
        dt_end   = st.datetime_complete.strftime('%H:%M:%S') if st.datetime_complete is not None else ' -n/a- ' 
        print(f"Trial #: {st.number:<4d} {dt_start:^21s} - {dt_end:^8s}  {st.state:3d}  ", end="")
        if st.state == TrialState.COMPLETE:
            print(f" {st.values[0]:10.5f}   {st.values[1]:12.5f}    {st.user_attrs.get('memo', '')}")
        elif st.state == TrialState.RUNNING:
            print(f"        *** RUNNING ***       {st.user_attrs.get('memo', '')}")            
        elif st.state == TrialState.PRUNED:
            print(f"        *** PRUNED ***        {st.user_attrs.get('memo', '')}")
        elif st.state == TrialState.FAIL:
            print(f"        *** FAILED ***        {st.user_attrs.get('memo', '')}")
        elif st.state == TrialState.WAITING:
            print(f"        *** WAITING ***       {st.user_attrs.get('memo', '')}")            
        else:
            print("\n")
    print(" *** end of trials *** ")
    print(" Best trials: ", [x.number for x in study.best_trials])    
    

# Open Optuna databases 

  #### Address for optuna dashboard repository:  `sqlite:////home/kevin/WSL-shared/Cellpainting/cj-datasets/optuna_database.db`


In [6]:
import optuna
import sqlite3
import pandas as pd

In [7]:
# del storage, storage_example, storage_instance

In [10]:
# db_1 = "example.db"
# db_2 = "example_copy.db"
# db_3 = "optuna_data.db"
# sqlite_1 = "sqlite:///example.db"
# sqlite_2 = "sqlite:///example_copy.db"
# sqlite_3 = "sqlite:///optuna_data.db"
# storage_1 = optuna.storages.RDBStorage(
#     url=sqlite_1,
#     heartbeat_interval=60,
#     grace_period=120,
#     failed_trial_callback=RetryFailedTrialCallback(max_retry=3),
# )

# storage_2 = optuna.storages.RDBStorage(
#     url=sqlite_2,
#     heartbeat_interval=60,
#     grace_period=120,
#     failed_trial_callback=RetryFailedTrialCallback(max_retry=3),
# )

# storage_3= optuna.storages.RDBStorage(
#     url=sqlite_3,
#     heartbeat_interval=60,
#     grace_period=120,
#     failed_trial_callback=RetryFailedTrialCallback(max_retry=3),
# )

In [11]:
db_4 = "optuna_database.db"
sqlite_4 = "sqlite:///optuna_database.db"

In [12]:
## https://optuna.readthedocs.io/en/stable/reference/generated/optuna.storages.RetryFailedTrialCallback.html

from optuna.storages import RetryFailedTrialCallback


storage_4 = optuna.storages.RDBStorage(
    url=sqlite_4,
    heartbeat_interval=60,
    grace_period=120,
    failed_trial_callback=RetryFailedTrialCallback(max_retry=3),
)

In [10]:
# study_name="classification-study-1"
# study_name_clone="classification-study-1-clone"

In [13]:
with sqlite3.connect(db_4) as conn:
    # conn.row_factory = sqlite3.Row
    tables = conn.execute("SELECT tbl_name FROM sqlite_master WHERE type='table';").fetchall()
    tables = [t[0] for t in tables]

In [14]:
tables

['studies',
 'version_info',
 'study_directions',
 'study_user_attributes',
 'study_system_attributes',
 'trials',
 'trial_user_attributes',
 'trial_system_attributes',
 'trial_params',
 'trial_values',
 'trial_intermediate_values',
 'trial_heartbeats',
 'alembic_version']

In [15]:
# resp = input(f" Delete study {study_name} ?")
# if resp.lower() in ['yes' ,'y']:
#     print(' You responsed yes')
#     try:
#         optuna.delete_study(storage=storage_instance, study_name=study_name)
#         print("delete successful")
#     except Exception as e:
#         print("delete failed")
# else:
#     print(f" {study_name} will be kept")

# study = optuna.create_study(storage=storage_instance_copy,
#                             study_name=study_name_copy,
#                             directions=["maximize","minimize"], 
#                             load_if_exists=True)
# study.set_metric_names(["roc_auc", "logloss"])

In [16]:
# start = datetime.now()
# study_example = optuna.load_study(study_name= study_name, storage=storage_example)
# study_example_copy = optuna.load_study(study_name= study_name, storage=storage_example_copy)
# study_optuna_data = optuna.load_study(study_name= study_name, storage=storage_optuna_data)
# study_optuna_database = optuna.load_study(study_name= study_name, storage=storage_optuna_database)
# study_clone  = optuna.load_study(study_name= study_name_clone, storage=storage_instance)
# print(f"Total time:  {datetime.now() - start}")

## storage_4 = `optuna_database`

In [19]:
STORAGE = storage_4
DB = db_4
print(DB, STORAGE)
with sqlite3.connect(DB) as conn:
    conn.row_factory = sqlite3.Row
    results = conn.execute("SELECT * FROM studies ;").fetchall()
    db_4_studies = [dict(r) for r in results]
    db_4_studies

for study in db_4_studies:
    loaded_study = optuna.load_study(study_name= study['study_name'], storage=STORAGE)
    disp_study_history(loaded_study)

optuna_database.db <optuna.storages._rdb.storage.RDBStorage object at 0x7f7ef4f1a710>


[{'study_id': 2, 'study_name': 'multiobjective-study-1'},
 {'study_id': 3, 'study_name': 'multiobjective-study-2'},
 {'study_id': 4, 'study_name': 'multiobjective-study-3'},
 {'study_id': 6, 'study_name': 'classification-study-Apr24'},
 {'study_id': 7, 'study_name': 'classification-3SampleStudy-Apr24'},
 {'study_id': 8, 'study_name': 'classification-study-1'}]



 Study  : multiobjective-study-1
 Metrics: ['mse_score', 'R2_score']

                start     -   completion      status        validation metrics
 trial#         time      -      time          code      mse_score        R2_score
--------------------------------------------------------------------------------
Trial #: 0    2024-01-10   15:34:20 - 15:34:20    3          *** FAILED ***        
Trial #: 1    2024-01-10   15:51:14 - 16:00:22    1      0.02377       -0.00016    
Trial #: 2    2024-01-10   16:00:23 - 16:10:23    1      0.02362        0.00629    
Trial #: 3    2024-01-10   16:10:23 - 16:20:46    1      0.02377       -0.00001    
Trial #: 4    2024-01-10   16:20:46 - 16:31:21    1      0.02377       -0.00016    
Trial #: 5    2024-01-10   16:31:22 - 16:41:59    1   20903136613044846592.00000   -869609181496980406272.00000    
Trial #: 6    2024-01-10   16:41:59 - 16:52:38    1      0.02378       -0.00009    
Trial #: 7    2024-01-10   16:52:39 - 17:03:34    1          inf 

## Display study as Pandas dataframe 

In [21]:
loaded_study = optuna.load_study(study_name= 'classification-study-1', storage=storage_4)
disp_study_history(loaded_study)



 Study  : classification-study-1
 Metrics: ['roc_auc', 'logloss']

                start     -   completion      status        validation metrics
 trial#         time      -      time          code      roc_auc        logloss
--------------------------------------------------------------------------------
Trial #: 0    2024-02-04   00:48:09 - 00:54:05    3          *** FAILED ***        
Trial #: 1    2024-02-04   01:15:51 - 01:40:09    1      0.92354        0.20999    
Trial #: 2    2024-02-04   10:36:45 - 12:28:29    1      0.82900        0.34993    
Trial #: 3    2024-02-04   14:50:22 - 15:11:49    1      0.77843        0.32556    
Trial #: 4    2024-02-04   15:11:50 - 16:04:43    1      0.71748        0.36995    
Trial #: 5    2024-02-04   16:04:44 - 16:25:30    1      0.84788        0.29725    
Trial #: 6    2024-02-04   16:25:30 - 16:55:59    1      0.62604        0.36119    
Trial #: 7    2024-02-04   18:06:45 - 18:29:19    3          *** FAILED ***        
Trial #: 8    2024-

In [49]:
df_trials = loaded_study.trials_dataframe()
df_trials.sort_values('values_roc_auc', ascending=False, inplace = True)

In [None]:
loaded_study.trials_dataframe().iloc[1:2]
loaded_study.trials_dataframe().iloc[13:14]

Unnamed: 0,number,values_logloss,values_roc_auc,datetime_start,datetime_complete,duration,params_colsample_bylevel,params_colsample_bynode,params_learning_rate,params_max_delta_step,params_max_depth,params_min_child_weight,params_min_split_loss,params_subsample,user_attrs_memo,system_attrs_fixed_params,system_attrs_nsga2:generation,state
1,1,0.209986,0.923542,2024-02-04 01:15:51.892559,2024-02-04 01:40:09.121813,0 days 00:24:17.229254,0.947399,0.659437,0.034551,1.081959,13,4.008506,4.749165,0.719159,,,0.0,COMPLETE


Unnamed: 0,number,values_logloss,values_roc_auc,datetime_start,datetime_complete,duration,params_colsample_bylevel,params_colsample_bynode,params_learning_rate,params_max_delta_step,params_max_depth,params_min_child_weight,params_min_split_loss,params_subsample,user_attrs_memo,system_attrs_fixed_params,system_attrs_nsga2:generation,state
13,13,0.229483,0.917122,2024-02-05 01:11:06.944139,2024-02-05 01:34:06.921264,0 days 00:22:59.977125,0.980188,0.656057,0.06227,0.650368,10,4.423521,4.444297,0.834164,,,0.0,COMPLETE


In [42]:
for i in loaded_study.best_trials:
    print(f" Trial {i.number}  ")
    for k,v in i.params.items():
        print(f" {k:25s}   {v:.5f}")
    print('\n\n\n')

 Trial 1  
 learning_rate               0.03455
 min_split_loss              4.74917
 max_depth                   13.00000
 min_child_weight            4.00851
 max_delta_step              1.08196
 subsample                   0.71916
 colsample_bylevel           0.94740
 colsample_bynode            0.65944




 Trial 12  
 learning_rate               0.85865
 min_split_loss              3.60513
 max_depth                   10.00000
 min_child_weight            9.58354
 max_delta_step              2.11724
 subsample                   0.88346
 colsample_bylevel           0.61197
 colsample_bynode            0.85980






In [None]:
{"learning_rate"     : [0.03455,  0.85865],
 "min_split_loss"    : [4.74917,  3.60513],
 "max_depth"         : [13.0000,  10.00000],
 "min_child_weight"  : [4.00851,  9.58354],
 "max_delta_step"    : [1.08196,  2.11724],
 "subsample"         : [0.71916,  0.88346],
 "colsample_bylevel" : [0.94740,  0.61197],
 "colsample_bynode"  : [0.65944,  0.85980],
}


# Clone study

In [42]:
from optuna.study import create_study, load_study
print(storage_1, storage_4)

<optuna.storages._rdb.storage.RDBStorage object at 0x7ffa50791b10> <optuna.storages._rdb.storage.RDBStorage object at 0x7ffa4fb26510>


In [10]:
# storage = "sqlite:///example.db"
# storage_copy = "sqlite:///example_copy.db"

In [None]:
# study_name="classification-study-1"
# study_name_copy="classification-study-1-clone"

In [41]:
## https://optuna.readthedocs.io/en/stable/reference/generated/optuna.storages.RetryFailedTrialCallback.html

# from optuna.storages import RetryFailedTrialCallback

# storage_instance = optuna.storages.RDBStorage(
#     url=storage_copy,
#     heartbeat_interval=60,
#     grace_period=120,
#     failed_trial_callback=RetryFailedTrialCallback(max_retry=3),
# )
# storage_instance

In [43]:
to_storage    ="sqlite:///optuna_database.db"
from_storage  = "sqlite:///example_copy.db"
from_study_name = "classification-study-1"
to_study_name   = "classification-study-1"

In [44]:
from_study = load_study(study_name=from_study_name, storage=from_storage)

In [45]:
disp_study_history(from_study)



 Study  : classification-study-1
 Metrics: ['roc_auc', 'logloss']

                start     -   completion      status        validation metrics
 trial#         time      -      time          code      roc_auc        logloss
--------------------------------------------------------------------------------
Trial #: 0    2024-02-04   00:48:09 - 00:54:05    3          *** FAILED ***        
Trial #: 1    2024-02-04   01:15:51 - 01:40:09    1      0.92354        0.20999    
Trial #: 2    2024-02-04   10:36:45 - 12:28:29    1      0.82900        0.34993    
Trial #: 3    2024-02-04   14:50:22 - 15:11:49    1      0.77843        0.32556    
Trial #: 4    2024-02-04   15:11:50 - 16:04:43    1      0.71748        0.36995    
Trial #: 5    2024-02-04   16:04:44 - 16:25:30    1      0.84788        0.29725    
Trial #: 6    2024-02-04   16:25:30 - 16:55:59    1      0.62604        0.36119    
Trial #: 7    2024-02-04   18:06:45 - 18:29:19    3          *** FAILED ***        
Trial #: 8    2024-

In [69]:
to_study = load_study(study_name=to_study_name, storage=to_storage)

In [70]:
disp_study_history(to_study)



 Study  : classification-study-1
 Metrics: ['roc_auc', 'logloss']

                start     -   completion      status        validation metrics
 trial#         time      -      time          code      roc_auc        logloss
--------------------------------------------------------------------------------
Trial #: 0    2024-02-04   00:48:09 - 00:54:05    3          *** FAILED ***        
Trial #: 1    2024-02-04   01:15:51 - 01:40:09    1      0.92354        0.20999    
Trial #: 2    2024-02-04   10:36:45 - 12:28:29    1      0.82900        0.34993    
Trial #: 3    2024-02-04   14:50:22 - 15:11:49    1      0.77843        0.32556    
Trial #: 4    2024-02-04   15:11:50 - 16:04:43    1      0.71748        0.36995    
Trial #: 5    2024-02-04   16:04:44 - 16:25:30    1      0.84788        0.29725    
Trial #: 6    2024-02-04   16:25:30 - 16:55:59    1      0.62604        0.36119    
Trial #: 7    2024-02-04   18:06:45 - 18:29:19    3          *** FAILED ***        
Trial #: 8    2024-

## Start Copy Process

In [52]:
disp_study_history(from_study)



 Study  : classification-study-1
 Metrics: ['roc_auc', 'logloss']

                start     -   completion      status        validation metrics
 trial#         time      -      time          code      roc_auc        logloss
--------------------------------------------------------------------------------
Trial #: 0    2024-02-04   00:48:09 - 00:54:05    3          *** FAILED ***        
Trial #: 1    2024-02-04   01:15:51 - 01:40:09    1      0.92354        0.20999    
Trial #: 2    2024-02-04   10:36:45 - 12:28:29    1      0.82900        0.34993    
Trial #: 3    2024-02-04   14:50:22 - 15:11:49    1      0.77843        0.32556    
Trial #: 4    2024-02-04   15:11:50 - 16:04:43    1      0.71748        0.36995    
Trial #: 5    2024-02-04   16:04:44 - 16:25:30    1      0.84788        0.29725    
Trial #: 6    2024-02-04   16:25:30 - 16:55:59    1      0.62604        0.36119    
Trial #: 7    2024-02-04   18:06:45 - 18:29:19    3          *** FAILED ***        
Trial #: 8    2024-

### Create a new study (to copy over to)

In [48]:
to_study = create_study(
    study_name=to_study_name ,
    storage=to_storage,
    directions=from_study.directions,
    load_if_exists=False,
)

[I 2024-08-01 17:02:36,732] A new study created in RDB with name: classification-study-1


In [57]:
# disp_study_history(to_study)
# to_study.study_name
# to_study.

{}

### Copy system attributes

In [61]:
for key, value in from_study._storage.get_study_system_attrs(from_study._study_id).items():
    print(f" Copy System Attributes FROM: {from_study.study_name} trial: {key} : {value}  TO: {to_study.study_name}")
    to_study._storage.set_study_system_attr(to_study._study_id, key, value)

 Copy from classification-study-1 trial: study:metric_names : ['roc_auc', 'logloss']  TO: classification-study-1


### Copy user attributes

In [63]:
for key, value in from_study.user_attrs.items():
    print(f" Copy User Attributes FROM: {from_study.study_name} trial: {key} : {value}  TO: {to_study.study_name}")
    to_study.set_user_attr(key, value)

In [23]:
# disp_study_history(from_study)

### Get trials from source study (to copy over)

In [64]:
# Trials are deep copied on `add_trials`.
from_trials = from_study.get_trials(deepcopy=False)

In [65]:
len(from_trials)
type(from_trials)
from_trials[0]

62

list

FrozenTrial(number=0, state=3, values=None, datetime_start=datetime.datetime(2024, 2, 4, 0, 48, 9, 793433), datetime_complete=datetime.datetime(2024, 2, 4, 0, 54, 5, 488762), params={'learning_rate': 0.14327608105242665, 'min_split_loss': 0.013665061540799606, 'max_depth': 11, 'min_child_weight': 9.683348905910576, 'max_delta_step': 0.0746562549161478, 'subsample': 0.9869150345493672, 'colsample_bylevel': 0.5856183883769744, 'colsample_bynode': 0.5820866507844484}, user_attrs={}, system_attrs={'nsga2:generation': 0}, intermediate_values={}, distributions={'learning_rate': FloatDistribution(high=1.0, log=True, low=0.0001, step=None), 'min_split_loss': FloatDistribution(high=10.0, log=False, low=0.0, step=None), 'max_depth': IntDistribution(high=15, log=False, low=1, step=1), 'min_child_weight': FloatDistribution(high=10.0, log=False, low=0.0, step=None), 'max_delta_step': FloatDistribution(high=10.0, log=False, low=0.0, step=None), 'subsample': FloatDistribution(high=1.0, log=False, low

In [27]:
# copy_trials = from_trials[52:62]
# len(copy_trials)

10

In [67]:
# Trials are deep copied on `add_trials`.
to_study.add_trials(from_trials)

# Trials are deep copied on `add_trials`.
# to_study.add_trials(from_study.get_trials(deepcopy=False))

In [68]:
disp_study_history(to_study)



 Study  : classification-study-1
 Metrics: ['roc_auc', 'logloss']

                start     -   completion      status        validation metrics
 trial#         time      -      time          code      roc_auc        logloss
--------------------------------------------------------------------------------
Trial #: 0    2024-02-04   00:48:09 - 00:54:05    3          *** FAILED ***        
Trial #: 1    2024-02-04   01:15:51 - 01:40:09    1      0.92354        0.20999    
Trial #: 2    2024-02-04   10:36:45 - 12:28:29    1      0.82900        0.34993    
Trial #: 3    2024-02-04   14:50:22 - 15:11:49    1      0.77843        0.32556    
Trial #: 4    2024-02-04   15:11:50 - 16:04:43    1      0.71748        0.36995    
Trial #: 5    2024-02-04   16:04:44 - 16:25:30    1      0.84788        0.29725    
Trial #: 6    2024-02-04   16:25:30 - 16:55:59    1      0.62604        0.36119    
Trial #: 7    2024-02-04   18:06:45 - 18:29:19    3          *** FAILED ***        
Trial #: 8    2024-

# Optuna Delete Study

In [38]:
optuna.delete_study(study_name="classification-study-1",storage=storage_4)

# Optuna trying to copy /delete trials from study

In [None]:
study_name="classification-study-1"
storage = "sqlite:///example.db"
study_name_copy="classification-study-1-copy"
storage_copy = "sqlite:///example_copy.db"

In [None]:
## https://optuna.readthedocs.io/en/stable/reference/generated/optuna.storages.RetryFailedTrialCallback.html

from optuna.storages import RetryFailedTrialCallback

# storage_instance = optuna.storages.RDBStorage(
#     url=storage_copy,
#     heartbeat_interval=60,
#     grace_period=120,
#     failed_trial_callback=RetryFailedTrialCallback(max_retry=3),
# )
# storage_instance

In [91]:
# optuna.copy_study(
#     from_study_name=study_name,
#     from_storage="sqlite:///example.db",
#     to_storage="sqlite:///example_copy.db",
# )
# study = optuna.load_study(storage="sqlite:///example_copy.db",
                            # study_name="classification-study-1-copy",)

study = optuna.load_study(study_name = study_name_copy, storage = storage_instance)


In [92]:
disp_study_history(study)

 classification-study-1-copy  study history

            status        validation metrics


TypeError: 'NoneType' object is not subscriptable

In [87]:
ho_study_name = "classification-study-1-copy"
ho_storage_name = storage_copy
ho_trials_to_delete = [0,7,14]

num_trials = study.trials[-1].number
num_trials_to_keep = 50

num_trials

29

In [67]:
t = study.trials[0] 
t 

FrozenTrial(number=0, state=3, values=None, datetime_start=datetime.datetime(2024, 2, 4, 0, 48, 9, 793433), datetime_complete=datetime.datetime(2024, 2, 4, 0, 54, 5, 488762), params={'learning_rate': 0.14327608105242665, 'min_split_loss': 0.013665061540799606, 'max_depth': 11, 'min_child_weight': 9.683348905910576, 'max_delta_step': 0.0746562549161478, 'subsample': 0.9869150345493672, 'colsample_bylevel': 0.5856183883769744, 'colsample_bynode': 0.5820866507844484}, user_attrs={}, system_attrs={'nsga2:generation': 0}, intermediate_values={}, distributions={'learning_rate': FloatDistribution(high=1.0, log=True, low=0.0001, step=None), 'min_split_loss': FloatDistribution(high=10.0, log=False, low=0.0, step=None), 'max_depth': IntDistribution(high=15, log=False, low=1, step=1), 'min_child_weight': FloatDistribution(high=10.0, log=False, low=0.0, step=None), 'max_delta_step': FloatDistribution(high=10.0, log=False, low=0.0, step=None), 'subsample': FloatDistribution(high=1.0, log=False, low

In [69]:
trials_to_keep = [
    optuna.trial.create_trial(
        state=optuna.trial.TrialState.COMPLETE,
        params=t.params,
        user_attrs=t.user_attrs,
        system_attrs=t.system_attrs,
        intermediate_values=t.intermediate_values,
        distributions=t.distributions,
        values=t.values,
        value =t.value
    )   t in study.trials if t.state = optuna.trial.TrialState.COMPLETE ] ##t.number not in ho_trials_to_delete]

SyntaxError: invalid syntax. Perhaps you forgot a comma? (757330297.py, line 2)

In [85]:
trials_to_keep = [] 
for t in study.trials:
     if t.state == optuna.trial.TrialState.COMPLETE : 
        copy_t =  optuna.trial.create_trial(
            
                            trial_id = t.trial_id
                            state=optuna.trial.TrialState.COMPLETE,
                            params=t.params,
                            user_attrs=t.user_attrs,
                            system_attrs=t.system_attrs,
                            intermediate_values=t.intermediate_values,
                            distributions=t.distributions,
                            values=t.values,
                            # value =t.value
                        ) 
        trials_to_keep.append(copy_t)

SyntaxError: invalid syntax. Perhaps you forgot a comma? (582776261.py, line 6)

In [10]:
# trials_to_keep

In [88]:
ho_storage_name

'sqlite:///example_copy.db'

In [90]:
# Delete study before recreating
# optuna.delete_study(study_name=ho_study_name, storage=ho_storage_name)

# Recreate study and add trials to keep
new_study = optuna.create_study(
    pruner=optuna.pruners.MedianPruner(n_warmup_steps=5),
    directions=["maximize","minimize"],     
    study_name=ho_study_name,
    storage=ho_storage_name,
    load_if_exists=True)
new_study.add_trials(trials_to_keep)

[I 2024-02-12 13:13:32,369] A new study created in RDB with name: classification-study-1-copy


In [58]:

study.trials[0]
# study.trials.pop(0)
study.trials[1].value

FrozenTrial(number=0, state=3, values=None, datetime_start=datetime.datetime(2024, 2, 4, 0, 48, 9, 793433), datetime_complete=datetime.datetime(2024, 2, 4, 0, 54, 5, 488762), params={'learning_rate': 0.14327608105242665, 'min_split_loss': 0.013665061540799606, 'max_depth': 11, 'min_child_weight': 9.683348905910576, 'max_delta_step': 0.0746562549161478, 'subsample': 0.9869150345493672, 'colsample_bylevel': 0.5856183883769744, 'colsample_bynode': 0.5820866507844484}, user_attrs={}, system_attrs={'nsga2:generation': 0}, intermediate_values={}, distributions={'learning_rate': FloatDistribution(high=1.0, log=True, low=0.0001, step=None), 'min_split_loss': FloatDistribution(high=10.0, log=False, low=0.0, step=None), 'max_depth': IntDistribution(high=15, log=False, low=1, step=1), 'min_child_weight': FloatDistribution(high=10.0, log=False, low=0.0, step=None), 'max_delta_step': FloatDistribution(high=10.0, log=False, low=0.0, step=None), 'subsample': FloatDistribution(high=1.0, log=False, low

RuntimeError: This attribute is not available during multi-objective optimization.

# Optuna Merge

**Merge two dbs within Optuna**

## Datasets

In [10]:
prefix = '' ### Target-2' , 'MOA'
db1  = f"./example.db"
db2  = f"./example_copy.db"

In [12]:
def combine_optuna_dbs(db1_path, db2_path):
    """"
    This function combines two optuna databases into the db1.
    Both dbs should be from the same study.
    The ids of the db2 are updated to be unique.
    """
    
    # get all the tables names
    with sqlite3.connect(db1_path) as con:
        tables = con.execute("SELECT name FROM sqlite_master WHERE type='table';").fetchall()
        tables = [table[0] for table in tables]
    not_tables = ['studies', 'version_info', 'study_directions', 'alembic_version']
    tables = [table for table in tables if table not in not_tables]
    print(f"tbles are: {tables}")
    
    dfs1 = {}
    with sqlite3.connect(db1_path) as con:
        for table in tables:
            dfs1[table] = pd.read_sql_query(f"SELECT * FROM {table}", con)
    return dfs1
    table_ids = {'study_user_attributes'     : ['study_user_attribute_id'],
                 'study_system_attributes'   : ['study_system_attribute_id'],
                 'trials'                    : ['trial_id', 'number'],
                 'trial_user_attributes'     : ['trial_user_attribute_id'],
                 'trial_system_attributes'   : ['trial_system_attribute_id'],
                 'trial_params'              : ['param_id'],
                 'trial_values'              : ['trial_value_id'],
                 'trial_intermediate_values' : ['trial_intermediate_value_id'],
                 'trial_heartbeats'          : ['trial_heartbeat_id'],
    }
    
    max_ids = {}
    
    # # add max id if the table is not empty
    # for table in tables:
    #     if len(dfs1[table]) > 0:
    #         for id in table_ids[table]:
    #             max_ids[id] = dfs1[table].iloc[-1][id]
    #     else:
    #         for id in table_ids[table]:
    #             max_ids[id] = 0
    
    dfs2 = {}
    with sqlite3.connect(db2_path) as con:
        for table in tables:
            dfs2[table] = pd.read_sql_query(f"SELECT * FROM {table}", con)
    
    update_ids = {'study_user_attributes'     : ['study_user_attribute_id'],
                  'study_system_attributes'   : ['study_system_attribute_id'],
                  'trials'                    : ['trial_id', 'number'],
                  'trial_user_attributes'     : ['trial_user_attribute_id'  , 'trial_id'],
                  'trial_system_attributes'   : ['trial_system_attribute_id', 'trial_id'],
                  'trial_params'              : ['param_id', 'trial_id'],
                  'trial_values'              : ['trial_value_id', 'trial_id'],
                  'trial_intermediate_values' : ['trial_intermediate_value_id', 'trial_id'],
                  'trial_heartbeats'          : ['trial_heartbeat_id', 'trial_id'],
    }
    
    # # update the ids of the second db
    # for table in tables:
    #     if len(dfs2[table]) > 0:
    #         for id in update_ids[table]:
    #             dfs2[table][id] = dfs2[table][id] + max_ids[id]
    
    # # add the second db to the first db
    # with sqlite3.connect(db1_path) as con1:
    #     for table in tables:
    #         dfs2[table].to_sql(table, con1, if_exists='append', index=False)

In [13]:
tbls =  combine_optuna_dbs(db1, db2)

tbles are: ['study_user_attributes', 'study_system_attributes', 'trials', 'trial_user_attributes', 'trial_system_attributes', 'trial_params', 'trial_values', 'trial_intermediate_values', 'trial_heartbeats']


In [14]:
tbls.keys()

dict_keys(['study_user_attributes', 'study_system_attributes', 'trials', 'trial_user_attributes', 'trial_system_attributes', 'trial_params', 'trial_values', 'trial_intermediate_values', 'trial_heartbeats'])

In [24]:
tbls['study_user_attributes']
tbls['trial_user_attributes']
tbls['trial_intermediate_values']
tbls['trial_heartbeats']

tbls['study_system_attributes']
tbls['trial_system_attributes']
tbls['trials']
tbls['trial_params']
tbls['trial_values']

Unnamed: 0,study_user_attribute_id,study_id,key,value_json


Unnamed: 0,trial_user_attribute_id,trial_id,key,value_json


Unnamed: 0,trial_intermediate_value_id,trial_id,step,intermediate_value,intermediate_value_type


Unnamed: 0,trial_heartbeat_id,trial_id,heartbeat


Unnamed: 0,study_system_attribute_id,study_id,key,value_json
0,1,2,study:metric_names,"[""mse_score"", ""R2_score""]"
1,2,4,study:metric_names,"[""mse_score"", ""pearson_corr""]"
2,3,5,study:metric_names,"[""roc_auc"", ""logloss""]"


Unnamed: 0,trial_system_attribute_id,trial_id,key,value_json
0,1,17,nsga2:generation,0
1,2,18,nsga2:generation,0
2,3,19,nsga2:generation,0
3,4,20,nsga2:generation,0
4,5,21,nsga2:generation,0
...,...,...,...,...
99,100,116,nsga2:generation,0
100,101,117,nsga2:generation,0
101,102,118,nsga2:generation,0
102,103,119,nsga2:generation,0


Unnamed: 0,trial_id,number,study_id,state,datetime_start,datetime_complete
0,17,0,2,FAIL,2024-01-10 15:34:20.240464,2024-01-10 15:34:20.458984
1,18,1,2,COMPLETE,2024-01-10 15:51:14.898264,2024-01-10 16:00:22.971070
2,19,2,2,COMPLETE,2024-01-10 16:00:23.425742,2024-01-10 16:10:23.476871
3,20,3,2,COMPLETE,2024-01-10 16:10:23.958746,2024-01-10 16:20:46.199603
4,21,4,2,COMPLETE,2024-01-10 16:20:46.713100,2024-01-10 16:31:21.687379
...,...,...,...,...,...,...
99,116,25,5,COMPLETE,2024-02-12 01:55:54.321211,2024-02-12 02:12:44.511828
100,117,26,5,COMPLETE,2024-02-12 02:12:45.046738,2024-02-12 02:29:49.915460
101,118,27,5,COMPLETE,2024-02-12 02:29:50.425523,2024-02-12 03:59:12.655144
102,119,28,5,COMPLETE,2024-02-12 03:59:13.129237,2024-02-12 04:49:11.655533


Unnamed: 0,param_id,trial_id,param_name,param_value,distribution_json
0,49,17,learning_rate,0.021931,"{""name"": ""FloatDistribution"", ""attributes"": {""..."
1,50,17,colsample_bytree,0.614273,"{""name"": ""FloatDistribution"", ""attributes"": {""..."
2,51,17,colsample_bylevel,0.891675,"{""name"": ""FloatDistribution"", ""attributes"": {""..."
3,52,17,colsample_bynode,0.598044,"{""name"": ""FloatDistribution"", ""attributes"": {""..."
4,53,17,reg_lambda,0.677960,"{""name"": ""FloatDistribution"", ""attributes"": {""..."
...,...,...,...,...,...
827,876,120,min_child_weight,8.575491,"{""name"": ""FloatDistribution"", ""attributes"": {""..."
828,877,120,max_delta_step,7.047180,"{""name"": ""FloatDistribution"", ""attributes"": {""..."
829,878,120,subsample,0.897760,"{""name"": ""FloatDistribution"", ""attributes"": {""..."
830,879,120,colsample_bylevel,0.860123,"{""name"": ""FloatDistribution"", ""attributes"": {""..."


Unnamed: 0,trial_value_id,trial_id,objective,value,value_type
0,15,18,0,0.023774,FINITE
1,16,18,1,-0.000156,FINITE
2,17,19,0,0.023621,FINITE
3,18,19,1,0.006289,FINITE
4,19,20,0,0.023774,FINITE
...,...,...,...,...,...
177,192,118,1,0.374718,FINITE
178,193,119,0,0.616641,FINITE
179,194,119,1,0.354772,FINITE
180,195,120,0,0.527256,FINITE
