In [1]:
import optuna
import joblib
import numpy as np
import pandas as pd
import time
from optuna.storages import JournalStorage, JournalFileStorage
from project_resources.it4i_resources import fp_from_smiles, parse_jazzy_df, HyperparamTuner


types = ["morgan", "jazzy"]
splitters = ["rand", "scaff", "time"]
model_identifiers = ["linear", "KRR", "GB", "RF", "ANN"]
isozymes = ["3A4", "RLM", "HLC"]
data_splits = ["train", "test"]
rel_paths = {
    "morgan_3A4_train_scaff": r"project_resources/base_splits/scaffold_splitter/3A4_train.csv",
    "morgan_3A4_train_rand": r"project_resources/base_splits/random/3A4_train.csv",
    "morgan_3A4_train_time": r"project_resources/base_splits/time_split/3A4_train.csv",
    "morgan_RLM_train_scaff": r"project_resources/base_splits/scaffold_splitter/RLM_train.csv",
    "morgan_RLM_train_rand": r"project_resources/base_splits/random/RLM_train.csv",
    "morgan_RLM_train_time": r"project_resources/base_splits/time_split/RLM_train.csv",
    "morgan_HLC_train_scaff": r"project_resources/base_splits/scaffold_splitter/HLC_train.csv",
    "morgan_HLC_train_rand": r"project_resources/base_splits/random/HLC_train.csv",
    "morgan_HLC_train_time": r"project_resources/base_splits/time_split/HLC_train.csv",

    "morgan_3A4_test_scaff": r"project_resources/base_splits/scaffold_splitter/3A4_test.csv",
    "morgan_3A4_test_rand": r"project_resources/base_splits/random/3A4_test.csv",
    "morgan_3A4_test_time": r"project_resources/base_splits/time_split/3A4_test.csv",
    "morgan_RLM_test_scaff": r"project_resources/base_splits/scaffold_splitter/RLM_test.csv",
    "morgan_RLM_test_rand": r"project_resources/base_splits/random/RLM_test.csv",
    "morgan_RLM_test_time": r"project_resources/base_splits/time_split/RLM_test.csv",
    "morgan_HLC_test_scaff": r"project_resources/base_splits/scaffold_splitter/HLC_test.csv",
    "morgan_HLC_test_rand": r"project_resources/base_splits/random/HLC_test.csv",
    "morgan_HLC_test_time": r"project_resources/base_splits/time_split/HLC_test.csv",

    "jazzy_3A4_train_scaff": r"project_resources/jazzy_splits/scaffold_splitter/3A4_train.csv",
    "jazzy_3A4_train_rand": r"project_resources/jazzy_splits/random/3A4_train.csv",
    "jazzy_3A4_train_time": r"project_resources/jazzy_splits/time_split/3A4_train.csv",
    "jazzy_RLM_train_scaff": r"project_resources/jazzy_splits/scaffold_splitter/RLM_train.csv",
    "jazzy_RLM_train_rand": r"project_resources/jazzy_splits/random/RLM_train.csv",
    "jazzy_RLM_train_time": r"project_resources/jazzy_splits/time_split/RLM_train.csv",
    "jazzy_HLC_train_scaff": r"project_resources/jazzy_splits/scaffold_splitter/HLC_train.csv",
    "jazzy_HLC_train_rand": r"project_resources/jazzy_splits/random/HLC_train.csv",
    "jazzy_HLC_train_time": r"project_resources/jazzy_splits/time_split/HLC_train.csv",

    "jazzy_3A4_test_scaff": r"project_resources/jazzy_splits/scaffold_splitter/3A4_test.csv",
    "jazzy_3A4_test_rand": r"project_resources/jazzy_splits/random/3A4_test.csv",
    "jazzy_3A4_test_time": r"project_resources/jazzy_splits/time_split/3A4_test.csv",
    "jazzy_RLM_test_scaff": r"project_resources/jazzy_splits/scaffold_splitter/RLM_test.csv",
    "jazzy_RLM_test_rand": r"project_resources/jazzy_splits/random/RLM_test.csv",
    "jazzy_RLM_test_time": r"project_resources/jazzy_splits/time_split/RLM_test.csv",
    "jazzy_HLC_test_scaff": r"project_resources/jazzy_splits/scaffold_splitter/HLC_test.csv",
    "jazzy_HLC_test_rand": r"project_resources/jazzy_splits/random/HLC_test.csv",
    "jazzy_HLC_test_time": r"project_resources/jazzy_splits/time_split/HLC_test.csv"
}
# sampler - a method used to generate new sets of hyperparameters in each iteration of the optimization process
samplers = {
    'RandomSampler': optuna.samplers.RandomSampler,          # Sampler that selects hyperparameters randomly from the search space.
    'GridSampler': optuna.samplers.GridSampler,              # Sampler that performs a grid search over the hyperparameter space.
    'TPESampler': optuna.samplers.TPESampler,                # Sampler that uses a tree-structured Parzen estimator to model the objective function and sample new points from the search space.
    'CmaEsSampler': optuna.samplers.CmaEsSampler,            # Sampler that uses the Covariance Matrix Adaptation Evolution Strategy algorithm to efficiently search the hyperparameter space.
    'NSGAIISampler': optuna.samplers.NSGAIISampler,          # Multi-objective evolutionary algorithm that generates new samples using non-dominated sorting and crowding distance selection.
    'QMCSampler': optuna.samplers.QMCSampler,                # Quasi-Monte Carlo sampler that uses low-discrepancy sequences to sample the search space in a more efficient and evenly distributed way than random sampling.
    'BoTorchSampler': optuna.integration.BoTorchSampler,     # Sampler that leverages the BoTorch library for Bayesian optimization and can handle both continuous and categorical hyperparameters.
    'BruteForceSampler': optuna.samplers.BruteForceSampler,  # Sampler that exhaustively evaluates all possible combinations of hyperparameters in the search space.
}
# pruner - a technique used to eliminate unpromising trials during the course of hyperparameter optimization.
pruners = {
    'BasePruner': optuna.pruners.BasePruner,                            # This is the base class for all pruning strategies in Optuna. It provides a skeleton for implementing custom pruning strategies.
    'MedianPruner': optuna.pruners.MedianPruner,                        # A pruner that prunes unpromising trials that have median objective values, as determined in previous steps.
    'SuccessiveHalvingPruner': optuna.pruners.SuccessiveHalvingPruner,  # This pruner repeatedly splits trials into halves, discarding the lower performing half at each iteration.
    'HyperbandPruner': optuna.pruners.HyperbandPruner,                  # This pruner implements the Hyperband algorithm, which selects promising trials and runs them with different resource allocation schemes to determine the best one.
    'PercentilePruner': optuna.pruners.PercentilePruner,                # A pruner that prunes unpromising trials based on their percentile rank relative to all completed trials.
    'NopPruner': optuna.pruners.NopPruner,                              # A pruner that does nothing and does not prune any trials.
    'ThresholdPruner': optuna.pruners.ThresholdPruner,                  # This pruner prunes trials that have not reached a certain level of performance (i.e., objective value).
    'PatientPruner': optuna.pruners.PatientPruner,                      # This pruner prunes trials that do not show improvement over a certain number of steps (or epochs).
}
smiles = {}
halflives = {}
features = {}

In [2]:
# load smiles used for ML with Morgan features
smiles["morgan"] = {}
halflives["morgan"] = {}
for splitter in splitters:
    smiles["morgan"][splitter] = {}
    halflives["morgan"][splitter] = {}
    for isozyme in isozymes:
        smiles["morgan"][splitter][isozyme] = {}
        halflives["morgan"][splitter][isozyme] = {}
        for split in data_splits:
            df = pd.read_csv(rel_paths[f"morgan_{isozyme}_{split}_{splitter}"])
            df_smiles = list(df["smiles"])
            df_halflives = list(df["half-life"])
            smiles["morgan"][splitter][isozyme][split] = df_smiles
            halflives["morgan"][splitter][isozyme][split] = df_halflives

# smiles to Morgan fingerprint
features["morgan"] = {}  # need to destinguish between Jazzy and Morngan since Jazzy ommits some mols
for splitter in splitters:
    features["morgan"][splitter] = {}
    for isozyme in isozymes:
        features["morgan"][splitter][isozyme] = {}
        for data_split in data_splits:
            fps = fp_from_smiles(smiles["morgan"][splitter][isozyme][data_split])
            features["morgan"][splitter][isozyme][data_split] = np.array(fps)

# load Jazzy features from csv files and their corresponding smiles
smiles["jazzy"] = {}
halflives["jazzy"] = {}
features["jazzy"] = {}
for splitter in splitters:
    features["jazzy"][splitter] = {}
    smiles["jazzy"][splitter] = {}
    halflives["jazzy"][splitter] = {}
    for isozyme in isozymes:
        features["jazzy"][splitter][isozyme] = {}
        smiles["jazzy"][splitter][isozyme] = {}
        halflives["jazzy"][splitter][isozyme] = {}
        for split in data_splits:
            df = pd.read_csv(rel_paths[f"jazzy_{isozyme}_{split}_{splitter}"])
            jazzy_smiles, df_features, thalfs, contains_nan = parse_jazzy_df(df)
            smiles["jazzy"][splitter][isozyme][split] = jazzy_smiles
            features["jazzy"][splitter][isozyme][split] = df_features
            halflives["jazzy"][splitter][isozyme][split] = thalfs

     56, [63, 'CC(C)(O)c1cc(F)c2c(c1)C(=O)N(Cc1ccc(Cl)cn1)[C@@]2(OCC1(O)CC1)c1ccc(Cl)cc1', 0.9999999999999998, 13.1753, 1.4177, 11.3044, -7.3559, -125.8906, -119.4889]
     14, [23, 'Cc1ncsc1-c1ccc([C@H](CC(=O)NCCCCCCNC(=O)COc2c(-c3csc(N4CCOCC4)n3)ccc(F)c2F)NC(=O)[C@@H]2C[C@@H](O)CN2C(=O)[C@@H](NC(=O)C2(F)CC2)C(C)(C)C)cc1', 0.0622624201077031, 28.4327, 3.274, 23.4872, -4.0348, -286.1545, -257.0326]
     removed index 325 corresponding to NaN
     removed index 678 corresponding to NaN
     removed index 725 corresponding to NaN
     removed index 1053 corresponding to NaN
     1417, [439, 'CN1C(=O)c2ccccc2[S@+]([O-])c2ccc(C(=O)NCc3ccc(Br)cc3)cc21', 0.4181064270905321, 10.3104, 0.7256, 7.5559, -14.2816, -86.2027, -96.3091]
     356, [66, 'CC(=O)c1c(C)[nH]c(C(=O)Nc2cccc([S+](=O)([O-])Nc3cccc(C#N)c3)c2)c1C', 0.8327574291637871, 10.4683, 2.4345, 9.1374, -15.7732, -116.2014, -124.8128]
     151, [36, 'N#Cc1ccc(CN2CCC(N3CCNC3=O)CC2)cc1', 0.7615658362989324, 8.7086, 0.7824, 4.7992, -7.9813, -

In [3]:
sampler = samplers['TPESampler']
pruner = pruners["BasePruner"]
t_end = time.time() + (60 * 60 * 24)
while time.time() < t_end:
    # while loop is needed; if instead n_trials was large only one model would be trained
    n_trials = 5
    for _type in types:
        for splitter in splitters:
            if splitter == "rand":
                splitter_name = "random"
            elif splitter == "scaff":
                splitter_name = "scaffold_splitter"
            else:
                splitter_name = "time_split"

            for isozyme in isozymes:
                X_train = features[_type][splitter][isozyme]["train"]
                y_train = np.array(halflives[_type][splitter][isozyme]["train"])
                X_test = features[_type][splitter][isozyme]["test"]
                y_test = np.array(halflives[_type][splitter][isozyme]["test"])

                for model_identifier in model_identifiers:
                    print(splitter_name, isozyme, model_identifier)
                    lock_obj = optuna.storages.JournalFileOpenLock(f"./project_resources/optuna/{_type}/{splitter_name}/{isozyme}/{model_identifier}_journal.log")

                    storage = JournalStorage(
                        JournalFileStorage(f"./project_resources/optuna/{_type}/{splitter_name}/{isozyme}/{model_identifier}_journal.log", lock_obj=lock_obj)
                    )
                    study = optuna.create_study(study_name=model_identifier, directions=['minimize'], pruner=pruner,
                                                storage=storage, load_if_exists=True)
                    tuner = HyperparamTuner(model_identifier, X_train, y_train, X_test, y_test)
                    study.optimize(tuner.objective, n_trials=n_trials, n_jobs=-1)  # catch=(ValueError,)
                    joblib.dump(study, f"./project_resources/optuna/{_type}/{splitter_name}/{isozyme}/{model_identifier}.pkl")

[I 2023-11-06 21:35:56,631] A new study created in Journal with name: linear
[I 2023-11-06 21:35:56,801] Trial 0 finished with value: 0.2159085003240563 and parameters: {'alpha': 0.0599499141658115, 'l1_ratio': 0.15526440061980828}. Best is trial 2 with value: 0.20835933375232604.


random 3A4 linear


[I 2023-11-06 21:35:56,819] Trial 2 finished with value: 0.20835933375232604 and parameters: {'alpha': 0.057518229349610095, 'l1_ratio': 0.431838410817209}. Best is trial 1 with value: 0.2061043810369936.
[I 2023-11-06 21:35:56,821] Trial 4 finished with value: 0.20966882633842435 and parameters: {'alpha': 0.010375806481240549, 'l1_ratio': 0.06820299316668954}. Best is trial 1 with value: 0.2061043810369936.
[I 2023-11-06 21:35:56,823] Trial 1 finished with value: 0.2061043810369936 and parameters: {'alpha': 0.08223262035912703, 'l1_ratio': 0.8548894306546737}. Best is trial 1 with value: 0.2061043810369936.
[I 2023-11-06 21:35:56,826] Trial 3 finished with value: 0.21391781836608137 and parameters: {'alpha': 0.02665149954462294, 'l1_ratio': 0.5386761667345472}. Best is trial 1 with value: 0.2061043810369936.
[I 2023-11-06 21:35:56,837] A new study created in Journal with name: KRR


random 3A4 KRR


[I 2023-11-06 21:35:57,055] Trial 1 finished with value: 0.18072474944245775 and parameters: {'alpha': 0.15884370123074237, 'gamma': 9.622427114467683e-15, 'kernel': 'laplacian'}. Best is trial 3 with value: 0.1748468113682428.
[I 2023-11-06 21:35:57,058] Trial 0 finished with value: 0.20029196713495878 and parameters: {'alpha': 0.9899157705258239, 'gamma': 2.4882257673187645e-15, 'kernel': 'linear'}. Best is trial 3 with value: 0.1748468113682428.
[I 2023-11-06 21:35:57,070] Trial 2 finished with value: 0.18370761362757035 and parameters: {'alpha': 0.4858482219873495, 'gamma': 5.352261283261603e-15, 'kernel': 'laplacian'}. Best is trial 3 with value: 0.1748468113682428.
[I 2023-11-06 21:35:57,073] Trial 3 finished with value: 0.1748468113682428 and parameters: {'alpha': 0.2835092231640621, 'gamma': 6.504728565566763e-15, 'kernel': 'linear'}. Best is trial 3 with value: 0.1748468113682428.
[I 2023-11-06 21:35:57,075] Trial 4 finished with value: 0.1848646961612148 and parameters: {'alp

random 3A4 GB


[I 2023-11-06 21:35:58,599] Trial 0 finished with value: 0.21511603079320263 and parameters: {'n_estimators': 500, 'learning_rate': 0.4011559068534995, 'max_depth': 4}. Best is trial 0 with value: 0.21511603079320263.
[I 2023-11-06 21:35:58,634] Trial 3 finished with value: 0.20869574526828072 and parameters: {'n_estimators': 200, 'learning_rate': 0.08813019407814766, 'max_depth': 4}. Best is trial 3 with value: 0.20869574526828072.
[I 2023-11-06 21:35:58,648] Trial 2 finished with value: 0.1965343433619402 and parameters: {'n_estimators': 200, 'learning_rate': 0.3812017555592774, 'max_depth': 1}. Best is trial 2 with value: 0.1965343433619402.
[I 2023-11-06 21:35:58,672] Trial 4 finished with value: 0.19807730756660202 and parameters: {'n_estimators': 200, 'learning_rate': 0.28802411415608065, 'max_depth': 3}. Best is trial 2 with value: 0.1965343433619402.
[I 2023-11-06 21:35:58,674] Trial 1 finished with value: 0.20269884038782082 and parameters: {'n_estimators': 10, 'learning_rate'

random 3A4 RF


[I 2023-11-06 21:36:04,740] Trial 0 finished with value: 0.19660310779741044 and parameters: {'n_estimators': 200, 'max_features': 'log2', 'max_depth': 10}. Best is trial 0 with value: 0.19660310779741044.
[I 2023-11-06 21:36:04,776] Trial 1 finished with value: 0.17679154369017414 and parameters: {'n_estimators': 500, 'max_features': 'auto', 'max_depth': 10}. Best is trial 1 with value: 0.17679154369017414.
[I 2023-11-06 21:36:04,796] Trial 3 finished with value: 0.17891148304368312 and parameters: {'n_estimators': 200, 'max_features': 'log2', 'max_depth': 2}. Best is trial 1 with value: 0.17679154369017414.
[I 2023-11-06 21:36:04,813] Trial 4 finished with value: 0.2098998106563891 and parameters: {'n_estimators': 20, 'max_features': 'sqrt', 'max_depth': None}. Best is trial 2 with value: 0.17668758006349547.
[I 2023-11-06 21:36:04,815] Trial 2 finished with value: 0.17668758006349547 and parameters: {'n_estimators': 500, 'max_features': 'auto', 'max_depth': None}. Best is trial 2 wi

random 3A4 ANN


[I 2023-11-06 21:36:07,397] Trial 1 finished with value: 0.24999915314194265 and parameters: {'learning_rate_init': 0.05219077072187204, 'hidden_layer_sizes': [10, 10, 10]}. Best is trial 1 with value: 0.24999915314194265.
[I 2023-11-06 21:36:07,489] Trial 0 finished with value: 0.26647591166445284 and parameters: {'learning_rate_init': 0.028475064371844678, 'hidden_layer_sizes': [20, 20]}. Best is trial 1 with value: 0.24999915314194265.
[I 2023-11-06 21:36:07,624] Trial 2 finished with value: 0.2501532440392772 and parameters: {'learning_rate_init': 0.06627351158563095, 'hidden_layer_sizes': [10, 10, 10]}. Best is trial 1 with value: 0.24999915314194265.
[I 2023-11-06 21:36:07,626] Trial 4 finished with value: 0.27422234585024624 and parameters: {'learning_rate_init': 0.021757721427106226, 'hidden_layer_sizes': [10, 10]}. Best is trial 1 with value: 0.24999915314194265.
[I 2023-11-06 21:36:07,658] Trial 3 finished with value: 0.2322754409752719 and parameters: {'learning_rate_init': 

random RLM linear


[I 2023-11-06 21:36:07,864] Trial 0 finished with value: 0.2569745426662995 and parameters: {'alpha': 0.07516623791553981, 'l1_ratio': 0.27165961603705924}. Best is trial 0 with value: 0.2569745426662995.
[I 2023-11-06 21:36:07,888] Trial 1 finished with value: 0.2571033570069799 and parameters: {'alpha': 0.020874760606872052, 'l1_ratio': 0.4429513853300632}. Best is trial 0 with value: 0.2569745426662995.
[I 2023-11-06 21:36:07,915] Trial 2 finished with value: 0.2569794187586049 and parameters: {'alpha': 0.03871367101376458, 'l1_ratio': 0.7767477514125511}. Best is trial 0 with value: 0.2569745426662995.
[I 2023-11-06 21:36:07,925] Trial 3 finished with value: 0.2569485466550108 and parameters: {'alpha': 0.05325544469536814, 'l1_ratio': 0.09767998747703388}. Best is trial 3 with value: 0.2569485466550108.
[I 2023-11-06 21:36:07,928] Trial 4 finished with value: 0.2569895217444133 and parameters: {'alpha': 0.03623706357320289, 'l1_ratio': 0.5770140182006642}. Best is trial 3 with valu

random RLM KRR


[I 2023-11-06 21:36:09,507] Trial 0 finished with value: 0.2509067334877077 and parameters: {'alpha': 0.9230219868500257, 'gamma': 6.2213568089945545e-15, 'kernel': 'rbf'}. Best is trial 0 with value: 0.2509067334877077.
[I 2023-11-06 21:36:09,552] Trial 1 finished with value: 0.25103933778604776 and parameters: {'alpha': 0.15639569368353134, 'gamma': 8.148588294896296e-15, 'kernel': 'laplacian'}. Best is trial 0 with value: 0.2509067334877077.
[I 2023-11-06 21:36:09,658] Trial 2 finished with value: 0.25155923825456855 and parameters: {'alpha': 0.036077683015076786, 'gamma': 1.1617860735502961e-15, 'kernel': 'rbf'}. Best is trial 3 with value: 0.25035753476362244.
[I 2023-11-06 21:36:09,661] Trial 4 finished with value: 0.25143161322788976 and parameters: {'alpha': 0.4018149473120357, 'gamma': 2.797772180419963e-15, 'kernel': 'linear'}. Best is trial 3 with value: 0.25035753476362244.
[I 2023-11-06 21:36:09,664] Trial 3 finished with value: 0.25035753476362244 and parameters: {'alpha'

random RLM GB


[I 2023-11-06 21:36:15,699] Trial 3 finished with value: 0.24234112253846937 and parameters: {'n_estimators': 50, 'learning_rate': 0.4043371295952096, 'max_depth': 2}. Best is trial 3 with value: 0.24234112253846937.
[I 2023-11-06 21:36:15,712] Trial 2 finished with value: 0.24374012420732824 and parameters: {'n_estimators': 500, 'learning_rate': 0.0561704561834485, 'max_depth': 5}. Best is trial 3 with value: 0.24234112253846937.
[I 2023-11-06 21:36:15,755] Trial 1 finished with value: 0.2427785767223036 and parameters: {'n_estimators': 50, 'learning_rate': 0.6122634926289041, 'max_depth': 5}. Best is trial 0 with value: 0.24226224000909816.
[I 2023-11-06 21:36:15,758] Trial 0 finished with value: 0.24226224000909816 and parameters: {'n_estimators': 20, 'learning_rate': 0.4903477777835727, 'max_depth': 5}. Best is trial 0 with value: 0.24226224000909816.
[I 2023-11-06 21:36:15,775] Trial 4 finished with value: 0.24345150732768348 and parameters: {'n_estimators': 10, 'learning_rate': 0

random RLM RF


[I 2023-11-06 21:36:36,957] Trial 2 finished with value: 0.23753108987620522 and parameters: {'n_estimators': 20, 'max_features': 'sqrt', 'max_depth': 4}. Best is trial 2 with value: 0.23753108987620522.
[I 2023-11-06 21:36:37,010] Trial 1 finished with value: 0.2373498185548646 and parameters: {'n_estimators': 200, 'max_features': 'auto', 'max_depth': 4}. Best is trial 1 with value: 0.2373498185548646.
[I 2023-11-06 21:36:37,063] Trial 3 finished with value: 0.23821174979854068 and parameters: {'n_estimators': 50, 'max_features': 'sqrt', 'max_depth': 10}. Best is trial 1 with value: 0.2373498185548646.
[I 2023-11-06 21:36:37,102] Trial 4 finished with value: 0.23727899489964566 and parameters: {'n_estimators': 20, 'max_features': 'auto', 'max_depth': 5}. Best is trial 4 with value: 0.23727899489964566.
[I 2023-11-06 21:36:37,116] Trial 0 finished with value: 0.23767215214167206 and parameters: {'n_estimators': 200, 'max_features': 'sqrt', 'max_depth': 5}. Best is trial 4 with value: 0

KeyboardInterrupt: 