In [1]:
import optuna
import joblib
import numpy as np
import pandas as pd
import sys
from project_resources.import_utils import NotebookFinder
sys.meta_path.append(NotebookFinder())
from project_resources.it4i_resources import fp_from_smiles, HyperparamTuner, parse_jazzy_df

importing Jupyter notebook from C:\Users\Lukas\Documents\datacytochromy\project_resources\it4i_resources.ipynb


In [2]:
types = ["morgan", "jazzy"]
splitters = ["rand", "scaff", "time"]
model_identifiers = ["linear", "KRR", "GB", "RF", "ANN"]
isozymes = ["3A4", "RLM", "HLC"]
data_splits = ["train", "test"]
rel_paths = {
    "morgan_3A4_train_scaff": r"project_resources/base_splits/scaffold_splitter/3A4_train.csv",
    "morgan_3A4_train_rand": r"project_resources/base_splits/random/3A4_train.csv",
    "morgan_3A4_train_time": r"project_resources/base_splits/time_split/3A4_train.csv",
    "morgan_RLM_train_scaff": r"project_resources/base_splits/scaffold_splitter/RLM_train.csv",
    "morgan_RLM_train_rand": r"project_resources/base_splits/random/RLM_train.csv",
    "morgan_RLM_train_time": r"project_resources/base_splits/time_split/RLM_train.csv",
    "morgan_HLC_train_scaff": r"project_resources/base_splits/scaffold_splitter/HLC_train.csv",
    "morgan_HLC_train_rand": r"project_resources/base_splits/random/HLC_train.csv",
    "morgan_HLC_train_time": r"project_resources/base_splits/time_split/HLC_train.csv",

    "morgan_3A4_test_scaff": r"project_resources/base_splits/scaffold_splitter/3A4_test.csv",
    "morgan_3A4_test_rand": r"project_resources/base_splits/random/3A4_test.csv",
    "morgan_3A4_test_time": r"project_resources/base_splits/time_split/3A4_test.csv",
    "morgan_RLM_test_scaff": r"project_resources/base_splits/scaffold_splitter/RLM_test.csv",
    "morgan_RLM_test_rand": r"project_resources/base_splits/random/RLM_test.csv",
    "morgan_RLM_test_time": r"project_resources/base_splits/time_split/RLM_test.csv",
    "morgan_HLC_test_scaff": r"project_resources/base_splits/scaffold_splitter/HLC_test.csv",
    "morgan_HLC_test_rand": r"project_resources/base_splits/random/HLC_test.csv",
    "morgan_HLC_test_time": r"project_resources/base_splits/time_split/HLC_test.csv",

    "jazzy_3A4_train_scaff": r"project_resources/jazzy_splits/scaffold_splitter/3A4_train.csv",
    "jazzy_3A4_train_rand": r"project_resources/jazzy_splits/random/3A4_train.csv",
    "jazzy_3A4_train_time": r"project_resources/jazzy_splits/time_split/3A4_train.csv",
    "jazzy_RLM_train_scaff": r"project_resources/jazzy_splits/scaffold_splitter/RLM_train.csv",
    "jazzy_RLM_train_rand": r"project_resources/jazzy_splits/random/RLM_train.csv",
    "jazzy_RLM_train_time": r"project_resources/jazzy_splits/time_split/RLM_train.csv",
    "jazzy_HLC_train_scaff": r"project_resources/jazzy_splits/scaffold_splitter/HLC_train.csv",
    "jazzy_HLC_train_rand": r"project_resources/jazzy_splits/random/HLC_train.csv",
    "jazzy_HLC_train_time": r"project_resources/jazzy_splits/time_split/HLC_train.csv",

    "jazzy_3A4_test_scaff": r"project_resources/jazzy_splits/scaffold_splitter/3A4_test.csv",
    "jazzy_3A4_test_rand": r"project_resources/jazzy_splits/random/3A4_test.csv",
    "jazzy_3A4_test_time": r"project_resources/jazzy_splits/time_split/3A4_test.csv",
    "jazzy_RLM_test_scaff": r"project_resources/jazzy_splits/scaffold_splitter/RLM_test.csv",
    "jazzy_RLM_test_rand": r"project_resources/jazzy_splits/random/RLM_test.csv",
    "jazzy_RLM_test_time": r"project_resources/jazzy_splits/time_split/RLM_test.csv",
    "jazzy_HLC_test_scaff": r"project_resources/jazzy_splits/scaffold_splitter/HLC_test.csv",
    "jazzy_HLC_test_rand": r"project_resources/jazzy_splits/random/HLC_test.csv",
    "jazzy_HLC_test_time": r"project_resources/jazzy_splits/time_split/HLC_test.csv"
}
# sampler - a method used to generate new sets of hyperparameters in each iteration of the optimization process
samplers = {
    'RandomSampler': optuna.samplers.RandomSampler,          # Sampler that selects hyperparameters randomly from the search space.
    'GridSampler': optuna.samplers.GridSampler,              # Sampler that performs a grid search over the hyperparameter space.
    'TPESampler': optuna.samplers.TPESampler,                # Sampler that uses a tree-structured Parzen estimator to model the objective function and sample new points from the search space.
    'CmaEsSampler': optuna.samplers.CmaEsSampler,            # Sampler that uses the Covariance Matrix Adaptation Evolution Strategy algorithm to efficiently search the hyperparameter space.
    'NSGAIISampler': optuna.samplers.NSGAIISampler,          # Multi-objective evolutionary algorithm that generates new samples using non-dominated sorting and crowding distance selection.
    'QMCSampler': optuna.samplers.QMCSampler,                # Quasi-Monte Carlo sampler that uses low-discrepancy sequences to sample the search space in a more efficient and evenly distributed way than random sampling.
    'BoTorchSampler': optuna.integration.BoTorchSampler,     # Sampler that leverages the BoTorch library for Bayesian optimization and can handle both continuous and categorical hyperparameters.
    'BruteForceSampler': optuna.samplers.BruteForceSampler,  # Sampler that exhaustively evaluates all possible combinations of hyperparameters in the search space.
}
# pruner - a technique used to eliminate unpromising trials during the course of hyperparameter optimization.
pruners = {
    'BasePruner': optuna.pruners.BasePruner,                            # This is the base class for all pruning strategies in Optuna. It provides a skeleton for implementing custom pruning strategies.
    'MedianPruner': optuna.pruners.MedianPruner,                        # A pruner that prunes unpromising trials that have median objective values, as determined in previous steps.
    'SuccessiveHalvingPruner': optuna.pruners.SuccessiveHalvingPruner,  # This pruner repeatedly splits trials into halves, discarding the lower performing half at each iteration.
    'HyperbandPruner': optuna.pruners.HyperbandPruner,                  # This pruner implements the Hyperband algorithm, which selects promising trials and runs them with different resource allocation schemes to determine the best one.
    'PercentilePruner': optuna.pruners.PercentilePruner,                # A pruner that prunes unpromising trials based on their percentile rank relative to all completed trials.
    'NopPruner': optuna.pruners.NopPruner,                              # A pruner that does nothing and does not prune any trials.
    'ThresholdPruner': optuna.pruners.ThresholdPruner,                  # This pruner prunes trials that have not reached a certain level of performance (i.e., objective value).
    'PatientPruner': optuna.pruners.PatientPruner,                      # This pruner prunes trials that do not show improvement over a certain number of steps (or epochs).
}
smiles = {}
halflives = {}
features = {}

In [3]:
# load smiles used for ML with Morgan features
smiles["morgan"] = {}
halflives["morgan"] = {}
for splitter in splitters:
    print("\n")
    print(splitter)
    smiles["morgan"][splitter] = {}
    halflives["morgan"][splitter] = {}
    for isozyme in isozymes:
        smiles["morgan"][splitter][isozyme] = {}
        halflives["morgan"][splitter][isozyme] = {}
        for split in data_splits:
            print(isozyme, split)
            df = pd.read_csv(rel_paths[f"morgan_{isozyme}_{split}_{splitter}"])
            df_smiles = list(df["smiles"])
            df_halflives = list(df["half-life"])
            smiles["morgan"][splitter][isozyme][split] = df_smiles
            halflives["morgan"][splitter][isozyme][split] = df_halflives
            print(smiles["morgan"][splitter][isozyme][split][0], halflives["morgan"][splitter][isozyme][split][0])



rand
3A4 train
CC(C)(O)c1cc(F)c2c(c1)C(=O)N(Cc1ccc(Cl)cn1)[C@@]2(OCC1(O)CC1)c1ccc(Cl)cc1 0.9999999999999998
3A4 test
Cc1ncsc1-c1ccc([C@H](CC(=O)NCCCCCCNC(=O)COc2c(-c3csc(N4CCOCC4)n3)ccc(F)c2F)NC(=O)[C@@H]2C[C@@H](O)CN2C(=O)[C@@H](NC(=O)C2(F)CC2)C(C)(C)C)cc1 0.0622624201077031
RLM train
CN1C(=O)c2ccccc2[S@+]([O-])c2ccc(C(=O)NCc3ccc(Br)cc3)cc21 0.4181064270905321
RLM test
CC(=O)c1c(C)[nH]c(C(=O)Nc2cccc([S+](=O)([O-])Nc3cccc(C#N)c3)c2)c1C 0.8327574291637871
HLC train
N#Cc1ccc(CN2CCC(N3CCNC3=O)CC2)cc1 0.7615658362989324
HLC test
c1ccc(Nc2ncc(-c3cncnc3)c3c2OCC3)cc1 1.0


scaff
3A4 train
COc1cccc([C@@H](CO)NC(=O)[C@@H](C)N2Cc3ccc(-c4nc(NC5CCOCC5)ncc4Cl)cc3C2=O)c1 0.0147310164129507
3A4 test
O=C1CCC(N2C(=O)c3cccc(NCCOCCOCCNC(=O)c4ccc5c(c4)nc(Nc4cccc(Cl)c4)c4ccncc45)c3C2=O)C(=O)N1 0.0400167292931223
RLM train
CNC(=O)C1CCN(c2nc(-c3ccc(Br)cc3)cs2)CC1 0.8728403593642018
RLM test
O=c1cc(CN2CCCN(c3ccc(C(F)(F)F)cn3)CC2)nc2ccccn12 0.2695231513476157
HLC train
Cc1cc(F)ccc1OC1CN(Cc2ccc(C#N)cc2)C1 0.2

In [4]:
# smiles to Morgan fingerprint
features["morgan"] = {}  # need to destinguish between Jazzy and Morngan since Jazzy ommits some mols
for splitter in splitters:
    features["morgan"][splitter] = {}
    for isozyme in isozymes:
        features["morgan"][splitter][isozyme] = {}
        for data_split in data_splits:
            fps = fp_from_smiles(smiles["morgan"][splitter][isozyme][data_split])
            features["morgan"][splitter][isozyme][data_split] = np.array(fps)
            print(splitter, isozyme, data_split)
            print(f"first ten bits of the frist element: {fps[0][:10]}, number of bit vectors: {len(fps)}")
        print("\n")

rand 3A4 train
first ten bits of the frist element: [1 1 1 0 1 0 1 1 1 1], number of bit vectors: 56
rand 3A4 test
first ten bits of the frist element: [1 1 1 1 1 1 0 1 1 0], number of bit vectors: 14


rand RLM train
first ten bits of the frist element: [0 1 1 0 0 1 1 1 0 0], number of bit vectors: 1421
rand RLM test
first ten bits of the frist element: [0 0 0 0 0 1 1 0 1 0], number of bit vectors: 356


rand HLC train
first ten bits of the frist element: [0 0 0 1 1 0 1 0 0 1], number of bit vectors: 151
rand HLC test
first ten bits of the frist element: [0 1 0 0 0 0 0 0 0 0], number of bit vectors: 38


scaff 3A4 train
first ten bits of the frist element: [0 1 0 1 0 1 1 1 0 1], number of bit vectors: 56
scaff 3A4 test
first ten bits of the frist element: [0 1 1 1 0 1 0 0 0 1], number of bit vectors: 14


scaff RLM train
first ten bits of the frist element: [0 1 0 1 0 1 0 0 0 0], number of bit vectors: 1421
scaff RLM test
first ten bits of the frist element: [0 1 0 1 0 0 0 0 1 1], num

In [5]:
# load Jazzy features from csv files and their corresponding smiles
smiles["jazzy"] = {}
halflives["jazzy"] = {}
features["jazzy"] = {}
for splitter in splitters:
    print("\n")
    print(splitter)
    features["jazzy"][splitter] = {}
    smiles["jazzy"][splitter] = {}
    halflives["jazzy"][splitter] = {}
    for isozyme in isozymes:
        features["jazzy"][splitter][isozyme] = {}
        smiles["jazzy"][splitter][isozyme] = {}
        halflives["jazzy"][splitter][isozyme] = {}
        for split in data_splits:
            print(isozyme, split)
            df = pd.read_csv(rel_paths[f"jazzy_{isozyme}_{split}_{splitter}"])
            jazzy_smiles, df_features, thalfs, contains_nan = parse_jazzy_df(df)
            smiles["jazzy"][splitter][isozyme][split] = jazzy_smiles
            features["jazzy"][splitter][isozyme][split] = df_features
            halflives["jazzy"][splitter][isozyme][split] = thalfs
            print(smiles["jazzy"][splitter][isozyme][split].shape, features["jazzy"][splitter][isozyme][split].shape, halflives["jazzy"][splitter][isozyme][split].shape)
            print(f"     {features['jazzy'][splitter][isozyme][split][0]}")
            print(f"     {isozyme} mol_features {split} contains NaN: {contains_nan}")



rand
3A4 train
     56, [63, 'CC(C)(O)c1cc(F)c2c(c1)C(=O)N(Cc1ccc(Cl)cn1)[C@@]2(OCC1(O)CC1)c1ccc(Cl)cc1', 0.9999999999999998, 13.1753, 1.4177, 11.3044, -7.3559, -125.8906, -119.4889]
(56,) (56, 6) (56,)
     [  13.1753    1.4177   11.3044   -7.3559 -125.8906 -119.4889]
     3A4 mol_features train contains NaN: False
3A4 test
     14, [23, 'Cc1ncsc1-c1ccc([C@H](CC(=O)NCCCCCCNC(=O)COc2c(-c3csc(N4CCOCC4)n3)ccc(F)c2F)NC(=O)[C@@H]2C[C@@H](O)CN2C(=O)[C@@H](NC(=O)C2(F)CC2)C(C)(C)C)cc1', 0.0622624201077031, 28.4327, 3.274, 23.4872, -4.0348, -286.1545, -257.0326]
(14,) (14, 6) (14,)
     [  28.4327    3.274    23.4872   -4.0348 -286.1545 -257.0326]
     3A4 mol_features test contains NaN: False
RLM train
     removed index 325 corresponding to NaN
     removed index 678 corresponding to NaN
     removed index 725 corresponding to NaN
     removed index 1053 corresponding to NaN
     1417, [439, 'CN1C(=O)c2ccccc2[S@+]([O-])c2ccc(C(=O)NCc3ccc(Br)cc3)cc21', 0.4181064270905321, 10.3104, 0.7256, 7

In [7]:
sampler = samplers['TPESampler']
pruner = pruners["BasePruner"]
while True:
    # while True is needed; if instead n_trials was large only one model would be trained
    n_trials = 20
    for _type in types:
        for splitter in splitters:
            if splitter == "rand":
                splitter_name = "random"
            elif splitter == "scaff":
                splitter_name = "scaffold_splitter"
            else:
                splitter_name = "time_split"

            for isozyme in isozymes:
                X_train = features[_type][splitter][isozyme]["train"]
                y_train = np.array(halflives[_type][splitter][isozyme]["train"])
                X_test = features[_type][splitter][isozyme]["test"]
                y_test = np.array(halflives[_type][splitter][isozyme]["test"])

                for model_identifier in model_identifiers:
                    print(splitter_name, isozyme, model_identifier)
                    study = optuna.create_study(study_name=model_identifier, directions=['minimize'], pruner=pruner,
                                                storage=f"sqlite:///project_resources/optuna/{_type}/{splitter_name}/{isozyme}/db.{model_identifier}", load_if_exists=True)
                    tuner = HyperparamTuner(model_identifier, X_train, y_train, X_test, y_test)
                    study.optimize(tuner.objective, n_trials=n_trials, n_jobs=-1)  # catch=(ValueError,)
                    joblib.dump(study, f"./project_resources/optuna/{_type}/{splitter_name}/{isozyme}/{model_identifier}.pkl")

[I 2023-11-02 21:35:07,435] Using an existing study with name 'linear' instead of creating a new one.


random 3A4 linear


[I 2023-11-02 21:35:08,305] Trial 212 finished with value: 0.21038843437527613 and parameters: {'alpha': 0.018953978042114406, 'l1_ratio': 0.8624139078755827}. Best is trial 25 with value: 0.20464433707863924.
[I 2023-11-02 21:35:08,705] Trial 218 finished with value: 0.20866986233529672 and parameters: {'alpha': 0.018887892441939894, 'l1_ratio': 0.8935831193093403}. Best is trial 25 with value: 0.20464433707863924.
[I 2023-11-02 21:35:08,779] Trial 208 finished with value: 0.21051999367985139 and parameters: {'alpha': 0.019075753354313465, 'l1_ratio': 0.8635420777341933}. Best is trial 25 with value: 0.20464433707863924.
[I 2023-11-02 21:35:08,783] Trial 213 finished with value: 0.21191917522691653 and parameters: {'alpha': 0.01897663402610377, 'l1_ratio': 0.8966779736230629}. Best is trial 25 with value: 0.20464433707863924.
[I 2023-11-02 21:35:08,894] Trial 217 finished with value: 0.21215494624919384 and parameters: {'alpha': 0.019327206756950797, 'l1_ratio': 0.8693229954787127}. B

random 3A4 KRR


[I 2023-11-02 21:35:11,253] Trial 208 finished with value: 0.17883435214122437 and parameters: {'alpha': 0.2888954608109726, 'gamma': 9.890745506245817e-15, 'kernel': 'rbf'}. Best is trial 5 with value: 0.1479540658359624.
[I 2023-11-02 21:35:11,276] Trial 207 finished with value: 0.18703247353961813 and parameters: {'alpha': 0.7327953555329771, 'gamma': 6.5920953628506964e-15, 'kernel': 'rbf'}. Best is trial 5 with value: 0.1479540658359624.
[I 2023-11-02 21:35:11,278] Trial 214 finished with value: 0.18131112001385905 and parameters: {'alpha': 0.2835794806436533, 'gamma': 7.318153351025836e-15, 'kernel': 'rbf'}. Best is trial 5 with value: 0.1479540658359624.
[I 2023-11-02 21:35:11,356] Trial 211 finished with value: 0.17444946552840668 and parameters: {'alpha': 0.7306362836297297, 'gamma': 7.646307526621747e-15, 'kernel': 'rbf'}. Best is trial 5 with value: 0.1479540658359624.
[I 2023-11-02 21:35:11,527] Trial 209 finished with value: 0.17337591261988194 and parameters: {'alpha': 0.

KeyboardInterrupt: 

[I 2023-11-02 21:35:12,881] Trial 226 finished with value: 0.18077744001830423 and parameters: {'alpha': 0.6326587591293352, 'gamma': 8.970094238158804e-15, 'kernel': 'laplacian'}. Best is trial 5 with value: 0.1479540658359624.
