In [1]:
import optuna
import joblib
import numpy as np
import pandas as pd
import sys
from project_resources.import_utils import NotebookFinder
sys.meta_path.append(NotebookFinder())
from project_resources.cytochrome_P450 import fp_from_smiles, HyperparamTuner

importing Jupyter notebook from C:\Users\Lukas\Documents\datacytochromy\project_resources\cytochrome_P450.ipynb


In [2]:
model_identifiers = ["linear", "KRR", "GB", "RF", "ANN"]
isozymes = ["3A4", "RLM", "HLC"]
data_splits = ["train", "test"]
splitters = ["rand", "scaff"]
rel_paths = {
    "3A4_source": r"project_resources/ChEMBL_3A4.csv",
    "3A4_sep": ";",
    "3A4": r"project_resources/3A4.csv",
    "3A4_train_scaff": r"project_resources/base_splits/scaffold_splitter/3A4_train.csv",
    "3A4_test_scaff": r"project_resources/base_splits/scaffold_splitter/3A4_test.csv",
    "3A4_train_rand": r"project_resources/base_splits/random/3A4_train.csv",
    "3A4_test_rand": r"project_resources/base_splits/random/3A4_test.csv",

    "RLM_source": r"project_resources/AID_1508591_datatable_all.csv",
    "RLM_sep": ",",
    "RLM": r"project_resources/RLM.csv",
    "RLM_train_scaff": r"project_resources/base_splits/scaffold_splitter/RLM_train.csv",
    "RLM_test_scaff": r"project_resources/base_splits/scaffold_splitter/RLM_test.csv",
    "RLM_train_rand": r"project_resources/base_splits/random/RLM_train.csv",
    "RLM_test_rand": r"project_resources/base_splits/random/RLM_test.csv",

    "HLC_source": r"project_resources/AID_1508603_datatable_all.csv",
    "HLC_sep": ",",
    "HLC": r"project_resources/HLC.csv",
    "HLC_train_scaff": r"project_resources/base_splits/scaffold_splitter/HLC_train.csv",
    "HLC_test_scaff": r"project_resources/base_splits/scaffold_splitter/HLC_test.csv",
    "HLC_train_rand": r"project_resources/base_splits/random/HLC_train.csv",
    "HLC_test_rand": r"project_resources/base_splits/random/HLC_test.csv"
}
# sampler - a method used to generate new sets of hyperparameters in each iteration of the optimization process
samplers = {
    'RandomSampler': optuna.samplers.RandomSampler,          # Sampler that selects hyperparameters randomly from the search space.
    'GridSampler': optuna.samplers.GridSampler,              # Sampler that performs a grid search over the hyperparameter space.
    'TPESampler': optuna.samplers.TPESampler,                # Sampler that uses a tree-structured Parzen estimator to model the objective function and sample new points from the search space.
    'CmaEsSampler': optuna.samplers.CmaEsSampler,            # Sampler that uses the Covariance Matrix Adaptation Evolution Strategy algorithm to efficiently search the hyperparameter space.
    'NSGAIISampler': optuna.samplers.NSGAIISampler,          # Multi-objective evolutionary algorithm that generates new samples using non-dominated sorting and crowding distance selection.
    'QMCSampler': optuna.samplers.QMCSampler,                # Quasi-Monte Carlo sampler that uses low-discrepancy sequences to sample the search space in a more efficient and evenly distributed way than random sampling.
    'BoTorchSampler': optuna.integration.BoTorchSampler,     # Sampler that leverages the BoTorch library for Bayesian optimization and can handle both continuous and categorical hyperparameters.
    'BruteForceSampler': optuna.samplers.BruteForceSampler,  # Sampler that exhaustively evaluates all possible combinations of hyperparameters in the search space.
}
# pruner - a technique used to eliminate unpromising trials during the course of hyperparameter optimization.
pruners = {
    'BasePruner': optuna.pruners.BasePruner,                            # This is the base class for all pruning strategies in Optuna. It provides a skeleton for implementing custom pruning strategies.
    'MedianPruner': optuna.pruners.MedianPruner,                        # A pruner that prunes unpromising trials that have median objective values, as determined in previous steps.
    'SuccessiveHalvingPruner': optuna.pruners.SuccessiveHalvingPruner,  # This pruner repeatedly splits trials into halves, discarding the lower performing half at each iteration.
    'HyperbandPruner': optuna.pruners.HyperbandPruner,                  # This pruner implements the Hyperband algorithm, which selects promising trials and runs them with different resource allocation schemes to determine the best one.
    'PercentilePruner': optuna.pruners.PercentilePruner,                # A pruner that prunes unpromising trials based on their percentile rank relative to all completed trials.
    'NopPruner': optuna.pruners.NopPruner,                              # A pruner that does nothing and does not prune any trials.
    'ThresholdPruner': optuna.pruners.ThresholdPruner,                  # This pruner prunes trials that have not reached a certain level of performance (i.e., objective value).
    'PatientPruner': optuna.pruners.PatientPruner,                      # This pruner prunes trials that do not show improvement over a certain number of steps (or epochs).
}
smiles = {}
halflives = {}
fingerprints = {}

In [3]:
# load train-test
for splitter in splitters:
    print(splitter)
    smiles[splitter] = {}
    halflives[splitter] = {}
    for isozyme in isozymes:
        smiles[splitter][isozyme] = {}
        halflives[splitter][isozyme] = {}

        for data_split in data_splits:
            # load smiles
            split_df = pd.read_csv(rel_paths[f"{isozyme}_{data_split}_{splitter}"])
            split_smi = np.array(split_df["smiles"])
            smiles[splitter][isozyme][data_split] = split_smi

            # load half-life
            split_df = pd.read_csv(rel_paths[f"{isozyme}_{data_split}_{splitter}"])
            split_halflife = np.array(split_df["half-life"])
            halflives[splitter][isozyme][data_split] = split_halflife

        print(f"""{isozyme}
    x_train: {smiles[splitter][isozyme]["train"][0]}
    x_test: {smiles[splitter][isozyme]["test"][0]}
    y_train: {halflives[splitter][isozyme]["train"][:3]}
    y_test: {halflives[splitter][isozyme]["test"][:3]}
    """)

rand
3A4
    x_train: CC(C)(O)c1cc(F)c2c(c1)C(=O)N(Cc1ccc(Cl)cn1)[C@@]2(OCC1(O)CC1)c1ccc(Cl)cc1
    x_test: Cc1ncsc1-c1ccc([C@H](CC(=O)NCCCCCCNC(=O)COc2c(-c3csc(N4CCOCC4)n3)ccc(F)c2F)NC(=O)[C@@H]2C[C@@H](O)CN2C(=O)[C@@H](NC(=O)C2(F)CC2)C(C)(C)C)cc1
    y_train: [6.   0.02 0.5 ]
    y_test: [0.3767  0.3333  0.01433]
    
RLM
    x_train: O=c1cc(N2CCOCC2)oc2c1ccc1ccccc12
    x_test: Cc1ccc(OCCn2c(CCNC(=O)N3CCCCC3)nc3ccccc32)cc1
    y_train: [30.    4.4  26.58]
    y_test: [ 1.7  1.7 30. ]
    
HLC
    x_train: N#Cc1ccc(CN2CCC(N3CCNC3=O)CC2)cc1
    x_test: c1ccc(Nc2ncc(-c3cncnc3)c3c2OCC3)cc1
    y_train: [93.2 21.  50.8]
    y_test: [120.  111.9  57.4]
    
scaff
3A4
    x_train: COc1cccc([C@@H](CO)NC(=O)[C@@H](C)N2Cc3ccc(-c4nc(NC5CCOCC5)ncc4Cl)cc3C2=O)c1
    x_test: O=C1CCC(N2C(=O)c3cccc(NCCOCCOCCNC(=O)c4ccc5c(c4)nc(Nc4cccc(Cl)c4)c4ccncc45)c3C2=O)C(=O)N1
    y_train: [0.09167 0.08333 0.8167 ]
    y_test: [0.2433 0.055  0.2667]
    
RLM
    x_train: CS(=O)(=O)c1ccccc1-c1csc(N2CCC(C(N)=O)C

In [4]:
# convert smiles from every isozyme and split to Morgan fingerprint as numpy array
for splitter in splitters:
    fingerprints[splitter] = {}
    for isozyme in isozymes:
        fingerprints[splitter][isozyme] = {}
        for data_split in data_splits:
            fps = fp_from_smiles(smiles[splitter][isozyme][data_split])
            fingerprints[splitter][isozyme][data_split] = np.array(fps)
            print(splitter, isozyme, data_split)
            print(fps[0], len(fps))
        print("\n")

rand 3A4 train
[1 1 1 0 1 0 1 1 1 1 1 0 1 1 1 1 0 1 0 1 0 0 0 1 1 0 1 0 1 0 1 0 1 0 1 0 1
 0 0 0 0 0 1 1 0 0 1 0 0 0 1 0 0 1 0 1 1 1 0 1 1 0 0 0 0 0 0 0 1 0 0 0 1 0
 1 0 1 0 1 0 0 0 0 0 1 0 0 1 1 1 0 0 0 1 0 0 0 0 0 0 1 1 1 0 1 0 0 0 1 0 0
 1 0 1 0 0 1 0 0 0 1 1 1 0] 56
rand 3A4 test
[1 1 1 1 1 1 0 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 0 1 1 0 0 0 0 0 1 1 1 0 0 1 1
 1 0 0 0 1 1 1 1 1 1 0 1 0 0 1 1 1 1 1 0 1 1 1 1 0 0 0 1 1 0 1 1 0 0 0 0 0
 1 1 1 1 0 1 1 0 0 0 1 0 0 0 0 1 1 0 1 0 0 0 1 1 0 0 1 1 1 0 0 0 0 0 1 0 1
 1 1 0 0 0 1 0 1 1 1 1 1 1] 14


rand RLM train
[0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0
 1 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 1 1 1 1 0 0 1 0 0 0 0 0 0 0 1 0 1 0 0 0 0
 1 0 1 0 0 0 1 0 0 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 1
 1 0 0 1 0 0 0 0 0 1 0 1 0] 2024
rand RLM test
[1 0 0 1 0 1 0 1 0 0 1 0 1 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0
 1 1 1 0 1 1 1 1 1 1 0 0 1 0 1 0 1 0 1 0 0 1 1 0 0 0 0 1 0 1 0 0 0 0 0 0 0
 1 0 1 0 0 0 1 0 0 0

In [5]:
sampler = samplers['TPESampler']
pruner = pruners["BasePruner"]
n_trials = 1
for splitter in splitters:
    if splitter == "rand":
        splitter_name = "random"
    else:
        splitter_name = "scaffold_splitter"

    for isozyme in isozymes:
        X_train = fingerprints[splitter][isozyme]["train"]
        y_train = np.log(halflives[splitter][isozyme]["train"])  # natural log half-lives -> values are less spread out
        X_test = fingerprints[splitter][isozyme]["test"]
        y_test = np.log(halflives[splitter][isozyme]["test"])
        for model_identifier in model_identifiers:
            print(splitter_name, isozyme, model_identifier)
            study = optuna.create_study(study_name=model_identifier, directions=['minimize'], pruner=pruner,
                                        storage=f"sqlite:///project_resources/optuna/morgan/{splitter_name}/{isozyme}/db.{model_identifier}", load_if_exists=True)
            test = HyperparamTuner(model_identifier, X_train, y_train, X_test, y_test)
            study.optimize(test.objective, n_trials=n_trials, n_jobs=-1)  # catch=(ValueError,)
            joblib.dump(study, f"./project_resources/optuna/morgan/{splitter_name}/{isozyme}/{model_identifier}.pkl")

random 3A4 linear


[I 2023-10-10 22:07:41,425] A new study created in RDB with name: linear
[I 2023-10-10 22:07:41,501] Trial 0 finished with value: 1.425459235038666 and parameters: {'alpha': 0.08255489771040272, 'l1_ratio': 0.20761210522970164}. Best is trial 0 with value: 1.425459235038666.
[I 2023-10-10 22:07:41,700] A new study created in RDB with name: KRR


random 3A4 KRR


[I 2023-10-10 22:07:41,786] Trial 0 finished with value: 1.308819709409171 and parameters: {'alpha': 0.0018795661757690966, 'gamma': 9.270499031709152e-15, 'kernel': 'rbf'}. Best is trial 0 with value: 1.308819709409171.
[I 2023-10-10 22:07:41,916] A new study created in RDB with name: GB


random 3A4 GB


[I 2023-10-10 22:07:42,442] Trial 0 finished with value: 1.2807484970065994 and parameters: {'n_estimators': 10, 'learning_rate': 0.7162800349625453, 'max_depth': 4}. Best is trial 0 with value: 1.2807484970065994.
[I 2023-10-10 22:07:42,569] A new study created in RDB with name: RF


random 3A4 RF


[I 2023-10-10 22:07:43,998] Trial 0 finished with value: 1.194520236563144 and parameters: {'n_estimators': 10, 'max_features': 'auto', 'max_depth': 10}. Best is trial 0 with value: 1.194520236563144.
[I 2023-10-10 22:07:44,126] A new study created in RDB with name: ANN


random 3A4 ANN


[I 2023-10-10 22:07:45,548] Trial 0 finished with value: 1.1678425906626275 and parameters: {'learning_rate_init': 0.011786841122018759, 'hidden_layer_sizes': [10]}. Best is trial 0 with value: 1.1678425906626275.
[I 2023-10-10 22:07:45,678] A new study created in RDB with name: linear


random RLM linear


[I 2023-10-10 22:07:45,792] Trial 0 finished with value: 1.097787850303627 and parameters: {'alpha': 0.06386539808364722, 'l1_ratio': 0.46782392495881153}. Best is trial 0 with value: 1.097787850303627.
[I 2023-10-10 22:07:45,920] A new study created in RDB with name: KRR


random RLM KRR


[I 2023-10-10 22:07:47,171] Trial 0 finished with value: 1.010931164075948 and parameters: {'alpha': 0.9659149300589076, 'gamma': 1.0176113633690576e-15, 'kernel': 'rbf'}. Best is trial 0 with value: 1.010931164075948.


random RLM GB


[I 2023-10-10 22:07:47,413] A new study created in RDB with name: GB
[I 2023-10-10 22:07:55,188] Trial 0 finished with value: 0.976563425836126 and parameters: {'n_estimators': 200, 'learning_rate': 0.16288320855366117, 'max_depth': 5}. Best is trial 0 with value: 0.976563425836126.
[I 2023-10-10 22:07:55,312] A new study created in RDB with name: RF


random RLM RF


[I 2023-10-10 22:08:18,283] Trial 0 finished with value: 0.9562615238924675 and parameters: {'n_estimators': 50, 'max_features': 'auto', 'max_depth': 5}. Best is trial 0 with value: 0.9562615238924675.
[I 2023-10-10 22:08:18,424] A new study created in RDB with name: ANN


random RLM ANN


[I 2023-10-10 22:08:41,379] Trial 0 finished with value: 1.0639709567255349 and parameters: {'learning_rate_init': 0.029388203193950597, 'hidden_layer_sizes': [20, 20]}. Best is trial 0 with value: 1.0639709567255349.
[I 2023-10-10 22:08:41,518] A new study created in RDB with name: linear


random HLC linear


[I 2023-10-10 22:08:41,597] Trial 0 finished with value: 0.626079959918068 and parameters: {'alpha': 0.017301114132892727, 'l1_ratio': 0.024273445911196623}. Best is trial 0 with value: 0.626079959918068.
[I 2023-10-10 22:08:41,727] A new study created in RDB with name: KRR
[I 2023-10-10 22:08:41,806] Trial 0 finished with value: 0.6273681281183437 and parameters: {'alpha': 0.8343066865309305, 'gamma': 7.687316438198187e-15, 'kernel': 'laplacian'}. Best is trial 0 with value: 0.6273681281183437.


random HLC KRR


[I 2023-10-10 22:08:41,940] A new study created in RDB with name: GB


random HLC GB


[I 2023-10-10 22:08:42,787] Trial 0 finished with value: 0.5736892443323853 and parameters: {'n_estimators': 20, 'learning_rate': 0.1369687140562429, 'max_depth': 3}. Best is trial 0 with value: 0.5736892443323853.


random HLC RF


[I 2023-10-10 22:08:43,030] A new study created in RDB with name: RF
[I 2023-10-10 22:08:45,102] Trial 0 finished with value: 0.5653730643750247 and parameters: {'n_estimators': 200, 'max_features': 'log2', 'max_depth': 4}. Best is trial 0 with value: 0.5653730643750247.
[I 2023-10-10 22:08:45,232] A new study created in RDB with name: ANN


random HLC ANN


[I 2023-10-10 22:08:45,624] Trial 0 finished with value: 1.0617275470746037 and parameters: {'learning_rate_init': 0.020195610623569005, 'hidden_layer_sizes': [5, 5]}. Best is trial 0 with value: 1.0617275470746037.
[I 2023-10-10 22:08:45,760] A new study created in RDB with name: linear


scaffold_splitter 3A4 linear


[I 2023-10-10 22:08:45,840] Trial 0 finished with value: 1.4126590024116428 and parameters: {'alpha': 0.040945467116598314, 'l1_ratio': 0.7721784381966122}. Best is trial 0 with value: 1.4126590024116428.
[I 2023-10-10 22:08:45,970] A new study created in RDB with name: KRR
[I 2023-10-10 22:08:46,048] Trial 0 finished with value: 1.4292979844575795 and parameters: {'alpha': 0.7249913725698782, 'gamma': 9.723181323036458e-15, 'kernel': 'laplacian'}. Best is trial 0 with value: 1.4292979844575795.


scaffold_splitter 3A4 KRR


[I 2023-10-10 22:08:46,187] A new study created in RDB with name: GB


scaffold_splitter 3A4 GB


[I 2023-10-10 22:08:46,724] Trial 0 finished with value: 1.2346694221513719 and parameters: {'n_estimators': 200, 'learning_rate': 0.7671687980653608, 'max_depth': 1}. Best is trial 0 with value: 1.2346694221513719.
[I 2023-10-10 22:08:46,861] A new study created in RDB with name: RF


scaffold_splitter 3A4 RF


[I 2023-10-10 22:08:48,377] Trial 0 finished with value: 1.2710593957483545 and parameters: {'n_estimators': 500, 'max_features': 'sqrt', 'max_depth': 4}. Best is trial 0 with value: 1.2710593957483545.


scaffold_splitter 3A4 ANN


[I 2023-10-10 22:08:48,640] A new study created in RDB with name: ANN
[I 2023-10-10 22:08:50,244] Trial 0 finished with value: 1.2676004220102657 and parameters: {'learning_rate_init': 0.07331971264271707, 'hidden_layer_sizes': [10, 10]}. Best is trial 0 with value: 1.2676004220102657.
[I 2023-10-10 22:08:50,398] A new study created in RDB with name: linear


scaffold_splitter RLM linear


[I 2023-10-10 22:08:50,520] Trial 0 finished with value: 1.1097657604592588 and parameters: {'alpha': 0.08459120629250794, 'l1_ratio': 0.29103946445220885}. Best is trial 0 with value: 1.1097657604592588.
[I 2023-10-10 22:08:50,661] A new study created in RDB with name: KRR


scaffold_splitter RLM KRR


[I 2023-10-10 22:08:51,975] Trial 0 finished with value: 1.0266570584541566 and parameters: {'alpha': 0.6167807623626737, 'gamma': 2.3144905454073005e-15, 'kernel': 'rbf'}. Best is trial 0 with value: 1.0266570584541566.
[I 2023-10-10 22:08:52,110] A new study created in RDB with name: GB


scaffold_splitter RLM GB


[I 2023-10-10 22:09:00,127] Trial 0 finished with value: 0.9820169050235356 and parameters: {'n_estimators': 500, 'learning_rate': 0.9156130552571269, 'max_depth': 4}. Best is trial 0 with value: 0.9820169050235356.
[I 2023-10-10 22:09:00,264] A new study created in RDB with name: RF


scaffold_splitter RLM RF


[I 2023-10-10 22:09:24,103] Trial 0 finished with value: 0.9645015180875737 and parameters: {'n_estimators': 20, 'max_features': 'auto', 'max_depth': None}. Best is trial 0 with value: 0.9645015180875737.
[I 2023-10-10 22:09:24,236] A new study created in RDB with name: ANN


scaffold_splitter RLM ANN


[I 2023-10-10 22:09:47,207] Trial 0 finished with value: 1.0467954808042317 and parameters: {'learning_rate_init': 0.09581362183764155, 'hidden_layer_sizes': [20]}. Best is trial 0 with value: 1.0467954808042317.
[I 2023-10-10 22:09:47,339] A new study created in RDB with name: linear


scaffold_splitter HLC linear


[I 2023-10-10 22:09:47,411] Trial 0 finished with value: 0.49806108695940704 and parameters: {'alpha': 0.05394828923173165, 'l1_ratio': 0.17656950132704696}. Best is trial 0 with value: 0.49806108695940704.


scaffold_splitter HLC KRR


[I 2023-10-10 22:09:47,660] A new study created in RDB with name: KRR
[I 2023-10-10 22:09:47,736] Trial 0 finished with value: 0.5835187029827895 and parameters: {'alpha': 0.05142101444504084, 'gamma': 5.760986465322482e-16, 'kernel': 'laplacian'}. Best is trial 0 with value: 0.5835187029827895.
[I 2023-10-10 22:09:47,862] A new study created in RDB with name: GB


scaffold_splitter HLC GB


[I 2023-10-10 22:09:48,676] Trial 0 finished with value: 0.48155930976594513 and parameters: {'n_estimators': 200, 'learning_rate': 0.15924173224463375, 'max_depth': 2}. Best is trial 0 with value: 0.48155930976594513.
[I 2023-10-10 22:09:48,802] A new study created in RDB with name: RF


scaffold_splitter HLC RF


[I 2023-10-10 22:09:50,748] Trial 0 finished with value: 0.45762883726933007 and parameters: {'n_estimators': 200, 'max_features': 'sqrt', 'max_depth': 10}. Best is trial 0 with value: 0.45762883726933007.
[I 2023-10-10 22:09:50,879] A new study created in RDB with name: ANN


scaffold_splitter HLC ANN


[I 2023-10-10 22:09:51,294] Trial 0 finished with value: 1.1657691938634063 and parameters: {'learning_rate_init': 0.08834060665637472, 'hidden_layer_sizes': [10, 10, 10]}. Best is trial 0 with value: 1.1657691938634063.
