In [1]:
import optuna
import joblib
import numpy as np
import pandas as pd
import sys
from project_resources.import_utils import NotebookFinder
sys.meta_path.append(NotebookFinder())
from project_resources.cytochrome_P450 import HyperparamTuner, mol_fts, abs_file_path

importing Jupyter notebook from C:\Users\Lukas\Documents\datacytochromy\project_resources\cytochrome_P450.ipynb


In [2]:
model_identifiers = ["linear", "KRR", "GB", "RF", "ANN"]
isozymes = ["3A4", "RLM", "HLC"]
data_splits = ["train", "test"]
splitters = ["rand", "scaff"]
rel_paths = {
    "3A4_source": r"project_resources/ChEMBL_3A4.csv",
    "3A4_sep": ";",
    "3A4": r"project_resources/3A4.csv",
    "3A4_train_scaff": r"project_resources/base_splits/scaffold_splitter/3A4_train.csv",
    "3A4_test_scaff": r"project_resources/base_splits/scaffold_splitter/3A4_test.csv",
    "3A4_train_rand": r"project_resources/base_splits/random/3A4_train.csv",
    "3A4_test_rand": r"project_resources/base_splits/random/3A4_test.csv",
    "3A4_jazzy_train_rand": r"project_resources/jazzy_splits/random/3A4_train.csv",
    "3A4_jazzy_test_rand": r"project_resources/jazzy_splits/random/3A4_test.csv",
    "3A4_jazzy_train_scaff": r"project_resources/jazzy_splits/scaffold_splitter/3A4_train.csv",
    "3A4_jazzy_test_scaff": r"project_resources/jazzy_splits/scaffold_splitter/3A4_test.csv",

    "RLM_source": r"project_resources/AID_1508591_datatable_all.csv",
    "RLM_sep": ",",
    "RLM": r"project_resources/RLM.csv",
    "RLM_train_scaff": r"project_resources/base_splits/scaffold_splitter/RLM_train.csv",
    "RLM_test_scaff": r"project_resources/base_splits/scaffold_splitter/RLM_test.csv",
    "RLM_train_rand": r"project_resources/base_splits/random/RLM_train.csv",
    "RLM_test_rand": r"project_resources/base_splits/random/RLM_test.csv",
    "RLM_jazzy_train_rand": r"project_resources/jazzy_splits/random/RLM_train.csv",
    "RLM_jazzy_test_rand": r"project_resources/jazzy_splits/random/RLM_test.csv",
    "RLM_jazzy_train_scaff": r"project_resources/jazzy_splits/scaffold_splitter/RLM_train.csv",
    "RLM_jazzy_test_scaff": r"project_resources/jazzy_splits/scaffold_splitter/RLM_test.csv",


    "HLC_source": r"project_resources/AID_1508603_datatable_all.csv",
    "HLC_sep": ",",
    "HLC": r"project_resources/HLC.csv",
    "HLC_train_scaff": r"project_resources/base_splits/scaffold_splitter/HLC_train.csv",
    "HLC_test_scaff": r"project_resources/base_splits/scaffold_splitter/HLC_test.csv",
    "HLC_train_rand": r"project_resources/base_splits/random/HLC_train.csv",
    "HLC_test_rand": r"project_resources/base_splits/random/HLC_test.csv",
    "HLC_jazzy_train_rand": r"project_resources/jazzy_splits/random/HLC_train.csv",
    "HLC_jazzy_test_rand": r"project_resources/jazzy_splits/random/HLC_test.csv",
    "HLC_jazzy_train_scaff": r"project_resources/jazzy_splits/scaffold_splitter/HLC_train.csv",
    "HLC_jazzy_test_scaff": r"project_resources/jazzy_splits/scaffold_splitter/HLC_test.csv"
}
# sampler - a method used to generate new sets of hyperparameters in each iteration of the optimization process
samplers = {
    'RandomSampler': optuna.samplers.RandomSampler,          # Sampler that selects hyperparameters randomly from the search space.
    'GridSampler': optuna.samplers.GridSampler,              # Sampler that performs a grid search over the hyperparameter space.
    'TPESampler': optuna.samplers.TPESampler,                # Sampler that uses a tree-structured Parzen estimator to model the objective function and sample new points from the search space.
    'CmaEsSampler': optuna.samplers.CmaEsSampler,            # Sampler that uses the Covariance Matrix Adaptation Evolution Strategy algorithm to efficiently search the hyperparameter space.
    'NSGAIISampler': optuna.samplers.NSGAIISampler,          # Multi-objective evolutionary algorithm that generates new samples using non-dominated sorting and crowding distance selection.
    'QMCSampler': optuna.samplers.QMCSampler,                # Quasi-Monte Carlo sampler that uses low-discrepancy sequences to sample the search space in a more efficient and evenly distributed way than random sampling.
    'BoTorchSampler': optuna.integration.BoTorchSampler,     # Sampler that leverages the BoTorch library for Bayesian optimization and can handle both continuous and categorical hyperparameters.
    'BruteForceSampler': optuna.samplers.BruteForceSampler,  # Sampler that exhaustively evaluates all possible combinations of hyperparameters in the search space.
}
# pruner - a technique used to eliminate unpromising trials during the course of hyperparameter optimization.
pruners = {
    'BasePruner': optuna.pruners.BasePruner,                            # This is the base class for all pruning strategies in Optuna. It provides a skeleton for implementing custom pruning strategies.
    'MedianPruner': optuna.pruners.MedianPruner,                        # A pruner that prunes unpromising trials that have median objective values, as determined in previous steps.
    'SuccessiveHalvingPruner': optuna.pruners.SuccessiveHalvingPruner,  # This pruner repeatedly splits trials into halves, discarding the lower performing half at each iteration.
    'HyperbandPruner': optuna.pruners.HyperbandPruner,                  # This pruner implements the Hyperband algorithm, which selects promising trials and runs them with different resource allocation schemes to determine the best one.
    'PercentilePruner': optuna.pruners.PercentilePruner,                # A pruner that prunes unpromising trials based on their percentile rank relative to all completed trials.
    'NopPruner': optuna.pruners.NopPruner,                              # A pruner that does nothing and does not prune any trials.
    'ThresholdPruner': optuna.pruners.ThresholdPruner,                  # This pruner prunes trials that have not reached a certain level of performance (i.e., objective value).
    'PatientPruner': optuna.pruners.PatientPruner,                      # This pruner prunes trials that do not show improvement over a certain number of steps (or epochs).
}
halflives = {}
mol_features = {}

In [3]:
# load Jazzy features from csv files or generate the features and save them
for splitter in splitters:
    print("\n")
    print(splitter)
    mol_features[splitter] = {}
    halflives[splitter] = {}
    for isozyme in isozymes:
        mol_features[splitter][isozyme] = {}
        halflives[splitter][isozyme] = {}
        for split in data_splits:
            print(isozyme, split)

            try:
                df = pd.read_csv(rel_paths[f"{isozyme}_jazzy_{split}_{splitter}"])
                cols = df.columns
                data = {}  # all data from csv file (i.e. mol indexes, smiles, half-lives and features)
                for col in cols:
                    data[col] = list(df[col])
                nan_idxs = np.argwhere(np.isnan(data["dgtot"]))
                nan_idxs = [int(idx) for idx in nan_idxs]
                data_clumped = []  # same as data, but in the form [[idx1, smi1, thalf1, fts1], [idx2, smi2, thalf2, fts2],...]]
                for col in cols:
                    for i, foo in zip(range(len(data[col])), data[col]):
                        if len(data_clumped) < i+1:
                            data_clumped.append([])
                        data_clumped[i].append(foo)

                # remove all mols for which Jazzy features generation wasn't successful
                num_pops = 0
                for nan_idx in nan_idxs:
                    data_clumped.pop(nan_idx - num_pops)
                    num_pops += 1
                    print(f"     removed index {nan_idx} corresponding to NaN in {split}")
                print(f"     {len(data_clumped)}, {data_clumped[0]}")

                # filter out only the features
                mol_features[splitter][isozyme][split] = np.array([feature[3:11] for feature in data_clumped])
                halflives[splitter][isozyme][split] = np.array([feature[2] for feature in data_clumped])
                contains_nan = np.any(np.isnan(mol_features[splitter][isozyme][split]))
                print(f"     {mol_features[splitter][isozyme][split][0]}")
                print(f"     {isozyme} mol_features {split} contains NaN: {contains_nan}")

            except FileNotFoundError:
                df = pd.read_csv(rel_paths[f"{isozyme}_{split}_{splitter}"])
                mol_idx = df["index"]
                smiles = df["smiles"]
                halflife = df["half-life"]
                mol_features[splitter][isozyme] = mol_fts(smiles, isozyme)  # calculate mol features for list of smiles
                mol_fts_df = pd.DataFrame()
                mol_fts_df["index"] = mol_idx
                mol_fts_df["smiles"] = smiles
                mol_fts_df["half-life"] = halflife
                present_fts = [feature for feature in mol_features[splitter][isozyme][0]]  # list of all mol features

                # write the value of a given feature for each molecule into df
                for feature in present_fts:
                    single_feature_list = []
                    for i in range(len(mol_features[splitter][isozyme])):
                        if mol_features[splitter][isozyme][i] == mol_features[splitter][isozyme][i]:
                            single_feature_list.append(mol_features[splitter][isozyme][i][feature])
                        else:
                            single_feature_list.append(np.nan)
                    mol_fts_df[feature] = single_feature_list

                mol_fts_df.to_csv(abs_file_path(rel_paths[f"{isozyme}_jazzy_{split}_{splitter}"]), index=False)
                print(f'{rel_paths[f"{isozyme}_jazzy_{split}_{splitter}"]} was created successfully')



rand
3A4 train
     56, [63, 'CC(C)(O)c1cc(F)c2c(c1)C(=O)N(Cc1ccc(Cl)cn1)[C@@]2(OCC1(O)CC1)c1ccc(Cl)cc1', 6.0, 13.1753, 1.4177, 11.3044, -7.3559, -125.8906, -119.4889]
     [  13.1753    1.4177   11.3044   -7.3559 -125.8906 -119.4889]
     3A4 mol_features train contains NaN: False
3A4 test
     14, [23, 'Cc1ncsc1-c1ccc([C@H](CC(=O)NCCCCCCNC(=O)COc2c(-c3csc(N4CCOCC4)n3)ccc(F)c2F)NC(=O)[C@@H]2C[C@@H](O)CN2C(=O)[C@@H](NC(=O)C2(F)CC2)C(C)(C)C)cc1', 0.3767, 28.4327, 3.274, 23.4872, -4.0348, -286.1545, -257.0326]
     [  28.4327    3.274    23.4872   -4.0348 -286.1545 -257.0326]
     3A4 mol_features test contains NaN: False
RLM train
     removed index 1047 corresponding to NaN in train
     removed index 1517 corresponding to NaN in train
     2022, [158, 'O=c1cc(N2CCOCC2)oc2c1ccc1ccccc12', 30.0, 8.0486, 0.0, 5.2165, -13.5913, -63.7029, -67.7504]
     [  8.0486   0.       5.2165 -13.5913 -63.7029 -67.7504]
     RLM mol_features train contains NaN: False
RLM test
     removed index 174 c

In [5]:
sampler = samplers['TPESampler']
pruner = pruners["BasePruner"]
n_trials = 1
for splitter in splitters:
    if splitter == "rand":
        splitter_name = "random"
    else:
        splitter_name = "scaffold_splitter"

    for isozyme in isozymes:
        X_train = mol_features[splitter][isozyme]["train"]
        y_train = np.log(halflives[splitter][isozyme]["train"])  # natural log half-lives -> values are less spread out
        X_test = mol_features[splitter][isozyme]["test"]
        y_test = np.log(halflives[splitter][isozyme]["test"])
        for model_identifier in model_identifiers:
            print(splitter_name, isozyme, model_identifier)
            study = optuna.create_study(study_name=model_identifier, directions=['minimize'], pruner=pruner,
                                        storage=f"sqlite:///project_resources/optuna/jazzy/{splitter_name}/{isozyme}/db.{model_identifier}", load_if_exists=True)
            test = HyperparamTuner(model_identifier, X_train, y_train, X_test, y_test)
            study.optimize(test.objective, n_trials=n_trials, n_jobs=-1)  # catch=(ValueError,)
            joblib.dump(study, f"./project_resources/optuna/jazzy/{splitter_name}/{isozyme}/{model_identifier}.pkl")

[I 2023-10-10 22:05:55,404] Using an existing study with name 'linear' instead of creating a new one.


random 3A4 linear


[I 2023-10-10 22:05:55,479] Trial 1 finished with value: 1.4052157267487904 and parameters: {'alpha': 0.09756333251827164, 'l1_ratio': 0.2842031992425431}. Best is trial 0 with value: 1.4038627192898365.
[I 2023-10-10 22:05:55,609] A new study created in RDB with name: KRR


random 3A4 KRR


[I 2023-10-10 22:05:55,695] Trial 0 finished with value: 1.3804652303052911 and parameters: {'alpha': 0.38984005784428544, 'gamma': 7.247000271055202e-15, 'kernel': 'rbf'}. Best is trial 0 with value: 1.3804652303052911.
[I 2023-10-10 22:05:55,822] A new study created in RDB with name: GB


random 3A4 GB


[I 2023-10-10 22:05:56,145] Trial 0 finished with value: 1.1714652221427115 and parameters: {'n_estimators': 200, 'learning_rate': 0.5975822104728432, 'max_depth': 3}. Best is trial 0 with value: 1.1714652221427115.
[I 2023-10-10 22:05:56,272] A new study created in RDB with name: RF


random 3A4 RF


[I 2023-10-10 22:05:57,381] Trial 0 finished with value: 1.3342143583288308 and parameters: {'n_estimators': 500, 'max_features': 'auto', 'max_depth': None}. Best is trial 0 with value: 1.3342143583288308.
[I 2023-10-10 22:05:57,507] A new study created in RDB with name: ANN


random 3A4 ANN


[I 2023-10-10 22:05:58,118] Trial 0 finished with value: 3.2120353102656423 and parameters: {'learning_rate_init': 0.013975285882685533, 'hidden_layer_sizes': [20]}. Best is trial 0 with value: 3.2120353102656423.
[I 2023-10-10 22:05:58,243] A new study created in RDB with name: linear
[I 2023-10-10 22:05:58,319] Trial 0 finished with value: 1.0537742063632034 and parameters: {'alpha': 0.0738166003971161, 'l1_ratio': 0.7239777216625384}. Best is trial 0 with value: 1.0537742063632034.


random RLM linear


[I 2023-10-10 22:05:58,444] A new study created in RDB with name: KRR


random RLM KRR


[I 2023-10-10 22:05:59,587] Trial 0 finished with value: 1.108322815229059 and parameters: {'alpha': 0.4491530336826782, 'gamma': 8.863021458677126e-15, 'kernel': 'linear'}. Best is trial 0 with value: 1.108322815229059.


random RLM GB


[I 2023-10-10 22:05:59,827] A new study created in RDB with name: GB
[I 2023-10-10 22:06:03,473] Trial 0 finished with value: 0.9946741084381537 and parameters: {'n_estimators': 10, 'learning_rate': 0.5062073363088264, 'max_depth': 2}. Best is trial 0 with value: 0.9946741084381537.
[I 2023-10-10 22:06:03,598] A new study created in RDB with name: RF


random RLM RF


[I 2023-10-10 22:06:11,503] Trial 0 finished with value: 0.9926985913490077 and parameters: {'n_estimators': 20, 'max_features': 'auto', 'max_depth': 2}. Best is trial 0 with value: 0.9926985913490077.
[I 2023-10-10 22:06:11,630] A new study created in RDB with name: ANN


random RLM ANN


[I 2023-10-10 22:06:18,778] Trial 0 finished with value: 1.0440669369038673 and parameters: {'learning_rate_init': 0.0437041328527414, 'hidden_layer_sizes': [20]}. Best is trial 0 with value: 1.0440669369038673.
[I 2023-10-10 22:06:18,906] A new study created in RDB with name: linear
[I 2023-10-10 22:06:18,987] Trial 0 finished with value: 0.6091618191984409 and parameters: {'alpha': 0.09067758468806048, 'l1_ratio': 0.7921924132667718}. Best is trial 0 with value: 0.6091618191984409.


random HLC linear


[I 2023-10-10 22:06:19,115] A new study created in RDB with name: KRR
[I 2023-10-10 22:06:19,189] Trial 0 finished with value: 0.8915941225887049 and parameters: {'alpha': 0.051345874009050135, 'gamma': 6.96385638171956e-15, 'kernel': 'linear'}. Best is trial 0 with value: 0.8915941225887049.


random HLC KRR


[I 2023-10-10 22:06:19,385] A new study created in RDB with name: GB


random HLC GB


[I 2023-10-10 22:06:19,819] Trial 0 finished with value: 0.6320929573668942 and parameters: {'n_estimators': 20, 'learning_rate': 0.18420158500170125, 'max_depth': 4}. Best is trial 0 with value: 0.6320929573668942.


random HLC RF


[I 2023-10-10 22:06:20,052] A new study created in RDB with name: RF
[I 2023-10-10 22:06:21,344] Trial 0 finished with value: 0.5887749788625218 and parameters: {'n_estimators': 50, 'max_features': 'auto', 'max_depth': 10}. Best is trial 0 with value: 0.5887749788625218.
[I 2023-10-10 22:06:21,468] A new study created in RDB with name: ANN


random HLC ANN


[I 2023-10-10 22:06:21,974] Trial 0 finished with value: 2.6112125638809442 and parameters: {'learning_rate_init': 0.08836646202580983, 'hidden_layer_sizes': [5, 5, 5]}. Best is trial 0 with value: 2.6112125638809442.
[I 2023-10-10 22:06:22,099] A new study created in RDB with name: linear
[I 2023-10-10 22:06:22,171] Trial 0 finished with value: 1.419257015292315 and parameters: {'alpha': 0.07369358217606732, 'l1_ratio': 0.029129653845802794}. Best is trial 0 with value: 1.419257015292315.


scaffold_splitter 3A4 linear


[I 2023-10-10 22:06:22,299] A new study created in RDB with name: KRR
[I 2023-10-10 22:06:22,370] Trial 0 finished with value: 1.4310348444717338 and parameters: {'alpha': 0.7649941054916107, 'gamma': 2.5195634920626997e-15, 'kernel': 'linear'}. Best is trial 0 with value: 1.4310348444717338.


scaffold_splitter 3A4 KRR


[I 2023-10-10 22:06:22,499] A new study created in RDB with name: GB


scaffold_splitter 3A4 GB


[I 2023-10-10 22:06:22,816] Trial 0 finished with value: 1.086335124039331 and parameters: {'n_estimators': 20, 'learning_rate': 0.22118883462600292, 'max_depth': 1}. Best is trial 0 with value: 1.086335124039331.
[I 2023-10-10 22:06:22,944] A new study created in RDB with name: RF


scaffold_splitter 3A4 RF


[I 2023-10-10 22:06:24,102] Trial 0 finished with value: 1.226473090401944 and parameters: {'n_estimators': 10, 'max_features': 'log2', 'max_depth': 2}. Best is trial 0 with value: 1.226473090401944.


scaffold_splitter 3A4 ANN


[I 2023-10-10 22:06:24,357] A new study created in RDB with name: ANN
[I 2023-10-10 22:06:24,918] Trial 0 finished with value: 3.587870843320496 and parameters: {'learning_rate_init': 0.017553126739662868, 'hidden_layer_sizes': [5, 5]}. Best is trial 0 with value: 3.587870843320496.
[I 2023-10-10 22:06:25,047] A new study created in RDB with name: linear
[I 2023-10-10 22:06:25,123] Trial 0 finished with value: 1.060306211969177 and parameters: {'alpha': 0.02264573835035236, 'l1_ratio': 0.027929688173000233}. Best is trial 0 with value: 1.060306211969177.


scaffold_splitter RLM linear


[I 2023-10-10 22:06:25,252] A new study created in RDB with name: KRR


scaffold_splitter RLM KRR


[I 2023-10-10 22:06:26,327] Trial 0 finished with value: 1.1120909107559847 and parameters: {'alpha': 0.4681177342387701, 'gamma': 3.659206904728363e-16, 'kernel': 'rbf'}. Best is trial 0 with value: 1.1120909107559847.
[I 2023-10-10 22:06:26,465] A new study created in RDB with name: GB


scaffold_splitter RLM GB


[I 2023-10-10 22:06:30,080] Trial 0 finished with value: 1.0062548683072043 and parameters: {'n_estimators': 10, 'learning_rate': 0.3270569931829844, 'max_depth': 4}. Best is trial 0 with value: 1.0062548683072043.
[I 2023-10-10 22:06:30,206] A new study created in RDB with name: RF


scaffold_splitter RLM RF


[I 2023-10-10 22:06:38,121] Trial 0 finished with value: 0.996736952308356 and parameters: {'n_estimators': 200, 'max_features': 'auto', 'max_depth': 3}. Best is trial 0 with value: 0.996736952308356.
[I 2023-10-10 22:06:38,249] A new study created in RDB with name: ANN


scaffold_splitter RLM ANN


[I 2023-10-10 22:06:46,221] Trial 0 finished with value: 1.0714399293778747 and parameters: {'learning_rate_init': 0.09388700367296468, 'hidden_layer_sizes': [5]}. Best is trial 0 with value: 1.0714399293778747.
[I 2023-10-10 22:06:46,359] A new study created in RDB with name: linear


scaffold_splitter HLC linear


[I 2023-10-10 22:06:46,435] Trial 0 finished with value: 0.49222880245650563 and parameters: {'alpha': 0.08354423739180682, 'l1_ratio': 0.6771087670427723}. Best is trial 0 with value: 0.49222880245650563.


scaffold_splitter HLC KRR


[I 2023-10-10 22:06:46,688] A new study created in RDB with name: KRR
[I 2023-10-10 22:06:46,762] Trial 0 finished with value: 0.8159988144385038 and parameters: {'alpha': 0.45138889652195213, 'gamma': 9.25112512766806e-15, 'kernel': 'laplacian'}. Best is trial 0 with value: 0.8159988144385038.
[I 2023-10-10 22:06:46,893] A new study created in RDB with name: GB


scaffold_splitter HLC GB


[I 2023-10-10 22:06:47,334] Trial 0 finished with value: 0.5036686982461732 and parameters: {'n_estimators': 10, 'learning_rate': 0.2048964546603394, 'max_depth': 5}. Best is trial 0 with value: 0.5036686982461732.
[I 2023-10-10 22:06:47,480] A new study created in RDB with name: RF


scaffold_splitter HLC RF


[I 2023-10-10 22:06:48,751] Trial 0 finished with value: 0.4514864164705113 and parameters: {'n_estimators': 50, 'max_features': 'auto', 'max_depth': None}. Best is trial 0 with value: 0.4514864164705113.
[I 2023-10-10 22:06:48,876] A new study created in RDB with name: ANN


scaffold_splitter HLC ANN


[I 2023-10-10 22:06:49,342] Trial 0 finished with value: 2.7861343135035814 and parameters: {'learning_rate_init': 0.025107979675308954, 'hidden_layer_sizes': [5, 5, 5]}. Best is trial 0 with value: 2.7861343135035814.
