In [27]:
from joblib import Parallel, delayed
from functools import partial

In [1]:
import pandas as pd
import os
import re
import csv
import sys
import joblib
import requests
import warnings
import py3Dmol
from ase import Atoms
from ase.io import read, write
from ase.calculators.singlepoint import SinglePointCalculator
from chembl_webresource_client.new_client import new_client as client
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem.MolStandardize.rdMolStandardize import FragmentParent
from rdkit import DataStructs
from jazzy.api import molecular_vector_from_smiles as mol_vect
import numpy as np
import pubchempy as pcp
import plotly.graph_objects as go
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error
import optuna
from concurrent.futures import ThreadPoolExecutor
from optuna.storages import JournalStorage, JournalFileStorage
from tdc.single_pred import ADME
from project_resources.import_utils import NotebookFinder
from sklearn.preprocessing import MinMaxScaler
sys.meta_path.append(NotebookFinder())
from project_resources.cytochrome_P450 import fp_from_smiles, parse_jazzy_df, optuna_trial_logging

  from .autonotebook import tqdm as notebook_tqdm


importing Jupyter notebook from C:\Users\Lukas\Documents\datacytochromy\project_resources\cytochrome_P450.ipynb


In [2]:
feature_types = ["morgan", "jazzy"]
tdc_benchmarks = ["obach", "microsome", "hepatocyte"]
model_identifiers = ["linear", "KRR", "GB", "RF", "ANN"]
tdc_datasets = {}
smiles = {}
halflives = {}
mol_features = {}

# sampler - a method used to generate new sets of hyperparameters in each iteration of the optimization process
samplers = {
    'RandomSampler': optuna.samplers.RandomSampler,          # Sampler that selects hyperparameters randomly from the search space.
    'GridSampler': optuna.samplers.GridSampler,              # Sampler that performs a grid search over the hyperparameter space.
    'TPESampler': optuna.samplers.TPESampler,                # Sampler that uses a tree-structured Parzen estimator to model the objective function and sample new points from the search space.
    'CmaEsSampler': optuna.samplers.CmaEsSampler,            # Sampler that uses the Covariance Matrix Adaptation Evolution Strategy algorithm to efficiently search the hyperparameter space.
    'NSGAIISampler': optuna.samplers.NSGAIISampler,          # Multi-objective evolutionary algorithm that generates new samples using non-dominated sorting and crowding distance selection.
    'QMCSampler': optuna.samplers.QMCSampler,                # Quasi-Monte Carlo sampler that uses low-discrepancy sequences to sample the search space in a more efficient and evenly distributed way than random sampling.
    'BoTorchSampler': optuna.integration.BoTorchSampler,     # Sampler that leverages the BoTorch library for Bayesian optimization and can handle both continuous and categorical hyperparameters.
    'BruteForceSampler': optuna.samplers.BruteForceSampler,  # Sampler that exhaustively evaluates all possible combinations of hyperparameters in the search space.
}
# pruner - a technique used to eliminate unpromising trials during the course of hyperparameter optimization.
pruners = {
    'BasePruner': optuna.pruners.BasePruner,                            # This is the base class for all pruning strategies in Optuna. It provides a skeleton for implementing custom pruning strategies.
    'MedianPruner': optuna.pruners.MedianPruner,                        # A pruner that prunes unpromising trials that have median objective values, as determined in previous steps.
    'SuccessiveHalvingPruner': optuna.pruners.SuccessiveHalvingPruner,  # This pruner repeatedly splits trials into halves, discarding the lower performing half at each iteration.
    'HyperbandPruner': optuna.pruners.HyperbandPruner,                  # This pruner implements the Hyperband algorithm, which selects promising trials and runs them with different resource allocation schemes to determine the best one.
    'PercentilePruner': optuna.pruners.PercentilePruner,                # A pruner that prunes unpromising trials based on their percentile rank relative to all completed trials.
    'NopPruner': optuna.pruners.NopPruner,                              # A pruner that does nothing and does not prune any trials.
    'ThresholdPruner': optuna.pruners.ThresholdPruner,                  # This pruner prunes trials that have not reached a certain level of performance (i.e., objective value).
    'PatientPruner': optuna.pruners.PatientPruner,                      # This pruner prunes trials that do not show improvement over a certain number of steps (or epochs).
}

In [3]:
obach = ADME(name='Half_Life_Obach')
obach_split = obach.get_split()
tdc_datasets["obach"] = obach_split
microsome = ADME(name='Clearance_Microsome_AZ')
microsome_split = microsome.get_split()
tdc_datasets["microsome"] = microsome_split
hepatocyte = ADME(name='Clearance_Hepatocyte_AZ')
hepatocyte_split = hepatocyte.get_split()
tdc_datasets["hepatocyte"] = hepatocyte_split

Found local copy...
Loading...
Done!
Found local copy...
Loading...
Done!
Found local copy...
Loading...
Done!


In [4]:
halflives["morgan"] = {}
for benchmark in tdc_benchmarks:
    print(benchmark)
    smiles[benchmark] = {}
    halflives["morgan"][benchmark] = {}
    
    benchmark_train_smiles = tdc_datasets[benchmark]["train"]["Drug"]
    benchmark_test_smiles = tdc_datasets[benchmark]["test"]["Drug"]
    smiles[benchmark]["train"] = np.array(benchmark_train_smiles)
    smiles[benchmark]["test"] = np.array(benchmark_test_smiles)
    
    benchmark_train_halflives = tdc_datasets[benchmark]["train"]["Y"]
    benchmark_test_halflives = tdc_datasets[benchmark]["test"]["Y"]
    
    reshaped_train_halflife = np.array(benchmark_train_halflives).reshape(-1, 1)
    scaler = MinMaxScaler().fit(reshaped_train_halflife)
    train_halflife_scaled = scaler.transform(reshaped_train_halflife)
    train_halflives_scaled = np.array([val[0] for val in train_halflife_scaled])

    reshaped_test_halflife = np.array(benchmark_test_halflives).reshape(-1, 1)
    scaler = MinMaxScaler().fit(reshaped_test_halflife)
    test_halflife_scaled = scaler.transform(reshaped_test_halflife)
    test_halflives_scaled = np.array([val[0] for val in test_halflife_scaled])
    
    halflives["morgan"][benchmark]["train"] = np.array(train_halflives_scaled)
    halflives["morgan"][benchmark]["test"] = np.array(test_halflives_scaled)
    
    print(halflives["morgan"].keys())
    
    print(benchmark_train_smiles.shape, benchmark_train_halflives.shape, benchmark_test_smiles.shape, benchmark_test_halflives.shape)

obach
dict_keys(['obach'])
(467,) (467,) (133,) (133,)
microsome
dict_keys(['obach', 'microsome'])
(772,) (772,) (220,) (220,)
hepatocyte
dict_keys(['obach', 'microsome', 'hepatocyte'])
(849,) (849,) (243,) (243,)


In [5]:
mol_features["morgan"] = {}
for benchmark in tdc_benchmarks:
    print(benchmark)
    mol_features["morgan"][benchmark] = {}
    train_morgan_fps = np.array(fp_from_smiles(smiles[benchmark]["train"]))
    test_morgan_fps = np.array(fp_from_smiles(smiles[benchmark]["test"]))
    mol_features["morgan"][benchmark]["train"] = train_morgan_fps
    mol_features["morgan"][benchmark]["test"] = test_morgan_fps
    print(train_morgan_fps.shape, test_morgan_fps.shape)

obach
(467, 124) (133, 124)
microsome
(772, 124) (220, 124)
hepatocyte
(849, 124) (243, 124)


In [6]:
mol_features["jazzy"] = {}
halflives["jazzy"] = {}
for benchmark in tdc_benchmarks:
    print(benchmark)
    train_jazzy_df = pd.read_csv(f"project_resources/jazzy_splits/{benchmark}_train.csv")
    test_jazzy_df = pd.read_csv(f"project_resources/jazzy_splits/{benchmark}_test.csv")
    train_fts, train_jazzy_thalfs, contains_nan = parse_jazzy_df(train_jazzy_df, no_idx_smi=True)
    test_fts, test_jazzy_thalfs, contains_nan = parse_jazzy_df(test_jazzy_df, no_idx_smi=True)
    
    mol_features["jazzy"][benchmark] = {}
    mol_features["jazzy"][benchmark]["train"] = train_fts
    mol_features["jazzy"][benchmark]["test"] = test_fts
    halflives["jazzy"][benchmark] = {}
    halflives["jazzy"][benchmark]["train"] = train_jazzy_thalfs
    halflives["jazzy"][benchmark]["test"] = test_jazzy_thalfs
    
    print(np.array(train_fts).shape, np.array(train_jazzy_thalfs).shape, np.array(test_fts).shape, np.array(test_jazzy_thalfs).shape)

obach
     525, [0.0033169983665033, 11.3305, 0.0, 3.3351, -4.054, -69.5826, -60.2942]
     130, [0.0078064793190067, 10.9705, 1.8136, 5.8249, -16.4281, -118.9807, -121.6516]
(525, 6) (525,) (130, 6) (130,)
microsome
     882, [0.0652380952380952, 10.5072, 1.4478, 4.6964, -13.5025, -92.8889, -106.3914]
     220, [0.0884353741496598, 3.5084, 3.5128, 4.5042, -12.6756, -80.3865, -84.2257]
(882, 6) (882,) (220, 6) (220,)
hepatocyte
     970, [0.0, 9.8552, 1.4451, 4.4407, -15.1209, -91.0733, -102.926]
     243, [0.0825850340136054, 10.2098, 1.292, 5.2199, -18.4498, -95.9609, -98.199]
(970, 6) (970,) (243, 6) (243,)


In [7]:
class HyperparamTuner():
    def __init__(self, log_csv_path, model_identifier, X_train, y_train, X_test, y_test):
        self.log_csv_path = log_csv_path
        self.model_identifier = model_identifier
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test

    def sample_params(self, trial: optuna.Trial, model_identifier):
        if model_identifier == 'linear':
            alpha = trial.suggest_float('alpha', 1e-5, 1e-1)
            l1_ratio = trial.suggest_float('l1_ratio', 0, 1)
            return {
                "alpha": alpha,
                "l1_ratio": l1_ratio
            }, ElasticNet(alpha=alpha, l1_ratio=l1_ratio)

        if model_identifier == 'KRR':
            alpha = trial.suggest_float("alpha", 1e-4, 1)
            gamma = trial.suggest_float("gamma", 0, 1e-14)
            kernel = trial.suggest_categorical("kernel", ["linear", "laplacian", "rbf"])
            return {
                "alpha": alpha,
                "gamma": gamma,
                "kernel": kernel
            }, KernelRidge(alpha=alpha, gamma=gamma, kernel=kernel)

        if model_identifier == 'GB':
            n_estimators = trial.suggest_categorical("n_estimators", [10, 20, 50, 200, 500])
            learning_rate = trial.suggest_float("learning_rate", 0.005, 1)
            max_depth = trial.suggest_categorical("max_depth", [1, 2, 3, 4, 5])
            return {
                "n_estimators": n_estimators,
                "learning_rate": learning_rate,
                "max_depth": max_depth
            }, GradientBoostingRegressor(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth)

        if model_identifier == 'RF':
            n_estimators = trial.suggest_categorical("n_estimators", [10, 20, 50, 200, 500])
            max_features = trial.suggest_categorical("max_features", ["sqrt", "log2"])
            max_depth = trial.suggest_categorical("max_depth", [None, 2, 3, 4, 5, 10])
            return {
                "n_estimators": n_estimators,
                "max_features": max_features,
                "max_depth": max_depth
            }, RandomForestRegressor(n_estimators=n_estimators, max_features=max_features, max_depth=max_depth)

        if model_identifier == 'ANN':
            learning_rate_init = trial.suggest_float("learning_rate_init", 0.001, 0.1)
            hidden_layer_sizes = trial.suggest_categorical("hidden_layer_sizes",
                                                           [[5], [10], [20], [50], [5]*2, [10]*2, [20]*2, [50]*2, [5]*3, [10]*3, [50]*3])
            return {
            "learning_rate_init": learning_rate_init,
            "hidden_layer_sizes": hidden_layer_sizes
            }, MLPRegressor(learning_rate_init=learning_rate_init, hidden_layer_sizes=hidden_layer_sizes)

    def cross_validation_splits(self, X_train, X_test, y_train, y_test, cv_splits=5):
        """
        Splits the data into cv_splits different combinations for cross-validation.

        Parameters:
        - X_train: Training data features
        - X_test: Testing data features
        - y_train: Training data labels
        - y_test: Testing data labels
        - cv_splits: Number of cross-validation splits

        Returns:
        - List of tuples, where each tuple contains (X_train_fold, X_test_fold, y_train_fold, y_test_fold)
        """
        # Initialize StratifiedKFold with the desired number of splits
        kf = KFold(n_splits=cv_splits, shuffle=True)  # random_state=42)

        # Initialize an empty list to store the data splits
        data_splits = []

        # Loop through the cross-validation splits
        for train_index, test_index in kf.split(X_train, y_train):
            X_train_fold, X_val_fold = X_train[train_index], X_train[test_index]
            y_train_fold, y_val_fold = y_train[train_index], y_train[test_index]

            # Append the current split to the list
            data_splits.append((X_train_fold, X_val_fold, y_train_fold, y_val_fold))

        # Append the original test data to the list
        data_splits.append((X_train, X_test, y_train, y_test))

        return data_splits

    def evaluate(self, model, X_test, y_test, return_predictions=False):
        predictions = model.predict(X_test)
        rmsd = mean_squared_error(y_test, predictions, squared=False)
        return rmsd, predictions

    def train_test_return(self, parameters, model, trial_number):
        runs = 3
        # average over all runs
        runs_results = []
        y_tests_predicted = []

        for run in range(runs):
            validation_splits = self.cross_validation_splits(self.X_train, self.X_test, self.y_train, self.y_test)
            # average over all splits in a given run
            cv_fold_results = []
            y_test_predicted = []
            fold_num = 0

            # cross-validation
            for (X_train_val, X_test_val, y_train_val, y_test_val) in validation_splits:
                fold_num += 1
                
                # train the model on the given validation split
                model.fit(X_train_val, y_train_val)
                cv_fold_rmsd, validation_predictions = self.evaluate(model, X_test_val, y_test_val)
                
                # and save the result of that split
                cv_fold_results.append(cv_fold_rmsd)
                
                # after all five folds, append the final predictions
                if fold_num == 6:
                    y_test_predicted.append(validation_predictions)

            runs_results.append(np.mean(cv_fold_results))
            y_tests_predicted.append(y_test_predicted)

        # calculate the standard deviation of predictions
        y_tests_predicted = np.array(y_tests_predicted)
        std = np.std(y_tests_predicted, axis=0)[0]
        
        average_predictions = np.average(y_tests_predicted, axis=0)[0]
        average_result = np.mean(runs_results)
        
        # write the result and hyperparameters of a run to csv file
        optuna_trial_logging(self.log_csv_path, trial_number, parameters, average_result, average_predictions, std)

        return average_result

    def objective(self, trial=None):
        parameters, model = self.sample_params(trial, self.model_identifier)
        return self.train_test_return(parameters, model, trial.number)

In [8]:
sampler = samplers['TPESampler']
pruner = pruners["HyperbandPruner"]
n_trials = 1
with ThreadPoolExecutor() as executor:
    futures = []
    for _type in feature_types:
        for benchmark in tdc_benchmarks:
            X_train = mol_features[_type][benchmark]["train"]
            y_train = halflives[_type][benchmark]["train"]
            X_test = mol_features[_type][benchmark]["test"]
            y_test = halflives[_type][benchmark]["test"]

            for model_identifier in model_identifiers:
                print(_type, benchmark, model_identifier)
                lock_obj = optuna.storages.JournalFileOpenLock(
                    f"./project_resources/optuna/{_type}/{benchmark}/{model_identifier}_journal.log"
                )

                storage = JournalStorage(
                                JournalFileStorage(f"./project_resources/optuna/{_type}/{benchmark}/{model_identifier}_journal.log", lock_obj=lock_obj)
                            )
                study = optuna.create_study(study_name=model_identifier, directions=['minimize'], pruner=pruner,
                                            storage=storage, load_if_exists=True)
                
                trial_log_csv_path = f"./project_resources/optuna/{_type}/{benchmark}/{model_identifier}.csv"
                tuner = HyperparamTuner(trial_log_csv_path, model_identifier, X_train, y_train, X_test, y_test)
                
                futures.append(executor.submit(study.optimize, tuner.objective, n_trials=n_trials))
                joblib.dump(study, f"./project_resources/optuna/{_type}/{benchmark}/{model_identifier}.pkl")
    for future in futures:
        future.result()

[I 2023-12-17 17:12:15,733] Using an existing study with name 'linear' instead of creating a new one.


morgan obach linear
morgan obach KRR


[I 2023-12-17 17:12:15,950] Using an existing study with name 'KRR' instead of creating a new one.
[I 2023-12-17 17:12:16,119] Using an existing study with name 'GB' instead of creating a new one.
[I 2023-12-17 17:12:16,121] Trial 1202 finished with value: 0.06667361802284581 and parameters: {'alpha': 0.018537127958601657, 'l1_ratio': 0.518882288269876}. Best is trial 1007 with value: 0.05832439668234344.


morgan obach GB
Successfully updated linear.csv with results of trial 1202
morgan obach RF


[I 2023-12-17 17:12:16,302] Using an existing study with name 'RF' instead of creating a new one.
[I 2023-12-17 17:12:16,489] Using an existing study with name 'ANN' instead of creating a new one.


morgan obach ANN
morgan microsome linear


[I 2023-12-17 17:12:16,755] Using an existing study with name 'linear' instead of creating a new one.
[I 2023-12-17 17:12:16,953] Trial 1201 finished with value: 0.06349276693262527 and parameters: {'alpha': 0.2625922342887563, 'gamma': 2.252945740857649e-16, 'kernel': 'rbf'}. Best is trial 1154 with value: 0.05763325633365717.
[I 2023-12-17 17:12:16,956] Using an existing study with name 'KRR' instead of creating a new one.


morgan microsome KRR
Successfully updated KRR.csv with results of trial 1201
morgan microsome GB


[I 2023-12-17 17:12:17,167] Using an existing study with name 'GB' instead of creating a new one.
[I 2023-12-17 17:12:17,260] Trial 1201 finished with value: 0.2890649394527991 and parameters: {'alpha': 0.06130801761853888, 'l1_ratio': 0.05322476970582926}. Best is trial 1159 with value: 0.28783839269336703.


morgan microsome RF
Successfully updated linear.csv with results of trial 1201


[I 2023-12-17 17:12:17,398] Using an existing study with name 'RF' instead of creating a new one.
[I 2023-12-17 17:12:17,617] Using an existing study with name 'ANN' instead of creating a new one.


morgan microsome ANN
morgan hepatocyte linear


[I 2023-12-17 17:12:17,952] Using an existing study with name 'linear' instead of creating a new one.


morgan hepatocyte KRR


[I 2023-12-17 17:12:18,184] Using an existing study with name 'KRR' instead of creating a new one.


morgan hepatocyte GB


[I 2023-12-17 17:12:18,429] Using an existing study with name 'GB' instead of creating a new one.
[I 2023-12-17 17:12:18,548] Trial 1201 finished with value: 0.3261168240349056 and parameters: {'alpha': 0.04604510437675812, 'l1_ratio': 0.10319759046147191}. Best is trial 1107 with value: 0.32443212075276023.


morgan hepatocyte RF
Successfully updated linear.csv with results of trial 1201


[I 2023-12-17 17:12:18,722] Using an existing study with name 'RF' instead of creating a new one.
[I 2023-12-17 17:12:18,788] Trial 1201 finished with value: 0.30546019610756997 and parameters: {'alpha': 0.1785407332397495, 'gamma': 6.141941910422675e-15, 'kernel': 'laplacian'}. Best is trial 580 with value: 0.3041671844687009.
[I 2023-12-17 17:12:18,844] Trial 1001 finished with value: 0.09304810932026464 and parameters: {'n_estimators': 10, 'learning_rate': 0.6901048398192198, 'max_depth': 3}. Best is trial 590 with value: 0.06843477254509721.


Successfully updated KRR.csv with results of trial 1201
morgan hepatocyte ANN
Successfully updated GB.csv with results of trial 1001


[I 2023-12-17 17:12:18,982] Using an existing study with name 'ANN' instead of creating a new one.


jazzy obach linear


[I 2023-12-17 17:12:19,408] Using an existing study with name 'linear' instead of creating a new one.


jazzy obach KRR


[I 2023-12-17 17:12:19,648] Using an existing study with name 'KRR' instead of creating a new one.


jazzy obach GB
Successfully updated KRR.csv with results of trial 1201
Successfully updated linear.csv with results of trial 1201


[I 2023-12-17 17:12:19,924] Trial 1201 finished with value: 0.3386709898012134 and parameters: {'alpha': 0.5025235230219876, 'gamma': 9.658854848797873e-15, 'kernel': 'rbf'}. Best is trial 1094 with value: 0.33833623719315026.
[I 2023-12-17 17:12:19,930] Using an existing study with name 'GB' instead of creating a new one.
[I 2023-12-17 17:12:19,935] Trial 1201 finished with value: 0.06353120657874102 and parameters: {'alpha': 0.0681160254857361, 'l1_ratio': 0.33006347800048336}. Best is trial 76 with value: 0.05617515370850514.
[I 2023-12-17 17:12:20,038] Trial 1201 finished with value: 0.06515377107546293 and parameters: {'n_estimators': 20, 'max_features': 'sqrt', 'max_depth': 2}. Best is trial 1103 with value: 0.058874125865533845.
[I 2023-12-17 17:12:20,172] Using an existing study with name 'RF' instead of creating a new one.


jazzy obach RF
Successfully updated RF.csv with results of trial 1201


[I 2023-12-17 17:12:20,424] Using an existing study with name 'ANN' instead of creating a new one.


jazzy obach ANN
jazzy microsome linear


[I 2023-12-17 17:12:20,735] Using an existing study with name 'linear' instead of creating a new one.
[I 2023-12-17 17:12:20,774] Trial 1201 finished with value: 0.06164069792792839 and parameters: {'alpha': 0.9366166495009393, 'gamma': 8.507692915949687e-15, 'kernel': 'rbf'}. Best is trial 1144 with value: 0.05888163806771656.


Successfully updated KRR.csv with results of trial 1201
jazzy microsome KRR


[I 2023-12-17 17:12:21,113] Using an existing study with name 'KRR' instead of creating a new one.


jazzy microsome GB
Successfully updated linear.csv with results of trial 1201


[I 2023-12-17 17:12:21,380] Using an existing study with name 'GB' instead of creating a new one.
[I 2023-12-17 17:12:21,388] Trial 1201 finished with value: 0.29887574015642876 and parameters: {'alpha': 0.08193895939576343, 'l1_ratio': 0.4105370634966366}. Best is trial 1182 with value: 0.2980294497536738.


jazzy microsome RF


[I 2023-12-17 17:12:21,695] Using an existing study with name 'RF' instead of creating a new one.


jazzy microsome ANN


[I 2023-12-17 17:12:22,003] Using an existing study with name 'ANN' instead of creating a new one.
[I 2023-12-17 17:12:22,154] Trial 1201 finished with value: 0.29670224365887593 and parameters: {'alpha': 0.7715950286933926, 'gamma': 2.319782213079134e-15, 'kernel': 'linear'}. Best is trial 724 with value: 0.2956542734080018.


jazzy hepatocyte linear
Successfully updated KRR.csv with results of trial 1201


[I 2023-12-17 17:12:22,279] Using an existing study with name 'linear' instead of creating a new one.


jazzy hepatocyte KRR


[I 2023-12-17 17:12:22,542] Using an existing study with name 'KRR' instead of creating a new one.


jazzy hepatocyte GB
Successfully updated linear.csv with results of trial 1201


[I 2023-12-17 17:12:22,815] Trial 1201 finished with value: 0.3338284599330294 and parameters: {'alpha': 0.030252963491125964, 'l1_ratio': 0.5941734280432983}. Best is trial 1080 with value: 0.3329620582740734.
[I 2023-12-17 17:12:22,823] Using an existing study with name 'GB' instead of creating a new one.


jazzy hepatocyte RF


[I 2023-12-17 17:12:23,299] Using an existing study with name 'RF' instead of creating a new one.
[I 2023-12-17 17:12:23,436] Trial 1102 finished with value: 0.0615650650155006 and parameters: {'n_estimators': 10, 'learning_rate': 0.027130580419708512, 'max_depth': 3}. Best is trial 99 with value: 0.05917832364524326.


jazzy hepatocyte ANN
Successfully updated GB.csv with results of trial 1102


[I 2023-12-17 17:12:23,636] Using an existing study with name 'ANN' instead of creating a new one.
[I 2023-12-17 17:12:23,920] Trial 1201 finished with value: 0.33884006001965034 and parameters: {'alpha': 0.332320881970816, 'gamma': 3.5223164507414932e-15, 'kernel': 'laplacian'}. Best is trial 1003 with value: 0.338662400699703.


Successfully updated KRR.csv with results of trial 1201


[I 2023-12-17 17:12:25,174] Trial 1201 finished with value: 0.3307067912995554 and parameters: {'n_estimators': 10, 'max_features': 'sqrt', 'max_depth': 3}. Best is trial 1029 with value: 0.32781508966158346.


Successfully updated RF.csv with results of trial 1201


[I 2023-12-17 17:12:27,181] Trial 1102 finished with value: 0.30094807935879303 and parameters: {'n_estimators': 20, 'learning_rate': 0.24192275021364867, 'max_depth': 5}. Best is trial 861 with value: 0.2879621591584269.


Successfully updated GB.csv with results of trial 1102


[I 2023-12-17 17:12:27,532] Trial 1201 finished with value: 0.07417765890305218 and parameters: {'learning_rate_init': 0.030703527554322707, 'hidden_layer_sizes': [10]}. Best is trial 1021 with value: 0.06688075265089947.


Successfully updated ANN.csv with results of trial 1201


[I 2023-12-17 17:12:28,054] Trial 1102 finished with value: 0.3309167455382222 and parameters: {'n_estimators': 20, 'learning_rate': 0.07065431629303412, 'max_depth': 5}. Best is trial 1069 with value: 0.3283807093809974.


Successfully updated GB.csv with results of trial 1102


[I 2023-12-17 17:12:31,533] Trial 1201 finished with value: 0.3388470384405495 and parameters: {'learning_rate_init': 0.09969259003136288, 'hidden_layer_sizes': [10]}. Best is trial 1186 with value: 0.3377094095552518.


Successfully updated ANN.csv with results of trial 1201


[I 2023-12-17 17:12:35,548] Trial 1173 finished with value: 0.10585132272506141 and parameters: {'learning_rate_init': 0.027868774372077538, 'hidden_layer_sizes': [10, 10, 10]}. Best is trial 1008 with value: 0.07157636029273418.


Successfully updated ANN.csv with results of trial 1173


[I 2023-12-17 17:12:41,342] Trial 1147 finished with value: 0.35143903593513026 and parameters: {'learning_rate_init': 0.02104829827015474, 'hidden_layer_sizes': [5, 5, 5]}. Best is trial 1067 with value: 0.3363088992610564.


Successfully updated ANN.csv with results of trial 1147


[I 2023-12-17 17:12:45,146] Trial 1102 finished with value: 0.31984236232106117 and parameters: {'n_estimators': 200, 'learning_rate': 0.472893147995886, 'max_depth': 3}. Best is trial 331 with value: 0.2829879661329832.


Successfully updated GB.csv with results of trial 1102


[I 2023-12-17 17:12:49,663] Trial 1124 finished with value: 0.3192601294758885 and parameters: {'learning_rate_init': 0.063007092141672, 'hidden_layer_sizes': [50, 50]}. Best is trial 1003 with value: 0.3055344753665534.


Successfully updated ANN.csv with results of trial 1124


[I 2023-12-17 17:12:50,246] Trial 1142 finished with value: 0.3285672191442115 and parameters: {'learning_rate_init': 0.040824442352958824, 'hidden_layer_sizes': [50, 50, 50]}. Best is trial 1108 with value: 0.3017441531185905.
[I 2023-12-17 17:12:50,345] Trial 1127 finished with value: 0.06363277359066731 and parameters: {'n_estimators': 500, 'max_features': 'sqrt', 'max_depth': 2}. Best is trial 1091 with value: 0.059055450943547895.


Successfully updated ANN.csv with results of trial 1142
Successfully updated RF.csv with results of trial 1127


[I 2023-12-17 17:12:53,034] Trial 1123 finished with value: 0.27316324886330395 and parameters: {'n_estimators': 200, 'max_features': 'sqrt', 'max_depth': None}. Best is trial 1006 with value: 0.27173792118810963.


Successfully updated RF.csv with results of trial 1123


[I 2023-12-17 17:12:54,249] Trial 1122 finished with value: 0.2822410657749162 and parameters: {'n_estimators': 200, 'max_features': 'log2', 'max_depth': None}. Best is trial 1115 with value: 0.27912236584646655.


Successfully updated RF.csv with results of trial 1122


[I 2023-12-17 17:12:56,259] Trial 1102 finished with value: 0.334031268298576 and parameters: {'n_estimators': 500, 'learning_rate': 0.11466223750037101, 'max_depth': 2}. Best is trial 949 with value: 0.3188718800318547.


Successfully updated GB.csv with results of trial 1102


[I 2023-12-17 17:13:07,375] Trial 1151 finished with value: 0.3188620998283546 and parameters: {'n_estimators': 500, 'max_features': 'sqrt', 'max_depth': None}. Best is trial 1083 with value: 0.3165965720272566.


Successfully updated RF.csv with results of trial 1151


In [30]:
class FOO():
    def __init__(self, log_csv_path, model_identifier, X_train, y_train, X_test, y_test):
        self.log_csv_path = log_csv_path
        self.model_identifier = model_identifier
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test

    def sample_params(self, trial: optuna.Trial, model_identifier):
        if model_identifier == 'linear':
            alpha = trial.suggest_float('alpha', 1e-5, 1e-1)
            l1_ratio = trial.suggest_float('l1_ratio', 0, 1)
            return {
                "alpha": alpha,
                "l1_ratio": l1_ratio
            }, ElasticNet(alpha=alpha, l1_ratio=l1_ratio)

        if model_identifier == 'KRR':
            alpha = trial.suggest_float("alpha", 1e-4, 1)
            gamma = trial.suggest_float("gamma", 0, 1e-14)
            kernel = trial.suggest_categorical("kernel", ["linear", "laplacian", "rbf"])
            return {
                "alpha": alpha,
                "gamma": gamma,
                "kernel": kernel
            }, KernelRidge(alpha=alpha, gamma=gamma, kernel=kernel)

        if model_identifier == 'GB':
            n_estimators = trial.suggest_categorical("n_estimators", [10, 20, 50, 200, 500])
            learning_rate = trial.suggest_float("learning_rate", 0.005, 1)
            max_depth = trial.suggest_categorical("max_depth", [1, 2, 3, 4, 5])
            return {
                "n_estimators": n_estimators,
                "learning_rate": learning_rate,
                "max_depth": max_depth
            }, GradientBoostingRegressor(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth)

        if model_identifier == 'RF':
            n_estimators = trial.suggest_categorical("n_estimators", [10, 20, 50, 200, 500])
            max_features = trial.suggest_categorical("max_features", ["sqrt", "log2"])
            max_depth = trial.suggest_categorical("max_depth", [None, 2, 3, 4, 5, 10])
            return {
                "n_estimators": n_estimators,
                "max_features": max_features,
                "max_depth": max_depth
            }, RandomForestRegressor(n_estimators=n_estimators, max_features=max_features, max_depth=max_depth)

        if model_identifier == 'ANN':
            learning_rate_init = trial.suggest_float("learning_rate_init", 0.001, 0.1)
            hidden_layer_sizes = trial.suggest_categorical("hidden_layer_sizes",
                                                           [[5], [10], [20], [50], [5]*2, [10]*2, [20]*2, [50]*2, [5]*3, [10]*3, [50]*3])
            return {
            "learning_rate_init": learning_rate_init,
            "hidden_layer_sizes": hidden_layer_sizes
            }, MLPRegressor(learning_rate_init=learning_rate_init, hidden_layer_sizes=hidden_layer_sizes)

    def cross_validation_splits(self, X_train, X_test, y_train, y_test, cv_splits=5):
        """
        Splits the data into cv_splits different combinations for cross-validation.

        Parameters:
        - X_train: Training data features
        - X_test: Testing data features
        - y_train: Training data labels
        - y_test: Testing data labels
        - cv_splits: Number of cross-validation splits

        Returns:
        - List of tuples, where each tuple contains (X_train_fold, X_test_fold, y_train_fold, y_test_fold)
        """
        # Initialize StratifiedKFold with the desired number of splits
        kf = KFold(n_splits=cv_splits, shuffle=True)  # random_state=42)

        # Initialize an empty list to store the data splits
        data_splits = []

        # Loop through the cross-validation splits
        for train_index, test_index in kf.split(X_train, y_train):
            X_train_fold, X_val_fold = X_train[train_index], X_train[test_index]
            y_train_fold, y_val_fold = y_train[train_index], y_train[test_index]

            # Append the current split to the list
            data_splits.append((X_train_fold, X_val_fold, y_train_fold, y_val_fold))

        # Append the original test data to the list
        data_splits.append((X_train, X_test, y_train, y_test))

        return data_splits

    def evaluate(self, model, X_test, y_test, return_predictions=False):
        predictions = model.predict(X_test)
        rmsd = mean_squared_error(y_test, predictions, squared=False)
        return rmsd, predictions

    def train_predict(self, model, X_train, X_test, y_train, y_test):
        validation_splits = self.cross_validation_splits(self.X_train, self.X_test, self.y_train, self.y_test)
        # average over all splits in a given run
        cv_fold_results = []
        y_test_predicted = []
        fold_num = 0

        # cross-validation
        for (X_train_val, X_test_val, y_train_val, y_test_val) in validation_splits:
            fold_num += 1

            # train the model on the given validation split
            model.fit(X_train_val, y_train_val)
            cv_fold_rmsd, validation_predictions = self.evaluate(model, X_test_val, y_test_val)

            # and save the result of that split
            cv_fold_results.append(cv_fold_rmsd)

            # after all five folds, append the final predictions
            if fold_num == 6:
                y_test_predicted.append(validation_predictions)

        return np.mean(cv_fold_results), y_test_predicted
    
    def train_test_return(self, parameters, model, trial_number):
        runs = 3
        # average over all runs
        runs_rmsds = []
        y_tests_predicted = []
        
        partial_train_predict = partial(self.train_predict, model, self.X_train, self.X_test, self.y_train, self.y_test)
        
        runs_results = Parallel(n_jobs=-1)(
            delayed(partial_train_predict)() for run in range(runs)
        )

        runs_rmsds, y_tests_predicted = zip(*results)

        # calculate the standard deviation of predictions
        y_tests_predicted = np.array(y_tests_predicted)
        std = np.std(y_tests_predicted, axis=0)[0]
        
        average_predictions = np.average(y_tests_predicted, axis=0)[0]
        average_rmsd = np.mean(runs_rmsds)
        
        # write the result and hyperparameters of a run to csv file
        optuna_trial_logging(self.log_csv_path, trial_number, parameters, average_rmsd, average_predictions, std)

        return average_rmsd

    def objective(self, trial=None):
        parameters, model = self.sample_params(trial, self.model_identifier)
        return self.train_test_return(parameters, model, trial.number)

In [34]:
sampler = samplers['TPESampler']
pruner = pruners["HyperbandPruner"]
n_trials = 1
for _type in feature_types:
    for benchmark in tdc_benchmarks:
        X_train = mol_features[_type][benchmark]["train"]
        y_train = halflives[_type][benchmark]["train"]
        X_test = mol_features[_type][benchmark]["test"]
        y_test = halflives[_type][benchmark]["test"]

        for model_identifier in model_identifiers:
            print(_type, benchmark, model_identifier)
            lock_obj = optuna.storages.JournalFileOpenLock(
                f"./project_resources/optuna/{_type}/{benchmark}/{model_identifier}_journal.log"
            )

            storage = JournalStorage(
                            JournalFileStorage(f"./project_resources/optuna/{_type}/{benchmark}/{model_identifier}_journal.log", lock_obj=lock_obj)
                        )
            study = optuna.create_study(study_name=model_identifier, directions=['minimize'], pruner=pruner,
                                        storage=storage, load_if_exists=True)

            trial_log_csv_path = f"./project_resources/optuna/{_type}/{benchmark}/{model_identifier}.csv"
            tuner = FOO(trial_log_csv_path, model_identifier, X_train, y_train, X_test, y_test)

            study.optimize(tuner.objective, n_trials=n_trials, n_jobs=-1)
            joblib.dump(study, f"./project_resources/optuna/{_type}/{benchmark}/{model_identifier}.pkl")

[I 2023-12-17 18:32:17,042] Using an existing study with name 'linear' instead of creating a new one.


morgan obach linear


[W 2023-12-17 18:32:19,088] Trial 1212 failed with parameters: {'alpha': 0.01420339305330507, 'l1_ratio': 0.8060740387091849} because of the following error: BrokenProcessPool('A task has failed to un-serialize. Please ensure that the arguments of the function are all picklable.').
joblib.externals.loky.process_executor._RemoteTraceback: 
"""
Traceback (most recent call last):
  File "C:\Users\Lukas\anaconda3\envs\soc\Lib\site-packages\joblib\externals\loky\process_executor.py", line 426, in _process_worker
    call_item = call_queue.get(block=True, timeout=timeout)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Lukas\anaconda3\envs\soc\Lib\multiprocessing\queues.py", line 122, in get
    return _ForkingPickler.loads(res)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
ModuleNotFoundError: No module named 'project_resources.cytochrome_P450'
"""

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "C:\Users\

morgan obach KRR


[W 2023-12-17 18:32:21,262] Trial 1210 failed with parameters: {'alpha': 0.2153667170857962, 'gamma': 3.2602102606395927e-16, 'kernel': 'rbf'} because of the following error: BrokenProcessPool('A task has failed to un-serialize. Please ensure that the arguments of the function are all picklable.').
joblib.externals.loky.process_executor._RemoteTraceback: 
"""
Traceback (most recent call last):
  File "C:\Users\Lukas\anaconda3\envs\soc\Lib\site-packages\joblib\externals\loky\process_executor.py", line 426, in _process_worker
    call_item = call_queue.get(block=True, timeout=timeout)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Lukas\anaconda3\envs\soc\Lib\multiprocessing\queues.py", line 122, in get
    return _ForkingPickler.loads(res)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
ModuleNotFoundError: No module named 'project_resources.cytochrome_P450'
"""

The above exception was the direct cause of the following exception:

Traceback (most recent call last):


morgan obach GB


[W 2023-12-17 18:32:23,462] Trial 1010 failed with parameters: {'n_estimators': 10, 'learning_rate': 0.49728044655000503, 'max_depth': 1} because of the following error: BrokenProcessPool('A task has failed to un-serialize. Please ensure that the arguments of the function are all picklable.').
joblib.externals.loky.process_executor._RemoteTraceback: 
"""
Traceback (most recent call last):
  File "C:\Users\Lukas\anaconda3\envs\soc\Lib\site-packages\joblib\externals\loky\process_executor.py", line 426, in _process_worker
    call_item = call_queue.get(block=True, timeout=timeout)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Lukas\anaconda3\envs\soc\Lib\multiprocessing\queues.py", line 122, in get
    return _ForkingPickler.loads(res)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
ModuleNotFoundError: No module named 'project_resources.cytochrome_P450'
"""

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  Fil

morgan obach RF


[I 2023-12-17 18:32:23,853] Using an existing study with name 'RF' instead of creating a new one.
[W 2023-12-17 18:32:25,891] Trial 1210 failed with parameters: {'n_estimators': 20, 'max_features': 'sqrt', 'max_depth': 2} because of the following error: BrokenProcessPool('A task has failed to un-serialize. Please ensure that the arguments of the function are all picklable.').
joblib.externals.loky.process_executor._RemoteTraceback: 
"""
Traceback (most recent call last):
  File "C:\Users\Lukas\anaconda3\envs\soc\Lib\site-packages\joblib\externals\loky\process_executor.py", line 426, in _process_worker
    call_item = call_queue.get(block=True, timeout=timeout)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Lukas\anaconda3\envs\soc\Lib\multiprocessing\queues.py", line 122, in get
    return _ForkingPickler.loads(res)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
ModuleNotFoundError: No module named 'project_resources.cytochrome_P450'
"""

The above exception was th

KeyboardInterrupt: 