In [1]:
import optuna
import joblib
import numpy as np
import sys
from concurrent.futures import ThreadPoolExecutor
from optuna.storages import JournalStorage, JournalFileStorage
from tdc.single_pred import ADME
from scipy.stats import spearmanr
from sklearn.metrics import r2_score
from jazzy.api import molecular_vector_from_smiles as mol_vect
import matplotlib.pyplot as plt
from project_resources.import_utils import NotebookFinder
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import ElasticNet
from sklearn.preprocessing import MinMaxScaler
sys.meta_path.append(NotebookFinder())
from project_resources.cytochrome_P450 import fp_from_smiles, HyperparamTuner, tanimoto, create_interactive_scatter_plot

importing Jupyter notebook from C:\Users\Lukas\Documents\datacytochromy\project_resources\cytochrome_P450.ipynb


In [2]:
model_identifiers = ["linear", "KRR", "GB", "RF", "ANN"]
splitters = ["rand", "scaff", "time"]
data_splits = ["train", "test"]
feature_types = ["morgan", "jazzy"]

# sampler - a method used to generate new sets of hyperparameters in each iteration of the optimization process
samplers = {
    'RandomSampler': optuna.samplers.RandomSampler,          # Sampler that selects hyperparameters randomly from the search space.
    'GridSampler': optuna.samplers.GridSampler,              # Sampler that performs a grid search over the hyperparameter space.
    'TPESampler': optuna.samplers.TPESampler,                # Sampler that uses a tree-structured Parzen estimator to model the objective function and sample new points from the search space.
    'CmaEsSampler': optuna.samplers.CmaEsSampler,            # Sampler that uses the Covariance Matrix Adaptation Evolution Strategy algorithm to efficiently search the hyperparameter space.
    'NSGAIISampler': optuna.samplers.NSGAIISampler,          # Multi-objective evolutionary algorithm that generates new samples using non-dominated sorting and crowding distance selection.
    'QMCSampler': optuna.samplers.QMCSampler,                # Quasi-Monte Carlo sampler that uses low-discrepancy sequences to sample the search space in a more efficient and evenly distributed way than random sampling.
    'BoTorchSampler': optuna.integration.BoTorchSampler,     # Sampler that leverages the BoTorch library for Bayesian optimization and can handle both continuous and categorical hyperparameters.
    'BruteForceSampler': optuna.samplers.BruteForceSampler,  # Sampler that exhaustively evaluates all possible combinations of hyperparameters in the search space.
}
# pruner - a technique used to eliminate unpromising trials during the course of hyperparameter optimization.
pruners = {
    'BasePruner': optuna.pruners.BasePruner,                            # This is the base class for all pruning strategies in Optuna. It provides a skeleton for implementing custom pruning strategies.
    'MedianPruner': optuna.pruners.MedianPruner,                        # A pruner that prunes unpromising trials that have median objective values, as determined in previous steps.
    'SuccessiveHalvingPruner': optuna.pruners.SuccessiveHalvingPruner,  # This pruner repeatedly splits trials into halves, discarding the lower performing half at each iteration.
    'HyperbandPruner': optuna.pruners.HyperbandPruner,                  # This pruner implements the Hyperband algorithm, which selects promising trials and runs them with different resource allocation schemes to determine the best one.
    'PercentilePruner': optuna.pruners.PercentilePruner,                # A pruner that prunes unpromising trials based on their percentile rank relative to all completed trials.
    'NopPruner': optuna.pruners.NopPruner,                              # A pruner that does nothing and does not prune any trials.
    'ThresholdPruner': optuna.pruners.ThresholdPruner,                  # This pruner prunes trials that have not reached a certain level of performance (i.e., objective value).
    'PatientPruner': optuna.pruners.PatientPruner,                      # This pruner prunes trials that do not show improvement over a certain number of steps (or epochs).
}

In [3]:
data = ADME(name='Clearance_Hepatocyte_AZ')
split = data.get_split()

Found local copy...
Loading...
Done!


In [4]:
train_smiles = np.array(list(split["train"]["Drug"]) + list(split["valid"]["Drug"]))
train_halflives = np.array(list(split["train"]["Y"]) + list(split["valid"]["Y"]))
test_smiles = np.array(list(split["test"]["Drug"]) + list(split["test"]["Drug"]))
test_halflives = np.array(list(split["test"]["Y"]) + list(split["test"]["Y"]))

reshaped_train_halflife = np.array(train_halflives).reshape(-1, 1)
scaler = MinMaxScaler().fit(reshaped_train_halflife)
train_halflife_scaled = scaler.transform(reshaped_train_halflife)
train_halflives_scaled = np.array([val[0] for val in train_halflife_scaled])

reshaped_test_halflife = np.array(test_halflives).reshape(-1, 1)
scaler = MinMaxScaler().fit(reshaped_test_halflife)
test_halflife_scaled = scaler.transform(reshaped_test_halflife)
test_halflives_scaled = np.array([val[0] for val in test_halflife_scaled])

print(train_smiles.shape, train_halflives_scaled.shape, test_smiles.shape, test_halflives_scaled.shape)

(970,) (970,) (486,) (486,)


In [5]:
train_jazzy_fps = []
train_jazzy_thalfs = []
test_jazzy_fps = []
test_jazzy_thalfs = []

for smi, thalf in zip(train_smiles, train_halflives_scaled):
    try:
        jazzy_fp = mol_vect(smi)
    except:
        jazzy_fp = None
    if jazzy_fp and not np.isnan(np.array(list(jazzy_fp.values()))).any():
        jazzy_fp_list = np.array([fp for fp in jazzy_fp.values()])
        train_jazzy_fps.append(jazzy_fp_list)
        train_jazzy_thalfs.append(thalf)

for smi, thalf in zip(test_smiles, test_halflives_scaled):
    try:
        jazzy_fp = mol_vect(smi)
    except:
        jazzy_fp = None
    if jazzy_fp and not np.isnan(np.array(list(jazzy_fp.values()))).any():
        jazzy_fp_list = np.array([fp for fp in jazzy_fp.values()])
        test_jazzy_fps.append(jazzy_fp_list)
        test_jazzy_thalfs.append(thalf)

In [6]:
train_jazzy_fps = np.array(train_jazzy_fps)
train_jazzy_thalfs = np.array(train_jazzy_thalfs)
test_jazzy_fps = np.array(test_jazzy_fps)
test_jazzy_thalfs = np.array(test_jazzy_thalfs)
print(train_jazzy_fps.shape, train_jazzy_thalfs.shape, test_jazzy_fps.shape, test_jazzy_thalfs.shape)

(970, 6) (970,) (486, 6) (486,)


In [7]:
train_morgan_fps = np.array(fp_from_smiles(train_smiles))
test_morgan_fps = np.array(fp_from_smiles(test_smiles))
print(train_morgan_fps.shape, test_morgan_fps.shape)

(970, 124) (486, 124)


In [8]:
best_model_hyperparams = {}
for _type in feature_types:
    best_model_hyperparams[_type] = {}
    for model_id in model_identifiers:
        jl = joblib.load(f"project_resources/optuna/AZ_Hepatocyte/{_type}/{model_id}.pkl")
        best_model_hyperparams[_type][model_id] = jl.best_trial.params
print(best_model_hyperparams["morgan"]["linear"])

OperationalError: (sqlite3.OperationalError) unable to open database file
(Background on this error at: https://sqlalche.me/e/20/e3q8)

In [None]:
y_predicted = {}
rmsds = {}
stds = {}
best_models = {}
# test models with the best hyperparams
for _type in feature_types:
    y_predicted[_type] = {}
    rmsds[_type] = {}
    stds[_type] = {}
    best_models[_type] = {}
    group_rmsds = {}
    for model_id in model_identifiers:
        hyperparams = best_model_hyperparams[_type][model_id]
        if _type == "morgan":
            X_train = train_morgan_fps
            y_train = train_halflives_scaled
            X_test = test_morgan_fps
            y_test = test_halflives_scaled

        elif _type == "jazzy":
            X_train = train_jazzy_fps
            y_train = train_jazzy_thalfs
            X_test = test_jazzy_fps
            y_test = test_jazzy_thalfs

        if model_id == 'linear':
            alpha = hyperparams["alpha"]
            l1_ratio = hyperparams["l1_ratio"]
            reg = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, fit_intercept=True)

        if model_id == 'KRR':
            alpha = hyperparams["alpha"]
            gamma = hyperparams["gamma"]
            kernel = hyperparams["kernel"]
            reg = KernelRidge(alpha=alpha, gamma=gamma, kernel=kernel)

        if model_id == 'GB':
            n_estimators = hyperparams["n_estimators"]
            learning_rate = hyperparams["learning_rate"]
            max_depth = hyperparams["max_depth"]
            reg = GradientBoostingRegressor(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth)

        if model_id == 'RF':
            n_estimators = hyperparams["n_estimators"]
            max_features = hyperparams["max_features"]
            max_depth = hyperparams["max_depth"]
            reg = RandomForestRegressor(n_estimators=n_estimators, max_features=max_features, max_depth=max_depth)

        if model_id == 'ANN':
            learning_rate_init = hyperparams["learning_rate_init"]
            hidden_layer_sizes = hyperparams["hidden_layer_sizes"]
            reg = MLPRegressor(learning_rate_init=learning_rate_init, hidden_layer_sizes=hidden_layer_sizes)

        # create an instance of HyperparamTuner without specifying any model_identifier
        tuner = HyperparamTuner("foo", X_train, y_train, X_test, y_test)
        # and use the train_test_return function with return_predictions to get the rmsd values
        # average over multiple runs of the same model
        runs_rmsds = []
        runs_y_test_predictions = []
        runs_stds = []
        for i in range(10):
            rmsd, y_test_predictions, std = tuner.train_test_return("foo", reg, return_predictions=True)
            runs_rmsds.append(rmsd)
            runs_y_test_predictions.append(y_test_predictions)
            runs_stds.append(std)
        mean_rmsd = np.mean(runs_rmsds, axis=0)
        mean_y_test_predictions = np.mean(runs_y_test_predictions, axis=0)
        mean_stds = np.mean(runs_stds, axis=0)
        group_rmsds[mean_rmsd] = model_id
        rmsds[_type][model_id] = mean_rmsd
        y_predicted[_type][model_id] = mean_y_test_predictions
        stds[_type][model_id] = mean_stds
        print(mean_rmsd, f"y_test predictions: {y_test_predictions[:5]}, {len(y_test_predictions)}")
        print(f"     standard deviations: {mean_stds[:4]}, {len(mean_stds)}")

    # find best model for each dataset and its rmsd
    min_rmsd = min(group_rmsds.keys())
    best_model = group_rmsds[min_rmsd]
    best_models[_type] = (best_model, min_rmsd)
    print(f"best was {best_model} with rmsd {min_rmsd}")

    print("\n")

In [None]:
tanimoto_similarities = tanimoto(test_morgan_fps, train_morgan_fps)
median = np.median(tanimoto_similarities)
mean = np.mean(tanimoto_similarities)
print(f"length: {len(tanimoto_similarities)}, median: {median}, arithmetic mean: {mean}, ",
      tanimoto_similarities[:10])

In [None]:
model_id = best_models["morgan"][0]
create_interactive_scatter_plot(tanimoto_similarities, y_predicted["morgan"][model_id],
                                "Tanimoto Similarity", "Predicted Test Half-life",
                                "Obach Tanimoto Similartiy Between Test and Train", "Molecular Similarity vs. Predicted Values")

In [None]:
model_id = best_models["morgan"][0]
create_interactive_scatter_plot(test_halflives_scaled, y_predicted["morgan"][model_id],
                                "Real Test Half-life", "Predicted Test Half-life",
                                f"Prediction of the Obach Dataset Using {model_id} on the Morgan Fingerprints",
                                "Real vs. Predicted Values", include_diagonal=True, y_error=stds["morgan"][model_id])

In [None]:
model_id = best_models["jazzy"][0]
create_interactive_scatter_plot(test_halflives_scaled, y_predicted["jazzy"][model_id],
                                "Real Test Half-life", "Predicted Test Half-life",
                                f"Prediction of the Obach Dataset Using {model_id} on the Jazzy Fingerprints",
                                "Real vs. Predicted Values", include_diagonal=True, y_error=stds["jazzy"][model_id])

In [None]:
plt.hist(train_halflives_scaled)
plt.hist(train_halflives_scaled, range=(0, 0.1))

In [None]:
plt.hist(train_halflives_scaled, range=(0, 0.2))

In [None]:
plt.hist(test_halflives_scaled)
plt.hist(train_halflives_scaled, range=(0, 0.1))

In [None]:
plt.hist(test_halflives_scaled, range=(0, 0.2))

In [None]:
model_id = best_models["morgan"][0]
print("morgan")
print(spearmanr(test_halflives_scaled, y_predicted["morgan"][model_id]))
print("R^2 score:", r2_score(test_halflives_scaled, y_predicted["morgan"][model_id]))

In [None]:
model_id = best_models["jazzy"][0]
print("jazzy")
print(spearmanr(test_jazzy_thalfs, y_predicted["jazzy"][model_id]))
print("R^2 score:", r2_score(test_jazzy_thalfs, y_predicted["jazzy"][model_id]))

In [None]:
sampler = samplers['TPESampler']
pruner = pruners["BasePruner"]
n_trials = 200
with ThreadPoolExecutor() as executor:
    futures = []
    for _type in feature_types:
        if _type == "morgan":
            X_train = train_morgan_fps
            y_train = train_halflives_scaled
            X_test = test_morgan_fps
            y_test = test_halflives_scaled

        elif _type == "jazzy":
            X_train = train_jazzy_fps
            y_train = train_jazzy_thalfs
            X_test = test_jazzy_fps
            y_test = test_jazzy_thalfs

        for model_identifier in model_identifiers:
            print(model_identifier)
            lock_obj = optuna.storages.JournalFileOpenLock(
                f"./project_resources/optuna/AZ_Hepatocyte/{_type}/{model_identifier}_journal.log"
            )

            storage = JournalStorage(
                            JournalFileStorage(f"./project_resources/optuna/AZ_Hepatocyte/{_type}/{model_identifier}_journal.log", lock_obj=lock_obj)
                        )
            study = optuna.create_study(study_name=model_identifier, directions=['minimize'], pruner=pruner,
                                        storage=storage, load_if_exists=True)
            tuner = HyperparamTuner(model_identifier, X_train, y_train, X_test, y_test)
            futures.append(executor.submit(study.optimize, tuner.objective, n_trials=n_trials))
            joblib.dump(study, f"./project_resources/optuna/AZ_Hepatocyte/{_type}/{model_identifier}.pkl")
    for future in futures:
        future.result()