In [11]:
import pandas as pd
import os
import re
import csv
import sys
import time
import joblib
import py3Dmol
from rdkit import Chem
from rdkit.Chem import AllChem
import numpy as np
from tdc.single_pred import ADME
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from sklearn.model_selection import KFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import ElasticNet
from scipy.stats import spearmanr
import optuna
from project_resources.import_utils import NotebookFinder
sys.meta_path.append(NotebookFinder())
from project_resources.cytochrome_P450 import load_ml_data

In [2]:
class HyperparamTuner():
    def __init__(self, log_csv_path, model_identifier, X_train, y_train, X_val, y_val):
        self.log_csv_path = log_csv_path
        self.model_identifier = model_identifier
        self.X_train = X_train
        self.y_train = y_train
        self.X_val = X_val
        self.y_val = y_val

    def sample_params(self, trial: optuna.Trial, model_identifier):
        if model_identifier == 'linear':
            alpha = trial.suggest_float('alpha', 0.005, 0.1)
            l1_ratio = trial.suggest_float('l1_ratio', 0, 0.05)
            return {
                "alpha": alpha,
                "l1_ratio": l1_ratio
            }, ElasticNet(alpha=alpha, l1_ratio=l1_ratio)

        if model_identifier == 'KRR':
            alpha = trial.suggest_float("alpha", 0.3, 1)
            gamma = trial.suggest_float("gamma", 0.1, 0.3)
            kernel = trial.suggest_categorical("kernel", ["laplacian", "rbf"])
            return {
                "alpha": alpha,
                "gamma": gamma,
                "kernel": kernel
            }, KernelRidge(alpha=alpha, gamma=gamma, kernel=kernel)

        if model_identifier == 'GB':
            n_estimators = trial.suggest_categorical("n_estimators", [5, 10, 20, 50])
            learning_rate = trial.suggest_float("learning_rate", 0.05, 0.175)
            max_depth = trial.suggest_categorical("max_depth", [1, 2, 3])
            return {
                "n_estimators": n_estimators,
                "learning_rate": learning_rate,
                "max_depth": max_depth
            }, GradientBoostingRegressor(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth)

        if model_identifier == 'RF':
            n_estimators = trial.suggest_categorical("n_estimators", [500, 750, 1000])
            max_features = trial.suggest_categorical("max_features", ["sqrt", "log2"])
            max_depth = trial.suggest_categorical("max_depth", [None, 2, 5, 10, 20])
            return {
                "n_estimators": n_estimators,
                "max_features": max_features,
                "max_depth": max_depth
            }, RandomForestRegressor(n_estimators=n_estimators, max_features=max_features, max_depth=max_depth)

        if model_identifier == 'ANN':
            learning_rate_init = trial.suggest_float("learning_rate_init", 0.05, 0.15)
            hidden_layer_sizes = trial.suggest_categorical("hidden_layer_sizes",
                                                           [[3]*3, [5]*3, [3]*5, [5]*5, [10]*3])
            return {
            "learning_rate_init": learning_rate_init,
            "hidden_layer_sizes": hidden_layer_sizes
            }, MLPRegressor(solver="lbfgs", learning_rate_init=learning_rate_init, hidden_layer_sizes=hidden_layer_sizes)

    def evaluate(self, model, X_val, y_val):
        predictions = model.predict(X_val)
        spearman = spearmanr(y_val, predictions).statistic
        return abs(spearman), predictions

    def train_val_return(self, trial, parameters, model, trial_number):
        runs = 3
        runs_results = []
        runs_predictions = []
        
        # ensure that the models don't predict only one value (if they do, spearman is nan)
        for run in range(runs):
            # pythonic "do-while loop"
            while True:
                model.fit(self.X_train, self.y_train)
                run_spearman, run_predicts = self.evaluate(model, self.X_val, self.y_val)
                if not np.isnan(run_spearman):
                    break
            # Handle pruning based on the intermediate value.
            trial.report(run_spearman, run)
            if trial.should_prune():
                raise optuna.TrialPruned()
            runs_results.append(run_spearman)
            runs_predictions.append(run_predicts)

        # calculate the standard deviation of predictions
        runs_predictions = np.array(runs_predictions)
        std = np.std(runs_predictions, axis=0)
        
        average_result = np.mean(runs_results)
        average_predictions = np.mean(runs_predictions, axis=0)
        
        # write the result and hyperparameters of a run to csv file
        optuna_trial_logging(self.log_csv_path, trial_number, parameters, average_result, average_predictions, std)

        return average_result

    def objective(self, trial=None):
        parameters, model = self.sample_params(trial, self.model_identifier)
        return self.train_val_return(trial, parameters, model, trial.number)

In [6]:
def _evaluate(model, X_val, y_val):
    predictions = model.predict(X_val)
    spearman = spearmanr(y_val, predictions).statistic
    return abs(spearman), predictions

In [64]:
X_train, y_train, X_val, y_val, X_test, y_test = load_ml_data("morgan", "obach")

model = MLPRegressor(solver="lbfgs", learning_rate_init=0.05, hidden_layer_sizes=[10]*3)
print(f"Running: [10]*3")

all_times = []
for i in range(10):
    t_start = time.time()
    # ensure that the models don't predict only one value (if they do, spearman is nan)
    for run in range(runs):
        num_iter = 0
        # pythonic "do-while loop"
        while True:
            num_iter += 1
            model.fit(X_train, y_train)
            run_spearman, run_predicts = _evaluate(model, X_val, y_val)
            #print("spearman:", run_spearman)
            if not np.isnan(run_spearman):
                #print("spearman:", run_spearman)
                break
    t_end = time.time()
    total_time = round(t_end-t_start, 2)
    all_times.append(total_time)
print(f"Average: {round(np.average(all_times), 2)}")

Found local copy...
Loading...
Done!
100%|███████████████████████████████████████████████████████████████████████████████| 667/667 [00:00<00:00, 795.09it/s]


Running: [10]*3
Average: 0.52


In [63]:
# [3]*3 -> 1.05
# [3]*5 -> 1.75
# [5]*3 -> 0.6
# [5]*5 -> 0.75
# [10]*3 -> 0.5