In [4]:
import pandas as pd
import os
import re
import csv
import joblib
import py3Dmol
from urllib.request import urlopen
from urllib.parse import quote
from functools import partial
import ipywidgets as widgets
from tdc.single_pred import ADME
from IPython.display import display
from rdkit import Chem
from rdkit.Chem import AllChem
from jazzy.api import molecular_vector_from_smiles as mol_vect
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error
import optuna

In [5]:
def fp_from_smiles(list_smiles):
    # converts a list of SMILES strings into a list of Morgan fingerprint bit arrays

    list_fingerprint = []
    for smi in list_smiles:
        mol = Chem.MolFromSmiles(smi)
        fingerprint = AllChem.GetMorganFingerprintAsBitVect(mol, useChirality=True, radius=2, nBits=124)
        vector = np.array(fingerprint)
        list_fingerprint.append(vector)
    return list_fingerprint

In [2]:
def param_tuning(x_train, x_test, y_train, y_test, type_ml_use):
    # LEGACY HYPERPARAMETER OPTIMIZATION

    if type_ml_use == 'linear':
        param_grid = {
            'fit_intercept': [True],
            'alpha': [1e-5, 1e-4, 1e-3, 1e-2],
            'l1_ratio': [0, 0.1, 0.5, 0.9, 1]
        }
        reg = ElasticNet()

    if type_ml_use == 'KRR':
        param_grid = {
            "alpha": np.logspace(-4, 1, 20),
            "gamma": np.logspace(-14, 0, 20),
            "kernel": ['linear', 'laplacian', 'rbf']
        }
        reg = KernelRidge()

    if type_ml_use == 'GB':
        param_grid = {
            'n_estimators': [10, 20, 50, 200, 400],
            'learning_rate': [0.02, 0.05],
            'max_depth': [1, 2, 3, 5],
        }
        reg = GradientBoostingRegressor()

    if type_ml_use == 'RF':
        param_grid = {
            'max_depth': [None, 2, 3, 5, 10],
            'max_features': ['auto', 'sqrt', 'log2'],
            'n_estimators': [10, 20, 50, 100, 200],
        }
        reg = RandomForestRegressor()

    if type_ml_use == 'ANN':
        param_grid = {
            'learning_rate_init': [0.001, 0.002, 0.005, 0.01, 0.02, 0.05],
            'hidden_layer_sizes': [[5], [10], [20], [50], [5]*2, [10]*2, [20]*2, [50]*2, [5]*3, [10]*3]
        }
        reg = MLPRegressor()

    grid = RandomizedSearchCV(reg, param_grid, cv=KFold(n_splits=5, shuffle=True), verbose=0)
    grid.fit(x_train, y_train)
    best_reg = grid.best_estimator_
    y_train_predict = best_reg.predict(x_train)
    y_test_predict = best_reg.predict(x_test)
    abs_error = np.abs(y_test_predict-y_test)
    print(f"     best {type_ml_use} hyperparams: {best_reg}")
    # retrain on best hyperparameters
    best_reg.fit(x_train, y_train)

    return y_train_predict, y_test_predict, abs_error

In [1]:
def mol_predict_and_std(models, x_train, x_test, y_train, y_test):
    # LEGACY HYPERPARAMETER OPTIMIZATION
    
    y_test_avg_predict_dict = {}
    std_dict = {}
    rmsd_dict = {}
    for model in models:
        y_test_predicts = []

        for i in range(3):
            asdf, y_test_predict, ghjk = param_tuning(x_train, x_test, y_train, y_test, model)
            # asdf, ghjk ... dummy variables, are not needed here
            y_test_predicts.append(y_test_predict)

        y_test_predicts_array = np.array(y_test_predicts)

        y_test_avg_predict = np.average(y_test_predicts_array, axis=0)
        standard_deviation = np.std(y_test_predicts_array, axis=0)
        rmsd = np.sqrt(np.average(np.square(y_test_avg_predict-y_test)))
        # root-mean-square deviation

        y_test_avg_predict_dict[model] = y_test_avg_predict
        std_dict[model] = standard_deviation
        rmsd_dict[model] = rmsd
    return y_test_avg_predict_dict, std_dict, rmsd_dict

In [1]:
def show(smi, style='stick'):
    mol = Chem.MolFromSmiles(smi)
    mol = Chem.AddHs(mol)
    AllChem.EmbedMolecule(mol)
    AllChem.MMFFOptimizeMolecule(mol, maxIters=200)
    mblock = Chem.MolToMolBlock(mol)

    view = py3Dmol.view(width=500, height=400)
    view.addModel(mblock, 'mol')
    view.setStyle({style: {}})
    view.zoomTo()
    view.show()

In [2]:
# CODE SPECIFIC TO INFERENCE:

In [6]:
def retrain_and_save(model, file_path, X_train, y_train):
    # retrain a model on the best hyperparameters and save the model as pkl
    
    model.fit(X_train, y_train)
    joblib.dump(model, file_path)
    print(file_path, " was successfully created")

In [8]:
def CIRconvert(ids):
    # convert molecule name written by user into SMILES
    
    try:
        url = 'http://cactus.nci.nih.gov/chemical/structure/' + quote(ids) + '/smiles'
        ans = urlopen(url).read().decode('utf8')
        return ans
    except:
        print(f'Něco se nepovedlo. Jste si jistí, že nemáte v názvu "{ids}" překlep?')

In [1]:
def contains_nan(lst):
    for item in lst:
        if isinstance(item, dict):
            for v in item.values():
                if isinstance(v, (int, float)) and np.isnan(v):
                    return True
                elif isinstance(v, np.ndarray) and np.isnan(v).any():
                    return True
        elif isinstance(item, np.ndarray) and np.isnan(item).any():
            return True
    return False

In [1]:
def inference_predict(_type, benchmark, model_id, user_smi):

    model_path = f"project_resources/optuna/{_type}/{benchmark}/{model_id}.joblib"
    model = joblib.load(model_path)
    dataset_names = {"obach": 'Half_Life_Obach', "microsome": 'Clearance_Microsome_AZ', "hepatocyte": 'Clearance_Hepatocyte_AZ'}
    dataset_units = {"obach": "h", "microsome": "ml.min-1.g-1", "hepatocyte": "μl.min-1.(10^6 buněk)-1"}

    if _type == "morgan":
        user_fp = fp_from_smiles([user_smi])
        y_predict = model.predict(user_fp)
    else:
        user_fp = np.array([list(fts.values()) for fts in mol_fts([user_smi])])
        y_predict = model.predict(user_fp)
        adme_dataset = ADME(name=dataset_names[benchmark])
        adme_split = adme_dataset.get_split()
        adme_train_y = adme_split["train"]["Y"]
        adme_test_y = adme_split["test"]["Y"]
        adme_train_test_y = list(adme_train_y) + list(adme_test_y)
        reshaped_halflife = np.array(adme_train_test_y).reshape(-1, 1)
        scaler = MinMaxScaler().fit(reshaped_halflife)
        reshaped_predict = np.array(y_predict).reshape(-1, 1)
        y_predict = scaler.inverse_transform(reshaped_predict)[0]
    
    if not contains_nan(user_fp):
        y_predict_rounded = np.round(np.abs(y_predict)[0], decimals=1)
        benchmark_units = dataset_units[benchmark]
        if benchmark == "obach":
            print(f"Predikovaná hodnota eliminačního poločasu: {y_predict_rounded} {benchmark_units}")
        else:
            print(f"Predikovaná hodnota clearance: {y_predict_rounded} {benchmark_units}")
    else:
        print("X_test obsahuje hodnotu NaN")

In [10]:
# Handle the button click event
def on_button_click(selected, dropdowns, _):
    selected.clear()
    for dropdown in dropdowns:
        selected.append(dropdown.value)
    return selected

In [1]:
def inference_dataset_selection():  
    model_identifiers = ["linear", "KRR", "GB", "RF", "ANN"]
    tdc_benchmarks = ["obach", "microsome", "hepatocyte"]
    feature_types = ["morgan", "jazzy"]

    # Create and display dropdown menus using a loop
    dropdowns = []

    for options, description in zip([feature_types, tdc_benchmarks, model_identifiers],
                                    ['Features:', 'Dataset:', 'Model:']):
        dropdown = widgets.Dropdown(
            options=options,
            value=options[0],
            description=description
        )
        dropdowns.append(dropdown)
        display(dropdown)

    # Create a button to trigger the action
    selected = []  # Define an empty list to store selected options
    button = widgets.Button(description="Parse Output")
    button.on_click(partial(on_button_click, selected, dropdowns))
    display(button)

    return selected  # Return the selected options list

In [2]:
# CODE SPECIFIC TO JAZZY:

In [1]:
def mol_fts(smiles):
    features = []
    for smi in smiles:
        try:
            features.append(mol_vect(smi))
        except:
            # "except JazzyError" gives NameError: name 'JazzyError' is not defined
            features.append(np.nan)
    return features

In [1]:
def parse_jazzy_df(df, no_idx_smi = False):
    cols = df.columns
    data = {}  # all data from csv file (i.e. mol indexes, smiles, half-lives and features)
    for col in cols:
        data[col] = list(df[col])
    nan_idxs = np.argwhere(np.isnan(data["dgtot"]))
    nan_idxs = [int(idx) for idx in nan_idxs]
    data_clumped = []  # same as data, but in the form [[idx1, smi1, thalf1, fts1], [idx2, smi2, thalf2, fts2],...]]
    for col in cols:
        for i, foo in zip(range(len(data[col])), data[col]):
            if len(data_clumped) < i+1:
                data_clumped.append([])
            data_clumped[i].append(foo)

    # remove all mols for which Jazzy features generation wasn't successful
    num_pops = 0
    for nan_idx in nan_idxs:
        data_clumped.pop(nan_idx - num_pops)
        num_pops += 1
        print(f"     removed index {nan_idx} corresponding to NaN")
    print(f"     {len(data_clumped)}, {data_clumped[0]}")

    # filter out only the features
    if no_idx_smi:
        mol_features = np.array([feature[1:7] for feature in data_clumped])
        halflives = np.array([feature[0] for feature in data_clumped])
        contains_nan = np.any(np.isnan(mol_features))
        return mol_features, halflives, contains_nan
    else:
        mol_features = np.array([feature[3:9] for feature in data_clumped])
        halflives = np.array([feature[2] for feature in data_clumped])
        smiles = np.array([feature[1] for feature in data_clumped])
        contains_nan = np.any(np.isnan(mol_features))
        return smiles, mol_features, halflives, contains_nan

In [1]:
# CODE SPECIFIC TO OPTUNA:

In [2]:
def optuna_trial_logging(log_csv_path, trial_number, parameters, rmsd, predictions, std):
    # Check if the CSV file exists
    is_new_file = not os.path.isfile(log_csv_path)

    # Open the CSV file in append mode
    with open(log_csv_path, 'a', newline='') as csvfile:
        fieldnames = ['Trial Number', 'Parameters', 'RMSD', 'Predictions', 'Std']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        # If the file is newly created, write the header
        if is_new_file:
            writer.writeheader()

        # Write the data for the current trial
        writer.writerow({
            'Trial Number': trial_number,
            'Parameters': parameters,
            'RMSD': rmsd,
            'Predictions': predictions.tolist(),  # Convert numpy array to a list for CSV
            'Std': std.tolist()  # Convert numpy array to a list for CSV
        })

    # Extract only the filename using regular expressions
    file_name_match = re.search(r'[^\\/:*?"<>|\r\n]+$', log_csv_path)
    file_name = file_name_match.group() if file_name_match else log_csv_path

    # Print appropriate success message with the filename
    if is_new_file:
        print(f"Successfully created {file_name} with results of trial {trial_number} as the first entry")
    else:
        print(f"Successfully updated {file_name} with results of trial {trial_number}")

In [2]:
class HyperparamTuner():
    def __init__(self, log_csv_path, model_identifier, X_train, y_train, X_val, y_val):
        self.log_csv_path = log_csv_path
        self.model_identifier = model_identifier
        self.X_train = X_train
        self.y_train = y_train
        self.X_val = X_val
        self.y_val = y_val

    def sample_params(self, trial: optuna.Trial, model_identifier):
        if model_identifier == 'linear':
            alpha = trial.suggest_float('alpha', 0.085, 0.15)
            l1_ratio = trial.suggest_float('l1_ratio', 0, 0.1)
            return {
                "alpha": alpha,
                "l1_ratio": l1_ratio
            }, ElasticNet(alpha=alpha, l1_ratio=l1_ratio)

        if model_identifier == 'KRR':
            alpha = trial.suggest_float("alpha", 0.3, 1)
            gamma = trial.suggest_float("gamma", 0.1, 0.3)
            kernel = trial.suggest_categorical("kernel", ["laplacian", "rbf"])
            return {
                "alpha": alpha,
                "gamma": gamma,
                "kernel": kernel
            }, KernelRidge(alpha=alpha, gamma=gamma, kernel=kernel)

        if model_identifier == 'GB':
            n_estimators = trial.suggest_categorical("n_estimators", [5, 10, 20, 50])
            learning_rate = trial.suggest_float("learning_rate", 0.05, 0.175)
            max_depth = trial.suggest_categorical("max_depth", [1, 2, 3])
            return {
                "n_estimators": n_estimators,
                "learning_rate": learning_rate,
                "max_depth": max_depth
            }, GradientBoostingRegressor(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth)

        if model_identifier == 'RF':
            n_estimators = trial.suggest_categorical("n_estimators", [500, 750, 1000])
            max_features = trial.suggest_categorical("max_features", ["sqrt", "log2"])
            max_depth = trial.suggest_categorical("max_depth", [None, 2, 5, 10, 20])
            return {
                "n_estimators": n_estimators,
                "max_features": max_features,
                "max_depth": max_depth
            }, RandomForestRegressor(n_estimators=n_estimators, max_features=max_features, max_depth=max_depth)

        if model_identifier == 'ANN':
            learning_rate_init = trial.suggest_float("learning_rate_init", 0.05, 0.15)
            hidden_layer_sizes = trial.suggest_categorical("hidden_layer_sizes",
                                                           [[3]*3, [5]*3, [3]*5, [5]*5, [10]*3])
            return {
            "learning_rate_init": learning_rate_init,
            "hidden_layer_sizes": hidden_layer_sizes
            }, MLPRegressor(learning_rate_init=learning_rate_init, hidden_layer_sizes=hidden_layer_sizes)

    def cross_validation_splits(self, X_train, X_val, y_train, y_val, cv_splits=5):
        """
        Splits the data into cv_splits different combinations for cross-validation.

        Parameters:
        - X_train: Training data features
        - X_val: Testing data features
        - y_train: Training data labels
        - y_val: Testing data labels
        - cv_splits: Number of cross-validation splits

        Returns:
        - List of tuples, where each tuple contains (X_train_fold, X_val_fold, y_train_fold, y_val_fold)
        """
        # Initialize StratifiedKFold with the desired number of splits
        kf = KFold(n_splits=cv_splits, shuffle=True)  # random_state=42)

        # Initialize an empty list to store the data splits
        data_splits = []

        # Loop through the cross-validation splits
        for train_index, val_index in kf.split(X_train, y_train):
            X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
            y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

            # Append the current split to the list
            data_splits.append((X_train_fold, X_val_fold, y_train_fold, y_val_fold))

        # Append the original val data to the list
        data_splits.append((X_train, X_val, y_train, y_val))

        return data_splits

    def evaluate(self, model, X_val, y_val):
        predictions = model.predict(X_val)
        rmsd = mean_squared_error(y_val, predictions, squared=False)
        return rmsd, predictions

    def train_val_return(self, parameters, model, trial_number):
        runs = 3
        # average over all runs
        runs_results = []
        y_vals_predicted = []

        for run in range(runs):
            validation_splits = self.cross_validation_splits(self.X_train, self.X_val, self.y_train, self.y_val)
            # average over all splits in a given run
            cv_fold_results = []
            y_val_predicted = []
            fold_num = 0

            # cross-validation
            for (X_train_split, X_val_split, y_train_split, y_val_split) in validation_splits:
                fold_num += 1
                
                # train the model on the given validation split
                model.fit(X_train_split, y_train_split)
                cv_fold_rmsd, validation_predictions = self.evaluate(model, X_val_split, y_val_split)
                
                # and save the result of that split
                cv_fold_results.append(cv_fold_rmsd)
                
                # after all five folds, append the final predictions
                if fold_num == 6:
                    y_val_predicted.append(validation_predictions)

            runs_results.append(np.mean(cv_fold_results))
            y_vals_predicted.append(y_val_predicted)

        # calculate the standard deviation of predictions
        y_vals_predicted = np.array(y_vals_predicted)
        std = np.std(y_vals_predicted, axis=0)[0]
        
        average_predictions = np.average(y_vals_predicted, axis=0)[0]
        average_result = np.mean(runs_results)
        
        # write the result and hyperparameters of a run to csv file
        optuna_trial_logging(self.log_csv_path, trial_number, parameters, average_result, average_predictions, std)

        return average_result

    def objective(self, trial=None):
        parameters, model = self.sample_params(trial, self.model_identifier)
        return self.train_val_return(parameters, model, trial.number)