In [1]:
import pandas as pd
import os
import warnings
from chembl_webresource_client.new_client import new_client as client
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem.MolStandardize.rdMolStandardize import FragmentParent
from rdkit import DataStructs
from jazzy.api import molecular_vector_from_smiles as mol_vect
import numpy as np
import pubchempy as pcp
import matplotlib.pyplot as plt
import pathlib
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV
from sklearn import linear_model
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit

In [4]:
warnings.filterwarnings("ignore")

In [5]:
working_dir = os.getcwd()
# dir for useful stuff for the actual essay
graphs_rel_path = r"project_results/graphs"
project_results_graphs = os.path.join(working_dir, graphs_rel_path)

In [1]:
def abs_file_path(rel_path):
    working_dir = os.getcwd()
    abs_file_path = os.path.join(working_dir, rel_path.replace("\\", "/"))
    return abs_file_path

In [1]:
def smiles_from_mol_id(list_mol_id):
    # returns a list of smiles strings for given list of mol ids
    list_smiles = []
    if "CHEMBL" in str(list_mol_id):
        for chembl_id in list_mol_id:
            molecule = client.molecule
            compound = molecule.filter(chembl_id=chembl_id)[0]
            list_smiles.append(compound['molecule_structures']["canonical_smiles"])
    else:
        for cid in list_mol_id:
            compound = pcp.Compound.from_cid(cid)
            smiles = compound.isomeric_smiles
            list_smiles.append(smiles)
    return list_smiles

In [2]:
def halflife_formatting(source_df, isozyme):
    # creates a correctly formatted list of half-life values from df
    list_halflife = []
    if isozyme == "3A4":
        df_adjusted = source_df
        list_halflife = df_adjusted["Standard Value"]
    if isozyme == "RLM":
        df_adjusted = source_df.replace({">30": '30'})
        list_halflife = df_adjusted["Half-life (minutes)"]
    if isozyme == "HLC":
        df_adjusted = source_df
        list_halflife = df_adjusted["Half-life"]
    return list_halflife

In [2]:
def isz_csv_data_formatting(source_csv_file, isozyme, sep=","):
    # inputs are relative paths
    # creates a correctly formatted csv file for use with the chemprop library
    source_df = pd.read_csv(abs_file_path(source_csv_file), sep=sep)

    if isozyme + ".csv" in os.listdir(abs_file_path("project_resources")):
        print(f"{isozyme}.csv already exists in dir")
    else:
        if isozyme == "3A4":
            # additional formatting, since not all molecules have the desired property
            source_df = source_df[source_df["Standard Type"] == "T1/2"]
        try:
            mol_ids = source_df["Molecule ChEMBL ID"]
        except KeyError:
            mol_ids = source_df["PUBCHEM_CID"]
        final_df = pd.DataFrame()
        final_df["mol_idx"] = list(range(1, len(mol_ids)+1))
        orig_smiles = smiles_from_mol_id(mol_ids)
        cleaned_smiles = []
        # replace smiles with two disjoint parts with only the bigger mol
        for smi in orig_smiles:
            mol = Chem.MolFromSmiles(smi)
            cleaned_mol = FragmentParent(mol)
            cleaned_smi = Chem.MolToSmiles(cleaned_mol)
            cleaned_smiles.append(cleaned_smi)
        final_df["smiles"] = cleaned_smiles
        final_df["half-life"] = list(halflife_formatting(source_df, isozyme))
        final_df.to_csv(abs_file_path(f"project_resources/{isozyme}.csv"), index=False)
        print(f"{isozyme}.csv was successfully created")

In [1]:
def split_csv_data_formatting(isozyme, smiles_as_index, split_smiles, split_type):
    # saves ML splits as csv files containing mol idxs, smiles and half-lives
    for split in ["train", "test"]:
        location = f"project_resources/data_splits/{split_type}"
        file_name = f"{isozyme}_{split}.csv"
        # check if file already exists
        try:
            with open(f"{location}/{file_name}") as f:
                f.close()
            print(f"{file_name} already exists in {location}")

        except FileNotFoundError:
            split_df = pd.DataFrame()
            split_indexes = []
            split_halflifes = []
            isz_scaff_split_smiles = split_smiles[split]

            # get the index and half-life values for each smiles in data split
            for smi in isz_scaff_split_smiles:
                smi_idx = smiles_as_index[isozyme][smi][0]  # numerical index of smiles
                split_indexes.append(smi_idx)
                mol_halflife = smiles_as_index[isozyme][smi][1]  # half-life value for the specific molecule
                split_halflifes.append(mol_halflife)

            split_df["index"] = split_indexes
            split_df["smiles"] = isz_scaff_split_smiles
            split_df["half-life"] = split_halflifes
            split_df.to_csv(abs_file_path(f"{location}/{file_name}"), index=False)
            print(f"{file_name} was successfully created in {location}")

In [6]:
def fp_from_smiles(list_smiles):
    list_fingerprint = []
    for smi in list_smiles:
        mol = Chem.MolFromSmiles(smi)
        fingerprint = AllChem.GetMorganFingerprintAsBitVect(mol, useChirality=True, radius=2, nBits = 124)
        vector = np.array(fingerprint)
        list_fingerprint.append(vector)
    # takes a list of smiles strings,output is a corresponding Morgan fingerprint as a list
    return list_fingerprint

In [1]:
def create_scatter_plot(x_axis, y_axis, plot_title, x_label, y_label, diag=False, error_bars=False, x_min=None, y_min=None, x_max=None, y_max=None, save_dir=None, save_file_name=None):
    # !!! perhaps shorten input of function by using a class?
    # x_axis, y_axis ... input values to be displayed on their respective axis
    # x_min, y_min, x_max, y_max ... decide the span of the graph
    plt.scatter(x_axis, y_axis, edgecolors=None, c='b', alpha=0.2)
    if diag:
        diag = np.linspace(x_min, x_max)
        plt.plot(diag, diag, linestyle='dotted')
    plt.xlim(x_min, x_max)
    plt.ylim(y_min, y_max)
    plt.ylabel(y_label)
    plt.xlabel(x_label)
    plt.suptitle(plot_title)
    if save_dir and save_file_name:
        plt.savefig(os.path.join(save_dir, save_file_name))
    plt.show()

In [10]:
def to_rdkit_fingerprint(fps):
    rdkit_fingerprints = []
    for prnt in fps:
        bitstring = "".join(prnt.astype(str))
        fp = DataStructs.cDataStructs.CreateFromBitString(bitstring)
        rdkit_fingerprints.append(fp)
    return rdkit_fingerprints

In [2]:
def tanimoto(fps, fp2s):
    tanimoto_similarities = []
    fps = to_rdkit_fingerprint(fps)
    fp2s = to_rdkit_fingerprint(fp2s)
    for x in fps:
        fpsx = []
        for y in fp2s:
            fpsx.append(DataStructs.TanimotoSimilarity(x, y))
        max_tanimoto = max(fpsx)
        tanimoto_similarities.append(round(max_tanimoto, 3))
    print(tanimoto_similarities[:25], f"length: {len(tanimoto_similarities)}")
    return tanimoto_similarities

In [1]:
def param_tuning(x_train, x_test, y_train, y_test, type_ml_use):
    # !!! určování hodnot pro param tuning, lze vylepšit pomocí np.random.randint

    if type_ml_use == 'linear':
        param_grid = {
            'fit_intercept': [True],
            'alpha': [1e-5, 1e-4, 1e-3, 1e-2],
            'l1_ratio': [0, 0.1, 0.5, 0.9, 1]
        }
        reg = linear_model.ElasticNet()

    if type_ml_use == 'KRR':
        param_grid = {
            "alpha": np.logspace(-4, 1, 20),
            "gamma": np.logspace(-14, 0, 20),
            "kernel": ['linear', 'laplacian', 'rbf']
        }
        reg = KernelRidge()

    if type_ml_use == 'GB':
        param_grid = {
            'n_estimators': [10, 20, 50, 200, 400],
            'learning_rate': [0.02, 0.05],
            'max_depth': [1, 2, 3, 5],
        }
        reg = GradientBoostingRegressor()

    if type_ml_use == 'RF':
        param_grid = {
            'max_depth': [None, 2, 3, 5, 10],
            'max_features': ['auto', 'sqrt', 'log2'],
            'n_estimators': [10, 20, 50, 100, 200],
        }
        reg = RandomForestRegressor()

    if type_ml_use == 'ANN':
        param_grid = {
            'learning_rate_init': [0.001, 0.002, 0.005, 0.01, 0.02, 0.05],
            'hidden_layer_sizes': [[5], [10], [20], [50], [5]*2, [10]*2, [20]*2, [50]*2, [5]*3, [10]*3]
        }
        reg = MLPRegressor()

    grid = RandomizedSearchCV(reg, param_grid, cv=KFold(n_splits=5, shuffle=True), verbose=0)
    grid.fit(x_train, y_train)
    best_reg = grid.best_estimator_
    y_train_predict = best_reg.predict(x_train)
    y_test_predict = best_reg.predict(x_test)
    abs_error = np.abs(y_test_predict-y_test)
    print(f"     best {type_ml_use} hyperparams: {best_reg}")
    # retrain on best hyperparameters
    best_reg.fit(x_train, y_train)

    return y_train_predict, y_test_predict, abs_error

In [1]:
def mol_predict_and_std(models, x_train, x_test, y_train, y_test):
    y_test_avg_predict_dict = {}
    std_dict = {}
    rmsd_dict = {}
    for model in models:
        y_test_predicts = []

        for i in range(3):
            asdf, y_test_predict, ghjk = param_tuning(x_train, x_test, y_train, y_test, model)
            # asdf, ghjk ... dummy variables, are not needed here
            y_test_predicts.append(y_test_predict)

        y_test_predicts_array = np.array(y_test_predicts)

        y_test_avg_predict = np.average(y_test_predicts_array, axis=0)
        standard_deviation = np.std(y_test_predicts_array, axis=0)
        rmsd = np.sqrt(np.average(np.square(y_test_avg_predict-y_test)))
        # root-mean-square deviation

        y_test_avg_predict_dict[model] = y_test_avg_predict
        std_dict[model] = standard_deviation
        rmsd_dict[model] = rmsd
    return y_test_avg_predict_dict, std_dict, rmsd_dict

In [5]:
# CODE SPECIFIC TO JAZZY:

In [1]:
def mol_fts(smiles, isozyme):
    features = []
    for smi in smiles:
        try:
            features.append(mol_vect(smi))
        except Exception as e:
            # nepřišel jsem na to, jak zachytit ten JazzyError - protože mi to napíše NameError: name 'JazzyError' is not defined
            print(f"{e} caused by {smi}")
            features.append(np.nan)
    return features

In [1]:
# CODE SPECIFIC TO NEQUIP:

In [2]:
def list_splitter(list_to_split, ratio):
    elements = len(list_to_split)
    middle = int(elements * ratio)
    return [list_to_split[:middle], list_to_split[middle:]]

In [5]:
def get_atom_chars(smi):
    atoms_chars = []
    mol = Chem.MolFromSmiles(smi, sanitize=False)
    for a in mol.GetAtoms():
        atom = Chem.RWMol()
        atom.AddAtom(a)
        atom_smiles = Chem.MolToSmiles(atom)
        atom_smiles = str(atom_smiles)
        atoms_chars.append(atom_smiles)
    return atoms_chars

In [1]:
def get_unique_symbols(list_smiles):
    # gets unique symbols from every mol in the list e.g. ["C1=CC=C(C=C1)O", "C1=CSC=C1"] -> ["C", "O", "S"]
    all_smiles = ""  # every smiles together in one string
    for smiles in list_smiles:
        all_smiles += smiles
    atoms = get_atom_chars(all_smiles)  # all atoms present in the smiles e.g. "C1=CSC=C1" -> ["C", "C", "S", "C", "C"]
    unique_symbols_set = set(atoms)  # remove duplicates
    unique_symbols = list(unique_symbols_set)
    unique_symbols = [sym for sym in unique_symbols if "[" not in sym]  # filter out isotopes and ions, both of which are in brackets
    # filter out lower case letters (.upper method is not viable, since some elements have more than one letter e.g. He)
    unique_symbols = [sym for sym in unique_symbols if sym[0].isupper()]
    return unique_symbols