In [1]:
import pandas as pd
import os
import warnings
from chembl_webresource_client.new_client import new_client as client
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs
import numpy as np
import pubchempy as pcp
import matplotlib.pyplot as plt
import pathlib
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV
from sklearn import linear_model
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit

In [4]:
warnings.filterwarnings("ignore")

In [5]:
working_dir = os.getcwd()
# dir for useful stuff for the actual essay
graphs_rel_path = r"project_results/graphs"
project_results_graphs = os.path.join(working_dir, graphs_rel_path)

In [1]:
def abs_file_path(rel_path):
    working_dir = os.getcwd()
    abs_file_path = os.path.join(working_dir, rel_path.replace("\\", "/"))
    return abs_file_path

In [1]:
def smiles_from_mol_id(list_mol_id, database, isozyme):
    list_smiles = []
    smiles_csv_rel = r"project_resources/smiles.csv"
    smiles_csv_abs = os.path.join(working_dir, smiles_csv_rel)

    # Check if SMILES CSV file exists
    if not pathlib.Path(smiles_csv_abs).is_file():
        with open(smiles_csv_abs, "w") as empty_csv:
            pass
        smiles_csv = pd.DataFrame()
    else:
        # Read SMILES CSV file
        smiles_csv = pd.read_csv(smiles_csv_abs, index_col=0)

    try:
        list_smiles = smiles_csv[isozyme].values
    except KeyError or ValueError:
        if database == "ChEMBL":
            for chembl_id in list_mol_id:
                molecule = client.molecule
                compound = molecule.filter(chembl_id=chembl_id)[0]
                list_smiles.append(compound['molecule_structures']["canonical_smiles"])

        if database == "PubChem":
            for cid in list_mol_id:
                compound = pcp.Compound.from_cid(cid)
                smiles = compound.isomeric_smiles
                list_smiles.append(smiles)

        # Check if the isozyme exists in the DataFrame
        if isozyme not in smiles_csv.columns:
            smiles_csv[isozyme] = ""

        # Create a dictionary of {index: smile} pairs
        smile_dict = {i: smile for i, smile in enumerate(list_smiles)}

        # Assign the new smiles to the isozyme column
        smiles_csv[isozyme].update(pd.Series(smile_dict))

        smiles_csv.to_csv(smiles_csv_abs, index=True)  # Save DataFrame with index

    final = []
    for smile in list_smiles:
        if smile == smile:  # Check for NaN values
            final.append(smile)

    return final

In [6]:
def fp_from_smiles(list_smiles):
    list_fingerprint = []
    for smi in list_smiles:
        mol = Chem.MolFromSmiles(smi)
        fingerprint = AllChem.GetMorganFingerprintAsBitVect(mol, useChirality=True, radius=2, nBits = 124)
        vector = np.array(fingerprint)
        list_fingerprint.append(vector)
    # takes a list of smiles strings,output is a corresponding Morgan fingerprint as a list
    return list_fingerprint

In [1]:
def xy_split(fps_array, halflife, fps_arrays=False):
    x = fps_array
    y = halflife

    idx_train, idx_test = train_test_split(np.arange(len(y)), test_size=0.2, random_state=42)

    x_train = x[idx_train]
    x_test = x[idx_test]

    y_train = y[idx_train]
    y_test = y[idx_test]

    if fps_arrays:
        fps_filtered_train = fps_array[idx_train]
        fps_filtered_test = fps_array[idx_test]
        return x_train, x_test, y_train, y_test, fps_filtered_train, fps_filtered_test

    print(f"shapes of variables ... fps array: {fps_array.shape}, halflife: {halflife.shape},\n x_train: {x_train.shape}, x_test: {x_test.shape}, y_train: {y_train.shape}, y_test: {y_test.shape}")
    return x_train, x_test, y_train, y_test

In [8]:
def create_scatter_plot(x_axis, y_axis, plot_title, x_label, y_label, diag=False, error_bars=False, x_min=None, y_min=None, x_max=None, y_max=None, save_dir=None, save_file_name=None):
    # !!! perhaps shorten input of function by using a class?
    # !!! add error_bars after std has been fixed
    # x_axis, y_axis ... input values to be displayed on their respective axis
    # x_min, y_min, x_max, y_max ... decide the span of the graph
    plt.scatter(x_axis, y_axis, edgecolors=None, c='b', alpha=0.2)
    if diag:
        diag = np.linspace(x_min, x_max)
        plt.plot(diag, diag, linestyle='dotted')
    if error_bars:
        error_bars()
    plt.xlim(x_min, x_max)
    plt.ylim(y_min, y_max)
    plt.ylabel(y_label)
    plt.xlabel(x_label)
    plt.suptitle(plot_title)
    if save_dir and save_file_name:
        plt.savefig(os.path.join(save_dir, save_file_name))
    plt.show()

In [10]:
def to_rdkit_fingerprint(fps):
    rdkit_fingerprints = []
    for prnt in fps:
        bitstring = "".join(prnt.astype(str))
        fp = DataStructs.cDataStructs.CreateFromBitString(bitstring)
        rdkit_fingerprints.append(fp)
    return rdkit_fingerprints

In [1]:
def tanimoto(fps, fp2s):
    tanimoto_similarities = []
    fps = to_rdkit_fingerprint(fps)
    fp2s = to_rdkit_fingerprint(fp2s)
    for x in fps:
        fpsx = []
        for y in fp2s:
            fpsx.append(DataStructs.TanimotoSimilarity(x,y))
        max_tanimoto = max(fpsx)
        tanimoto_similarities.append(max_tanimoto)
    print(tanimoto_similarities[:25], f"length: {len(tanimoto_similarities)}")
    return tanimoto_similarities

In [1]:
def param_tuning(x_train, x_test, y_train, y_test, type_ml_use, show_print=False):
    # !!! určování hodnot pro param tuning, lze vylepšit pomocí np.random.randint

    # !!! upravit linear hyperparams aby bylo lepší než před tuning

    if type_ml_use == 'linear':
        param_grid = {
            'fit_intercept': [True],
            'alpha': [1e-5, 1e-4, 1e-3, 1e-2],
            'l1_ratio': [0, 0.1, 0.5, 0.9, 1]
        }
        reg = linear_model.ElasticNet()

    if type_ml_use == 'KRR':
        param_grid = {
            "alpha": np.logspace(-4, 1, 20),
            "gamma": np.logspace(-14, 0, 20),
            "kernel": ['linear', 'laplacian', 'rbf']
        }
        reg = KernelRidge()

    if type_ml_use == 'GB':
        param_grid = {
            'n_estimators': [10, 20, 50, 200, 400],
            'learning_rate': [0.02, 0.05],
            'max_depth': [1, 2, 3, 5],
        }
        reg = GradientBoostingRegressor()

    if type_ml_use == 'RF':
        param_grid = {
            'max_depth': [None, 2, 3, 5, 10],
            'max_features': ['auto', 'sqrt', 'log2'],
            'n_estimators': [10, 20, 50, 100, 200],
        }
        reg = RandomForestRegressor()

    if type_ml_use == 'ANN':
        param_grid = {
            'learning_rate_init': [0.001, 0.002, 0.005, 0.01, 0.02, 0.05],
            'hidden_layer_sizes': [[5], [10], [20], [50], [5]*2, [10]*2, [20]*2, [50]*2, [5]*3, [10]*3]
        }
        reg = MLPRegressor()

    grid = RandomizedSearchCV(reg, param_grid, cv=KFold(n_splits=5, shuffle=True), verbose=0)
    grid.fit(x_train, y_train)
    best_reg = grid.best_estimator_
    y_train_predict = best_reg.predict(x_train)
    y_test_predict = best_reg.predict(x_test)
    abs_error = np.abs(y_test_predict-y_test)
    print(f"     best {type_ml_use} hyperparams: {best_reg}")
    # retrain on best hyperparameters
    best_reg.fit(x_train, y_train)

    return y_train_predict, y_test_predict, abs_error

In [1]:
def mol_predict_and_std(models, x_train, x_test, y_train, y_test):
    y_test_avg_predict_dict = {}
    std_dict = {}
    rmsd_dict = {}
    for model in models:
        y_test_predicts = []

        for i in range(3):
            asdf, y_test_predict, ghjk = param_tuning(x_train, x_test, y_train, y_test, model)
            # asdf, ghjk ... dummy variables, are not needed here
            y_test_predicts.append(y_test_predict)

        y_test_predicts_array = np.array(y_test_predicts)

        y_test_avg_predict = np.average(y_test_predicts_array, axis=0)
        standard_deviation = np.std(y_test_predicts_array, axis=0)
        rmsd = np.sqrt(np.average(np.square(y_test_avg_predict-y_test)))
        # root-mean-square deviation

        y_test_avg_predict_dict[model] = y_test_avg_predict
        std_dict[model] = standard_deviation
        rmsd_dict[model] = rmsd
    return y_test_avg_predict_dict, std_dict, rmsd_dict