#### Dependencies:

In [2]:
from kcat_prediction import *

Before using kcat prediction for the first time, we have to download the ESM-1b model:

In [2]:
import esm
model, alphabet = torch.hub.load("facebookresearch/esm:v0.4.0", "esm1b_t33_650M_UR50S")

Using cache found in C:\Users\marle/.cache\torch\hub\facebookresearch_esm_v0.4.0


Example for calculating kcat prediction. You can enter multiple enyzmes and substrates. Enzyme 1 will be paired with substrates 1/products 1, enzyme 2 will be paired with substrates 2/products 2 and so on...

Enter enzymes as amino acid sequences and substrates/products either as SMILES strings, KEGG Compound IDs, or InChI strings. If the reaction has multiple substrates or products, separate them by using a semicolon (";)

In [5]:
import pandas as pd

# organism = "Arabidopsis thaliana"

# data = pickle.load(open("../../data/final_kcat_dataset_" + organism + ".pkl", "rb"))

split = "full"
data = pd.read_pickle(join("../data", "splits", split, "val_data.pkl"))
print(data)

data['substrate_IDs'] = data['substrate_IDs'].apply(list)

data['substrate_IDs'] = ['#'.join(map(str, l)) for l in data['substrate_IDs']]

data['product_IDs'] = data['product_IDs'].apply(list)

data['product_IDs'] = ['#'.join(map(str, l)) for l in data['product_IDs']]

# import plotnine
# (plotnine.ggplot(data, plotnine.aes(x="log10_kcat")) + 
#   plotnine.geom_histogram(binwidth=0.1,color='black', fill='white'))

          ECs              Organism Uniprot IDs      PMID      Type  \
0  1.14.14.43  Arabidopsis thaliana      P48421  11553739  wildtype   
1  1.14.14.45  Arabidopsis thaliana      O65782  11158532  wildtype   
2   3.2.1.147  Arabidopsis thaliana      Q8GRX1  19703694  wildtype   
3    3.2.1.21  Arabidopsis thaliana      Q8GRX1  19703694  wildtype   
4     6.3.2.2  Arabidopsis thaliana      P46309  15180996  wildtype   
5     6.3.2.2  Arabidopsis thaliana      P46309  15180996  wildtype   
6     6.3.2.2  Arabidopsis thaliana      P46309  15180996  wildtype   

        kcat Temperature   pH                                   Substrates  \
0   2.333333        28.0  7.6  O2;3-Indoleacetaldoxime;L-Cysteine;NADPH;H+   
1   0.883333        28.0  7.6  O2;NADPH;3-Indoleacetaldoxime;L-Cysteine;H+   
2  12.000000        37.0  4.5                                 Sinigrin;H2O   
3   7.300000        37.0  4.5           p-Nitrophenyl-beta-D-glucoside;H2O   
4   0.075000        25.0  7.0            

In [4]:
data = data.rename(columns={"Sequence ID_x" : "Sequence ID"})

In [5]:
df = kcat_predicton(substrates = data["substrate_IDs"].tolist(),
               products = data["product_IDs"].tolist(),
               enzymes = data["Sequence"].tolist(),
                values = np.log10(data["kcat"].tolist()),
                                 RID = data["Reaction ID"].tolist(),
                                 SID = data["Sequence ID"].tolist(),
                       EC = data["ECs"].tolist())

Step 1/3: Calculating numerical representations for all substrates and products.
Step 2/3: Calculating numerical representations for all enzymes.
.....2(a) Loading ESM-1b model.
.....2(b) Loading model parameters for task-specific model.
.....2(c) Calculating enzyme representations.
Step 3/3: Making predictions for kcat.
                                          substrates  \
0  InChI=1S/C21H30N7O17P3/c22-17-12-19(25-7-24-17...   
1  InChI=1S/C21H30N7O17P3/c22-17-12-19(25-7-24-17...   
2  InChI=1S/H2O/h1H2#InChI=1S/C10H17NO9S2/c1-2-3-...   
3  InChI=1S/C12H15NO8/c14-5-8-9(15)10(16)11(17)12...   
4  InChI=1S/C10H16N5O13P3/c11-8-5-9(13-2-12-8)15(...   
5  InChI=1S/C10H16N5O13P3/c11-8-5-9(13-2-12-8)15(...   
6  InChI=1S/C10H16N5O13P3/c11-8-5-9(13-2-12-8)15(...   

                                            products  \
0  InChI=1S/C13H15N3O3S/c14-10(13(17)18)7-20-12(1...   
1  InChI=1S/C13H15N3O3S/c14-10(13(17)18)7-20-12(1...   
2  InChI=1S/C4H5NS/c1-2-3-5-4-6/h2H,1,3H2#InChI=1...   
3  I

In [6]:
# df.to_pickle('../data/output_' + organism + '.pkl', 'wb')

In [18]:
import sklearn.metrics as sk
import scipy as sci
from math import sqrt
df = df[df["complete"]==True]
rms = sqrt(sk.mean_squared_error(df["value"].tolist(), df["kcat [s^(-1)]"].tolist(), squared=True))
R2 = sk.r2_score(df["value"].tolist(), df["kcat [s^(-1)]"].tolist())
Pearson = sci.stats.pearsonr(df["value"].tolist(), df["kcat [s^(-1)]"].tolist())[0]

print(rms, R2, Pearson)

rms = sqrt(sk.mean_squared_error((10**df["value"]).tolist(), (10**df["kcat [s^(-1)]"]).tolist(), squared=True))
R2 = sk.r2_score((10**df["value"]).tolist(), (10**df["kcat [s^(-1)]"]).tolist())
Pearson = sci.stats.pearsonr((10**df["value"]).tolist(), (10**df["kcat [s^(-1)]"]).tolist())[0]
MAE = np.mean(abs(np.array((10**df["value"]).tolist()) - np.array((10**df["kcat [s^(-1)]"]).tolist())))
MedAE = np.median(abs(np.array((10**df["value"]).tolist()) - np.array((10**df["kcat [s^(-1)]"]).tolist())))

print(rms, R2, Pearson, MAE, MedAE)

2.0853557608491196 -6.111261258387371 -0.2945819497398372
745.7519392187198 -27410.56564073479 -0.4565931622739521 361.62409315186767 43.09756088256836


In [None]:
data_train = pickle.load(open("../data/train_df_kcat.pkl")

In [None]:
df = df.rename(columns={'enzyme': 'Sequence'})

In [None]:
def calculate_identity_ignore_gaps(seq1, seq2):
    identical_residues = sum([1 for x, y in zip(seq1, seq2) if x == y and x != "-"])
    pid = identical_residues / sum([1 for x in seq1 if x != "-"]) 
    return pid

In [None]:
from Bio import Align
from Bio.Align import substitution_matrices

df["max_identity"] = np.nan

aligner=Align.PairwiseAligner()
aligner.substitution_matrix = substitution_matrices.load("BLOSUM62")
aligner.mode = "global"
aligner.extend_gap_score = -0.5
aligner.open_gap_score = -10

for i in df.index:
    identities = []
    for j in data_train.index:
        seq1 = str(df["Sequence"][i])
        seq2 = str(data_train["Sequence"][j])
        if 'U' in seq1:
            seq1 = seq1.replace('U', 'C')
        if 'U' in seq2:
            seq2 = seq2.replace('U', 'C')
        try:
            alignments = aligner.align(seq1, seq2)
        except:
            print(seq1, seq2)
        identities.append(calculate_identity_ignore_gaps(alignments[0][0], alignments[0][1]))
    df["max_identity"][i] = max(identities)

In [None]:
df["max_identity"] = df["max_identity"]*100

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize= (10,8))
plt.rcParams.update({'font.size': 28})

splits = ["0-40%", "40-80%", "80-99%","99-100%"]
lower_bounds = [0,40,80,99]
upper_bounds = [40,80,99,100]

points1 = []
points2 = []
n_points1, n_points2 = [], []

for i, split in enumerate(splits):

    lb, ub = lower_bounds[i], upper_bounds[i]
    
    help_df = df.loc[df["max_identity"]>= lb].loc[df["max_identity"]<= ub]
    y_true = np.array(help_df["value"])
    y_pred = np.array(help_df["kcat [s^(-1)]"])
    n_kcat = len(y_pred)
    R2 =  sk.r2_score(y_true, y_pred)
    abs_error = abs(y_true - y_pred)
    rmse = math.sqrt(np.mean(abs(y_true - y_pred)**2))
    print(len(y_true))
    print(split, R2, rmse)
    points1.append(R2)
    points2.append(rmse)
    n_points1.append(n_kcat)


ticks2 = np.array(range(len(splits)))
labs = splits
ax.set_xticks(ticks2)
ax.set_xticklabels(labs,  y= -0.03, fontsize=26)
ax.tick_params(axis='x', length=0, rotation = 0)

plt.ylim((-0.1,2.5))
plt.xlim((-0.2, 3.2))
plt.legend(loc = "lower right", fontsize=20)
plt.ylabel('RMSE')
plt.xlabel('Enzyme sequence identity')
ax.yaxis.set_label_coords(-0.15, 0.5)
ax.xaxis.set_label_coords(0.5,-0.13)

plt.plot([-0.15,4], [0,0], color='grey', linestyle='dashed')


plt.plot([0,1,2,3], points2, c= "black", linewidth=2)

for i, split in enumerate(splits):
    points1.append(R2)
    
    if i ==0:
        plt.scatter(i, points2[i], c='black', marker="o", linewidths= 8)
        ax.annotate(n_points1[i], (i-0.08, points2[i]+0.08), fontsize=17, c= "red", weight = "bold")

    else:
        plt.scatter(i, points2[i], c='black', marker="o", linewidths= 8)
        ax.annotate(n_points1[i], (i-0.08, points2[i]+0.08), fontsize=17, c= "red", weight = "bold")
            
     
plt.savefig(join("..", "data", "sequence_identity.png"))
plt.show()

In [None]:
import matplotlib.colors as colors
import matplotlib.cm as cmx

uniq = list(set(df['Reaction ID']))
z = range(1, len(uniq))
hot = plt.get_cmap('hsv')
cNorm = colors.Normalize(vmin=0, vmax=len(uniq))
scalarMap = cmx.ScalarMappable(norm=cNorm, cmap=hot)

for i in range(len(uniq)):
    indx = df['Reaction ID'] == uniq[i]
    plt.scatter(10**df["value"][indx], 10**df["kcat [s^(-1)]"][indx], s=15, color=scalarMap.to_rgba(i), label=uniq[i])
    
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.xlabel('Real value', fontsize=15)
plt.ylabel('Estimated value', fontsize=15)
plt.title('Predictions for A. thaliana data colored by reaction', fontsize=15)
plt.axline((1, 1), slope=1, c='red')
plt.show()

In [None]:
sci.stats.probplot(np.subtract(np.array(df["value"].tolist()), np.array(df["kcat [s^(-1)]"].tolist())), dist="norm", plot=plt)
plt.show()

In [None]:
EC_kcat_pred =[[] for _ in range(6)]
EC_kcat =[[] for _ in range(6)]
for ind in df.index:
    try:
        EC = int(df["EC"][ind][0][0])
        EC_kcat[EC-1].append(df["value"][ind])
        EC_kcat_pred[EC-1].append(df["kcat [s^(-1)]"][ind])
    except IndexError:
        pass

In [None]:
fig, ax = plt.subplots(figsize= (8,8))
plt.rcParams.update({'font.size': 28})

classes = [str(i) for i in range(1,7)]

for i in range(len(EC_kcat)):
    
    circle = plt.Circle((np.mean(EC_kcat[i]), np.mean(EC_kcat_pred[i]) ),
                        np.sqrt(len(EC_kcat_pred[i]))/300, color='navy', fill = True)
    ax.add_artist(circle)
    if i ==5:
        ax.annotate("EC"+ str(i+1), (np.mean(EC_kcat[i])+0.01, np.mean(EC_kcat_pred[i])-0.05), fontsize=17, c='red', weight = "bold")
    else:
        ax.annotate("EC"+ str(i+1), (np.mean(EC_kcat[i])+0.03, np.mean(EC_kcat_pred[i])-0.01), fontsize=17, c='red', weight = "bold")
    

ticks2 = [0.2, 0.6,1,1.4,1.8]
labs = ticks2
ax.set_xticks(ticks2)
ax.set_xticklabels(labs,  y= -0.03, fontsize=26)
ax.tick_params(axis='x', length=0, rotation = 0)

ax.set_yticks(ticks2)
ax.set_yticklabels(labs,  y= -0.03, fontsize=26)
ax.tick_params(axis='y', length=0, rotation = 0)

plt.ylim((0,2))
plt.xlim((0, 2))
plt.legend(loc = "upper left", fontsize=20)
plt.xlabel('mean measured \n $k_{cat}$ value on $\log_{10}$-scale')
plt.ylabel('mean predicted \n $k_{cat}$ value on $\log_{10}$-scale')
ax.yaxis.set_label_coords(-0.15, 0.5)
ax.xaxis.set_label_coords(0.5,-0.13)

plt.plot([0,2], [0,2], color='grey', alpha = 0.3, linestyle='dashed')
plt.savefig(join("..", "data", "EC_classes_mean_kcat.png"))
plt.show()

In [None]:
import pandas as pd
import numpy as np
from os.path import join
import os
from rdkit import Chem
from rdkit.Chem import AllChem
from drfp import DrfpEncoder
CURRENT_DIR = os.getcwd()

mol_folder = join("..", "..", "data", "mol-files")
def get_reaction_site_smarts(metabolites):
    reaction_site = ""
    for met in metabolites:
        is_kegg_id = False
        
        if met[0] == "C":
            is_kegg_id = True
            
        if is_kegg_id:
            try:
                Smarts = Chem.MolToSmarts(Chem.MolFromMolFile(join(mol_folder, met + '.mol')))
            except OSError:
                return(np.nan)
        else:
            mol = Chem.inchi.MolFromInchi(met)
            if mol is not None:
                Smarts = Chem.MolToSmarts(mol)
            else:
                return(np.nan)
        reaction_site = reaction_site + "." + Smarts
    return(reaction_site[1:])

def get_reaction_site_smiles(metabolites):
    reaction_site = ""
    for met in metabolites:
        is_kegg_id = False
        
        if met[0] == "C":
            is_kegg_id = True
            
        if is_kegg_id:
            try:
                Smarts = Chem.MolToSmiles(Chem.MolFromMolFile(join(mol_folder, met + '.mol')))
            except OSError:
                return(np.nan)
        else:
            mol = Chem.inchi.MolFromInchi(met)
            if mol is not None:
                Smarts = Chem.MolToSmiles(mol)
            else:
                return(np.nan)
        reaction_site = reaction_site + "." + Smarts
    return(reaction_site[1:])

def convert_fp_to_array(difference_fp_dict):
    fp = np.zeros(2048)
    for key in difference_fp_dict.keys():
        fp[key] = difference_fp_dict[key]
    return(fp)

In [None]:
df["structural_fp"] = ""

for ind in df.index:
    substrates = df["substrates"][ind].split('#')
    products = df["products"][ind].split('#')
    left_site = get_reaction_site_smarts(substrates)
    right_site = get_reaction_site_smarts(products)
    if not pd.isnull(left_site) and not pd.isnull(right_site):
        rxn_forward = AllChem.ReactionFromSmarts(left_site + ">>" + right_site)
        structural_fp = Chem.rdChemReactions.CreateStructuralFingerprintForReaction(rxn_forward).ToBitString()
    df["structural_fp"][ind] = structural_fp

In [None]:
import scipy

train_fps = [np.array(list(data_train["structural_fp"][ind][:3276])).reshape(1,-1).astype(int) for ind in data_train.index]
test_fps = [np.array(list(df["structural_fp"][ind][:3276])).reshape(1,-1).astype(int) for ind in df.index]

max_sim = []

for fp in test_fps:
    jaccard_sim = np.array([1- scipy.spatial.distance.cdist(fp,train_fp, metric='jaccard')[0][0] for train_fp in train_fps])
    max_sim.append(np.max(jaccard_sim))
    
df["reaction_sim"] = max_sim

df["reaction_sim"]= (df["reaction_sim"] - np.min(df["reaction_sim"]))
df["reaction_sim"] = df["reaction_sim"]/np.max(df["reaction_sim"])


In [None]:
df["global_sim"] = (df["max_identity"]/100)*df["reaction_sim"]

In [None]:
import matplotlib.pyplot as plt
import math
help_df = df

sim_bins_lb = [0.0, 0.4, 0.8]
sim_bins_ub = [0.4, 0.8, 1]
r2_scores, n_points, pearson_r, rmse = [], [], [], []
for i in range(len(sim_bins_lb)):
    help_df2 = help_df.loc[help_df["reaction_sim"] <= sim_bins_ub[i]].loc[help_df["reaction_sim"] >= sim_bins_lb[i]]
    pred = np.array(help_df2["kcat [s^(-1)]"])
    true = np.array(help_df2["value"])
    r2_scores.append(sk.r2_score(true, pred))
    pearson_r.append(sci.stats.pearsonr(true, pred)[0])
    rmse.append(math.sqrt(np.mean(abs(true - pred)**2)))
    n_points.append(len(pred))
    print("%s - %s" % (sim_bins_lb[i], sim_bins_ub[i]), r2_scores[-1], pearson_r[-1], rmse[-1], len(pred))
    

plt.rcParams.update({'font.size': 24})

fig, ax = plt.subplots(figsize= (8,6))

for i in range(len(sim_bins_lb)):    
    plt.scatter(i, rmse[i], c='navy', marker="o", linewidths= 8)
    ax.annotate(n_points[i], (i-0.08, rmse[i]+0.05), fontsize=17, c= "black", weight = "bold")

    
plt.xlabel('Reaction similarity score')
plt.ylabel('RMSE')
ax.yaxis.set_label_coords(-0.2, 0.5)
ax.xaxis.set_label_coords(0.5,-0.23)

ticks2 = np.array(range(len(sim_bins_lb)))
labs = ["%s - %s" % (sim_bins_lb[i], sim_bins_ub[i]) for i in range(len(sim_bins_lb))]
ax.set_xticks(ticks2)
ax.set_xticklabels(labs,  y= -0.03, fontsize=20)
ax.tick_params(axis='x', length=0, rotation = 0)

plt.ylim((0.5,2))
#plt.xlim((-0.5, 3.2))

# plt.plot([-0.49, 4], [0,0], color='grey', linestyle='dashed')
#plt.savefig(join("..","..", "data", "figures", "Reaction_Similarity_Score.eps"))
plt.show()

In [None]:
# df_kcat = pd.read_pickle(join("..", "data", "merged_and_grouped_kcat_dataset.pkl"))
# df2 = pd.DataFrame({"Reaction": df_kcat["Reaction ID"], "Sequence" : df_kcat["Sequence ID"],
#                   "kcats" :df_kcat["kcat_values"]})

# deviations = []
# x_value = []
# y_value = []

# for ind in df2.index:
#     kcats = df2["kcats"][ind]
#     if len(kcats) > 1 :
#         for i in range(len(kcats)):
#             for j in range(i+1, len(kcats)):
#                 if np.log10(float(kcats[i])) > -2.5 and np.log10(float(kcats[j])) > -2.5:
#                     deviations.append(abs(np.log10(float(kcats[i])) - np.log10(float(kcats[j]))))
#                     x_value.append(np.log10(float(kcats[i])))
#                     y_value.append(np.log10(float(kcats[j])))
                
                
# np.round(np.mean(deviations),2), np.round(10**np.mean(deviations),2)

# x_value = np.array(x_value)
# y_value = np.array(y_value)

# fig, ax = plt.subplots(figsize= (8,8))
# plt.rcParams.update({'font.size': 28})



# # x0, x1, y0, y1 = -3, 7, -3,7
# # plt.ylim(ymax = y1, ymin = y0)
# # plt.xlim(xmax = x1, xmin = x0)

# ax.tick_params(axis='x', length=10)
# ax.tick_params(axis='y', length=10)

# ax.yaxis.set_label_coords(-0.18, 0.5)
# ax.xaxis.set_label_coords(0.5, -0.1)

# plt.xticks([-2,0,2,4,6], ["$10^{-2}$", "$10^{0}$", "$10^{2}$", "$10^{4}$", "$10^{6}$"])
# plt.yticks([-2,0,2,4,6], ["$10^{-2}$", "$10^{0}$", "$10^{2}$", "$10^{4}$", "$10^{6}$"])

# plt.xlabel("Measured $k_{cat}$-values [$s^{-1}$]", fontsize = 22)
# plt.ylabel("Additional measurment for $k_{cat}$-values [$s^{-1}$] \n \
# for same enzyme-reaction pairs", fontsize = 22)

# plt.scatter(x_value, y_value, alpha = 0.4, s=30, c="navy")

In [None]:
df.to_pickle('../data/final_output_' + organism + '.pkl')

In [None]:
import plotnine 

df["residuals"] = df["value"] - df["kcat [s^(-1)]"]
    
(plotnine.ggplot(df, plotnine.aes(x = "kcat [s^(-1)]", y ="residuals")) +
plotnine.geom_point()+
plotnine.xlim(-4,4)+
plotnine.ylim(-4,4))