In [2]:
import pandas as pd
import numpy as np
from os.path import join
import os
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import MACCSkeys
from ete3 import NCBITaxa
import random
random.seed(10)
import torch
import esm
from bioservices import *
from data_preprocessing import *
from functions_and_dicts_data_preprocessing_GNN import *
from build_GNN import *
import warnings
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'
warnings.filterwarnings('ignore')
datasets_dir = "../../data"

CURRENT_DIR = os.getcwd()

  from .autonotebook import tqdm as notebook_tqdm


## 1. Loading in Sabio data

#### Loading Sabio data

In [3]:
organism = "Seed plants"

df_Sabio = pd.read_table(join(datasets_dir, "kcat_model_" + organism + ".tsv"))

df_Sabio["kcat"] = df_Sabio["kcat"].astype('float')
df_Sabio["PMID"] = df_Sabio["PMID"].astype('Int64')

df_Sabio["substrate_IDs"] = df_Sabio["substrate_IDs"].str.split('#').apply(set)
df_Sabio["product_IDs"] = df_Sabio["product_IDs"].str.split('#').apply(set)

df_Sabio["Type"][df_Sabio['Type'].str.contains("wildtype")] = "wildtype"
df_Sabio["Type"][df_Sabio['Type'].str.contains("mutant")] = "mutant"

print("Number of data points: %s" % len(df_Sabio))
print("Number of UniProt IDs: %s" % len(set(df_Sabio["Uniprot IDs"])))

df_kcat = df_Sabio

Number of data points: 1344
Number of UniProt IDs: 370


In [66]:
glucosinolates = pd.read_table(join(datasets_dir,"glucosinolates.tsv"))["Entry"].tolist()
df_validation = df_kcat[df_kcat["Uniprot IDs"].isin(glucosinolates)]

In [69]:
df_validation

Unnamed: 0,ECs,Organism,Uniprot IDs,PMID,Type,kcat,Temperature,pH,Substrates,Products,substrate_IDs,product_IDs,Main Substrate,Sequence
295,1.14.14.43,Arabidopsis thaliana,P48421,11553739,wildtype,2.333333,28.0,7.6,O2;3-Indoleacetaldoxime;L-Cysteine;NADPH;H+,NADP+;S-(Indolylmethylthiohydroximoyl)-L-cyste...,{InChI=1S/C10H10N2O/c13-12-6-5-8-7-11-10-4-2-1...,{InChI=1S/C13H15N3O3S/c14-10(13(17)18)7-20-12(...,InChI=1S/C10H10N2O/c13-12-6-5-8-7-11-10-4-2-1-...,MEDIIIGVVALAAVLLFFLYQKPKTKRYKLPPGPSPLPVIGNLLQL...
296,1.14.14.45,Arabidopsis thaliana,O65782,11158532,wildtype,0.8833333,28.0,7.6,O2;NADPH;3-Indoleacetaldoxime;L-Cysteine;H+,S-(Indolylmethylthiohydroximoyl)-L-cysteine;NA...,{InChI=1S/C10H10N2O/c13-12-6-5-8-7-11-10-4-2-1...,{InChI=1S/C13H15N3O3S/c14-10(13(17)18)7-20-12(...,InChI=1S/C10H10N2O/c13-12-6-5-8-7-11-10-4-2-1-...,MDLLLIIAGLVAAAAFFFLRSTTKKSLRLPPGPKGLPIIGNLHQME...
450,2.1.1.165,Arabidopsis thaliana,Q0WP12,19419967,wildtype,58000000.0,25.0,-,S-Adenosyl-L-methionine;Thiocyanate,Methyl thiocyanate;S-Adenosyl-L-homocysteine,{InChI=1S/C15H22N6O5S/c1-27(3-2-7(16)15(24)25)...,"{InChI=1S/C2H3NS/c1-4-2-3/h1H3, InChI=1S/C14H2...",InChI=1S/CHNS/c2-1-3/h3H/p-1,MAEEQQNSDQSNGGNVIPTPEEVATFLHKTVEEGGWEKCWEEEITP...
453,2.1.1.165,Arabidopsis thaliana,Q0WP12,19419967,wildtype,2200000.0,25.0,-,S-Adenosyl-L-methionine;Cl-,S-Adenosyl-L-homocysteine;Methyl chloride,"{InChI=1S/ClH/h1H/p-1, InChI=1S/C15H22N6O5S/c1...","{InChI=1S/CH3Cl/c1-2/h1H3, InChI=1S/C14H20N6O5...",InChI=1S/ClH/h1H/p-1,MAEEQQNSDQSNGGNVIPTPEEVATFLHKTVEEGGWEKCWEEEITP...
455,2.1.1.165,Arabidopsis thaliana,Q0WP12,19419967,wildtype,70000000.0,25.0,-,S-Adenosyl-L-methionine;Hydrosulfide,S-Adenosyl-L-homocysteine;Methanethiol,{InChI=1S/C15H22N6O5S/c1-27(3-2-7(16)15(24)25)...,"{InChI=1S/CH4S/c1-2/h2H,1H3, InChI=1S/C14H20N6...",InChI=1S/H2S/h1H2/p-1,MAEEQQNSDQSNGGNVIPTPEEVATFLHKTVEEGGWEKCWEEEITP...
457,2.1.1.165,Arabidopsis thaliana,Q0WP12,19419967,wildtype,36000000.0,25.0,-,S-Adenosyl-L-methionine;Hydrosulfide,Methanethiol;S-Adenosyl-L-homocysteine,{InChI=1S/C15H22N6O5S/c1-27(3-2-7(16)15(24)25)...,"{InChI=1S/CH4S/c1-2/h2H,1H3, InChI=1S/C14H20N6...",InChI=1S/C15H22N6O5S/c1-27(3-2-7(16)15(24)25)4...,MAEEQQNSDQSNGGNVIPTPEEVATFLHKTVEEGGWEKCWEEEITP...
878,3.2.1.147,Arabidopsis thaliana,P37702,19703694,wildtype,2.3,37.0,4.5,Sinigrin;H2O,beta-D-Glucose;Allyl isothiocyanate;Sulfate,"{InChI=1S/H2O/h1H2, InChI=1S/C10H17NO9S2/c1-2-...","{InChI=1S/H2O4S/c1-5(2,3)4/h(H2,1,2,3,4), InCh...","InChI=1S/C10H17NO9S2/c1-2-3-6(11-20-22(16,17)1...",MKLLMLAFVFLLALATCKGDEFVCEENEPFTCNQTKLFNSGNFEKG...
879,3.2.1.147,Arabidopsis thaliana,Q8GRX1,19703694,wildtype,12.0,37.0,4.5,Sinigrin;H2O,Sulfate;Allyl isothiocyanate;beta-D-Glucose,"{InChI=1S/H2O/h1H2, InChI=1S/C10H17NO9S2/c1-2-...","{InChI=1S/H2O4S/c1-5(2,3)4/h(H2,1,2,3,4), InCh...","InChI=1S/C10H17NO9S2/c1-2-3-6(11-20-22(16,17)1...",MAIPKAHYSLAVLVLLFVVVSSSQKVCNPECKAKEPFHCDNTHAFN...
943,3.2.1.21,Arabidopsis thaliana,P37702,19703694,wildtype,1.2,37.0,4.5,p-Nitrophenyl-beta-D-glucoside;H2O,p-Nitrophenol;beta-D-Glucose,{InChI=1S/C12H15NO8/c14-5-8-9(15)10(16)11(17)1...,{InChI=1S/C6H12O6/c7-1-2-3(8)4(9)5(10)6(11)12-...,InChI=1S/C12H15NO8/c14-5-8-9(15)10(16)11(17)12...,MKLLMLAFVFLLALATCKGDEFVCEENEPFTCNQTKLFNSGNFEKG...
944,3.2.1.21,Arabidopsis thaliana,Q8GRX1,19703694,wildtype,7.3,37.0,4.5,p-Nitrophenyl-beta-D-glucoside;H2O,p-Nitrophenol;beta-D-Glucose,{InChI=1S/C12H15NO8/c14-5-8-9(15)10(16)11(17)1...,{InChI=1S/C6H12O6/c7-1-2-3(8)4(9)5(10)6(11)12-...,InChI=1S/C12H15NO8/c14-5-8-9(15)10(16)11(17)12...,MAIPKAHYSLAVLVLLFVVVSSSQKVCNPECKAKEPFHCDNTHAFN...


#### Removing duplicates

In [3]:
droplist = []

for ind in df_kcat.index:
    UID, kcat = df_kcat["Uniprot IDs"][ind], df_kcat["kcat"][ind]
    help_df = df_kcat.loc[df_kcat["Uniprot IDs"] == UID].loc[df_kcat["kcat"] == kcat]
    
    if len(help_df) > 1:
        droplist = droplist + list(help_df.index)[1:]
        

In [4]:
# droplist = []

# for ind in df_kcat.index:
#     UID, kcat, temp, pH, main_substrate, substrates, products  = df_kcat["Uniprot IDs"][ind], df_kcat["kcat"][ind], df_kcat["Temperature"][ind], df_kcat["pH"][ind], df_kcat["Main Substrate"][ind], df_kcat["substrate_IDs"][ind], df_kcat["product_IDs"][ind]
#     help_df = df_kcat.loc[df_kcat["Uniprot IDs"] == UID].loc[df_kcat["kcat"] == kcat].loc[df_kcat["Temperature"] == temp].loc[df_kcat["pH"] == pH].loc[df_kcat["Main Substrate"] == main_substrate].loc[df_kcat["substrate_IDs"] == substrates].loc[df_kcat["product_IDs"] == products]
    
#     if len(help_df) > 1:
#         droplist = droplist + list(help_df.index)[1:]

In [5]:
df_kcat.drop(list(set(droplist)), inplace = True)
print("Dropping %s data points, because they are duplicated." % len(set(droplist)))
df_kcat.reset_index(inplace = True, drop = True)
df_kcat

Dropping 104 data points, because they are duplicated.


Unnamed: 0,ECs,Organism,Uniprot IDs,PMID,Type,kcat,Temperature,pH,Substrates,Products,substrate_IDs,product_IDs,Main Substrate,Sequence
0,1,Petunia hybrida,Q15GI3,16782809,wildtype,0.300000,28.0,6.5,Coniferyl acetate;NADPH,Acetate;NADP+;Isoeugenol,{InChI=1S/C21H30N7O17P3/c22-17-12-19(25-7-24-1...,"{InChI=1S/C2H4O2/c1-2(3)4/h1H3,(H,3,4)/p-1, In...",InChI=1S/C12H14O4/c1-9(13)16-7-3-4-10-5-6-11(1...,MTTGKGKILILGATGYLGKYMVKASISLGHPTYAYVMPLKKNSDDS...
1,1,Ocimum basilicum,Q15GI4,16782809,wildtype,0.700000,28.0,6.5,NADPH;Coniferyl acetate,Eugenol;NADP+;Acetate,{InChI=1S/C21H30N7O17P3/c22-17-12-19(25-7-24-1...,{InChI=1S/C10H12O2/c1-3-4-8-5-6-9(11)10(7-8)12...,InChI=1S/C12H14O4/c1-9(13)16-7-3-4-10-5-6-11(1...,MEENGMKSKILIFGGTGYIGNHMVKGSLKLGHPTYVFTRPNSSKTT...
2,1.1.1,Cochlearia officinalis,A7DY56,24583623,wildtype,1.010000,30.0,5.0,NADPH;3-Methylcyclohexanone;H+,NADP+;3-Methylcyclohexanol,"{InChI=1S/p+1, InChI=1S/C21H30N7O17P3/c22-17-1...","{InChI=1S/C7H14O/c1-6-3-2-4-7(8)5-6/h6-8H,2-5H...",InChI=1S/C21H30N7O17P3/c22-17-12-19(25-7-24-17...,MANLRESSRDKSRWSLEGMTALVTGGSKGIGEAVVEELAMLGARVH...
3,1.1.1,Cochlearia officinalis,A7DY56,24583623,wildtype,11.800000,30.0,5.0,3-Methylcyclohexanone;H+;NADH,NAD+;3-Methylcyclohexanol,"{InChI=1S/p+1, InChI=1S/C21H29N7O14P2/c22-17-1...","{InChI=1S/C7H14O/c1-6-3-2-4-7(8)5-6/h6-8H,2-5H...",InChI=1S/C21H29N7O14P2/c22-17-12-19(25-7-24-17...,MANLRESSRDKSRWSLEGMTALVTGGSKGIGEAVVEELAMLGARVH...
4,1.1.1,Cochlearia officinalis,A7DY56,24583623,wildtype,0.160000,30.0,9.5,3-Methylcyclohexanol;NADP+,3-Methylcyclohexanone;H+;NADPH,"{InChI=1S/C7H14O/c1-6-3-2-4-7(8)5-6/h6-8H,2-5H...","{InChI=1S/p+1, InChI=1S/C21H30N7O17P3/c22-17-1...",InChI=1S/C21H28N7O17P3/c22-17-12-19(25-7-24-17...,MANLRESSRDKSRWSLEGMTALVTGGSKGIGEAVVEELAMLGARVH...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1235,6.3.2.2,Arabidopsis thaliana,P46309,15180996,wildtype,0.101667,25.0,7.0,L-Glutamate;L-Cysteine;ATP,Phosphate;ADP;gamma-L-Glutamyl-L-cysteine,"{InChI=1S/C3H7NO2S/c4-2(1-7)3(5)6/h2,7H,1,4H2,...","{InChI=1S/H3O4P/c1-5(2,3)4/h(H3,1,2,3,4), InCh...","InChI=1S/C5H9NO4/c6-3(5(9)10)1-2-4(7)8/h3H,1-2...",MALLSQAGGSYTVVPSGVCSKAGTKAVVSGGVRNLDVLRMKEAFGS...
1236,6.3.2.2,Arabidopsis thaliana,P46309,15180996,wildtype,0.113333,25.0,7.0,ATP;L-Glutamate;L-Cysteine,gamma-L-Glutamyl-L-cysteine;ADP;Phosphate,"{InChI=1S/C3H7NO2S/c4-2(1-7)3(5)6/h2,7H,1,4H2,...","{InChI=1S/H3O4P/c1-5(2,3)4/h(H3,1,2,3,4), InCh...",InChI=1S/C10H16N5O13P3/c11-8-5-9(13-2-12-8)15(...,MALLSQAGGSYTVVPSGVCSKAGTKAVVSGGVRNLDVLRMKEAFGS...
1237,6.3.2.52,Arabidopsis thaliana,Q8GZ29,29462792,wildtype,0.073333,-,-,(-)-Jasmonic acid;Glutamine;ATP,Diphosphate;Jasmonoyl-glutamine;AMP,{InChI=1S/C12H18O3/c1-2-3-4-5-10-9(8-12(14)15)...,"{InChI=1S/H4O7P2/c1-8(2,3)7-9(4,5)6/h(H2,1,2,3...",InChI=1S/C12H18O3/c1-2-3-4-5-10-9(8-12(14)15)6...,MLPKFDPTNQKACLSLLEDLTTNVKQIQDSVLEAILSRNAQTEYLR...
1238,6.3.2.52,Arabidopsis thaliana,Q8GZ29,29462792,wildtype,0.066667,-,-,Glutamine;ATP;(-)-Jasmonic acid,Diphosphate;AMP;Jasmonoyl-glutamine,{InChI=1S/C12H18O3/c1-2-3-4-5-10-9(8-12(14)15)...,"{InChI=1S/H4O7P2/c1-8(2,3)7-9(4,5)6/h(H2,1,2,3...","InChI=1S/C5H10N2O3/c6-3(5(9)10)1-2-4(7)8/h3H,1...",MLPKFDPTNQKACLSLLEDLTTNVKQIQDSVLEAILSRNAQTEYLR...


#### Removing top and bottom 3% of kcat values

In [6]:
def find_outliers_IQR(df):

   q1=df.quantile(0.25)

   q3=df.quantile(0.75)

   IQR=q3-q1

   outliers = df[((df<(q1-1.5*IQR)) | (df>(q3+1.5*IQR)))]

   return outliers

find_outliers_IQR(df_kcat["kcat"])

print(df_kcat['kcat'].quantile(0.03),  df_kcat['kcat'].quantile(0.97))

0.001 1649.7999999999956


In [7]:
print(len(df_kcat))
df_kcat = df_kcat[(df_kcat['kcat'] > df_kcat['kcat'].quantile(0.03)) & (df_kcat['kcat'] < df_kcat['kcat'].quantile(0.97))]
df_kcat.reset_index(inplace = True, drop = True)
print(len(df_kcat))

1240
1161


In [8]:
todrop= []

for ind in df_kcat.index:
    UID = df_kcat["Uniprot IDs"][ind]
    if len(UID.split(';')) > 1:
        todrop.append(ind)
        print(df_kcat["Uniprot IDs"][ind])
        print(todrop)
        
df_kcat.drop(todrop, inplace=True)
df_kcat.reset_index(inplace = True, drop = True)

Q41736;P00221
[281]
Q41736;P00221
[281, 282]
Q41736;P00221
[281, 282, 283]
P19866;P12860
[281, 282, 283, 297]
P19866;P12860
[281, 282, 283, 297, 298]
O04385;O23760
[281, 282, 283, 297, 298, 408]
O04385;O23760
[281, 282, 283, 297, 298, 408, 409]
P09342;P09114
[281, 282, 283, 297, 298, 408, 409, 436]
P09342;P09114
[281, 282, 283, 297, 298, 408, 409, 436, 437]
Q42588;P32260
[281, 282, 283, 297, 298, 408, 409, 436, 437, 478]
Q42588;P16703
[281, 282, 283, 297, 298, 408, 409, 436, 437, 478, 479]
A0A2U7XUE3;Q9FEY5
[281, 282, 283, 297, 298, 408, 409, 436, 437, 478, 479, 499]
A0A2U7XUE3;Q9FEY5
[281, 282, 283, 297, 298, 408, 409, 436, 437, 478, 479, 499, 500]
Q9SC13;P60038
[281, 282, 283, 297, 298, 408, 409, 436, 437, 478, 479, 499, 500, 551]
Q42588;P32260
[281, 282, 283, 297, 298, 408, 409, 436, 437, 478, 479, 499, 500, 551, 581]
P55241;Q947C0
[281, 282, 283, 297, 298, 408, 409, 436, 437, 478, 479, 499, 500, 551, 581, 630]
P23509;P55241;Q947C0
[281, 282, 283, 297, 298, 408, 409, 436, 437, 478, 

In [9]:
df_kcat["substrate_IDs"] = df_kcat["substrate_IDs"].apply(lambda x: (set(x)))
df_kcat["product_IDs"] = df_kcat["product_IDs"].apply(lambda x: (set(x)))

In [10]:
df_kcat.to_pickle(join(datasets_dir, "kcat_data_merged2.pkl"))

## 2. Assigning IDs to every unique sequence and to every unique reaction in the dataset

#### Creating DataFrames for all sequences and for all reactions

In [11]:
# df_reactions = pd.DataFrame({"substrates": df_kcat["substrate_IDs"],
#                             "products" : df_kcat["product_IDs"]})

# df_reactions = df_reactions.loc[df_reactions["substrates"] != set([])]
# df_reactions = df_reactions.loc[df_reactions["products"] != set([])]


# droplist = []
# for ind in df_reactions.index:
#     sub_IDs, pro_IDs = df_reactions["substrates"][ind], df_reactions["products"][ind]
#     help_df = df_reactions.loc[df_reactions["substrates"] == sub_IDs].loc[df_reactions["products"] == pro_IDs]
#     if len(help_df):
#         for ind in list(help_df.index)[1:]:
#             droplist.append(ind)
            
# df_reactions.drop(list(set(droplist)), inplace = True)
# df_reactions.reset_index(inplace = True, drop =True)

# df_reactions["Reaction ID"] = ["Reaction_" + str(ind) for ind in df_reactions.index]

In [12]:
# df_sequences = pd.DataFrame(data = {"Sequence" : df_kcat["Sequence"].unique()})
# df_sequences = df_sequences.loc[~pd.isnull(df_sequences["Sequence"])]
# df_sequences.reset_index(inplace = True, drop = True)
# df_sequences["Sequence ID"] = ["Sequence_" + str(ind) for ind in df_sequences.index]

# df_sequences

#### Calculating maximal kcat value for each reaction and sequence

In [13]:
# df_reactions["max_kcat_for_RID"] = np.nan
# for ind in df_reactions.index:
#     df_reactions["max_kcat_for_RID"][ind] = max(df_kcat.loc[df_kcat["substrate_IDs"] == df_reactions["substrates"][ind]].loc[df_kcat["product_IDs"] == df_reactions["products"][ind]]["kcat"])

In [14]:
# df_sequences["max_kcat_for_UID"] = np.nan
# for ind in df_sequences.index:
#     df_sequences["max_kcat_for_UID"][ind] = max(df_kcat.loc[df_kcat["Sequence"] == df_sequences['Sequence'][ind]]["kcat"])

#### Calculating the sum of the molecular weights of all substrates and of all products

In [15]:
# df_reactions["MW_frac"] = np.nan

# for ind in df_reactions.index:
#     substrates = list(df_reactions["substrates"][ind])
#     products = list(df_reactions["products"][ind])
    
#     mw_subs = mw_mets(metabolites = substrates)
#     mw_pros = mw_mets(metabolites = products)
    
#     if mw_subs == np.nan or mw_pros == np.nan:
#         df_reactions["MW_frac"][ind] = np.inf
#     if mw_pros != 0:
#         df_reactions["MW_frac"][ind] = mw_subs/mw_pros
#     else:
#         df_reactions["MW_frac"][ind] = np.inf
        
# df_reactions

#### Calculating enzyme, reaction and substrate features

In [16]:
# model, alphabet = torch.hub.load("facebookresearch/esm:main", "esm2_t33_650M_UR50D")

In [17]:
# #creating model input:
# df_sequences["model_input"] = [seq[:1022] for seq in df_sequences["Sequence"]]
# model_input = [(df_sequences["Sequence ID"][ind], df_sequences["model_input"][ind]) for ind in df_sequences.index]
# seqs = [model_input[i][1] for i in range(len(model_input))]
# #loading ESM-2 model:
# print(".....2(a) Loading ESM-2 model.")
# model, alphabet = esm.pretrained.esm2_t33_650M_UR50D()
# batch_converter = alphabet.get_batch_converter()
# #convert input into batches:

# #Calculate ESM-2 representations
# print(".....2(b) Calculating enzyme representations.")
# df_sequences["Enzyme rep"] = ""

# for ind in df_sequences.index:
#     print(ind,"/",len(df_sequences))    
#     batch_labels, batch_strs, batch_tokens = batch_converter([(df_sequences["Sequence ID"][ind], df_sequences["model_input"][ind])])
#     with torch.no_grad():
#         results = model(batch_tokens, repr_layers=[33])
#     df_sequences["Enzyme rep"][ind] = results["representations"][33][0, 1 : len(df_sequences["model_input"][ind]) + 1].mean(0).numpy()
    
# df_sequences.head(5)

In [18]:
# def get_metabolite_type(met):
#     if is_KEGG_ID(met):
#         return("KEGG")
#     elif is_InChI(met):
#         return("InChI")
#     else:
#         return("invalid")

# def get_reaction_site_smarts(metabolites):
#     reaction_site = ""
#     for met in metabolites:
#         met_type = get_metabolite_type(met)
#         if met_type == "KEGG":
#             try:
#                 Smarts = Chem.MolToSmarts(Chem.MolFromMolFile(join("", "", "data", "mol-files",  met + ".mol")))
#             except OSError:
#                 return(np.nan)
#         elif met_type == "InChI":
#             Smarts = Chem.MolToSmarts(Chem.inchi.MolFromInchi(met))
#         else:
#             Smarts = "invalid"
#         reaction_site = reaction_site + "." + Smarts
#     return(reaction_site[1:])


# def is_KEGG_ID(met):
#     #a valid KEGG ID starts with a "C" or "D" followed by a 5 digit number:
#     if len(met) == 6 and met[0] in ["C", "D"]:
#         try:
#             int(met[1:])
#             return(True)
#         except: 
#             pass
#     return(False)

# def is_InChI(met):
#     m = Chem.inchi.MolFromInchi(met,sanitize=False)
#     if m is None:
#       return(False)
#     else:
#       try:
#         Chem.SanitizeMol(m)
#       except:
#         print('.......Metabolite string "%s" is in InChI format but has invalid chemistry' % met)
#         return(False)
#     return(True)

# def convert_fp_to_array(difference_fp_dict):
#     fp = np.zeros(2048)
#     for key in difference_fp_dict.keys():
#         fp[key] = difference_fp_dict[key]
#     return(fp)

In [19]:
# df_reactions["difference_fp"], df_reactions["structural_fp"],  = "", ""
# #each metabolite should be either a KEGG ID, InChI string, or a SMILES:
# for ind in df_reactions.index:
#     left_site = get_reaction_site_smarts(df_reactions["substrates"][ind])
#     right_site = get_reaction_site_smarts(df_reactions["products"][ind])
#     if not pd.isnull(left_site) and not pd.isnull(right_site):
#         rxn_forward = AllChem.ReactionFromSmarts(left_site + ">>" + right_site)
#         difference_fp = Chem.rdChemReactions.CreateDifferenceFingerprintForReaction(rxn_forward)
#         difference_fp = convert_fp_to_array(difference_fp.GetNonzeroElements())
#         df_reactions["difference_fp"][ind] = difference_fp
#         df_reactions["structural_fp"][ind] = Chem.rdChemReactions.CreateStructuralFingerprintForReaction(rxn_forward).ToBitString()

# df_reactions.head(5)

In [20]:
# df_sequences.to_pickle(join(datasets_dir, "all_sequences_with_IDs.pkl"))
# df_reactions.to_pickle(join(datasets_dir, "all_reactions_with_IDs.pkl"))

In [21]:
df_sequences = pd.read_pickle(join(datasets_dir, "all_sequences_with_IDs.pkl"))
df_reactions = pd.read_pickle(join(datasets_dir, "all_reactions_with_IDs.pkl"))
df_sequences["max_kcat_for_UID"] = np.nan
for ind in df_sequences.index:
    df_sequences["max_kcat_for_UID"][ind] = max(df_kcat.loc[df_kcat["Sequence"] == df_sequences['Sequence'][ind]]["kcat"])

#### Mapping Sequence and Reaction IDs to kcat_df

In [22]:
df_kcat = df_kcat.merge(df_sequences, on = "Sequence", how = "left")

In [23]:
df_reactions.rename(columns = {"substrates" : "substrate_IDs",
                              "products" : "product_IDs"}, inplace = True)

df_kcat["Reaction ID"] = np.nan
df_kcat["MW_frac"] = np.nan
df_kcat["max_kcat_for_RID"] = np.nan
df_kcat["difference_fp"] = ""
df_kcat["structural_fp"] = ""

for ind in df_kcat.index:
    sub_set, pro_set = df_kcat["substrate_IDs"][ind], df_kcat["product_IDs"][ind]
    
    help_df = df_reactions.loc[df_reactions["substrate_IDs"] == sub_set].loc[df_reactions["product_IDs"] == pro_set]
    if len(help_df) == 1:
        df_kcat["Reaction ID"][ind] = list(help_df["Reaction ID"])[0]
        df_kcat["max_kcat_for_RID"][ind] = list(help_df["max_kcat_for_RID"])[0]
        df_kcat["MW_frac"][ind] = list(help_df["MW_frac"])[0]
        df_kcat["difference_fp"][ind] = list(help_df["difference_fp"])[0]
        df_kcat["structural_fp"][ind] = list(help_df["structural_fp"])[0]
df_kcat.head(2)

Unnamed: 0,ECs,Organism,Uniprot IDs,PMID,Type,kcat,Temperature,pH,Substrates,Products,...,Sequence,Sequence ID,model_input,Enzyme rep,max_kcat_for_UID,Reaction ID,MW_frac,max_kcat_for_RID,difference_fp,structural_fp
0,1,Petunia hybrida,Q15GI3,16782809,wildtype,0.3,28.0,6.5,Coniferyl acetate;NADPH,Acetate;NADP+;Isoeugenol,...,MTTGKGKILILGATGYLGKYMVKASISLGHPTYAYVMPLKKNSDDS...,Sequence_0,MTTGKGKILILGATGYLGKYMVKASISLGHPTYAYVMPLKKNSDDS...,"[-0.032205846, -0.031796537, -0.051493276, 0.0...",0.3,Reaction_0,1.001043,0.3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1100111100000001001000110110010001001111111100...
1,1,Ocimum basilicum,Q15GI4,16782809,wildtype,0.7,28.0,6.5,NADPH;Coniferyl acetate,Eugenol;NADP+;Acetate,...,MEENGMKSKILIFGGTGYIGNHMVKGSLKLGHPTYVFTRPNSSKTT...,Sequence_1,MEENGMKSKILIFGGTGYIGNHMVKGSLKLGHPTYVFTRPNSSKTT...,"[-0.016749386, -0.048214775, -0.049711384, 0.0...",0.7,Reaction_1,1.001043,0.7,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1100111100000001001000110110010001001111111100...


In [24]:
df_kcat["MACCS FP"] = ""

for ind in df_kcat.index:
    id = df_kcat["Main Substrate"][ind]
    # try:
    #     id = df_kcat['Substrate_IDs'][ind][df_kcat["Substrates"][ind].split(';').index(substrate)]
    # except:
    #     for i,s in enumerate(df_kcat["Substrates"][ind].split(';')[:-1]):
    #         if substrate in s or s in substrate:
    #             id = list(df_kcat['Substrate_IDs'][ind])[i]
    if id[0] == "C":
        try:
            mol = Chem.MolFromMolFile(join(datasets_dir,"mol-files", id + '.mol'))
        except OSError:
            None
    else:
        try:
            mol = Chem.inchi.MolFromInchi(id,sanitize=False)
        except OSError:
            None
    if mol is not None:
        maccs_fp = MACCSkeys.GenMACCSKeys(mol).ToBitString()
        df_kcat["MACCS FP"][ind] = maccs_fp

#### Calculating the maximal kcat value for every EC number in the dataset

In [25]:
df_EC_kcat = pd.read_csv(join(datasets_dir, "max_EC_" + organism + ".tsv"), sep = "\t", header=0)
# df_EC_kcat = df_EC_kcat.rename(columns={0: "EC", 1: "max_kcat"})

for ind in df_EC_kcat.index:
    try:
        kcat_max = df_EC_kcat[df_EC_kcat["EC"] == df_kcat["ECs"]]["max_kcat"]
        df_EC_kcat["max_kcat"][ind] = kcat_max
        print(ind, kcat_max)
    except:
        pass
        
df_EC_kcat.describe()

Unnamed: 0,max_kcat
count,271.0
mean,260863.4
std,4252168.0
min,2.666667e-06
25%,0.32
50%,5.1
75%,69.505
max,70000000.0


In [26]:
df_EC_kcat = pd.read_csv(join(datasets_dir, "max_EC_" + organism + ".tsv"), sep = "\t", header=0)

df_EC_kcat.head(5)
df_kcat["max_kcat_for_EC"] = np.nan

for ind in df_kcat.index:
    EC = df_kcat["ECs"][ind]
    max_kcat = 0
    try:
        print(EC)
        max_kcat = df_EC_kcat.loc[df_EC_kcat["EC"] == EC, "max_kcat"].iloc[0]
        print(max_kcat)
    except:
        pass
    if max_kcat != 0:
        df_kcat["max_kcat_for_EC"][ind] = max_kcat
df_kcat.to_pickle(join(datasets_dir, "merged_and_grouped_kcat_dataset2.pkl"))     

1
0.7
1
0.7
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.0
1.1.1
44.

In [27]:
glucosinolates = pd.read_table(join(datasets_dir,"glucosinolates.tsv"))["Entry"].tolist()
df_validation = df_kcat[df_kcat["Uniprot IDs"].isin(glucosinolates)]
df_validation.reset_index(inplace=True, drop = True)
df_kcat = df_kcat[~df_kcat["Uniprot IDs"].isin(glucosinolates)]
df_kcat.reset_index(inplace=True, drop = True)
split = "full"

In [28]:
df_kcat.dtypes

ECs                  object
Organism             object
Uniprot IDs          object
PMID                  Int64
Type                 object
kcat                float64
Temperature          object
pH                   object
Substrates           object
Products             object
substrate_IDs        object
product_IDs          object
Main Substrate       object
Sequence             object
Sequence ID          object
model_input          object
Enzyme rep           object
max_kcat_for_UID    float64
Reaction ID          object
MW_frac             float64
max_kcat_for_RID    float64
difference_fp        object
structural_fp        object
MACCS FP             object
max_kcat_for_EC     float64
dtype: object

## 3. Removing outliers

#### Removing non-optimally measured values

To ignore $kcat$ values that were obtained under non-optimal conditions, we exclude values lower than 0.1\% than the maximal $kcat$ value for the same enzyme, reaction or EC number.

In [29]:
df_kcat["frac_of_max_UID"] = np.nan
df_kcat["frac_of_max_RID"] = np.nan
df_kcat["frac_of_max_EC"] = np.nan

for ind in df_kcat.index:
    df_kcat["frac_of_max_UID"][ind] =  df_kcat["kcat"][ind]/df_kcat["max_kcat_for_UID"][ind]
    df_kcat["frac_of_max_RID"][ind] =  df_kcat["kcat"][ind]/df_kcat["max_kcat_for_RID"][ind]
    df_kcat["frac_of_max_EC"][ind] = df_kcat["kcat"][ind]/df_kcat["max_kcat_for_EC"][ind]

len(df_kcat)

1123

In [30]:
n = len(df_kcat)

df_kcat = df_kcat.loc[df_kcat["frac_of_max_UID"] >= 0.01]
df_kcat = df_kcat.loc[df_kcat["frac_of_max_RID"] >= 0.01]

# df_kcat["frac_of_max_EC"].loc[pd.isnull(df_kcat["frac_of_max_EC"])] = 1
df_kcat = df_kcat.loc[df_kcat["frac_of_max_EC"] <= 10]
df_kcat = df_kcat.loc[df_kcat["frac_of_max_EC"] >= 0.01]

In [31]:
print("We remove %s data points, because we suspect that these kcat values were not measure for the natural reaction " \
    "of an enzyme or under non-optimal conditions." % (n-len(df_kcat)))

We remove 237 data points, because we suspect that these kcat values were not measure for the natural reaction of an enzyme or under non-optimal conditions.


#### Removing data points with reaction queations with uneven fraction of molecular weights

In [32]:
n = len(df_kcat)

df_kcat = df_kcat.loc[df_kcat["MW_frac"] < 3]
df_kcat = df_kcat.loc[df_kcat["MW_frac"] > 1/3]

print("We remove %s data points because the sum of molecular weights of substrates does not match the sum of molecular" \
      "weights of the products." % (n-len(df_kcat)))

We remove 30 data points because the sum of molecular weights of substrates does not match the sum of molecularweights of the products.


In [33]:
print("Size of final kcat dataset: %s" % len(df_kcat))
df_kcat.to_pickle(join(datasets_dir, "final_kcat_dataset_" + organism + "2.pkl"))

Size of final kcat dataset: 856


## 4. Preparing dataset and splitting into train-test

In [7]:
df_kcat = pd.read_pickle(join(datasets_dir, "final_kcat_dataset_" + organism + ".pkl"))
sorted(df_kcat["Reaction ID"].unique())
# df_kcat["log10_kcat"] = [np.log10(x) for x in df_kcat["kcat"]]

['Reaction_0',
 'Reaction_1',
 'Reaction_10',
 'Reaction_100',
 'Reaction_101',
 'Reaction_102',
 'Reaction_103',
 'Reaction_104',
 'Reaction_105',
 'Reaction_106',
 'Reaction_107',
 'Reaction_108',
 'Reaction_109',
 'Reaction_110',
 'Reaction_111',
 'Reaction_112',
 'Reaction_113',
 'Reaction_114',
 'Reaction_115',
 'Reaction_116',
 'Reaction_117',
 'Reaction_118',
 'Reaction_119',
 'Reaction_120',
 'Reaction_121',
 'Reaction_122',
 'Reaction_124',
 'Reaction_125',
 'Reaction_126',
 'Reaction_127',
 'Reaction_128',
 'Reaction_129',
 'Reaction_130',
 'Reaction_131',
 'Reaction_132',
 'Reaction_133',
 'Reaction_134',
 'Reaction_135',
 'Reaction_136',
 'Reaction_137',
 'Reaction_138',
 'Reaction_139',
 'Reaction_14',
 'Reaction_140',
 'Reaction_141',
 'Reaction_142',
 'Reaction_143',
 'Reaction_144',
 'Reaction_145',
 'Reaction_146',
 'Reaction_147',
 'Reaction_15',
 'Reaction_150',
 'Reaction_160',
 'Reaction_161',
 'Reaction_163',
 'Reaction_165',
 'Reaction_168',
 'Reaction_169',
 'Re

In [35]:
# import plotnine
# from plotnine import ggplot, geom_point, aes, theme_matplotlib, theme_set, geom_bin_2d
# # df_kcat['Temperature'] = df_kcat['Temperature'].replace('-', np.nan)
# # df_kcat['pH'] = df_kcat['pH'].replace('-', np.nan)
# # df_kcat['pH'] = df_kcat['pH'].astype('float')
# # df_kcat['Temperature'] = df_kcat['Temperature'].astype('float')
# # df_kcat["log10_kcat"] = np.log10(df_kcat["kcat"])
# theme_set(theme_matplotlib())
# (
#     ggplot(df_kcat) +
#     aes(x="Temperature",y="kcat") +
#     geom_bin_2d()
# )

In [36]:
# import numpy as np
# import matplotlib.pyplot as plt

# fig, ax = plt.subplots(figsize=(7, 4))
# hb = ax.hexbin(df_kcat["Temperature"], df_kcat["kcat"], gridsize=50, cmap='inferno')


# # ax.set_xlim(x.min(), x.max())
# # ax.set_ylim(y.min(), y.max())

# # Add a title and colorbar
# ax.set_title("Hexagon binning")
# fig.colorbar(hb, ax=ax, label='counts')

# plt.show()

#### Making input for GNN

In [37]:
# for ind in df_kcat.index:
#     substrate = df_kcat["Main Substrate"][ind]
#     try:
#         id = list(df_kcat['substrate_IDs'][ind])[df_kcat["Substrates"][ind].split(';').index(substrate)]
#     except:
#         for i,s in enumerate(df_kcat["Substrates"][ind].split(';')[:-1]):
#             if substrate in s or s in substrate:
#                 id = list(df_kcat['substrate_IDs'][ind])[i]
#     df_kcat["Main Substrate"][ind] = id


inchi_ids = {}
for i, element in enumerate(df_validation["Main Substrate"]):
    if element[0] != 'C' and element not in inchi_ids.keys():
        inchi_ids[element] = str(i)
        mol = Chem.inchi.MolFromInchi(element)
        if not mol is None:
            calculate_atom_and_bond_feature_vectors(mol, str(i))
        Chem.rdmolfiles.MolToMolFile(Chem.inchi.MolFromInchi(element), join(datasets_dir,"mol-files", str(i) + ".mol")  )  

#### Splitting glucosinolates into validation dataset

Search UniProt for GO term related to glucosionalte metabolic process, download file as .tsv and filter dataset

In [38]:
# glucosinolates = pd.read_table(join(datasets_dir,"glucosinolates.tsv"))["Entry"].tolist()
# df_validation = df_kcat[df_kcat["Uniprot IDs"].isin(glucosinolates)]
# df_validation.reset_index(inplace=True, drop = True)
# df_kcat = df_kcat[~df_kcat["Uniprot IDs"].isin(glucosinolates)]
# df_kcat.reset_index(inplace=True, drop = True)
# split = "full"

If training-testing with only Arabidopsis data:

In [39]:
# df_kcat = df_kcat[df_kcat["Organism"] == 'Arabidopsis thaliana']
# df_kcat.reset_index(inplace=True, drop = True)
# split = "Arabidopsis"

If training-testing with only Brassicaceae data:

In [40]:
# ncbi = NCBITaxa()

# organisms = {}

# def is_brassicaceae(org):
#     try:
#         tax_id = ncbi.get_name_translator([org])[org][0]
#         lineage = ncbi.get_lineage(tax_id)
#         if 3700 not in lineage:
#             return(False)
#         else:
#             return(True)
#     except KeyError:
#         return(False)
    
# for org in df_kcat["Organism"].tolist():
#     if org not in organisms.keys():
#         organisms[org] = is_brassicaceae(org)

# df_kcat = df_kcat[df_kcat["Organism"].isin([key for key, value in organisms.items() if value is True])]
# df_kcat.reset_index(inplace=True, drop = True)
# split = "Brassicaceae"

If training-testing only with wildtype data:

In [41]:
# df_kcat = df_kcat[df_kcat["Type"].str.contains("wildtype")]
# df_kcat.reset_index(inplace=True, drop = True)
# split = "wildtype"

If training-testing only with secondary metabolite data:

In [42]:
# secondary = pd.read_table(join(datasets_dir,"secondary_metabolites.tsv"))["Entry"].tolist()
# df_kcat = df_kcat[df_kcat["Uniprot IDs"].isin(secondary)]
# df_kcat.reset_index(inplace=True, drop = True)
# split = "secondary"

In [43]:
# os.mkdir(join(datasets_dir, "splits", split))

#### Calculating arithmetic mean for kcat values of same enzyme-reaction-substrate combination-pH-temperature

In [44]:
# df_new = pd.DataFrame(data = {"Reaction ID" : df_kcat["Reaction ID"],
#                                   "Sequence ID" : df_kcat["Sequence ID"],
#                                   "Temperature" : df_kcat["Temperature"],
#                                     "pH" : df_kcat["pH"],
#                                  "Type": df_kcat["Type"],
#                              "MACCS FP" : df_kcat["MACCS FP"]})

# df_new.drop_duplicates(inplace = True)
# df_new.reset_index(inplace = True, drop = True)

# df_new["kcat_values"], df_new["Uniprot IDs"], df_new["ECs"], df_new["Substrates"], df_new["Products"], df_new["ESM2"], df_new["Sequence"], df_new["difference_fp"], df_new["structural_fp"] = "", "", "", "", "", "", "", "", ""

# for ind in df_new.index:
#     RID, SID, Temp, pH, Type, MSubstrate = df_new["Reaction ID"][ind], df_new["Sequence ID"][ind], df_new["Temperature"][ind], df_new["pH"][ind], df_new["Type"][ind], df_new["MACCS FP"][ind]
#     help_df = df_kcat.loc[df_kcat["Reaction ID"] 
#                                  == RID].loc[df_kcat["Sequence ID"] 
#                                              == SID].loc[df_kcat["Temperature"] 
#                                                          == Temp].loc[df_kcat["pH"] 
#                                                                       == pH].loc[df_kcat["Type"] 
#                                                                                  == Type].loc[df_kcat["MACCS FP"] 
#                                                                                               == MSubstrate]
#     print(help_df)
#     df_new["ECs"][ind] = list(help_df["ECs"])
#     df_new["kcat_values"][ind] = list(help_df["kcat"])
#     df_new["Uniprot IDs"][ind] = list(help_df["Uniprot IDs"])
#     df_new["Sequence"][ind] = help_df["Sequence"].values[0]
#     df_new["ESM2"][ind] = help_df["Enzyme rep"].values[0]
#     df_new["difference_fp"][ind], df_new["structural_fp"][ind] = help_df["difference_fp"].values[0], help_df["structural_fp"].values[0]
#     df_new["Substrates"][ind], df_new["Products"][ind] = help_df["Substrates"].values[0], help_df["Products"].values[0]

In [45]:
# df_new2 = pd.DataFrame(data = {"Reaction ID" : df_validation["Reaction ID"],
#                                   "Sequence ID" : df_validation["Sequence ID"],
#                                   "Temperature" : df_validation["Temperature"],
#                                     "pH" : df_validation["pH"],
#                                   "Type" : df_validation["Type"],
#                                   "MACCS FP" : df_validation["MACCS FP"]})

# df_new2.drop_duplicates(inplace = True)
# df_new2.reset_index(inplace = True, drop = True)

# df_new2["kcat_values"], df_new2["Uniprot IDs"], df_new2["ECs"], df_new2["Organisms"], df_new2["Substrates"], df_new2["Products"], df_new2["ESM2"], df_new2["Sequence"], df_new2["difference_fp"], df_new2["structural_fp"] = "", "", "", "", "", "", "", "", "", ""

# for ind in df_new2.index:
#     RID, SID, Temp, pH, Type, MSubstrate = df_new2["Reaction ID"][ind], df_new2["Sequence ID"][ind], df_new2["Temperature"][ind], df_new2["pH"][ind], df_new2["Type"][ind], df_new2["MACCS FP"][ind]
#     help_df = df_validation.loc[df_validation["Reaction ID"] 
#                               == RID].loc[df_validation["Sequence ID"] 
#                                           == SID].loc[df_validation["Temperature"] 
#                                                       == Temp].loc[df_validation["pH"] 
#                                                                     == pH].loc[df_validation["Type"] 
#                                                                               == Type].loc[df_validation["MACCS FP"] 
#                                                                                                             == MSubstrate]
#     df_new2["ECs"][ind] = list(help_df["ECs"])
#     df_new2["kcat_values"][ind] = list(help_df["kcat"])
#     df_new2["Uniprot IDs"][ind] = list(help_df["Uniprot IDs"])
#     df_new2["Organisms"][ind] = list(help_df["Organism"])
#     df_new2["Type"][ind]
#     df_new2["Sequence"][ind] = help_df["Sequence"].values[0]
#     df_new2["ESM2"][ind] = help_df["Enzyme rep"].values[0]
#     df_new2["difference_fp"][ind], df_new2["structural_fp"][ind] = help_df["difference_fp"].values[0], help_df["structural_fp"].values[0]
#     df_new2["Substrates"][ind], df_new2["Products"][ind] = help_df["Substrates"].values[0], help_df["Products"].values[0]

In [46]:
# df_kcat = df_new
# df_validation = df_new2

In [47]:
# df_kcat["geomean_kcat"] = np.nan
# for ind in df_kcat.index:
#     all_kcat = np.array(df_kcat["kcat_values"][ind]).astype(float)
#     max_kcat = max(all_kcat)
#     all_kcat_top = [kcat for kcat in all_kcat  if kcat/max_kcat >= 0.01]
#     df_kcat["geomean_kcat"][ind] = np.mean((all_kcat_top))

In [48]:
# df_validation["geomean_kcat"] = np.nan
# for ind in df_validation.index:
#     all_kcat = np.array(df_validation["kcat_values"][ind]).astype(float)
#     max_kcat = max(all_kcat)
#     all_kcat_top = [kcat for kcat in all_kcat  if kcat/max_kcat >= 0.01]
#     df_validation["geomean_kcat"][ind] = np.mean((all_kcat_top))

# df_validation.to_pickle(join(datasets_dir,"splits", split, "validation_%s.pkl" %organism))


#### Splitting into train-test

In [49]:
# df = df_kcat.copy()
# df = df.sample(frac = 1, random_state=123)
# df.reset_index(drop= True, inplace = True)

# train_df, test_df = split_dataframe_enzyme(frac = 5, df = df.copy())
# print("Test set size: %s" % len(test_df))
# print("Training set size: %s" % len(train_df))
# print("Size of test set in percent: %s" % np.round(100*len(test_df)/ (len(test_df) + len(train_df))))

# train_df.reset_index(inplace = True, drop = True)
# test_df.reset_index(inplace = True, drop = True)

# train_df.to_pickle(join(datasets_dir, "splits", split, "train_df_kcat_%s.pkl" %organism))
# test_df.to_pickle(join(datasets_dir, "splits", split, "test_df_kcat_%s.pkl" %organism))

#### Splitting CV folds

In [50]:
# data_train2 = train_df.copy()
# data_train2["index"] = list(data_train2.index)

# data_train2, df_fold = split_dataframe_enzyme(df = data_train2, frac=5)
# indices_fold1 = list(df_fold["index"])
# print(len(data_train2), len(indices_fold1))#

# data_train2, df_fold = split_dataframe_enzyme(df = data_train2, frac=4)
# indices_fold2 = list(df_fold["index"])
# print(len(data_train2), len(indices_fold2))

# data_train2, df_fold = split_dataframe_enzyme(df = data_train2, frac=3)
# indices_fold3 = list(df_fold["index"])
# print(len(data_train2), len(indices_fold3))

# data_train2, df_fold = split_dataframe_enzyme(df = data_train2, frac=2)
# indices_fold4 = list(df_fold["index"])
# indices_fold5 = list(data_train2["index"])
# print(len(data_train2), len(indices_fold4))


# fold_indices = [indices_fold1, indices_fold2, indices_fold3, indices_fold4, indices_fold5]

# CV_train_indices = [[], [], [], [], []]
# CV_test_indices = [[], [], [], [], []]

# for i in range(5):
#     for j in range(5):
#         if i != j:
#             CV_train_indices[i] = CV_train_indices[i] + fold_indices[j]
#     CV_test_indices[i] = fold_indices[i]
    
    
# np.save(join(datasets_dir, "splits", split, "CV_train_indices_%s" %organism), CV_train_indices)
# np.save(join(datasets_dir, "splits", split, "CV_test_indices_%s" %organism), CV_test_indices)

## 5. Building GNN for substrate representation

In [51]:
# os.mkdir(join(datasets_dir, "GNN_input_data", split))

# for ind in train_df.index:
#     calculate_and_save_input_matrixes(inchi_ids, sample_ID = "train_" + str(ind), df = train_df,
#                                       save_folder = join(datasets_dir, "GNN_input_data", split))
    
# for ind in test_df.index:
#     calculate_and_save_input_matrixes(inchi_ids, sample_ID = "test_" + str(ind), df = test_df,
#                                       save_folder = join(datasets_dir, "GNN_input_data", split))
    
for ind in df_validation.index:
    calculate_and_save_input_matrixes(inchi_ids, sample_ID = "val_" + str(ind), df = df_validation,
                                    save_folder = join(datasets_dir, "GNN_input_data", split))

In [52]:
# train_indices = os.listdir(join(datasets_dir, "GNN_input_data", split))
# train_indices = [index[:index.rfind("_")] for index in train_indices]
# train_indices = list(set([index for index in train_indices if "train" in index]))

# test_indices = os.listdir(join(datasets_dir, "GNN_input_data", split))
# test_indices = [index[:index.rfind("_")] for index in test_indices]
# test_indices = list(set([index for index in test_indices if "test" in index]))

#### Hyper-parameter optimization with CV

In [53]:
# param_grid = {'batch_size': [32,64,96],
#                 'D': [50,100],
#                 'learning_rate': [0.01, 0.1],
#                 'epochs': [30,50,80],
#                 'l2_reg_fc' : [0.01, 0.1, 1],
#                 'l2_reg_conv': [0.01, 0.1, 1],
#                 'rho': [0.9, 0.95, 0.99]}

# params_list = [(batch_size, D, learning_rate, epochs, l2_reg_fc, l2_reg_conv, rho) for batch_size in param_grid['batch_size'] for D in param_grid["D"] for learning_rate in param_grid['learning_rate']
#                 for epochs in param_grid['epochs'] for l2_reg_fc in param_grid['l2_reg_fc'] for l2_reg_conv in param_grid['l2_reg_conv'] for rho in param_grid["rho"]]

# params_list = random.sample(params_list, 10)

In [54]:
# count = 0
# results=[]

# for params in params_list:

#     batch_size, D, learning_rate, epochs, l2_reg_fc, l2_reg_conv, rho = params
#     count +=1
#     MAE = []

#     for i in range(5):
#         train_index, test_index  = CV_train_indices[i], CV_test_indices[i]
#         train_index = [ind for ind in train_indices if int(ind.split("_")[1]) in train_index]
#         test_index = [ind for ind in train_indices if int(ind.split("_")[1]) in test_index]

#         train_params = {'batch_size': batch_size,
#                     'folder' :join(datasets_dir, "GNN_input_data/full"),
#                     'list_IDs' : np.array(train_index),
#                     'shuffle': True}

#         test_params = {'batch_size': len(test_index),
#                     'folder' : join(datasets_dir, "GNN_input_data/full"),
#                     'list_IDs' : np.array(test_index),
#                     'shuffle': False}

#         training_generator = DataGenerator(**train_params)
#         test_generator = DataGenerator(**test_params)


#         model = DMPNN_without_extra_features(l2_reg_conv = l2_reg_conv, l2_reg_fc = l2_reg_fc, learning_rate = learning_rate,
#                         D = D, N = N, F1 = F1, F2 = F2, F= F, drop_rate = 0.0, ada_rho = rho)
#         model.fit(training_generator, epochs= epochs, shuffle = True, verbose = 1)

#         #get test_y:
#         test_indices_y = [int(ind.split("_")[1]) for ind in train_indices if ind in test_index]
#         test_y = np.array([train_df["kcat"][ind] for ind in test_indices_y])

#         pred_test = model.predict(test_generator)
#         mae = np.median(abs(np.array([10**x for x in pred_test]) - np.reshape(test_y[:len(pred_test)], (-1,1))))
#         print(mae)
#         MAE.append(mae)

#     results.append({"batch_size" : batch_size, "D" : D , "learning_rate" : learning_rate, "epochs" : epochs,
#                     "l2_reg_fc" : l2_reg_fc, "l2_reg_conv" : l2_reg_conv, "rho" : rho, "cv_mae" : np.mean(MAE)})

# params = min(results, key=lambda d: d['cv_mae'])
# print(params)

{'batch_size': 32, 'D': 50, 'learning_rate': 0.01, 'epochs': 30, 'l2_reg_fc': 0.1, 'l2_reg_conv': 1, 'rho': 0.9, 'cv_mae': 2.4853503725624084}

#### Training the model with the best set of hyperparmeters on the whole training set and validate it on the test set

In [55]:
batch_size = 32
D = 50
learning_rate = 0.01
epochs = 30
l2_reg_fc = 0.1
l2_reg_conv = 1
rho = 0.9

In [56]:
# train_indices = os.listdir(join(datasets_dir, "GNN_input_data/full"))
# train_indices = [index[:index.rfind("_")] for index in train_indices]
# train_indices = list(set([index for index in train_indices if "train" in index]))

# test_indices = os.listdir(join(datasets_dir, "GNN_input_data/full"))
# test_indices = [index[:index.rfind("_")] for index in test_indices]
# test_indices = list(set([index for index in test_indices if "test" in index]))

# train_params = {'batch_size': batch_size,
#               'folder' :join(datasets_dir, "GNN_input_data/full"),
#               'list_IDs' : train_indices,
#               'shuffle': True}

# test_params = {'batch_size': batch_size,
#               'folder' :join(datasets_dir, "GNN_input_data/full"),
#               'list_IDs' : test_indices,
#               'shuffle': False}

# training_generator = DataGenerator(**train_params)
# test_generator = DataGenerator(**test_params)

# model = DMPNN_without_extra_features(l2_reg_conv = l2_reg_conv, l2_reg_fc = l2_reg_fc, learning_rate = learning_rate,
#                   D = D, N = N, F1 = F1, F2 = F2, F= F, drop_rate = 0.0, ada_rho = rho)

# model.fit(training_generator, epochs= epochs, shuffle = True, verbose = 1)
# model.save_weights(join(datasets_dir, "model_weights", "saved_model_GNN_best_hyperparameters"))

# pred_test = model.predict(test_generator)
# test_indices_y = [int(ind.split("_")[1]) for ind in np.array(test_indices)]
# test_y = np.array([test_df["kcat"][ind] for ind in test_indices_y])

#### Calculating substrate representation for every data point in training and test set

In [57]:
model = DMPNN_without_extra_features(l2_reg_conv = l2_reg_conv, l2_reg_fc = l2_reg_fc, learning_rate = learning_rate,
                  D = D, N = N, F1 = F1, F2 = F2, F= F, drop_rate = 0.0, ada_rho = rho)
model.load_weights(join(datasets_dir, "model_weights", "saved_model_GNN_best_hyperparameters"))

get_fingerprint_fct = K.function([model.layers[0].input, model.layers[26].input,
                                  model.layers[3].input],
                                  [model.layers[-10].output])

In [58]:
input_data_folder = join(datasets_dir, "GNN_input_data", split)   

def get_representation_input(cid_list):
    XE = ();
    X = ();
    A = ();
    # Generate data
    for cid in cid_list:
        try:
            X = X + (np.load(join(input_data_folder, cid + '_X.npy')), );
            XE = XE + (np.load(join(input_data_folder, cid + '_XE.npy')), );
            A = A + (np.load(join(input_data_folder, cid + '_A.npy')), );
        except FileNotFoundError: #return zero arrays:
            X = X + (np.zeros((N,32)), );
            XE = XE + (np.zeros((N,N,F)), );
            A = A + (np.zeros((N,N,1)), );
    return(XE, X, A)

input_data_folder = join(datasets_dir, "GNN_input_data", split)   
def get_substrate_representations(df, training_set, testing_set, get_fingerprint_fct):
    df["GNN FP"] = ""
    i = 0
    n = len(df)
    
    cid_all = list(df.index)
    if training_set == True:
        prefix = "train_"
    elif testing_set == True:
        prefix = "test_"
    else:
        prefix = "val_"
    cid_all = [prefix + str(cid) for cid in cid_all]
    
    while i*32 <= n:
        if (i+1)*32  <= n:
            XE, X, A = get_representation_input(cid_all[i*32:(i+1)*32])
            representations = get_fingerprint_fct([np.array(XE), np.array(X),np.array(A)])[0]
            df["GNN FP"][i*32:(i+1)*32] = list(representations[:, :52])
        else:
            print(i)
            XE, X, A = get_representation_input(cid_all[-min(32,n):])
            representations = get_fingerprint_fct([np.array(XE), np.array(X),np.array(A)])[0]
            df["GNN FP"][-min(32,n):] = list(representations[:, :52])
        i += 1
        
    ### set all GNN FP-entries with no input matrices to np.nan:
    all_X_matrices = os.listdir(input_data_folder)
    for ind in df.index:
        if prefix +str(ind) +"_X.npy" not in all_X_matrices:
            df["GNN FP"][ind] = np.nan
    return(df)

In [59]:
#Calculating the GNN representations
# train_with_rep = get_substrate_representations(df = train_df, training_set = True, testing_set = False,
#                                                       get_fingerprint_fct = get_fingerprint_fct)
# test_with_rep = get_substrate_representations(df = test_df, training_set = False, testing_set = True,
#                                                      get_fingerprint_fct = get_fingerprint_fct)
val_with_rep = get_substrate_representations(df = df_validation, training_set = False, testing_set = False,
                                                     get_fingerprint_fct = get_fingerprint_fct)

#Saving the DataFrames:
# train_with_rep.to_pickle(join(datasets_dir, "splits", split, "training_data.pkl"))
# test_with_rep.to_pickle(join(datasets_dir, "splits", split, "test_data.pkl"))
val_with_rep.to_pickle(join(datasets_dir, "splits", split, "val_data2.pkl"))

0


In [60]:
# df_sequences.drop("model_input", axis=1, inplace=True)

# for split in ["full", "Arabidopsis", "Brassicaceae", "wildtype", "secondary"]:
#     train_with_rep = pd.read_pickle(join(datasets_dir, "splits", split, "training_data.pkl"))
#     test_with_rep = pd.read_pickle(join(datasets_dir, "splits", split, "test_data.pkl"))
#     val_with_rep = pd.read_pickle(join(datasets_dir, "splits", split, "val_data.pkl"))

#     train_with_rep = train_with_rep.merge(df_sequences, on = "Sequence", how = "left")
#     test_with_rep = test_with_rep.merge(df_sequences, on = "Sequence", how = "left")
#     val_with_rep = val_with_rep.merge(df_sequences, on = "Sequence", how = "left")

#     train_with_rep.to_pickle(join(datasets_dir, "splits", split, "training_data.pkl"))
#     test_with_rep.to_pickle(join(datasets_dir, "splits", split, "test_data.pkl"))
#     val_with_rep.to_pickle(join(datasets_dir, "splits", split, "val_data.pkl"))

In [64]:
df_validation

Unnamed: 0,ECs,Organism,Uniprot IDs,PMID,Type,kcat,Temperature,pH,Substrates,Products,...,Enzyme rep,max_kcat_for_UID,Reaction ID,MW_frac,max_kcat_for_RID,difference_fp,structural_fp,MACCS FP,max_kcat_for_EC,GNN FP
0,1.14.14.43,Arabidopsis thaliana,P48421,11553739,wildtype,2.333333,28.0,7.6,O2;3-Indoleacetaldoxime;L-Cysteine;NADPH;H+,NADP+;S-(Indolylmethylthiohydroximoyl)-L-cyste...,...,"[0.003149065, -0.09315398, -0.13853829, -0.035...",2.333333,Reaction_105,1.002867,2.333333,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1100111100000001001000110110010001001111111100...,0000000000000000000000001010000000000000000001...,2.333333,"[52.241016, 1.8865811, 0.122799575, 132.27585,..."
1,1.14.14.45,Arabidopsis thaliana,O65782,11158532,wildtype,0.883333,28.0,7.6,O2;NADPH;3-Indoleacetaldoxime;L-Cysteine;H+,S-(Indolylmethylthiohydroximoyl)-L-cysteine;NA...,...,"[-0.009683046, -0.10973537, -0.16049284, -0.04...",0.883333,Reaction_105,1.002867,2.333333,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1100111100000001001000110110010001001111111100...,0000000000000000000000001010000000000000000001...,0.883333,"[52.241016, 1.8865811, 0.122799575, 132.27585,..."
2,3.2.1.147,Arabidopsis thaliana,P37702,19703694,wildtype,2.3,37.0,4.5,Sinigrin;H2O,beta-D-Glucose;Allyl isothiocyanate;Sulfate,...,"[0.025101406, -0.096065566, -0.06863045, 0.065...",2.3,Reaction_325,0.997329,46.0,"[0.0, 0.0, 0.0, 0.0, -20.0, 0.0, 0.0, 0.0, 0.0...",1100000100001100000000010000000001101000001100...,0000000000000000000000001000000000100001100000...,287.0,"[164.78851, 95.09568, 0.23067562, 201.57686, 1..."
3,3.2.1.147,Arabidopsis thaliana,Q8GRX1,19703694,wildtype,12.0,37.0,4.5,Sinigrin;H2O,Sulfate;Allyl isothiocyanate;beta-D-Glucose,...,"[0.027088998, -0.10607523, -0.07779723, 0.0542...",12.0,Reaction_325,0.997329,46.0,"[0.0, 0.0, 0.0, 0.0, -20.0, 0.0, 0.0, 0.0, 0.0...",1100000100001100000000010000000001101000001100...,0000000000000000000000001000000000100001100000...,287.0,"[164.78851, 95.09568, 0.23067562, 201.57686, 1..."
4,3.2.1.21,Arabidopsis thaliana,P37702,19703694,wildtype,1.2,37.0,4.5,p-Nitrophenyl-beta-D-glucoside;H2O,p-Nitrophenol;beta-D-Glucose,...,"[0.025101406, -0.096065566, -0.06863045, 0.065...",2.3,Reaction_326,1.0,235.9,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1100000000000000000000000000000001000001001000...,0000000000000000000000001000000000000000000001...,353.2,"[162.77708, 75.44956, 2.0250943, 214.65363, 11..."
5,3.2.1.21,Arabidopsis thaliana,Q8GRX1,19703694,wildtype,7.3,37.0,4.5,p-Nitrophenyl-beta-D-glucoside;H2O,p-Nitrophenol;beta-D-Glucose,...,"[0.027088998, -0.10607523, -0.07779723, 0.0542...",12.0,Reaction_326,1.0,235.9,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1100000000000000000000000000000001000001001000...,0000000000000000000000001000000000000000000001...,353.2,"[162.77708, 75.44956, 2.0250943, 214.65363, 11..."
6,6.3.2.2,Arabidopsis thaliana,P46309,15180996,wildtype,0.075,25.0,7.0,ATP;L-Cysteine;L-Glutamate,ADP;gamma-L-Glutamyl-L-cysteine;Phosphate,...,"[0.01743098, -0.03222834, 0.030855617, 0.05712...",0.113333,Reaction_464,0.993501,0.113333,"[0.0, 0.0, 0.0, 0.0, 10.0, 0.0, 0.0, 0.0, 0.0,...",1100110100000000000000110110010001000001111100...,0000000000000000000000000000000000000000000000...,0.113333,"[42.501057, 19.575014, 0.09579325, 42.458717, ..."
7,6.3.2.2,Arabidopsis thaliana,P46309,15180996,wildtype,0.101667,25.0,7.0,L-Glutamate;L-Cysteine;ATP,Phosphate;ADP;gamma-L-Glutamyl-L-cysteine,...,"[0.01743098, -0.03222834, 0.030855617, 0.05712...",0.113333,Reaction_464,0.993501,0.113333,"[0.0, 0.0, 0.0, 0.0, 10.0, 0.0, 0.0, 0.0, 0.0,...",1100110100000000000000110110010001000001111100...,0000000000000000000000000000000000000000000000...,0.113333,"[56.40395, 22.93756, 0.1915865, 56.054768, 27...."
8,6.3.2.2,Arabidopsis thaliana,P46309,15180996,wildtype,0.113333,25.0,7.0,ATP;L-Glutamate;L-Cysteine,gamma-L-Glutamyl-L-cysteine;ADP;Phosphate,...,"[0.01743098, -0.03222834, 0.030855617, 0.05712...",0.113333,Reaction_464,0.993501,0.113333,"[0.0, 0.0, 0.0, 0.0, 10.0, 0.0, 0.0, 0.0, 0.0,...",1100110100000000000000110110010001000001111100...,0000000000000000000001000010010000000010000001...,0.113333,"[219.53024, 103.0316, 2.6845355, 358.56815, 14..."
