In [1]:
import pandas as pd
import numpy as np
from os.path import join
import os
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import MACCSkeys
from ete3 import NCBITaxa
import random
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'
random.seed(10)
import torch
import esm
from bioservices import *
from data_preprocessing import *
from functions_and_dicts_data_preprocessing_GNN import *
from build_GNN import *
import warnings
warnings.filterwarnings('ignore')
datasets_dir = "../../data"

CURRENT_DIR = os.getcwd()

  from .autonotebook import tqdm as notebook_tqdm


## 1. Loading in Sabio data

#### Loading Sabio data

In [2]:
organism = "Seed plants"

df_Sabio = pd.read_table(join("..", "..", "data", "Km_model_" + organism + ".tsv"))

df_Sabio["Km"] = df_Sabio["Km"].astype('float')
df_Sabio["PMID"] = df_Sabio["PMID"].astype('Int64')

df_Sabio["substrate_IDs"] = df_Sabio["substrate_IDs"].str.split('#')
df_Sabio["product_IDs"] = df_Sabio["product_IDs"].str.split('#')

print("Number of data points: %s" % len(df_Sabio))
print("Number of UniProt IDs: %s" % len(set(df_Sabio["Uniprot IDs"])))

df_Km = df_Sabio

Number of data points: 3550
Number of UniProt IDs: 833


#### Removing duplicates

In [None]:
droplist = []

for ind in df_Km.index:
    UID, Km = df_Km["Uniprot IDs"][ind], df_Km["Km"][ind]
    help_df = df_Km.loc[df_Km["Uniprot IDs"] == UID].loc[df_Km["Km"] == Km]
    
    if len(help_df) > 1:
        droplist = droplist + list(help_df.index)[1:]
        

In [None]:
df_Km.drop(list(set(droplist)), inplace = True)
print("Dropping %s data points, because they are duplicated." % len(set(droplist)))
df_Km.reset_index(inplace = True, drop = True)
df_Km

#### Removing top and bottom 3% of Km values

In [None]:
def find_outliers_IQR(df):

   q1=df.quantile(0.25)

   q3=df.quantile(0.75)

   IQR=q3-q1

   outliers = df[((df<(q1-1.5*IQR)) | (df>(q3+1.5*IQR)))]

   return outliers

find_outliers_IQR(df_Km["Km"])

print(df_Km['Km'].quantile(0.03),  df_Km['Km'].quantile(0.97))

In [None]:
df_Km = df_Km[(df_Km['Km'] > df_Km['Km'].quantile(0.03)) & (df_Km['Km'] < df_Km['Km'].quantile(0.97))]
df_Km.reset_index(inplace = True, drop = True)

In [None]:
todrop= []

for ind in df_Km.index:
    UID = df_Km["Uniprot IDs"][ind]
    if len(UID.split(';')) > 1:
        todrop.append(ind)
        print(df_Km["Uniprot IDs"][ind])
        print(todrop)
        
df_Km.drop(todrop, inplace=True)
df_Km.reset_index(inplace = True, drop = True)

In [None]:
df_Km["substrate_IDs"] = df_Km["substrate_IDs"].apply(lambda x: (set(x)))
df_Km["product_IDs"] = df_Km["product_IDs"].apply(lambda x: (set(x)))

In [3]:
# df_Km.to_pickle(join("..", "..", "data", "Km_data_merged.pkl"))
df_Km = pd.read_pickle(join("..", "..", "data", "Km_data_merged.pkl"))

## 2. Assigning IDs to every unique sequence and to every unique reaction in the dataset

#### Creating DataFrames for all sequences and for all reactions

In [None]:
df_reactions = pd.DataFrame({"substrates": df_Km["substrate_IDs"],
                            "products" : df_Km["product_IDs"]})

df_reactions = df_reactions.loc[df_reactions["substrates"] != set([])]
df_reactions = df_reactions.loc[df_reactions["products"] != set([])]


droplist = []
for ind in df_reactions.index:
    sub_IDs, pro_IDs = df_reactions["substrates"][ind], df_reactions["products"][ind]
    help_df = df_reactions.loc[df_reactions["substrates"] == sub_IDs].loc[df_reactions["products"] == pro_IDs]
    if len(help_df):
        for ind in list(help_df.index)[1:]:
            droplist.append(ind)
            
df_reactions.drop(list(set(droplist)), inplace = True)
df_reactions.reset_index(inplace = True, drop =True)

df_reactions["Reaction ID"] = ["Reaction_" + str(ind) for ind in df_reactions.index]

In [4]:
df_sequences = pd.DataFrame(data = {"Sequence" : df_Km["Sequence"].unique()})
df_sequences = df_sequences.loc[~pd.isnull(df_sequences["Sequence"])]
df_sequences.reset_index(inplace = True, drop = True)
df_sequences["Sequence ID"] = ["Sequence_" + str(ind) for ind in df_sequences.index]

df_sequences

Unnamed: 0,Sequence,Sequence ID
0,MTTGKGKILILGATGYLGKYMVKASISLGHPTYAYVMPLKKNSDDS...,Sequence_0
1,MEENGMKSKILIFGGTGYIGNHMVKGSLKLGHPTYVFTRPNSSKTT...,Sequence_1
2,MGKGGNSEDAVSGKEHGEENMAAWLLGIKTLKIQPYILPSLGPYDV...,Sequence_2
3,MANLRESSRDKSRWSLEGMTALVTGGSKGIGEAVVEELAMLGARVH...,Sequence_3
4,MAKEGGLGENSRWSLGGMTALVTGGSKGIGEAVVEELAMLGAKVHT...,Sequence_4
...,...,...
977,MSSLEDIKNETVDLEKIPIEEVFQQLKCSREGLTTQEGEDRIQIFG...,Sequence_977
978,MNARALLCSSNIHSLYTSNRPPEKTSSSRSLRNLKPSPKSLRVWIY...,Sequence_978
979,MKSFNTEGHNHSTAESGDAYTVSDPTKNVDEDGREKRTGTWLTASA...,Sequence_979
980,MDAYNNPSAVESGDAAVKSVDDDGREKRTGTFWTASAHIITAVIGS...,Sequence_980


#### Calculating minimal Km value for each reaction and sequence

In [None]:
df_reactions["min_Km_for_RID"] = np.nan
for ind in df_reactions.index:
    df_reactions["min_Km_for_RID"][ind] = min(df_Km.loc[df_Km["substrate_IDs"] == df_reactions["substrates"][ind]].loc[df_Km["product_IDs"] == df_reactions["products"][ind]]["Km"])

In [None]:
df_sequences["min_Km_for_UID"] = np.nan
for ind in df_sequences.index:
    df_sequences["min_Km_for_UID"][ind] = min(df_Km.loc[df_Km["Sequence"] == df_sequences['Sequence'][ind]]["Km"])

#### Calculating the sum of the molecular weights of all substrates and of all products

In [None]:
df_reactions["MW_frac"] = np.nan

for ind in df_reactions.index:
    substrates = list(df_reactions["substrates"][ind])
    products = list(df_reactions["products"][ind])
    
    mw_subs = mw_mets(metabolites = substrates)
    mw_pros = mw_mets(metabolites = products)
    
    if mw_subs == np.nan or mw_pros == np.nan:
        df_reactions["MW_frac"][ind] = np.inf
    if mw_pros != 0:
        df_reactions["MW_frac"][ind] = mw_subs/mw_pros
    else:
        df_reactions["MW_frac"][ind] = np.inf
        
df_reactions

#### Calculating enzyme, reaction and substrate features

In [5]:
model, alphabet = torch.hub.load("facebookresearch/esm:main", "esm2_t33_650M_UR50D")

Using cache found in C:\Users\marle/.cache\torch\hub\facebookresearch_esm_main


In [13]:
#creating model input:
df_sequences["model_input"] = [seq[:1022] for seq in df_sequences["Sequence"]]
model_input = [(df_sequences["Sequence ID"][ind], df_sequences["model_input"][ind]) for ind in df_sequences.index]
seqs = [model_input[i][1] for i in range(len(model_input))]
# loading ESM-2 model:
print(".....2(a) Loading ESM-2 model.")
model, alphabet = esm.pretrained.esm2_t33_650M_UR50D()
batch_converter = alphabet.get_batch_converter()
#convert input into batches:

#Calculate ESM-2 representations
print(".....2(b) Calculating enzyme representations.")
df_sequences["Enzyme rep"] = ""

for ind in df_sequences.index:
    print(ind,"/",len(df_sequences))
    batch_labels, batch_strs, batch_tokens = batch_converter([(df_sequences["Sequence ID"][ind], df_sequences["model_input"][ind])])
    with torch.no_grad():
        results = model(batch_tokens, repr_layers=[33])
    df_sequences["Enzyme rep"][ind] = results["representations"][33][0, 1 : len(df_sequences["model_input"][ind]) + 1].mean(0).numpy()
    
df_sequences.head(5)

0 / 982
1 / 982
2 / 982
3 / 982
4 / 982
5 / 982
6 / 982
7 / 982
8 / 982
9 / 982
10 / 982
11 / 982
12 / 982
13 / 982
14 / 982
15 / 982
16 / 982
17 / 982
18 / 982
19 / 982
20 / 982
21 / 982
22 / 982
23 / 982
24 / 982
25 / 982
26 / 982
27 / 982
28 / 982
29 / 982
30 / 982
31 / 982
32 / 982
33 / 982
34 / 982
35 / 982
36 / 982
37 / 982
38 / 982
39 / 982
40 / 982
41 / 982
42 / 982
43 / 982
44 / 982
45 / 982
46 / 982
47 / 982
48 / 982
49 / 982
50 / 982
51 / 982
52 / 982
53 / 982
54 / 982
55 / 982
56 / 982
57 / 982
58 / 982
59 / 982
60 / 982
61 / 982
62 / 982
63 / 982
64 / 982
65 / 982
66 / 982
67 / 982
68 / 982
69 / 982
70 / 982
71 / 982
72 / 982
73 / 982
74 / 982
75 / 982
76 / 982
77 / 982
78 / 982
79 / 982
80 / 982
81 / 982
82 / 982
83 / 982
84 / 982
85 / 982
86 / 982
87 / 982
88 / 982
89 / 982
90 / 982
91 / 982
92 / 982
93 / 982
94 / 982
95 / 982
96 / 982
97 / 982
98 / 982
99 / 982
100 / 982
101 / 982
102 / 982
103 / 982
104 / 982
105 / 982
106 / 982
107 / 982
108 / 982
109 / 982
110 / 982


Unnamed: 0,Sequence,Sequence ID,model_input,Enzyme rep
0,MTTGKGKILILGATGYLGKYMVKASISLGHPTYAYVMPLKKNSDDS...,Sequence_0,MTTGKGKILILGATGYLGKYMVKASISLGHPTYAYVMPLKKNSDDS...,"[-0.03220587, -0.031796478, -0.051493254, 0.03..."
1,MEENGMKSKILIFGGTGYIGNHMVKGSLKLGHPTYVFTRPNSSKTT...,Sequence_1,MEENGMKSKILIFGGTGYIGNHMVKGSLKLGHPTYVFTRPNSSKTT...,"[-0.016749369, -0.048214775, -0.049711403, 0.0..."
2,MGKGGNSEDAVSGKEHGEENMAAWLLGIKTLKIQPYILPSLGPYDV...,Sequence_2,MGKGGNSEDAVSGKEHGEENMAAWLLGIKTLKIQPYILPSLGPYDV...,"[-0.0048636524, -0.069875315, -0.014163645, 0...."
3,MANLRESSRDKSRWSLEGMTALVTGGSKGIGEAVVEELAMLGARVH...,Sequence_3,MANLRESSRDKSRWSLEGMTALVTGGSKGIGEAVVEELAMLGARVH...,"[-0.0007728286, -0.061243184, 0.041369658, 0.0..."
4,MAKEGGLGENSRWSLGGMTALVTGGSKGIGEAVVEELAMLGAKVHT...,Sequence_4,MAKEGGLGENSRWSLGGMTALVTGGSKGIGEAVVEELAMLGAKVHT...,"[-0.011831594, -0.06318853, 0.038726423, 0.021..."


In [30]:
def get_metabolite_type(met):
    if is_KEGG_ID(met):
        return("KEGG")
    elif is_InChI(met):
        return("InChI")
    else:
        return("invalid")

def get_reaction_site_smarts(metabolites):
    reaction_site = ""
    for met in metabolites:
        met_type = get_metabolite_type(met)
        if met_type == "KEGG":
            try:
                Smarts = Chem.MolToSmarts(Chem.MolFromMolFile(join("..", "..", "data", "mol-files",  met + ".mol")))
            except OSError:
                return(np.nan)
        elif met_type == "InChI":
            Smarts = Chem.MolToSmarts(Chem.inchi.MolFromInchi(met))
        else:
            Smarts = "invalid"
        reaction_site = reaction_site + "." + Smarts
    return(reaction_site[1:])


def is_KEGG_ID(met):
    #a valid KEGG ID starts with a "C" or "D" followed by a 5 digit number:
    if len(met) == 6 and met[0] in ["C", "D"]:
        try:
            int(met[1:])
            return(True)
        except: 
            pass
    return(False)

def is_InChI(met):
    m = Chem.inchi.MolFromInchi(met,sanitize=False)
    if m is None:
      return(False)
    else:
      try:
        Chem.SanitizeMol(m)
      except:
        print('.......Metabolite string "%s" is in InChI format but has invalid chemistry' % met)
        return(False)
    return(True)

def convert_fp_to_array(difference_fp_dict):
    fp = np.zeros(2048)
    for key in difference_fp_dict.keys():
        fp[key] = difference_fp_dict[key]
    return(fp)

In [33]:
df_reactions["difference_fp"], df_reactions["structural_fp"],  = "", ""
#each metabolite should be either a KEGG ID, InChI string, or a SMILES:
for ind in df_reactions.index:
    if df_reactions["difference_fp"][ind] == "" or df_reactions["structural_fp"][ind] == "":
        left_site = get_reaction_site_smarts(df_reactions["substrates"][ind])
        right_site = get_reaction_site_smarts(df_reactions["products"][ind])
        if not pd.isnull(left_site) and not pd.isnull(right_site):
            rxn_forward = AllChem.ReactionFromSmarts(left_site + ">>" + right_site)
            difference_fp = Chem.rdChemReactions.CreateDifferenceFingerprintForReaction(rxn_forward)
            difference_fp = convert_fp_to_array(difference_fp.GetNonzeroElements())
            df_reactions["difference_fp"][ind] = difference_fp
            df_reactions["structural_fp"][ind] = Chem.rdChemReactions.CreateStructuralFingerprintForReaction(rxn_forward).ToBitString()

df_reactions.head(5)

Unnamed: 0,substrates,products,Reaction ID,min_Km_for_RID,MW_frac,difference_fp,structural_fp
0,{InChI=1S/C12H14O4/c1-9(13)16-7-3-4-10-5-6-11(...,"{InChI=1S/C2H4O2/c1-2(3)4/h1H3,(H,3,4)/p-1, In...",Reaction_0,7.3e-05,1.001043,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1100111100000001001000110110010001001111111100...
1,{InChI=1S/C12H14O4/c1-9(13)16-7-3-4-10-5-6-11(...,"{InChI=1S/C2H4O2/c1-2(3)4/h1H3,(H,3,4)/p-1, In...",Reaction_1,0.000131,1.001043,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1100111100000001001000110110010001001111111100...
2,{InChI=1S/C21H27N7O14P2/c22-17-12-19(25-7-24-1...,{InChI=1S/C21H29N7O14P2/c22-17-12-19(25-7-24-1...,Reaction_2,0.0022,0.998829,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1100111100000001001000110110010001001101111100...
3,{InChI=1S/C21H29N7O14P2/c22-17-12-19(25-7-24-1...,{InChI=1S/C21H27N7O14P2/c22-17-12-19(25-7-24-1...,Reaction_3,0.0125,1.001173,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1100111100000001001000110110010001001111111100...
4,"{InChI=1S/C7H12O/c1-6-3-2-4-7(8)5-6/h6H,2-5H2,...",{InChI=1S/C21H28N7O17P3/c22-17-12-19(25-7-24-1...,Reaction_4,7e-06,1.001175,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1100111100000001001000110110010001001111111100...


In [35]:
df_sequences.to_pickle(join(datasets_dir, "all_sequences_with_IDs.pkl"))
df_reactions.to_pickle(join(datasets_dir, "all_reactions_with_IDs.pkl"))

#### Mapping Sequence and Reaction IDs to Km_df

In [None]:
df_Km = df_Km.merge(df_sequences, on = "Sequence", how = "left")

In [None]:
df_reactions.rename(columns = {"substrates" : "substrate_IDs",
                              "products" : "product_IDs"}, inplace = True)

df_Km["Reaction ID"] = np.nan
df_Km["MW_frac"] = np.nan
df_Km["min_Km_for_RID"] = np.nan
df_Km["difference_fp"] = ""
df_Km["structural_fp"] = ""

for ind in df_Km.index:
    sub_set, pro_set = df_Km["substrate_IDs"][ind], df_Km["product_IDs"][ind]
    help_df = df_reactions.loc[df_reactions["substrate_IDs"] == sub_set].loc[df_reactions["product_IDs"] == pro_set]
    if len(help_df) == 1:
        df_Km["Reaction ID"][ind] = list(help_df["Reaction ID"])[0]
        df_Km["min_Km_for_RID"][ind] = list(help_df["min_Km_for_RID"])[0]
        df_Km["MW_frac"][ind] = list(help_df["MW_frac"])[0]
        df_Km["difference_fp"][ind] = list(help_df["difference_fp"])[0]
        df_Km["structural_fp"][ind] = list(help_df["structural_fp"])[0]
df_Km.head(2)

In [None]:
df_Km["MACCS FP"] = ""

for ind in df_Km.index:
    id = df_Km["Main Substrate"][ind]
    # try:
    #     id = df_Km['Substrate_IDs'][ind][df_Km["Substrates"][ind].split(';').index(substrate)]
    # except:
    #     for i,s in enumerate(df_Km["Substrates"][ind].split(';')[:-1]):
    #         if substrate in s or s in substrate:
                # id = list(df_Km['Substrate_IDs'][ind])[i]
    if id[0] == "C":
        try:
            mol = Chem.MolFromMolFile(join(datasets_dir,"mol-files", id + '.mol'))
        except OSError:
            None
    else:
        try:
            mol = Chem.inchi.MolFromInchi(id,sanitize=False)
        except OSError:
            None
    if mol is not None:
        maccs_fp = MACCSkeys.GenMACCSKeys(mol).ToBitString()
        df_Km["MACCS FP"][ind] = maccs_fp

#### Calculating the minimal Km value for every EC number in the dataset

In [None]:
df_EC_Km = pd.read_csv(join("..", "..", "data", "min_EC_" + organism + ".tsv"), sep = "\t", header=0)
# df_EC_Km = df_EC_Km.rename(columns={0: "EC", 1: "min_Km"})

for ind in df_EC_Km.index:
    try:
        Km_min = df_EC_Km[df_EC_Km["EC"] == df_Km["ECs"]]["min_Km"]
        df_EC_Km["min_Km"][ind] = Km_min
        print(ind, Km_min)
    except:
        pass
        
df_EC_Km.describe()

In [None]:
df_EC_Km = pd.read_csv(join("..", "..", "data", "min_EC_" + organism + ".tsv"), sep = "\t", header=0)

df_EC_Km.head(5)
df_Km["min_Km_for_EC"] = np.nan

for ind in df_Km.index:
    EC = df_Km["ECs"][ind]
    min_Km = 0
    try:
        print(EC)
        min_Km = df_EC_Km.loc[df_EC_Km["EC"] == EC, "min_Km"].iloc[0]
        print(min_Km)
    except:
        pass
    if min_Km != 0:
        df_Km["min_Km_for_EC"][ind] = min_Km
df_Km.to_pickle(join("..", "..", "data", "merged_and_grouped_Km_dataset.pkl"))  

## 3. Removing outliers

#### Removing non-optimally measured values

To ignore $Km$ values that were obtained under non-optimal conditions, we exclude values higher than 10000\% than the minimal $Km$ value for the same enzyme-reaction combination.

In [None]:
df_Km["frac_of_min_UID"] = np.nan
df_Km["frac_of_min_RID"] = np.nan
df_Km["frac_of_min_EC"] = np.nan

for ind in df_Km.index:
    df_Km["frac_of_min_UID"][ind] =  df_Km["min_Km_for_UID"][ind]/df_Km["Km"][ind]
    df_Km["frac_of_min_RID"][ind] =  df_Km["min_Km_for_RID"][ind]/df_Km["Km"][ind]
    df_Km["frac_of_min_EC"][ind] = df_Km["min_Km_for_EC"][ind]/df_Km["Km"][ind]

len(df_Km)

In [None]:
n = len(df_Km)

df_Km = df_Km.loc[df_Km["frac_of_min_UID"] >= 0.01]
df_Km = df_Km.loc[df_Km["frac_of_min_RID"] >= 0.01]

df_Km["frac_of_min_EC"].loc[pd.isnull(df_Km["frac_of_min_EC"])] = 1
df_Km = df_Km.loc[df_Km["frac_of_min_EC"] <= 10]
df_Km = df_Km.loc[df_Km["frac_of_min_EC"] >= 0.01]

In [None]:
print("We remove %s data points, because we suspect that these Km values were not measure for the natural reaction " \
    "of an enzyme or under non-optimal conditions." % (n-len(df_Km)))

#### Removing data points with reaction queations with uneven fraction of molecular weights

In [None]:
n = len(df_Km)

df_Km = df_Km.loc[df_Km["MW_frac"] < 3]
df_Km = df_Km.loc[df_Km["MW_frac"] > 1/3]

print("We remove %s data points because the sum of molecular weights of substrates does not match the sum of molecular" \
      "weights of the products." % (n-len(df_Km)))

In [None]:
print("Size of final Km dataset: %s" % len(df_Km))
df_Km.to_pickle(join("..", "..", "data", "final_Km_dataset_" + organism + ".pkl"))

## 4. Preparing dataset and splitting into train-test

In [5]:
df_Km = pd.read_pickle(join("..", "..", "data", "final_Km_dataset_" + organism + ".pkl"))

#### Making input for GNN

In [None]:
# for ind in df_Km.index:
#     substrate = df_Km["Main Substrate"][ind]
#     try:
#         id = list(df_Km['substrate_IDs'][ind])[df_Km["Substrates"][ind].split(';').index(substrate)]
#     except:
#         for i,s in enumerate(df_Km["Substrates"][ind].split(';')[:-1]):
#             if substrate in s or s in substrate:
#                 id = list(df_Km['substrate_IDs'][ind])[i]
#     df_Km["Main Substrate"][ind] = id

inchi_ids = {}
for i, element in enumerate(df_Km["Main Substrate"]):
    if element[0] != 'C' and element not in inchi_ids.keys():
        inchi_ids[element] = str(i)
        # mol = Chem.inchi.MolFromInchi(element)
        # if not mol is None:
        #     calculate_atom_and_bond_feature_vectors(mol, str(i))
        # Chem.rdmolfiles.MolToMolFile(Chem.inchi.MolFromInchi(element), join(datasets_dir,"mol-files", str(i) + ".mol")  )  

#### Splitting glucosinolates into validation dataset

Search UniProt for GO term related to glucosionalte metabolic process, download file as .tsv and filter dataset

In [6]:
glucosinolates = pd.read_table(join(datasets_dir,"glucosinolates.tsv"))["Entry"].tolist()
df_validation = df_Km[df_Km["Uniprot IDs"].isin(glucosinolates)]
df_validation.reset_index(inplace=True, drop = True)
df_Km = df_Km[~df_Km["Uniprot IDs"].isin(glucosinolates)]
df_Km.reset_index(inplace=True, drop = True)
split = "full"

If training-testing with only Arabidopsis data:

In [None]:
# df_Km = df_Km[df_Km["Organism"] == 'Arabidopsis thaliana']
# df_Km.reset_index(inplace=True, drop = True)
# split = "Arabidopsis"

If training-testing with only Brassicaceae data:

In [None]:
# ncbi = NCBITaxa()

# organisms = {}

# def is_brassicaceae(org):
#     try:
#         tax_id = ncbi.get_name_translator([org])[org][0]
#         lineage = ncbi.get_lineage(tax_id)
#         if 3700 not in lineage:
#             return(False)
#         else:
#             return(True)
#     except KeyError:
#         return(False)
    
# for org in df_Km["Organism"].tolist():
#     if org not in organisms.keys():
#         organisms[org] = is_brassicaceae(org)

# df_Km = df_Km[df_Km["Organism"].isin([key for key, value in organisms.items() if value is True])]
# df_Km.reset_index(inplace=True, drop = True)
# split = "Brassicaceae"

If training-testing only with wildtype data:

In [None]:
# df_Km = df_Km[df_Km["Type"].str.contains("wildtype")]
# df_Km.reset_index(inplace=True, drop = True)
# split = "wildtype"

If training-testing only with secondary metabolite data:

In [None]:
secondary = pd.read_table(join(datasets_dir,"secondary_metabolites.tsv"))["Entry"].tolist()
df_Km = df_Km[df_Km["Uniprot IDs"].isin(secondary)]
df_Km.reset_index(inplace=True, drop = True)
split = "secondary"

In [None]:
# os.mkdir(join(datasets_dir, "splits", split))

#### Calculating arithmetic mean for Km values of same enzyme-reaction-substrate combination-pH-temperature

In [None]:
# df_new = pd.DataFrame(data = {"Reaction ID" : df_Km["Reaction ID"],
#                                   "Sequence ID" : df_Km["Sequence ID"],
#                                   "Temperature" : df_Km["Temperature"],
#                                     "pH" : df_Km["pH"],
#                                  "Type": df_Km["Type"],
#                              "MACCS FP" : df_Km["MACCS FP"]})

# df_new.drop_duplicates(inplace = True)
# df_new.reset_index(inplace = True, drop = True)

# df_new["Km_values"], df_new["Uniprot IDs"], df_new["ECs"], df_new["Substrates"], df_new["Products"], df_new["ESM2"], df_new["Sequence"], df_new["difference_fp"], df_new["structural_fp"] = "", "", "", "", "", "", "", "", ""

# for ind in df_new.index:
#     RID, SID, Temp, pH, Type, MSubstrate = df_new["Reaction ID"][ind], df_new["Sequence ID"][ind], df_new["Temperature"][ind], df_new["pH"][ind], df_new["Type"][ind], df_new["MACCS FP"][ind]
#     help_df = df_Km.loc[df_Km["Reaction ID"] 
#                                  == RID].loc[df_Km["Sequence ID"] 
#                                              == SID].loc[df_Km["Temperature"] 
#                                                          == Temp].loc[df_Km["pH"] 
#                                                                       == pH].loc[df_Km["Type"] 
#                                                                                  == Type].loc[df_Km["MACCS FP"] 
#                                                                                               == MSubstrate]
#     print(help_df)
#     df_new["ECs"][ind] = list(help_df["ECs"])
#     df_new["Km_values"][ind] = list(help_df["Km"])
#     df_new["Uniprot IDs"][ind] = list(help_df["Uniprot IDs"])
#     df_new["Sequence"][ind] = help_df["Sequence"].values[0]
#     df_new["ESM2"][ind] = help_df["Enzyme rep"].values[0]
#     df_new["difference_fp"][ind], df_new["structural_fp"][ind] = help_df["difference_fp"].values[0], help_df["structural_fp"].values[0]
#     df_new["Substrates"][ind], df_new["Products"][ind] = help_df["Substrates"].values[0], help_df["Products"].values[0]

In [None]:
# df_new2 = pd.DataFrame(data = {"Reaction ID" : df_validation["Reaction ID"],
#                                   "Sequence ID" : df_validation["Sequence ID"],
#                                   "Temperature" : df_validation["Temperature"],
#                                     "pH" : df_validation["pH"],
#                                   "Type" : df_validation["Type"],
#                                   "MACCS FP" : df_validation["MACCS FP"]})

# df_new2.drop_duplicates(inplace = True)
# df_new2.reset_index(inplace = True, drop = True)

# df_new2["Km_values"], df_new2["Uniprot IDs"], df_new2["ECs"], df_new2["Organisms"], df_new2["Substrates"], df_new2["Products"], df_new2["ESM2"], df_new2["Sequence"], df_new2["difference_fp"], df_new2["structural_fp"] = "", "", "", "", "", "", "", "", "", ""

# for ind in df_new2.index:
#     RID, SID, Temp, pH, Type, MSubstrate = df_new2["Reaction ID"][ind], df_new2["Sequence ID"][ind], df_new2["Temperature"][ind], df_new2["pH"][ind], df_new2["Type"][ind], df_new2["MACCS FP"][ind]
#     help_df = df_validation.loc[df_validation["Reaction ID"] 
#                               == RID].loc[df_validation["Sequence ID"] 
#                                           == SID].loc[df_validation["Temperature"] 
#                                                       == Temp].loc[df_validation["pH"] 
#                                                                     == pH].loc[df_validation["Type"] 
#                                                                               == Type].loc[df_validation["MACCS FP"] 
#                                                                                                             == MSubstrate]
#     df_new2["ECs"][ind] = list(help_df["ECs"])
#     df_new2["Km_values"][ind] = list(help_df["Km"])
#     df_new2["Uniprot IDs"][ind] = list(help_df["Uniprot IDs"])
#     df_new2["Organisms"][ind] = list(help_df["Organism"])
#     df_new2["Type"][ind]
#     df_new2["Sequence"][ind] = help_df["Sequence"].values[0]
#     df_new2["ESM2"][ind] = help_df["Enzyme rep"].values[0]
#     df_new2["difference_fp"][ind], df_new2["structural_fp"][ind] = help_df["difference_fp"].values[0], help_df["structural_fp"].values[0]
#     df_new2["Substrates"][ind], df_new2["Products"][ind] = help_df["Substrates"].values[0], help_df["Products"].values[0]

In [None]:
# df_Km = df_new
# df_validation = df_new2

In [None]:
# df_Km["geomean_Km"] = np.nan
# for ind in df_Km.index:
#     all_Km = np.array(df_Km["Km_values"][ind]).astype(float)
    # min_Km = min(all_Km)
    # all_Km_top = [Km for Km in all_Km  if min_Km/Km >= 0.01]
    # df_arabidopsis["geomean_Km"][ind] = np.mean((all_Km_top))

In [None]:
# df_validation["geomean_Km"] = np.nan
# for ind in df_validation.index:
#     all_Km = np.array(df_validation["Km_values"][ind]).astype(float)
#     min_Km = min(all_Km)
#     all_Km_top = [Km for Km in all_Km  if min_Km/Km >= 0.01]
#     df_validation["geomean_Km"][ind] = np.mean((all_Km_top))
    
# df_validation.to_pickle(join(datasets_dir, "splits", "validation_%s.pkl" %organism))


#### Splitting into train-test

In [None]:
df = df_Km.copy()
df = df.sample(frac = 1, random_state = 123)
df.reset_index(drop= True, inplace = True)

train_df, test_df = split_dataframe_enzyme(frac = 5, df = df.copy())
print("Test set size: %s" % len(test_df))
print("Training set size: %s" % len(train_df))
print("Size of test set in percent: %s" % np.round(100*len(test_df)/ (len(test_df) + len(train_df))))

train_df.reset_index(inplace = True, drop = True)
test_df.reset_index(inplace = True, drop = True)

train_df.to_pickle(join(datasets_dir, "splits", split, "train_df_Km_%s.pkl" %organism))
test_df.to_pickle(join(datasets_dir, "splits", split, "test_df_Km_%s.pkl" %organism))

#### Splitting CV folds

In [None]:
data_train2 = train_df.copy()
data_train2["index"] = list(data_train2.index)

data_train2, df_fold = split_dataframe_enzyme(df = data_train2, frac=5)
indices_fold1 = list(df_fold["index"])
print(len(data_train2), len(indices_fold1))

data_train2, df_fold = split_dataframe_enzyme(df = data_train2, frac=4)
indices_fold2 = list(df_fold["index"])
print(len(data_train2), len(indices_fold2))

data_train2, df_fold = split_dataframe_enzyme(df = data_train2, frac=3)
indices_fold3 = list(df_fold["index"])
print(len(data_train2), len(indices_fold3))

data_train2, df_fold = split_dataframe_enzyme(df = data_train2, frac=2)
indices_fold4 = list(df_fold["index"])
indices_fold5 = list(data_train2["index"])
print(len(data_train2), len(indices_fold4))


fold_indices = [indices_fold1, indices_fold2, indices_fold3, indices_fold4, indices_fold5]

CV_train_indices = [[], [], [], [], []]
CV_test_indices = [[], [], [], [], []]

for i in range(5):
    for j in range(5):
        if i != j:
            CV_train_indices[i] = CV_train_indices[i] + fold_indices[j]
    CV_test_indices[i] = fold_indices[i]
    
    
np.save(join(datasets_dir, "splits", split, "CV_train_indices_%s" %organism), CV_train_indices)
np.save(join(datasets_dir, "splits", split, "CV_test_indices_%s" %organism), CV_test_indices)

## 5. Building GNN for substrate representation

In [None]:
# os.mkdir(join(datasets_dir, "GNN_input_data", split))

for ind in train_df.index:
    calculate_and_save_input_matrixes(inchi_ids, sample_ID = "train_" + str(ind), df = train_df,
                                      save_folder = join(datasets_dir, "GNN_input_data", split))
    
for ind in test_df.index:
    calculate_and_save_input_matrixes(inchi_ids, sample_ID = "test_" + str(ind), df = test_df,
                                      save_folder = join(datasets_dir, "GNN_input_data", split))
    
for ind in df_validation.index:
    calculate_and_save_input_matrixes(inchi_ids, sample_ID = "val_" + str(ind), df = df_validation,
                                    save_folder = join(datasets_dir, "GNN_input_data", split))

In [None]:
train_indices = os.listdir(join(datasets_dir, "GNN_input_data", split))
train_indices = [index[:index.rfind("_")] for index in train_indices]
train_indices = list(set([index for index in train_indices if "train" in index]))

test_indices = os.listdir(join(datasets_dir, "GNN_input_data", split))
test_indices = [index[:index.rfind("_")] for index in test_indices]
test_indices = list(set([index for index in test_indices if "test" in index]))

#### Hyper-parameter optimization with CV

In [None]:
param_grid = {'batch_size': [96,96,128],
                'D': [50,100],
                'learning_rate': [0.01, 0.1],
                'epochs': [30,50,80],
                'l2_reg_fc' : [0.01, 0.1, 1],
                'l2_reg_conv': [0.01, 0.1, 1],
                'rho': [0.9, 0.95, 0.99]}

params_list = [(batch_size, D, learning_rate, epochs, l2_reg_fc, l2_reg_conv, rho) for batch_size in param_grid['batch_size'] for D in param_grid["D"] for learning_rate in param_grid['learning_rate']
                for epochs in param_grid['epochs'] for l2_reg_fc in param_grid['l2_reg_fc'] for l2_reg_conv in param_grid['l2_reg_conv'] for rho in param_grid["rho"]]

params_list = random.sample(params_list, 10)

In [None]:
# count = 0
# results=[]

# for params in params_list:

#     batch_size, D, learning_rate, epochs, l2_reg_fc, l2_reg_conv, rho = params
#     count +=1
#     MAE = []

#     for i in range(5):
#         train_index, test_index  = CV_train_indices[i], CV_test_indices[i]
#         train_index = [ind for ind in train_indices if int(ind.split("_")[1]) in train_index]
#         test_index = [ind for ind in train_indices if int(ind.split("_")[1]) in test_index]

#         train_params = {'batch_size': batch_size,
#                     'folder' :join(datasets_dir, "GNN_input_data/full"),
#                     'list_IDs' : np.array(train_index),
#                     'shuffle': True}

#         test_params = {'batch_size': min(batch_size,len(test_index)),
#                     'folder' : join(datasets_dir, "GNN_input_data/full"),
#                     'list_IDs' : np.array(test_index),
#                     'shuffle': False}

#         training_generator = DataGenerator(**train_params)
#         test_generator = DataGenerator(**test_params)


#         model = DMPNN_without_extra_features(l2_reg_conv = l2_reg_conv, l2_reg_fc = l2_reg_fc, learning_rate = learning_rate,
#                         D = D, N = N, F1 = F1, F2 = F2, F= F, drop_rate = 0.0, ada_rho = rho)
#         model.fit(training_generator, epochs= epochs, shuffle = True, verbose = 1)

#         #get test_y:
#         test_indices_y = [int(ind.split("_")[1]) for ind in train_indices if ind in test_index]
#         test_y = np.array([train_df["Km"][ind] for ind in test_indices_y])

#         pred_test = model.predict(test_generator)
#         mae = np.median(abs(pred_test - np.reshape(test_y[:len(pred_test)], (-1,1))))
#         print(mae)
#         MAE.append(mae)

#     results.append({"batch_size" : batch_size, "D" : D , "learning_rate" : learning_rate, "epochs" : epochs,
#                     "l2_reg_fc" : l2_reg_fc, "l2_reg_conv" : l2_reg_conv, "rho" : rho, "cv_mae" : np.mean(MAE)})

# params = min(results, key=lambda d: d['cv_mae'])
# print(params)

{'batch_size': 96, 'D': 50, 'learning_rate': 0.1, 'epochs': 80, 'l2_reg_fc': 0.1, 'l2_reg_conv': 0.1, 'rho': 0.99}

#### Training the model with the best set of hyperparmeters on the whole training set and validate it on the test set

In [None]:
batch_size = 96
D = 50
learning_rate = 0.1
epochs = 80
l2_reg_fc = 0.1
l2_reg_conv = 0.1
rho = 0.99

In [None]:
# train_indices = os.listdir(join(datasets_dir, "GNN_input_data/full"))
# train_indices = [index[:index.rfind("_")] for index in train_indices]
# train_indices = list(set([index for index in train_indices if "train" in index]))

# test_indices = os.listdir(join(datasets_dir, "GNN_input_data/full"))
# test_indices = [index[:index.rfind("_")] for index in test_indices]
# test_indices = list(set([index for index in test_indices if "test" in index]))


# train_params = {'batch_size': batch_size,
#               'folder' :join(datasets_dir, "GNN_input_data/full"),
#               'list_IDs' : train_indices,
#               'shuffle': True}

# test_params = {'batch_size': batch_size,
#               'folder' :join(datasets_dir, "GNN_input_data/full"),
#               'list_IDs' : test_indices,
#               'shuffle': False}

# training_generator = DataGenerator(**train_params)
# test_generator = DataGenerator(**test_params)

# model = DMPNN_without_extra_features(l2_reg_conv = l2_reg_conv, l2_reg_fc = l2_reg_fc, learning_rate = learning_rate,
#                   D = D, N = N, F1 = F1, F2 = F2, F= F, drop_rate = 0.0, ada_rho = rho)

# model.fit(training_generator, epochs= epochs, shuffle = True, verbose = 1)
# model.save_weights(join(datasets_dir, "model_weights", "saved_model_GNN_best_hyperparameters"))

# pred_test = model.predict(test_generator)
# test_indices_y = [int(ind.split("_")[1]) for ind in np.array(test_indices)]
# test_y = np.array([test_df["Km"][ind] for ind in test_indices_y])

#### Calculating substrate representation for every data point in training and test set

In [None]:
model = DMPNN_without_extra_features(l2_reg_conv = l2_reg_conv, l2_reg_fc = l2_reg_fc, learning_rate = learning_rate,
                  D = D, N = N, F1 = F1, F2 = F2, F= F, drop_rate = 0.0, ada_rho = rho)
model.load_weights(join(datasets_dir, "model_weights", "saved_model_GNN_best_hyperparameters"))

get_fingerprint_fct = K.function([model.layers[0].input, model.layers[26].input,
                                  model.layers[3].input],
                                  [model.layers[-10].output])

In [None]:
# val_indices = os.listdir(join(datasets_dir, "GNN_input_data", split))
# val_indices = [index[:index.rfind("_")] for index in val_indices]
# val_indices = list(set([index for index in val_indices if "val" in index]))

# val_params = {'batch_size': len(val_indices),
#               'folder' :join(datasets_dir, "GNN_input_data"),
#               'list_IDs' : val_indices,
#               'shuffle': False}

# val_generator = DataGenerator(**val_params)

# pred_val = model.predict(val_generator)

# val_indices_y = [int(ind.split("_")[1]) for ind in np.array(val_indices)]
# test_y = np.array([train_df["Km"][ind] for ind in val_indices_y])

# mae = np.median(abs(pred_val - np.reshape(test_y[:len(pred_val)], (-1,1))))
# print(mae)

In [None]:
input_data_folder = join(datasets_dir, "GNN_input_data", split)   

def get_representation_input(cid_list):
    XE = ();
    X = ();
    A = ();
    # Generate data
    for cid in cid_list:
        try:
            X = X + (np.load(join(input_data_folder, cid + '_X.npy')), );
            XE = XE + (np.load(join(input_data_folder, cid + '_XE.npy')), );
            A = A + (np.load(join(input_data_folder, cid + '_A.npy')), );
        except FileNotFoundError: #return zero arrays:
            X = X + (np.zeros((N,32)), );
            XE = XE + (np.zeros((N,N,F)), );
            A = A + (np.zeros((N,N,1)), );
    return(XE, X, A)

input_data_folder = join(datasets_dir, "GNN_input_data", split)   
def get_substrate_representations(df, training_set, testing_set, get_fingerprint_fct):
    df["GNN FP"] = ""
    i = 0
    n = len(df)
    
    cid_all = list(df.index)
    if training_set == True:
        prefix = "train_"
    elif testing_set == True:
        prefix = "test_"
    else:
        prefix = "val_"
    cid_all = [prefix + str(cid) for cid in cid_all]
    
    while i*96 <= n:
        if (i+1)*96  <= n:
            XE, X, A = get_representation_input(cid_all[i*96:(i+1)*96])
            representations = get_fingerprint_fct([np.array(XE), np.array(X),np.array(A)])[0]
            df["GNN FP"][i*96:(i+1)*96] = list(representations[:, :52])
        else:
            print(i)
            XE, X, A = get_representation_input(cid_all[-min(96,n):])
            representations = get_fingerprint_fct([np.array(XE), np.array(X),np.array(A)])[0]
            df["GNN FP"][-min(96,n):] = list(representations[:, :52])
        i += 1
        
    ### set all GNN FP-entries with no input matrices to np.nan:
    all_X_matrices = os.listdir(input_data_folder)
    for ind in df.index:
        if prefix +str(ind) +"_X.npy" not in all_X_matrices:
            df["GNN FP"][ind] = np.nan
    return(df)

In [None]:
#Calculating the GNN representations
train_with_rep = get_substrate_representations(df = train_df, training_set = True, testing_set = False,
                                                      get_fingerprint_fct = get_fingerprint_fct)
test_with_rep = get_substrate_representations(df = test_df, training_set = False, testing_set = True,
                                                     get_fingerprint_fct = get_fingerprint_fct)
val_with_rep = get_substrate_representations(df = df_validation, training_set = False, testing_set = False,
                                                     get_fingerprint_fct = get_fingerprint_fct)

#Saving the DataFrames:
train_with_rep.to_pickle(join(datasets_dir, "splits", split, "training_data.pkl"))
test_with_rep.to_pickle(join(datasets_dir, "splits", split, "test_data.pkl"))
val_with_rep.to_pickle(join(datasets_dir, "splits", split, "val_data.pkl"))

In [36]:
# for split in ["full", "Arabidopsis", "Brassicaceae", "wildtype", "secondary"]:
#     train_with_rep = pd.read_pickle(join(datasets_dir, "splits", split, "training_data.pkl"))
#     test_with_rep = pd.read_pickle(join(datasets_dir, "splits", split, "test_data.pkl"))
#     val_with_rep = pd.read_pickle(join(datasets_dir, "splits", split, "val_data.pkl"))
#     for ind in train_with_rep.index:
#         if train_with_rep["difference_fp"][ind] == "" or train_with_rep["structural_fp"][ind] == "":
#             train_with_rep["difference_fp"][ind] = list(df_reactions[df_reactions["Reaction ID"] == train_with_rep["Reaction ID"][ind]]["difference_fp"])[0]
#             train_with_rep["structural_fp"][ind] = list(df_reactions[df_reactions["Reaction ID"] == train_with_rep["Reaction ID"][ind]]["structural_fp"])[0]
#     for ind in test_with_rep.index:
#         if test_with_rep["difference_fp"][ind] == "" or test_with_rep["structural_fp"][ind] == "":
#             test_with_rep["difference_fp"][ind] = list(df_reactions[df_reactions["Reaction ID"] == test_with_rep["Reaction ID"][ind]]["difference_fp"])[0]
#             test_with_rep["structural_fp"][ind] = list(df_reactions[df_reactions["Reaction ID"] == test_with_rep["Reaction ID"][ind]]["structural_fp"])[0]
#     for ind in val_with_rep.index:
#         if val_with_rep["difference_fp"][ind] == "" or val_with_rep["structural_fp"][ind] == "":
#             val_with_rep["difference_fp"][ind] = list(df_reactions[df_reactions["Reaction ID"] == val_with_rep["Reaction ID"][ind]]["difference_fp"])[0]
#             val_with_rep["structural_fp"][ind] = list(df_reactions[df_reactions["Reaction ID"] == val_with_rep["Reaction ID"][ind]]["structural_fp"])[0]            

#     train_with_rep.to_pickle(join(datasets_dir, "splits", split, "training_data.pkl"))
#     test_with_rep.to_pickle(join(datasets_dir, "splits", split, "test_data.pkl"))
#     val_with_rep.to_pickle(join(datasets_dir, "splits", split, "val_data.pkl"))