In [None]:
from rdkit import Chem
from rdkit.Chem import AllChem
import pandas as pd
import networkx as nx
import re
import numpy as np
import pdb
import pickle
import os
from parsing_utils import *

In [None]:
from datetime import date
today = date.today()
date = today.strftime('%d%b%Y')
print ('Date prefix:', date)

Skip to [MetaCyc](#Parse-Metacyc-Molecules)

## Parse Rhea Molecules

In [None]:
EXISTING_GRAPH_PATH = "../../../b-o-t/19Sep2022_updated_whole_metabolic_network_labeled.pkl"
RHEA_SDF = "rhea/rhea.sdf"
# RHEA_SDF = "/Users/Itai/Desktop/test_2.sdf"
RHEA_REACTIONS = "rhea/rhea-reactions.txt"
CHEBI_NAMES_IDS = "rhea/chebiId_name.tsv"

In [None]:
# For some reason the > <GENERIC_COMPOUND> was leading to some issues in the files 
lines_to_write = []
with open(RHEA_SDF,'r') as f:
    lines = f.readlines()
    
    after_generic = False
    for l in lines:
        if '<GENERIC_COMPOUND>' in l:
            after_generic=True
        elif len(l) and after_generic and (l=='$$$$\n' or l[0]=='>' or len(l)==0):
            after_generic=False
        if after_generic:
            pass
        else:
            lines_to_write.append(l)     
new_file_name = RHEA_SDF[:-4]+'_removed_generics.sdf'
with open(new_file_name, 'w') as f:
    f.write(''.join(lines_to_write))

In [None]:
rhea = Chem.SDMolSupplier(new_file_name,)# strictParsing=False, sanitize=False)
rhea_smiles = []
rhea_names = []
chebi_to_smiles = {}
for i, m in enumerate(rhea):
    if m:
        prev_m = m
        smiles = Chem.MolToSmiles(m)
        chebi_to_smiles[m.GetProp('ACCESSION')] = smiles
        if '*' not in smiles:
            rhea_smiles.append(standardize_smiles(smiles))
            rhea_names.append(m.GetProp('Rhea_ascii_name'))
    else:
        print (i)
assert len(rhea_smiles)==len(rhea_names)

print ("{} molecules in rhea".format(len(rhea_smiles)))

In [None]:
if EXISTING_GRAPH_PATH:
    with open(EXISTING_GRAPH_PATH, 'rb') as f:
        g = pickle.load(f)
    smiles_in_graph = [standardize_smiles(n) for n in g.nodes if '>' not in n]

In [None]:
print (len(set(rhea_smiles) - set(rhea_smiles).intersection(set(smiles_in_graph))))
# set(rhea_smiles) - set(rhea_smiles).intersection(set(smiles_in_graph))

In [None]:
rhea_smiles_to_name = dict(zip(rhea_smiles, rhea_names))
names_not_in_graph = []
smiles_not_in_graph = []
for s in set(rhea_smiles) - set(rhea_smiles).intersection(set(smiles_in_graph)):
    names_not_in_graph.append(clean_name(rhea_smiles_to_name[s]))
    smiles_not_in_graph.append(s)

In [None]:
#### Uncomment to save

# pd.DataFrame({'name':["'"+x+"'" for x in names_not_in_graph],
#               'smiles':["'"+x+"'" for x in smiles_not_in_graph]}).to_csv('rhea/{}_molecules_from_rhea_cleaned_names.csv'.format(date), header=False, sep='\t', index=False)

## Parse Rhea Reactions

In [None]:
chebis_to_rm = [
                'CHEBI:30212', #photon
                'CHEBI:10545', #electron
               ]
patts_ls = []
for c in chebis_to_rm:
    patts_ls.append('\+ ([0-9]+ )?{} |([0-9]+ )?{} \+ |( \+)? ([0-9]+ )?{}$'.format(c,c,c))

rm_patt = re.compile('|'.join(patts_ls))

rhea_reaction_ids = []
rhea_string_reactions = []
rhea_chebi_reactions = []
with open(RHEA_REACTIONS, 'r') as f:
    for line in f.readlines():
        if line[:10] == 'DEFINITION':
            rhea_string_reactions.append(line[12:].strip())
        elif line[:8] == 'EQUATION':
            reaction = line[12:].strip()
            reaction = re.sub(rm_patt, '', reaction)
            rhea_chebi_reactions.append(reaction)
        elif line[:5] == 'ENTRY':
            rhea_reaction_ids.append(line[12:].strip())

In [None]:
p = re.compile('|'.join(patts_ls))
print (re.sub(p, '', 'CHEBI:15377 + CHEBI:30212 = CHEBI:16453'))

print (re.sub(p, '', 'CHEBI:15377 + CHEBI:30212 => CHEBI:16453'))

print (re.sub(p, '', 'CHEBI:15377 + CHEBI:30213 <=> CHEBI:30212'))

print (re.sub(p, '', 'CHEBI:30212 + CHEBI:30213 + CHEBI:302156'))

In [None]:
len(rhea_string_reactions)

In [None]:
rhea_reaction_df = pd.DataFrame({'ID': rhea_reaction_ids,
                   'reaction_string': rhea_string_reactions,
                   'reaction_chebi': rhea_chebi_reactions})
rhea_reaction_df

In [None]:
parsed_reactions = []
for i, rxn in enumerate(rhea_chebi_reactions):
    parsed_reactions.append(parse_chebi_reaction(rxn, chebi_to_smiles))
    
    if not len(parse_chebi_reaction(rxn, chebi_to_smiles)):
        print (rxn)

In [None]:
rhea_reaction_df['reaction_smiles'] = parsed_reactions

In [None]:
dic_f = make_cofactor_dict(np.array(rhea_reaction_df['reaction_chebi'][:]), 10, 0.9, "f")
dic_r = make_cofactor_dict(np.array(rhea_reaction_df['reaction_chebi'][:]), 10, 0.9, "r")


cofs = mergeDict(dic_f, dic_r)

#clean up
for r in cofs:
    cofs[r] = [p for p in cofs[r] if p not in chebi_to_smiles.keys() or '*' in chebi_to_smiles[p]]
cofs = {k:v for k,v in cofs.items() if len(v)}

In [None]:
cofs

In [None]:
parsed_reactions_no_cofs = []
for i, rxn in enumerate(rhea_chebi_reactions):
    parsed_reactions_no_cofs.append(parse_chebi_reaction(rxn, chebi_to_smiles, cof_dict=cofs, remove_cofs=True))

In [None]:
rhea_reaction_df['reaction_smiles_no_cofs'] = parsed_reactions_no_cofs

In [None]:
rhea_reaction_df['RHEA_ID'] = rhea_reaction_df['ID'].map(lambda x : int(x.replace('RHEA:','')))
rhea_reaction_df = rhea_reaction_df.drop(columns='ID')

In [None]:
rhea_2_uniprot = pd.read_csv('rhea/rhea2uniprot_sprot.tsv', sep='\t')
grouped_rhea_2_uniprot = rhea_2_uniprot.groupby('RHEA_ID')
grouped_rhea_2_uniprot = grouped_rhea_2_uniprot.aggregate(lambda x : ','.join(list(np.unique(x))))

In [None]:
rhea_2_uniprot

In [None]:
all_uniprot_ids = rhea_2_uniprot['ID'].tolist()

In [None]:
rhea_reaction_df = rhea_reaction_df.merge(grouped_rhea_2_uniprot, how='left', on='RHEA_ID')

In [None]:
rhea_reaction_df

In [None]:
np.unique([rhea_reaction_df.loc[61808, 'ID']])

In [None]:
rhea_reaction_df.groupby('reaction_smiles_no_cofs').head()

In [None]:
rhea_reaction_df.loc[rhea_reaction_df['ID'].isna(), 'ID'] = ''

In [None]:
rhea_reaction_df

In [None]:
grouped_df = rhea_reaction_df.groupby('reaction_smiles_no_cofs').aggregate(lambda x : list(np.unique(x)))
grouped_df['reaction_smiles_no_cofs'] = grouped_df.index
grouped_df['reaction_smiles'] = grouped_df['reaction_smiles'].map(lambda x: x[0])
grouped_df['reaction_chebi'] = grouped_df['reaction_chebi'].map(lambda x : x[0])
grouped_df['reaction_string'] = grouped_df['reaction_string'].map(lambda x : x[0])
grouped_df.index = range(len(grouped_df))

In [None]:
grouped_df[grouped_df['reaction_smiles'].map(lambda x : '*' not in x)]

In [None]:
# Uncomment to save

grouped_df[grouped_df['reaction_smiles'].map(lambda x : '*' not in x)].to_csv('rhea/{}_rhea_reaction_smiles_no_cofs.csv'.format(date), sep='\t', index=False)

In [None]:
# sanity check 

rxn = grouped_df.loc[23474,'reaction_chebi']
print (rxn)
AllChem.ReactionFromSmarts(parse_chebi_reaction(rxn, chebi_to_smiles, cof_dict=cofs, remove_cofs=True), useSmiles=True)

## Sanity  check on cofactors

In [None]:
chebi_df = pd.read_csv(CHEBI_NAMES_IDS, sep='\t', header=None)
# chebi_to_name = chebi_df[0].to_dict()
chebi_df[0] = chebi_df[0].map(lambda x : x.strip())
chebi_df[1] = chebi_df[1].map(lambda x : x.strip())

In [None]:
chebi_to_name_dict = pd.Series(chebi_df[1].values,index=chebi_df[0].values).to_dict()

for k in list(cofs.keys()):
    try:
        s_L = chebi_to_smiles[k]
        s_R = [chebi_to_smiles[r] for r in cofs[k]]
        print ("{} : {}".format(s_L, s_R))
        cof_L = chebi_to_name_dict[k]
        cof_R = [chebi_to_name_dict[r] for r in cofs[k]]
        print ("{} : {}".format(cof_L, cof_R))
    except KeyError:
        print ("Can't convert {}".format(k))
        
# for v in list(dic_f.values()) + list(dic_r.values()):
#     for entry in v:
#         try:
#             print (chebi_to_name_dict[entry])
#         except KeyError:
#             print ("Can't convert {}".format(entry))

## Parse Metacyc Molecules

In [None]:
METACYC_REACTIONS = "metacyc/reactions.dat"
METACYC_COMPOUND_LINKS = "metacyc/compound-links.dat"
METACYC_MOL_FILES = "../../../molecule_databases/Metacyc_v26.5/data/MetaCyc-MOLfiles/"

In [None]:
# metacyc_reactions = pd.read_csv(METACYC_REACTIONS, sep='\t', header=None)
with open(METACYC_COMPOUND_LINKS, 'r') as f:
    a = f.readlines()

split_lines = [l.split('\t') for l in a]
name_smiles = []
for l in split_lines:
    if len(l) >= 3:
        name_smiles.append((l[0].strip().lower(),standardize_smiles(l[2].strip())))
        
metacyc_name_to_smiles_dict = dict(name_smiles)
print (len(metacyc_name_to_smiles_dict))

In [None]:
for molfile in os.listdir(METACYC_MOL_FILES):
    molname = molfile.replace('.mol','').lower()
    mol = Chem.MolFromMolFile(os.path.join(METACYC_MOL_FILES,molfile))
    if mol:
        if molname not in metacyc_name_to_smiles_dict.keys():
            smiles = standardize_smiles(Chem.MolToSmiles(mol))
            metacyc_name_to_smiles_dict[molname] = smiles
    else:
        print (molname, 'was not included')
print (len(metacyc_name_to_smiles_dict))

## Parse MetaCyc Reactions

In [None]:
# Get Reactions
attribute_types = {'UNIQUE-ID':'string',
    'EC-NUMBER':'string',
   'ENZYMATIC-REACTION':'string',
   'GIBBS-0':'float',
   'IN-PATHWAY':'list',
   'LEFT':'list',
   'PHYSIOLOGICALLY-RELEVANT?':'string',
   'PREDECESSORS':'list',
   'REACTION-BALANCE-STATUS':'string',
   'REACTION-DIRECTION':'string',
   'RIGHT':'list',
   'RXN-LOCATIONS':'list',
   'SIGNAL':'list',
   'SPONTANEOUS?':'str',
   'STD-REDUCTION-POTENTIAL':'float',
   'SYNONYMS':'list',
   'SYSTEMATIC-NAME':'str'}

In [None]:
metacyc_dict = {}

entry_num = 0
metacyc_dict[entry_num] = {}
for atr in attribute_types:
    if attribute_types[atr]=='list':
        metacyc_dict[entry_num][atr]=[]
    else:
        metacyc_dict[entry_num][atr]=None

with open(METACYC_REACTIONS, 'r') as f:
    for line in f.readlines():
        if line[0] == '#':
            pass
        else:
            if line[:2] == '//':
                entry_num += 1
                metacyc_dict[entry_num] = {}
                for atr in attribute_types:
                    if attribute_types[atr]=='list':
                        metacyc_dict[entry_num][atr]=[]
                    else:
                        metacyc_dict[entry_num][atr]=None
            
            else:
                attribute_patt = re.compile('^[A-Z\-0-9\?\^]+ (?:\- )')
                attribute_match = re.findall(attribute_patt, line)
                if len (attribute_match)==0:
                    print ('Could not find the attribute in line : {}'.format(line))
                else:
                    attribute = attribute_match[0][:-3]
                    attribute_value = re.sub(attribute_patt,'',line).strip()

                    if attribute in attribute_types.keys():
                        if attribute_types[attribute] == 'list':
                            metacyc_dict[entry_num][attribute].append(attribute_value)
                        else:
                            metacyc_dict[entry_num][attribute] = attribute_value

In [None]:
metacyc_raw_df = pd.DataFrame.from_dict(metacyc_dict, orient='index')
metacyc_raw_df

In [None]:
IGNORE=['WATER','PROTON','CARBON-DIOXIDE','OXYGEN-MOLECULE','PPI']
IGNORE = [x.lower() for x in IGNORE]

In [None]:
to_remove = ['light', 'e-']

metacyc_raw_df['LEFT'] = metacyc_raw_df['LEFT'].map(lambda x : [y for y in x if y.lower() not in to_remove])
metacyc_raw_df['RIGHT'] = metacyc_raw_df['RIGHT'].map(lambda x : [y for y in x if y.lower() not in to_remove])

metacyc_raw_df['reaction_str'] = metacyc_raw_df['LEFT'].map(lambda x : ' + '.join(x)) + ' = ' + metacyc_raw_df['RIGHT'].map(lambda x : ' + '.join(x))

metacyc_raw_df['reaction_str'] = metacyc_raw_df['reaction_str'].map(lambda x : x.lower())

In [None]:
dic_f = make_cofactor_dict(np.array(metacyc_raw_df['reaction_str'][:]), 10, 0.9, "f", ignore=IGNORE)
dic_r = make_cofactor_dict(np.array(metacyc_raw_df['reaction_str'][:]), 10, 0.9, "r", ignore=IGNORE)


cofs = mergeDict(dic_f, dic_r)

for r in cofs:
    cofs[r] = [p for p in cofs[r] if p not in metacyc_name_to_smiles_dict.keys() or '*' in metacyc_name_to_smiles_dict[p]]
cofs = {k:v for k,v in cofs.items() if len(v)}

cofs

In [None]:
parsed_reactions = []
for i, rxn in enumerate(metacyc_raw_df['reaction_str'].values):
    parsed_reactions.append(parse_chebi_reaction(rxn, metacyc_name_to_smiles_dict, cof_dict=cofs, remove_cofs=True))

In [None]:
metacyc_raw_df['reaction_smiles'] = parsed_reactions

In [None]:
metacyc_df = metacyc_raw_df[metacyc_raw_df['reaction_smiles'].map(lambda x: len(x) > 0)]
print (metacyc_df['REACTION-DIRECTION'].unique())

rev_df = metacyc_df[metacyc_df['REACTION-DIRECTION']=='REVERSIBLE']
flipped_df = pd.concat([metacyc_df[metacyc_df['REACTION-DIRECTION']=='PHYSIOL-RIGHT-TO-LEFT'], metacyc_df[metacyc_df['REACTION-DIRECTION']=='RIGHT-TO-LEFT']])

rev_df.loc[:,'reaction_smiles'] = rev_df['reaction_smiles'].map(lambda x: flip_reaction(x))
flipped_df.loc[:,'reaction_smiles'] = flipped_df['reaction_smiles'].map(lambda x: flip_reaction(x))
metacyc_df = metacyc_df.drop(index=flipped_df.index)


metacyc_df = pd.concat([metacyc_df, rev_df, flipped_df]).reset_index()

In [None]:
## Uncomment to save 
metacyc_df.to_csv('metacyc/{}_metacyc_reaction_smiles_no_cofs.csv'.format(date), sep='\t', index=False)
print ('Saved to metacyc/{}_metacyc_reaction_smiles_no_cofs.csv'.format(date))

In [None]:
show_metacyc_df[metacyc_df['IN-PATHWAY'].map(lambda x: 'PWY-7040' in x)]

In [None]:
metacyc_df.loc[8815, 'reaction_smiles']