In [None]:
from rdkit import Chem
from rdkit.Chem import AllChem
import pandas as pd
import networkx as nx
import re
import numpy as np
import pdb
import pickle
from parsing_utils import *

In [None]:
from datetime import date
today = date.today()
date = today.strftime('%d%b%Y')
print ('Date prefix:', date)

## Parse Molecules

In [None]:
EXISTING_GRAPH_PATH = "../../../b-o-t/19Sep2022_updated_whole_metabolic_network_labeled.pkl"
RHEA_SDF = "rhea/rhea.sdf"
RHEA_REACTIONS = "rhea/rhea-reactions.txt"
CHEBI_NAMES_IDS = "rhea/chebiId_name.tsv"

In [None]:
rhea = Chem.SDMolSupplier(RHEA_SDF)
rhea_smiles = []
rhea_names = []
chebi_to_smiles = {}
for m in rhea:
    if m:
        smiles = Chem.MolToSmiles(m)
        chebi_to_smiles[m.GetProp('ACCESSION')] = smiles
        if '*' not in smiles:
            rhea_smiles.append(standardize_smiles(smiles))
            rhea_names.append(m.GetProp('Rhea_ascii_name'))
assert len(rhea_smiles)==len(rhea_names)

print ("{} molecules in rhea".format(len(rhea_smiles)))

In [None]:
if EXISTING_GRAPH_PATH:
    with open(EXISTING_GRAPH_PATH, 'rb') as f:
        g = pickle.load(f)
    smiles_in_graph = [standardize_smiles(n) for n in g.nodes if '>' not in n]

In [None]:
print (len(set(rhea_smiles) - set(rhea_smiles).intersection(set(smiles_in_graph))))
# set(rhea_smiles) - set(rhea_smiles).intersection(set(smiles_in_graph))

In [None]:
rhea_smiles_to_name = dict(zip(rhea_smiles, rhea_names))
names_not_in_graph = []
smiles_not_in_graph = []
for s in set(rhea_smiles) - set(rhea_smiles).intersection(set(smiles_in_graph)):
    names_not_in_graph.append(clean_name(rhea_smiles_to_name[s]))
    smiles_not_in_graph.append(s)

In [None]:
#### Uncomment to save

# pd.DataFrame({'name':["'"+x+"'" for x in names_not_in_graph],
#               'smiles':["'"+x+"'" for x in smiles_not_in_graph]}).to_csv('rhea/{}_molecules_from_rhea_cleaned_names.csv'.format(date), header=False, sep='\t', index=False)

## Parse Reactions

In [None]:
rhea_reaction_ids = []
rhea_string_reactions = []
rhea_chebi_reactions = []
with open(RHEA_REACTIONS, 'r') as f:
    for line in f.readlines():
        if line[:10] == 'DEFINITION':
            rhea_string_reactions.append(line[12:].strip())
        elif line[:8] == 'EQUATION':
            rhea_chebi_reactions.append(line[12:].strip())
        elif line[:5] == 'ENTRY':
            rhea_reaction_ids.append(line[12:].strip())

In [None]:
len(rhea_string_reactions)

In [None]:
rhea_reaction_df = pd.DataFrame({'ID': rhea_reaction_ids,
                   'reaction_string': rhea_string_reactions,
                   'reaction_chebi': rhea_chebi_reactions})
rhea_reaction_df

In [None]:
parsed_reactions = []
for i, rxn in enumerate(rhea_chebi_reactions):
    parsed_reactions.append(parse_chebi_reaction(rxn, chebi_to_smiles))

In [None]:
rhea_reaction_df['reaction_smiles'] = parsed_reactions

In [None]:
dic_f = make_cofactor_dict(np.array(rhea_reaction_df['reaction_chebi'][:]), 10, 0.9, "f")
dic_r = make_cofactor_dict(np.array(rhea_reaction_df['reaction_chebi'][:]), 10, 0.9, "r")


cofs = mergeDict(dic_f, dic_r)

In [None]:
parsed_reactions_no_cofs = []
for i, rxn in enumerate(rhea_chebi_reactions):
    parsed_reactions_no_cofs.append(parse_chebi_reaction(rxn, chebi_to_smiles, cof_dict=cofs, remove_cofs=True))

In [None]:
rhea_reaction_df['reaction_smiles_no_cofs'] = parsed_reactions_no_cofs

In [None]:
grouped_df = rhea_reaction_df.groupby('reaction_smiles_no_cofs').aggregate(lambda x : list(np.unique(x)))
grouped_df['reaction_smiles_no_cofs'] = grouped_df.index
grouped_df['reaction_smiles'] = grouped_df['reaction_smiles'].map(lambda x: x[0])
grouped_df['reaction_chebi'] = grouped_df['reaction_chebi'].map(lambda x : x[0])
grouped_df['reaction_string'] = grouped_df['reaction_string'].map(lambda x : x[0])
grouped_df.index = range(len(grouped_df))

In [None]:
grouped_df[grouped_df['reaction_smiles'].map(lambda x : '*' not in x)]

In [None]:
# grouped_df[grouped_df['reaction_smiles'].map(lambda x : '*' not in x)].to_csv('rhea/{}_rhea_reaction_smiles_no_cofs.csv'.format(date), sep='\t', index=False)

In [None]:
# sanity check 

rxn = grouped_df.loc[23474,'reaction_chebi']
print (rxn)
AllChem.ReactionFromSmarts(parse_chebi_reaction(rxn, chebi_to_smiles, cof_dict=cofs, remove_cofs=True), useSmiles=True)

## Sanity  check on cofactors

In [None]:
chebi_df = pd.read_csv(CHEBI_NAMES_IDS, sep='\t', header=None)
# chebi_to_name = chebi_df[0].to_dict()
chebi_df[0] = chebi_df[0].map(lambda x : x.strip())
chebi_df[1] = chebi_df[1].map(lambda x : x.strip())

In [None]:
chebi_to_name_dict = pd.Series(chebi_df[1].values,index=chebi_df[0].values).to_dict()

for k in list(cofs.keys()):
    try:
        cof_L = chebi_to_name_dict[k]
        cof_R = [chebi_to_name_dict[r] for r in cofs[k]]
        print ("{} : {}".format(cof_L, cof_R))
    except KeyError:
        print ("Can't convert {}".format(k))
        
# for v in list(dic_f.values()) + list(dic_r.values()):
#     for entry in v:
#         try:
#             print (chebi_to_name_dict[entry])
#         except KeyError:
#             print ("Can't convert {}".format(entry))

In [None]:
'chebi:67210 + 28 chebi:66915 <=> chebi:67212 + 28 chebi:15378 + 28 chebi:58223;chebi:67210 + 28 chebi:66915 = chebi:67212 + 28 chebi:15378 + 28 chebi:58223' == 'hebi:67210 + 28 chebi:66915 = chebi:67212 + 28 chebi:15378 + 28 chebi:58223;chebi:67210 + 28 chebi:66915 => chebi:67212 + 28 chebi:15378 + 28 chebi:58223'

In [None]:
np.array(grouped_df['reaction_chebi'][:])

## Check Metacyc

In [None]:
METACYC_REACTIONS = "metacyc/atom-mappings-smiles.dat"
METACYC_COMPOUND_LINKS = "metacyc/compound-links.dat"

In [None]:
metacyc_reactions = pd.read_csv(METACYC_REACTIONS, sep='\t', header=None)

In [None]:
metacyc_smiles = set()

for reaction in metacyc_reactions[1].values:
    try:
        reactants, products = reaction.split('>>')
        
        mapped_smiles_list = reactants.split('.') + products.split('.')
        
        for smiles in mapped_smiles_list:
            if 'R' not in smiles and ' ' not in smiles:
                mol = Chem.MolFromSmiles(smiles)
                if mol:
                    for a in mol.GetAtoms():
                        a.SetAtomMapNum(0)
                    unmapped_smiles = Chem.MolToSmiles(mol)
                    metacyc_smiles.add(standardize_smiles(unmapped_smiles))
            
    except ValueError:
        print("Couldn't parse {}".format(reaction))

In [None]:
len(metacyc_smiles)
print (len(set(rhea_smiles) - set(smiles_in_graph)))

In [None]:
metacyc_not_in_net = metacyc_smiles - set(smiles_in_graph) - set(rhea_smiles)
print (len(metacyc_smiles - set(smiles_in_graph) - set(rhea_smiles)))

In [None]:
with open(METACYC_COMPOUND_LINKS, 'r') as f:
    a = f.readlines()

In [None]:
split_lines = [l.split('\t') for l in a]


In [None]:
split_lines[0]

In [None]:
smiles_name = []
for l in split_lines:
    smiles_name.append((standardize_smiles(l[2].strip()),l[0].strip()))

In [None]:
metacyc_name_to_smiles_dict = dict(smiles_name)

In [None]:
count = 0
metacyc_names = []
for m in metacyc_not_in_net:
    if m in metacyc_name_to_smiles_dict.keys():
        metacyc_names.append(clean_name(metacyc_name_to_smiles_dict[m]))
    else:
        metacyc_names.append(clean_name(m))
        count+=1

In [None]:
count / len(metacyc_names)

In [None]:
#### Uncomment to save

# pd.DataFrame({'name':["'"+x+"'" for x in metacyc_names],
#               'smiles':["'"+x+"'" for x in metacyc_not_in_net]}).to_csv('metacyc/{}_molecules_from_metacyc_cleaned_names.csv'.format(date), header=False, sep='\t', index=False)

In [None]:
metacyc_reactions[metacyc_reactions[1].map(lambda x : len(re.findall('\>\>', x))) > 1]

In [None]:
nongeneric_reactions = metacyc_reactions[metacyc_reactions[1].map(lambda x : ' ' not in x and 'R' not in x and '&' not in x)]

In [None]:
def unmap(smiles):
    if 'R' in smiles or ' ' in smiles:
        return re.sub('\:[0-9]+','', smiles)
    m = Chem.MolFromSmiles(smiles)
    if m:
        for a in m.GetAtoms():
            a.SetAtomMapNum(0)
        return Chem.MolToSmiles(m)
    else:
        return re.sub('\:[0-9]+','', smiles)

In [None]:
# figure out why the reactions in the metacyc db have more than 1 >> 
check = nongeneric_reactions[1].map(lambda x : len(np.unique([unmap(m.strip()) for m in x.split('>>')][1:])) ==1)


assert len(nongeneric_reactions[check]) == len(nongeneric_reactions)

In [None]:
single_product_reactions = ['>>'.join([standardize_smiles(unmap(m.strip())) for m in r.split('>>')][:2]) for r in nongeneric_reactions[1]]

In [None]:
single_product_reactions

In [None]:
nongeneric_reactions['standardized_smiles'] = single_product_reactions

In [None]:
nongeneric_reactions[nongeneric_reactions[0]=='+-BORNEOL-DEHYDROGENASE-RXN']

In [None]:
## UNCOMMENT TO SAVE

# nongeneric_reactions.rename(columns={0:'Metacyc_ID',1:'raw_smiles'}).to_csv('metacyc/{}_metacyc_reaction_smiles.csv')