# Part 1: Parsing of mnx metabolite

In [1]:
%%bash
pwd

/home/jovyan/work/src/generated


In [2]:
import pandas as pnd
import pickle 
import copy
import os

In [3]:
header_chem_prop = ['ID', 'name', 'reference', 'formula', 'charge', 'mass', 'InChI', 'InChIKey', 'SMILES']
chem_prop = pnd.read_csv('metanetx_4_4/chem_prop.tsv', sep='\t', comment='#', header=None, names=header_chem_prop)
chem_prop.set_index('ID', drop=True, inplace=True, verify_integrity=True)

In [4]:
header_chem_xref = ['source', 'ID', 'description']
chem_xref = pnd.read_csv('metanetx_4_4/chem_xref.tsv', sep='\t', comment='#', header=None, names=header_chem_xref)
# not need to set the index here

In [5]:
# prepare output directories
os.makedirs('mnx_dicts_M/', exist_ok=True)

## mnx_to_something dict creation

In [6]:
def create_mnx_to_something_dict(chem_xref, key):
    
    # parse the table
    mnx_to_something = {}
    for index, row in chem_xref.iterrows():
        
        
        # check is the 'source' begins with the given key: 
        if row.source.startswith(key):
            something_id = row.source[len(key): ]  # id to the specific db.
            mnx_id = row.ID  # get the corresponding mnx id.
            
            
            # populate the dictionary: 
            if mnx_id not in mnx_to_something.keys():
                mnx_to_something[mnx_id] = set()
            mnx_to_something[mnx_id].add(something_id)
            
    
    # write to disk
    name = key[:-1]  # avoid the ':'
    with open(f'mnx_dicts_M/{name}.pickle', 'wb') as handle:
        pickle.dump(mnx_to_something, handle)
        
        
    return mnx_to_something


In [7]:

mnx_to_kegg_compound = create_mnx_to_something_dict(chem_xref, 'kegg.compound:') 
mnx_to_kegg_drug = create_mnx_to_something_dict(chem_xref, 'kegg.drug:') 
mnx_to_kegg_glycan = create_mnx_to_something_dict(chem_xref, 'kegg.glycan:') 
mnx_to_metacyc = create_mnx_to_something_dict(chem_xref, 'metacyc.compound:')
mnx_to_hmdb = create_mnx_to_something_dict(chem_xref, 'hmdb:')
mnx_to_bigg = create_mnx_to_something_dict(chem_xref, 'bigg.metabolite:')
mnx_to_seed = create_mnx_to_something_dict(chem_xref, 'seed.compound:')
mnx_to_chebi = create_mnx_to_something_dict(chem_xref, 'chebi:')
mnx_to_sabiork = create_mnx_to_something_dict(chem_xref, 'sabiork.compound:')
mnx_to_lipidmaps = create_mnx_to_something_dict(chem_xref, 'lipidmaps:')
mnx_to_envipath = create_mnx_to_something_dict(chem_xref, 'envipath:')
mnx_to_reactome = create_mnx_to_something_dict(chem_xref, 'reactome:')
mnx_to_rhea_g = create_mnx_to_something_dict(chem_xref, 'rheaG:')
mnx_to_rhea_p = create_mnx_to_something_dict(chem_xref, 'rheaP:')
mnx_to_swisslipids = create_mnx_to_something_dict(chem_xref, 'slm:')


In [8]:
# create auxiliary dictionaries for the attributes recorded in chem_prop
def create_aux_dict(chem_prop, key) :
    
    
    # parse the table
    mnx_to_something = {}
    for mnx_id, row in chem_prop.iterrows():
        
        
        # poulate the dictionary: 
        if mnx_id not in mnx_to_something.keys(): 
            mnx_to_something[mnx_id] = set() 
        something = row[key]
        if type(something) == str: 
            mnx_to_something[mnx_id].add(something)

    
    # write to disk
    name = key.lower()
    with open(f'mnx_dicts_M/{name}.pickle', 'wb') as handle:
        pickle.dump(mnx_to_something, handle)
        
    
    return mnx_to_something


In [9]:
mnx_to_inchi = create_aux_dict( chem_prop, 'InChI')
mnx_to_inchikey = create_aux_dict( chem_prop, 'InChIKey')
mnx_to_smiles = create_aux_dict( chem_prop, 'SMILES')

In [10]:
# keys pointing to themseves, gaining also this annotation
def create_mnx_to_mnx_dict(chem_prop): 
    
    
    # parse the table
    mnx_to_mnx = {}
    for mnx_id, row in chem_prop.iterrows():
        if mnx_id not in mnx_to_mnx.keys(): 
            mnx_to_mnx[mnx_id] = set() 
        mnx_to_mnx[mnx_id].add(mnx_id)
        
    
    # write to disk:
    with open(f'mnx_dicts_M/mnx_to_mnx.pickle', 'wb') as handle:
        pickle.dump(mnx_to_mnx, handle)
        
        
    return mnx_to_mnx

In [11]:
mnx_to_mnx = create_mnx_to_mnx_dict(chem_prop)

## crossrefs creation

In [12]:
crossrefs = {
    'kegg.compound': mnx_to_kegg_compound, 
    'kegg.drug': mnx_to_kegg_drug, 
    'kegg.glycan': mnx_to_kegg_glycan, 
    'biocyc': mnx_to_metacyc, 
    'hmdb': mnx_to_hmdb,
    'bigg.metabolite': mnx_to_bigg, 
    'seed.compound': mnx_to_seed, 
    'chebi': mnx_to_chebi, 
    'sabiork': mnx_to_sabiork, 
    'lipidmaps': mnx_to_lipidmaps, 
    'envipath': mnx_to_envipath, 
    'reactome': mnx_to_reactome, 
    'rhea_g': mnx_to_rhea_g, 
    'rhea_p': mnx_to_rhea_p, 
    'swisslipids': mnx_to_swisslipids, 
    'inchi': mnx_to_inchi, 
    'inchikey': mnx_to_inchikey, 
    'smiles': mnx_to_smiles, 
    'metanetx.chemical': mnx_to_mnx, 
}

with open(f'mnx_dicts_M/crossrefs.pickle', 'wb') as handle:
    pickle.dump(crossrefs, handle)

# Part 2: bigg/seed to others dict creation

In [13]:

with open(f'mnx_dicts_M/crossrefs.pickle', 'rb') as handle:
    crossrefs = pickle.load(handle)

In [14]:
def create_something_to_others_dict(chem_xref, key, crossrefs, name,  ):

    
    # create a dictionary converting a bigg/seed metabolite to all the others dbs
    something_to_others = {}
    for index, row in chem_xref.iterrows(): 
    
    
        # populate the dictionary
        if row.source.startswith(key): 
            something_id = row.source[len(key):]
            mnx_id = row.ID
            if something_id not in something_to_others.keys(): 
                something_to_others[something_id] = {}
            
            
            # iterate all the databases:  
            for crossref in crossrefs.keys(): 
                something_to_others[something_id][crossref] = set()
                try: annots = crossrefs[crossref][mnx_id]
                # no annotations for this mnx_id for this external database
                except: continue
                for annot in annots:
                    something_to_others[something_id][crossref].add(annot)
    
    
    # write to disk
    with open(f'mnx_dicts_M/{name}.pickle', 'wb') as handle:
        pickle.dump(something_to_others, handle)
    
    
    return something_to_others

In [15]:
bigg_to_others = create_something_to_others_dict( chem_xref, 'bigg.metabolite:', crossrefs, 'bigg_to_others',)

seed_to_others = create_something_to_others_dict( chem_xref, 'seed.compound:', crossrefs, 'seed_to_others',)


## testing area

In [16]:
bigg_to_others['glc__D']

{'kegg.compound': {'C00031'},
 'kegg.drug': {'D00009'},
 'kegg.glycan': set(),
 'biocyc': {'Glucopyranose',
  'Hederagenin-Monoglucosides',
  'Soyasapogenol-B-Monoglucosides',
  'Soyasapogenol-E-Monoglucosides'},
 'hmdb': {'HMDB0000122',
  'HMDB0000516',
  'HMDB0003340',
  'HMDB0006564',
  'HMDB00122',
  'HMDB00516',
  'HMDB0062170',
  'HMDB03340',
  'HMDB06564',
  'HMDB62170'},
 'bigg.metabolite': {'glc__D'},
 'seed.compound': {'cpd00027'},
 'chebi': {'4167'},
 'sabiork': {'1406', '1407'},
 'lipidmaps': set(),
 'envipath': set(),
 'reactome': set(),
 'rhea_g': set(),
 'rhea_p': set(),
 'swisslipids': set(),
 'inchi': {'InChI=1S/C6H12O6/c7-1-2-3(8)4(9)5(10)6(11)12-2/h2-11H,1H2/t2-,3-,4+,5-,6?/m1/s1'},
 'inchikey': {'InChIKey=WQZGKKKJIJFFOK-GASJEMHNSA-N'},
 'smiles': {'OC[C@H]1OC(O)[C@H](O)[C@@H](O)[C@@H]1O'},
 'metanetx.chemical': {'MNXM1137670'}}

In [18]:
seed_to_others['cpd00027']

{'kegg.compound': {'C00031'},
 'kegg.drug': {'D00009'},
 'kegg.glycan': set(),
 'biocyc': {'Glucopyranose',
  'Hederagenin-Monoglucosides',
  'Soyasapogenol-B-Monoglucosides',
  'Soyasapogenol-E-Monoglucosides'},
 'hmdb': {'HMDB0000122',
  'HMDB0000516',
  'HMDB0003340',
  'HMDB0006564',
  'HMDB00122',
  'HMDB00516',
  'HMDB0062170',
  'HMDB03340',
  'HMDB06564',
  'HMDB62170'},
 'bigg.metabolite': {'glc__D'},
 'seed.compound': {'cpd00027'},
 'chebi': {'4167'},
 'sabiork': {'1406', '1407'},
 'lipidmaps': set(),
 'envipath': set(),
 'reactome': set(),
 'rhea_g': set(),
 'rhea_p': set(),
 'swisslipids': set(),
 'inchi': {'InChI=1S/C6H12O6/c7-1-2-3(8)4(9)5(10)6(11)12-2/h2-11H,1H2/t2-,3-,4+,5-,6?/m1/s1'},
 'inchikey': {'InChIKey=WQZGKKKJIJFFOK-GASJEMHNSA-N'},
 'smiles': {'OC[C@H]1OC(O)[C@H](O)[C@@H](O)[C@@H]1O'},
 'metanetx.chemical': {'MNXM1137670'}}