In [74]:
#to read hmdb database
"""
useful concepts of lxml:
    - event signifies either opening or closing of xml tag
    - element is the 'content' of the xml itself
    - .tag is to find the tag of the element, e.g. <xyz></xyz>
    - .find is to find the first occurence of particular tag, e.g. <name></name>
    - .findtext is to find the content of a particular tag, e.g.<name>xyz</name>
    - .findall is to find all the occurence of a particular tag
    
"""

def hmdb_to_dict(directory, output_dir):
    outdict = {}
    counter = 0
    
    pf = "{http://www.hmdb.ca}"

    tree = ET.iterparse(directory, events=("start", "end"))

    for event, elem in tree:
        tmpdict = {}
        if event == 'end':        
            if elem.tag == f"{pf}metabolite":
                
                counter += 1
                
                if counter % 10000 == 0:
                    print(f"Processing {counter}-th mass")

                n_name = elem.findtext(f"{pf}name")
                n_acc = elem.findtext(f"{pf}accession")
                
                n_inchikey = elem.findtext(f"{pf}inchikey")

                if elem.findtext(f"{pf}monisotopic_molecular_weight") != '':
                    n_weight = elem.findtext(f"{pf}monisotopic_molecular_weight")
                else:
                    n_weight = 0.0

                tmpdict['name'] = n_name 
                tmpdict['neutral_mass'] = n_weight
                #tmpdict['desc'] = elem.findtext(f"{pf}description")
                
                try:
                    n_king = elem.find(f"{pf}taxonomy").find(f"{pf}kingdom").text
                    n_supclass = elem.find(f"{pf}taxonomy").find(f"{pf}super_class").text
                    n_class = elem.find(f"{pf}taxonomy").find(f"{pf}class").text
                    n_subclass = elem.find(f"{pf}taxonomy").find(f"{pf}sub_class").text
                    n_dirparent = elem.find(f"{pf}taxonomy").find(f"{pf}direct_parent").text
                    
                    #tmpdict['taxonomy'] = {'kingdom': n_king, 'superclass': n_supclass, 'class': n_class, 'subclass': n_subclass, 'direct_parent': n_dirparent}
                    
                    tmpdict['tax_kingdom'] = n_king
                    tmpdict['tax_superclass'] = n_supclass
                    tmpdict['tax_class'] = n_class
                    tmpdict['tax_subclass'] = n_subclass
                    tmpdict['tax_direct_parent'] = n_dirparent
                    
                except:
                    tmpdict['tax_kingdom'] = ''
                    tmpdict['tax_superclass'] = ''
                    tmpdict['tax_class'] = ''
                    tmpdict['tax_subclass'] = ''
                    tmpdict['tax_direct_parent'] = ''
                    
                #for getting synonym information
                tmplist= []
                for x in elem.find(f"{pf}synonyms"):
                    tmplist.append(x.text)
                    
                tmpdict['synonyms'] = tmplist
                    
                #for getting ontology information
                tmpdict1 = {}
                for x in elem.find(f"{pf}ontology").findall(f"{pf}root"):     
                    for x2 in x.find(f"{pf}descendants").findall(f"{pf}descendant"):
                        ontology_2 = x2.findtext(f"{pf}term")
                        ontology_3 =[]

#                         if ontology_2 == 'Route of exposure':
#                             for x3 in x2.find('descendants').findall('descendant'):
#                                 for x4 in x3.find('descendants').findall('descendant'):
#                                     ontology_3.append(x4.findtext('term'))

#                         elif ontology_2 == 'Source':
#                             for x3 in x2.find('descendants').findall('descendant'):
#                                 ontology_3.append(x3.findtext('term'))

#                         elif ontology_2 == 'Biological role':
#                             for x3 in x2.find('descendants').findall('descendant'):
#                                 ontology_3.append(x3.findtext('term'))

#                         elif ontology_2 == 'Industrial application':
#                             for x3 in x2.find('descendants').findall('descendant'):

#                                 if x3.findtext('term') == 'Food and nutrition':
#                                     try:
#                                         for x4 in x3.find('descendants').findall('descendant'):
#                                             ontology_3.append(x4.findtext('term'))
#                                     except:
#                                         ontology_3.append(x3.findtext('term'))

#                                 elif x3.findtext('term') == 'Pharmaceutical industry':
#                                     for x4 in x3.find('descendants').findall('descendant'):
#                                         ontology_3.append(x4.findtext('term'))

#                                 else:
#                                     ontology_3.append(x3.findtext('term'))

#                         elif ontology_2 == 'Biological location':
#                             for x3 in x2.find('descendants').findall('descendant'):
#                                 if x3.findtext('term') == "Biofluid and excreta":
#                                     for x4 in x3.find('descendants').findall('descendant'):
#                                         try:
#                                             ontology_3.append(x4.findtext('term'))
#                                         except:
#                                             ontology_2 = None
#                                             ontology_3 = None
#                                 else:
#                                     ontology_2 = None
#                                     ontology_3 = None

                        if ontology_2 == 'Naturally occurring process':
                            for x3 in x2.find(f"{pf}descendants").findall(f"{pf}descendant"):
                                ontology_20 = "biological_process"
                                ontology_30 = []
                                if x3.findtext(f"{pf}term") == 'Biological process':

                                    for x4 in x3.find(f"{pf}descendants").findall(f"{pf}descendant"):
                                        for x5 in x4.find(f"{pf}descendants").findall(f"{pf}descendant"):
                                            ontology_30.append(x5.findtext(f"{pf}term"))   
                                            
                                tmpdict[ontology_20] = ontology_30

                        elif ontology_2 == 'Health effect':
                            ontology_21 = "related_disease"
                            ontology_22 = "short_term_effects"
                            ontology_31 = []
                            ontology_32 = []

                            tmpdict[ontology_21] = ontology_31
                            tmpdict[ontology_22] = ontology_32
                            for x3 in x2.find(f"{pf}descendants").findall(f"{pf}descendant"):

                                if x3.findtext(f"{pf}term") == 'Health condition':                                   
                                    for x4 in x3.find(f"{pf}descendants").findall(f"{pf}descendant"):
                                        ontology_31.append(x4.findtext(f"{pf}term"))
                                    tmpdict[ontology_21] = ontology_31
                                    
                                elif x3.findtext(f"{pf}term") == 'Observation':
                                    for x4 in x3.find(f"{pf}descendants").findall(f"{pf}descendant"):
                                        ontology_32.append(x4.findtext(f"{pf}term"))
                                
                                    tmpdict[ontology_22] = ontology_32
                                
                        else:
                            ontology_2 = None
                            ontology_3 = None
                            
                        # if ontology30 != NULL:
                        #     tmpdict[ontology_20] = ontology_30
                        # elif ontology_31 != NULL:
                        #     tmpdict[ontology_21] = ontology_31
                        # elif ontology_32 != NULL:
                        #     tmpdict[ontology_22] = ontology_32

                #tmpdict['ontology'] = tmpdict1 
                outdict[n_acc] = tmpdict
            
    out_file = open(f'{output_dir}.json', "w")
    json.dump(outdict, out_file, indent = 6)
    out_file.close()
    
    return outdict

In [None]:
def drb_to_dict(db_directory, output_dir):
    outdict = {}
    tree = ET.iterparse(db_directory, events=("start", "end"))
    
    for event, elem in tree:
        tmpdict ={}
        xmlns = "{http://www.drugbank.ca}"

        if event == "end":
            try:
                if elem.tag == f'{xmlns}drug' and elem.attrib['type'] == 'small molecule':

                    dg_id = elem.find(f"{xmlns}drugbank-id").text

                    tmpdict['name'] = elem.find(f"{xmlns}name").text
                    tmpdict['neutral_mass'] = elem.find(f"{xmlns}monoisotopic-mass").text
                    tmpdict['application'] = elem.find(f"{xmlns}indication").text
                    tmpdict['int_brands'] = [i.findtext(f'{xmlns}name') for i in elem.find(f"{xmlns}international-brands").findall(f"{xmlns}international-brand")]

                    target = []
                    for i in elem.find(f'{xmlns}targets').findall(f'{xmlns}target'):
                        if i.findtext(f'{xmlns}known-action') == 'yes':
                            t_name = i.findtext(f'{xmlns}name')
                            t_action = i.find(f'{xmlns}actions').findtext(f'{xmlns}action')
                            t_list = [t_name, t_action]

                            target.append(t_list)

                    tmpdict['target'] = target

                    outdict[dg_id] = tmpdict
            except:
                pass

    out_file = open(f"{output_dir}.json", "w")
    json.dump(outdict, out_file, indent = 6)
    out_file.close()
    
    print(f'number of drugs in dictionary = {len(outdict)}')
    
    return outdict

In [75]:
from lxml import etree as ET
import pandas as pd
import argparse
import re
import json

hmdb_dir = 'sweat_metabolites.xml'
hmdb_output_dir = 'sweat_full'
hmdb_dict = hmdb_to_dict(hmdb_dir, hmdb_output_dir)

In [42]:
pf = "{http://www.hmdb.ca}"
counter = 0
tree = ET.iterparse("sweat_metabolites.xml", events=("start", "end"))

for event, elem in tree:
    tmpdict = {}
    
    if event == 'end':  
        if elem.tag == f"{pf}metabolite":

            counter += 1

            if counter % 10 == 0:
                print(f"Processing {counter}-th mass")

            n_name = elem.findtext(f"{pf}name")
            n_acc = elem.findtext(f"{pf}accession")
            
            for x in elem.find(f"{pf}synonyms"):
                print(n_name, x.text)
            

2-Hydroxybutyric acid (S)-2-Hydroxybutanoic acid
2-Hydroxybutyric acid 2-Hydroxybutyrate
2-Hydroxybutyric acid 2-Hydroxybutyric acid
2-Hydroxybutyric acid L-2-Hydroxybutanoic acid
2-Hydroxybutyric acid L-2-Hydroxybutyric acid
2-Hydroxybutyric acid L-alpha-Hydroxybutanoic acid
2-Hydroxybutyric acid L-alpha-Hydroxybutyric acid
2-Hydroxybutyric acid (S)-2-Hydroxybutanoate
2-Hydroxybutyric acid L-2-Hydroxybutanoate
2-Hydroxybutyric acid L-2-Hydroxybutyrate
2-Hydroxybutyric acid L-a-Hydroxybutanoate
2-Hydroxybutyric acid L-a-Hydroxybutanoic acid
2-Hydroxybutyric acid L-alpha-Hydroxybutanoate
2-Hydroxybutyric acid L-Α-hydroxybutanoate
2-Hydroxybutyric acid L-Α-hydroxybutanoic acid
2-Hydroxybutyric acid L-a-Hydroxybutyrate
2-Hydroxybutyric acid L-a-Hydroxybutyric acid
2-Hydroxybutyric acid L-alpha-Hydroxybutyrate
2-Hydroxybutyric acid L-Α-hydroxybutyrate
2-Hydroxybutyric acid L-Α-hydroxybutyric acid
2-Hydroxybutyric acid (S)-2-Hydroxybutyrate
2-Hydroxybutyric acid 2-Hydroxybutanoate
2-Hydroxy