In [1]:
def get_mass(sample_directory, tolr):
    raw_df = pd.read_excel(sample_directory, sheet_name = 0)
    x = raw_df['Mass'].values
    
    print(f'number of unique masses = {len(set(x))}')

    i = list(sorted(set(x)))
    """
    counter = 0
    for mass1 in i:  
        for z in range(0, len(i)-1):
            mass2 = i[z]

            if mass1 != mass2:
                ran = tolr*mass1/1e6
                diff = abs(mass2 - mass1)

                if diff <= ran:
                    i.remove(mass1)
                    counter += 1
                    break
    
    print(f'number of unique masses after filtering = {len(set(i))}')
    print(f'number of unique masses filtered = {counter}')
    """
    return i

In [2]:
def db_hit(sample, db_dict, tolr, output_dir):
    """
    For each mass found from LCMS, match them with a member of the drug database dictionary within the ppm tolerance
    ----
    Inputs:
    1. sample = list containing neutral masses of unknown molecules
    2. db_dict = dictionary containing parsed XML of database
    3. tolr = error to be considered mass a 'hit' in ppm
    
    Output:
    allhits = dictionary containing all the hits of each mass - {mass: [db_id: db_dict]}
    """
    
    count = 0
    allhits = {}
    
    for m in set(sample):
        count += 1
        
        if count%100 == 0:
            print(f'Processing {count}-th mass')
        
        mhits = [i for i in db_dict if abs(float(db_dict[i]['neutral_mass']) - float(m)) <= tolr*float(m)/1e6]
        allhits[m] = {i: db_dict[i] for i in mhits}
    
    out_file = open(f"{output_dir}.json", "w")
    json.dump(allhits, out_file, indent = 6)
    out_file.close()
    
    return allhits

In [3]:
from lxml import etree as ET
import pandas as pd
import re
import json
import numpy as np

In [4]:
s_dir = "raw/ww_ut_metabolomics.xlsx"
tolr = 10
sample = get_mass(s_dir, tolr) #get unique masses from sample

number of unique masses = 1761


In [5]:
hmdb_dict = json.load(open('databases/hmdb_metabolites/hmdb_full.json'))
print(f'number of metabolite in dictionary = {len(hmdb_dict)}')

number of metabolite in dictionary = 217920


In [6]:
hitoutput_dir = 'match_result/mass_to_hmdb'
allhit_hmdb = db_hit(sample, hmdb_dict, tolr, hitoutput_dir)

Processing 100-th mass
Processing 200-th mass
Processing 300-th mass
Processing 400-th mass
Processing 500-th mass
Processing 600-th mass
Processing 700-th mass
Processing 800-th mass
Processing 900-th mass
Processing 1000-th mass
Processing 1100-th mass
Processing 1200-th mass
Processing 1300-th mass
Processing 1400-th mass
Processing 1500-th mass
Processing 1600-th mass
Processing 1700-th mass


In [7]:
def count_multi_hits(matchdict):
    count = 0
    for x in matchdict.values():
        values_keys = x.keys()
        if len(values_keys) > 1:
            count += 1
        else:
            count += 0
            
    return count

In [15]:
#remove matches that don't contain any description and/or ontology
print(f'Initial dict number of multiple hits = {count_multi_hits(allhit_hmdb)}, {count_multi_hits(allhit_hmdb)/len(allhit_hmdb)*100}%')

newallhit_dict = {}
for x in list(allhit_hmdb.keys()): 
    tmpdict= {}
    for y in list(allhit_hmdb[x].keys()):
        
        desc = allhit_hmdb[x][y]['desc']
        #descstop = allhit_hmdb[x][y]['desc'].split('. ')

        if ((allhit_hmdb[x][y]['desc'] == '') or (allhit_hmdb[x][y]['ontology'] == {}) or (allhit_hmdb[x][y]['taxonomy'] == {})) :
            pass
        
        #elif 'insoluble' in desc or 'expected' in desc or '\'Expected\'' in desc or 'dipeptide' in desc or 'an intermediate' in desc or 'volatile' in desc:
            #pass
        
        #elif len(descstop) < 2:
            #pass
        
        #elif 'unstable' in descstop[0] or 'unstable' in descstop[1]:
            #pass
        
        else:
            tmpdict.update({y : allhit_hmdb[x][y]})
    
    newallhit_dict[x] = tmpdict

print(f'Filtered dict number of multiple hits = {count_multi_hits(newallhit_dict)}, {count_multi_hits(newallhit_dict)/len(newallhit_dict)*100}%')

Initial dict number of multiple hits = 771, 43.781942078364565%
Filtered dict number of multiple hits = 439, 24.9290176036343%


coding-based ontology determination
ontology structure:
source
    chem_ont_class
    chem_ont_subclass
    chem_ont_reldiet
    chem_ont_reldrug
    chem_ont_reldisease (manual)
    
list of sources: synthetic, human-derived, plant-derived, animal-derived, fungi-derived, environment-derived, ubiquitous

!!synthetic:

    chem_ont_class - drugs
        chem_ont_subclass - anticonvulsants
        chem_ont_subclass - antiparkinsons
        chem_ont_subclass - antipsychotics
        chem_ont_subclass - psychostimulants
        chem_ont_subclass - addiction-recovery drugs

        chem_ont_subclass - common analgesics and antipyretics
        chem_ont_subclass - opioid analgesics
        chem_ont_subclass - benzodiazepines, sedatives, barbiturates, and anesthetics
        chem_ont_subclass - NSAIDs
        chem_ont_subclass - antihistamines
        chem_ont_subclass - asthma-related drugs
        
        chem_ont_subclass - cardiovascular drugs
        chem_ont_subclass - proton pump inhibitors
        chem_ont_subclass - orphan drugs
        chem_ont_subclass - antimuscarinics and anticholinergics
        chem_ont_subclass - antineoplastics
        chem_ont_subclass - immunosuppresants
        chem_ont_subclass - opthalmic drugs
        
        chem_ont_subclass - antidiabetic
        chem_ont_subclass - statins and diet management drugs
        chem_ont_subclass - anabolic steroids
        
        chem_ont_subclass - contraceptives
        chem_ont_subclass - hormone therapy drugs
        
        chem_ont_subclass - antivirals
        chem_ont_subclass - antibiotics
        chem_ont_subclass - antifungals
        chem_ont_subclass - antiparasitics
        
        chem_ont_subclass - diagnostics agents and contrast media
        
    chem_ont_class - toiletries-related molecules
    chem_ont_class - food additives and dietary-related molecules
        
    chem_ont_class - plastics and synthetic polymer
    chem_ont_class - agricultural-related chemical
        chem_ont_subclass - insecticide and insecticide-related metabolites
        chem_ont_subclass - herbicide and herbicide-related metabolites
        chem_ont_subclass - fungicide and fungicide-related metabolites
    
!!human-derived:

    chem_ont_class - inflammation related molecules
        chem_ont_subclass - prostaglandin and prostaglandin-related molecules
        chem_ont_subclass - leukotriene and leukotriene-related molecules
        chem_ont_subclass - thromboxane and thromboxane-related molecules
    chem_ont_class - reproductive system metabolites
        chem_ont_subclass - androgens and androgen-related molecules
        chem_ont_subclass - estrogens and estrogen-related molecules
        chem_ont_subclass - progesterones and progesterone-related molecules
    chem_ont_class - circadian rhythm related metabolites
    chem_ont_class - bile acids and bile acids metabolites
    chem_ont_class - human related lipids and lipid metabolites
    chem_ont_class - drug metabolite
        chem_ont_subclass - ((refer to synthetic drug subclasses))
        chem_ont_reldrug - ((check desc))

!!plant-derived

    chem_ont_class - plant metabolites
    chem_ont_class - plant hormones
    chem_ont_reldiet - ((see below))

!!animal-derived

    chem_ont_class - animal metabolites
    chem_ont_reldiet - ((see below))

!!fungi-derived

    chem_ont_class - fungi metabolites
    chem_ont_reldiet - ((see below))
    
!!environment-derived

    chem_ont_class - environment metabolites
    chem_ont_reldrug - ((check desc))

!!ubiquitous

    chem_ont_class - nucleic acids
    chem_ont_class - carbohydrates
    chem_ont_class - vitamins
    chem_ont_class - lipids and lipid metabolites
        chem_ont_subclass - glycerolipids and energy storage lipid molecules
        chem_ont_subclass - glycerophospholipids, sphingolipids, sterols and membrane lipid molecules
    chem_ont_class - organic compounds
    
**related_diet :

    coffee and coffee products
    cereals and cereal products
    vegetables and vegetable products
    alcoholic beverages
    cocoa and cocoa products
    tea and tea products
    cooking oils
    fruits and fruit products
    legumes and legume products
    tobacco and tobacco products
    
    seafood and seafood products
    meats and meat products
    cheese, diary, and diary products
    
    fungi and fungi products

In [16]:
def find_ontology(name, desc):
    
    source_keyword_dict = {
        'fungus_derived' : ['mushroom'],
        'environment_derived' : ['soil metabolite'],
        'animal_derived' : ['crustacean', 'mollusk', 'milk', 'animal food', 'chicken', 'poultry', 'rat', 'mammal', 'sea ', 'EPA'],
        'synthetic_1' : ['agricultural', 'insecticide', 'herbicide', 'fungicide', 'nutritional supplement',
                        'that have used or taken this drug', 'perhexiline', 'ampicillin'],
        'plant_derived' : ['plant','biomarker for the consumption of', 'coffee', 
                        'cereals', 'wheat', 'oat',
                        'vegetable', 'vegetables', 'tomato', 'brassicas', 'onion', 'corn', 'butterbur', 'cucumber', 'potato', 'carrot','gourd',
                        'cocoa', 'chocolate', 'carob', 'coca', 
                        'tea', 
                        'fats and oils', 'borage', 'olive',
                        'fruit','pomes', 'avocado', 'bilberry', 'currant', ' plum', 'citrus', 'muskmelon', 'banana', 'lemon', 'apple', 'date',
                        'nuts', 'pulses', 'bean', 'chickpea', ' pea', 'ginkgo', 'isoflavone',
                        'tobacco', 'alcoholic beverages',
                        'herbs and spices', 'dill', 'cumin', 'cloves', 'anise', 'parsley', 'burdock', 'ginger', 'coriander',
                        'turmeric', 'angelica', 'rhubarb', 'sage', 'mugwort', 'chili', 'marjoram', 'dock',
                        'gibberellin', 'auxin'],
        'human_derived' : ['human', 'metabolite of ', 'have used or taken ', 'metabolite byproduct of ', 'patients',
                          'urine', 'epidermis', 'feces', 'neutrophil', 'renal', 'eosinophil', 'basophil', 'semen', 'liver', 'kidney',
                          'female', 'male', 'androgen', 'estrogen',
                          'phosphatidylinositol','sphingolipid', 'phosphatidylcholine',
                          'prostaglandin', 'PG', 'lipoxin', 'leukotriene', 'eicosanoid', 'arachidonic acid', 'epoxyeicosatrienoic', 'prostanoid', 'isoprostane', 
                           'hepoxilin', 
                           'bile acid', ' bile', 'carnitine',
                           'gut microbio', 'urobilin',
                           'dopamine', 'dihydroxyphenethylamine', 'aminobutyric acid', 'serotonin', 'hydroxytryptamine', 
                           'hydroxybutyric acid', 'noradrenaline', 'norepinephrine', 'acetylcholine', 'kynurenic acid',
                           'neuropeptide'
                          ],  
        'synthetic_2' : ['synthetic','food additive', 'flavorin', 'flavourin', 'stabiliser', 'emulsifier', 'confectionery', 'additiv', 'beverage',
                        'drug', 'medicine', 'antagonist', 'agonist', 'amphetamine', 'antilipemic', 'blocker', 'anticonvulsant', 'antibiotic', 'analgesic', 'angiostensin-receptor blocker', 'medication', 'anabolic steroid', 'therapeutic', 'cephem', 'cocaine', 'dideoxynucleoside',
                        'cosmetics', 'codeine', 'is metabolized', 'sleep agent', 'morphin', 'contrast',
                        'BPA', 'plasticiser', 'glycol', 'polymer', ' plastic', ' anti', 'methylenedioxyamphetamine', 'sedative', 
                        'alkylating agent', 'pharmacolo', 'a derivative of', 'antibacteria'],
        'ubiquitous' : ['nucleoside', 'vitamin', 'acetic', 'naphtha', 'uracil', 'adenine', 'guanine', 'thymine', 'cytosine', 'indole',
                        'glycoside','phosphatidic acid', 'ODE', 'steroid', 'lipid', 'phosphatidylserine', 'phosphatidylethanolamine', 'phosphatidylglycerol', 'phosphatidylglycerophosphate', 'cardiolipin', 'furan fatty acid',
                        'galactose', 'glucose', 'fructose' , 'hexose', 'monosaccharide', 'organic compound',
                        'imidazol', 'imidazole', 'tocopherol', 'carotenoid', 'ethanolamide','alanine', 'arginine', 'tryptophan', 'asparagine',
                        'aspartic acid', 'cysteine', 'ganglioside', 'glutamine', 'glutamic acid', 'glycine',
                        'lysine', 'leucine', 'proline', 'methionine', 'serine', 'threonine',
                        'tryptophan', 'tyrosine', 'valine', 'histidine', 'phenylalanine', 'quinol', 'nitro', 'benzene', 'enol ', 'acetate', 'acid', 'inositol']
    }
    
    source_class_dict = {
        'fungus_derived' : ['fungi metabolite'],
        'environment_derived' : ['environmental metabolite'],
        'animal_derived' : ['animal metabolite'],
        'plant_derived' : ['plant metabolite', 'plant hormones'],
        'synthetic' : ['agricultural-related chemical', 'drugs', 'toiletries-related molecules', 'food additives and dietary-related molecules',
                      'plastics and synthetic polymer'],        
        'human_derived' : ['reproductive system metabolites', 'circadian rhythm related metabolites', 'bile acids and bile acids metabolites', 
                           'human related lipids and lipid metabolites', 'gut microbiota metabolite', 'drug metabolites', 'neuromodulation metabolites','inflammation-related molecules',
                          'amino acids and amino acids metabolites'],
        'ubiquitous' : ['nucleic acids', 'amino acids and amino acids metabolites', 
                        'carbohydrates', 'vitamins', 'lipids and lipid metabolites', 'organic compounds']
        
    }
        
    class_keyword_dict = {
        'fungi metabolite' : ['mushroom'],
        'environmental metabolite' : ['soil metabolite'],
        'animal metabolite' : ['crustacean', 'mollusk', 'milk', 'animal food', 'chicken'],
        'plant metabolite' : ['coffee', 
                        'cereals', 'wheat', 'oat',
                        'vegetable', 'tomato', 'brassicas', 'onion', 'corn', 'butterbur', 'cucumber', 'potato', 'carrot','gourd',
                        'cocoa', 'chocolate', 'carob', 'coca', 
                        'tea', 
                        'fats and oils', 'borage', 'olives',
                        'fruit','pomes', 'currant', ' plum', 'avocado', 'bilberry', 'citrus', 'muskmelon', 'lemon', 'apple', 'date', 'banana',
                        'nuts', 'pulses', 'bean', ' pea', 'chickpea', 'ginkgo', 'isoflavone', 
                        'tobacco', 'alcoholic beverages',
                        'herbs and spices', 'dill', 'cumin', 'cloves', 'anise', 'parsley', 'burdock', 'ginger', 'dock', 'coriander'
                        'turmeric', 'angelica', 'rhubarb', 'sage', 'mugwort', 'chili', 'marjoram'
                        ],
        'plant hormones' : ['gibberellin', 'auxin'],
        'drugs' : ['drug', 'prodrug', 'inhibitor', 'agonist', 'antagonist', 'blocker',
                   'inflammatory bowel disease', 'antiemetic', 'gastrointestinal tract', 'antiulcer',
                   'glaucoma', ' eye', 'opthalmic',
                   'antineoplastic', 
                   'anesthetic', 'barbiturate', 'sedative', 'sleep agent', 'morphin', 'benzodiazepine',
                   'hypotension', 'antihypertensive', 'renin inhibitor', 'vasodilator', 'dilate blood vessel', 'antianginal', 'cardiac arrhythmia', 'skeletal muscle',
                   'bronchodilator', 'asthma',
                   'narcotic analgesic', 'opiate analgesic',
                   'antitubercular','antibiotic', 'antimicrobial', 'antibacterial', 'beta-lactamase', 'bactericidal', 'tetracycline',
                   'HIV', 'codeine', 'viral', 'virus',
                   'antifungal', 'fungus', 'fungal',
                   'antidepressant', 'depression', 'schizophrenia', 'psychotic', 'treat addiction', 'stimulant',
                   'cocaine' , 'heroin', 'cannabis',
                   'dermato', 'psoriasis', ' skin', 'itching',
                   'antimuscarinic', 'muscarinic antagonist',
                   'immunosuppressive',
                   'contrast',
                   'statin', 'cholesterol',
                   'antidiabetic',
                   'NSAIA', 'NSAID',
                   'parkinson',
                   'estrogen', 'androgen steroid', 'hormone replacement',
                   'nutritional supplement',
                   'contraceptive',
                   'anticonvulsant'
                  ],
        'toiletries-related molecules' : ['cosmetic', 'perfume', 'shampoo', 'soap', 'toiletries'],
        'food additives and dietary-related molecules' : ['flavourin', 'flavorin', 'food additive', 'EAFUS', 
                                                          'beverages', 'stabiliser', 'emulsifier', 'confectionery'],
        'plastics and synthetic polymer' : ['BPA', ' plastic', 'plasticizer', 'glycol', 'polymer', 'trimethylobenzene'],
        'agricultural-related chemical' : ['agricultural', 'insecticide', 'herbicide', 'fungicide'],
        'inflammation-related molecules' : ['prostaglandin', 'lipoxin', 'PG', 'leukotriene', 'eicosanoid', 'arachidonic acid', 
                                            'epoxyeicosatrienoic', 'prostanoid', 'isoprostane', 'hepoxilin'],
        'reproductive system metabolites' : ['androgen', 'testosterone', 'androsterone', 'estrogen', 'estetrol', 'estriol', 'estradiol',
                                            'progestogens', 'progesterone'],
        'circadian rhythm related metabolites' : ['melatonin', 'cortisol'],
        'bile acids and bile acids metabolites' : ['urobilin','bile acid', ' bile', 'chenodeoxycholic', 'cholic acid'],
        'human related lipids and lipid metabolites' : ['phosphatidylinositol','sphingolipid', 'phosphatidylcholine', 'carnitine'],
        'gut microbiota metabolite' : ['gut microbio'],
        'neuromodulation metabolites' : ['dopamine', 'dihydroxyphenethylamine', 'aminobutyric acid', 'serotonin', 
                                         'hydroxytryptamine', 'hydroxybutyric acid', 'noradrenaline', 'norepinephrine', 
                                         'acetylcholine', 'kynurenic acid', 'neuropeptide'],
        'drug metabolites' : ['is a metabolite of','drug', 'prodrug',
                   'inflammatory bowel disease', 'antiemetic', 'gastrointestinal tract', 'antiulcer',
                   'glaucoma', ' eye', 'opthalmic',
                   'antineoplastic', 
                   'anesthetic', 'barbiturate', 'sedative', 'sleep agent', 'benzodiazepine',
                   'hypotension', 'antihypertensive', 'renin inhibitor', 'vasodilator', 'dilate blood vessel', 'antianginal', 'cardiac arrhythmia', 'skeletal muscle',
                   'bronchodilator', 'asthma',
                   'narcotic analgesic', 'opiate analgesic',
                   'antitubercular','antibiotic', 'antimicrobial', 'antibacterial', 'beta-lactamase', 'bactericidal', 'tetracycline',
                   'HIV', 'viral', 'virus',
                   'antifungal', 'fungus', 'fungal',
                   'antidepressant', 'depression', 'schizophrenia', 'psychotic', 'treat addiction', 'stimulant',
                   'cocaine' , 'heroin', 'cannabis',
                   'dermato', 'psoriasis', ' skin', 'itching',
                   'antimuscarinic', 'muscarinic antagonist',
                   'immunosuppressive',
                   'contrast',
                   'statin', 'cholesterol',
                   'antidiabetic',
                   'NSAIA', 'NSAID',
                   'parkinson',
                   'estrogen', 'androgen steroid', 'hormone replacement',
                   'nutritional supplement',
                   'contraceptive',
                   'anticonvulsant'
                  ],
        'nucleic acids' :['nucleoside', 'uracil', 'adenine', 'guanine', 'thymine', 'cytosine'],
        'carbohydrates' : ['galactose', 'glucose', 'fructose' , 'hexose', 'monosaccharide'],
        'lipids and lipid metabolites' : ['lipid', 'ODE', 'ganglioside', 'steroid','phosphatidylinositol','glycoside','phosphatidic acid', 'phosphatidylserine', 'phosphatidylethanolamine', 'phosphatidylglycerol', 
                                          'phosphatidylglycerophosphate', 'cardiolipin', 'furan fatty acid'],
        'vitamins' : ['vitamin'],
        'amino acids and amino acids metabolites' : ['imidazol','alanine', 'arginine', 'tryptophan', 'asparagine',
                                                    'aspartic acid', 'cysteine', 'glutamine', 'glutamic acid', 'glycine',
                                                    'lysine', 'leucine', 'proline', 'methionine', 'serine', 'threonine',
                                                    'tryptophan', 'tyrosine', 'valine', 'histidine', 'phenylalanine'],
        
        'organic compounds' : ['organic compound', 'indole', 'naphtha', 'acetic', 'quinol', 'nitro', 
                               'benzene', 'enol ', 'carotenoid', 'tocopherol', 'ethanolamide', 'acetate', 'acid', 'inositol']
        
    }
        
    class_subclass_dict = {
        'drugs' : ['antiparkinsons', 'antipsychotics', 'psychostimulants', 'addiction-recovery drugs', 'recreational drugs',
                  'common analgesics and antipyretics', 'opioid analgesics', 'benzodiazepines, sedatives, barbiturates, and anesthetics',
                  'NSAIDs', 'antihistamines', 'asthma-related drugs', 'cardiovascular drugs', 'gastrointestinal drugs', 
                  'orphan drugs', 'antimuscarinics and anticholinergics', 'immunosuppresants', 
                  'opthalmic drugs', 'topical dermatology drugs', 'antidiabetic', 'statins and diet management drugs', 'anabolic steroids', 'contraceptives',
                  'hormone therapy drugs', 'antivirals', 'antibiotics', 'antifungals', 'antiparasitics', 
                   'diagnostics agents and contrast media', 'anticonvulsants', 'antineoplastics'],
        'drug metabolites' : ['antiparkinsons', 'antipsychotics', 'psychostimulants', 'addiction-recovery drugs', 'recreational drugs',
                  'common analgesics and antipyretics', 'opioid analgesics', 'benzodiazepines, sedatives, barbiturates, and anesthetics',
                  'NSAIDs', 'antihistamines', 'asthma-related drugs', 'cardiovascular drugs', 'gastrointestinal drugs', 
                  'orphan drugs', 'antimuscarinics and anticholinergics', 'immunosuppresants', 
                  'opthalmic drugs', 'topical dermatology drugs', 'antidiabetic', 'statins and diet management drugs', 'anabolic steroids', 'contraceptives',
                  'hormone therapy drugs', 'antivirals', 'antibiotics', 'antifungals', 'antiparasitics', 
                   'diagnostics agents and contrast media', 'anticonvulsants', 'antineoplastics'],
        'agricultural-related chemical' : ['insecticide and insecticide-related metabolites', 'herbicide and herbicide-related metabolites', 
                                           'fungicide and fungicide-related metabolites'],
        'inflammation-related molecules' : ['prostaglandin and prostaglandin-related molecules', 'leukotriene and leukotriene-related molecules',
                                           'thromboxane and thromboxane-related molecules'],
        'reproductive system metabolites' : ['androgens and androgen-related molecules', 'estrogens and estrogen-related molecules',
                                            'progesterones and progesterone-related molecules'],
        'lipids and lipid metabolites' : ['glycerophospholipids, sphingolipids, sterols and membrane lipid molecules',
                                         'glycerolipids and energy storage lipid molecules'],
        'neuromodulation metabolites' : ['noradrenaline system related neurotransmitter and metabolites' , 
                                         'dopamine system related neurotransmitter and metabolites',
                                        'serotonin system related neurotransmitter and metabolites',
                                        'cholinergic system related neurotransmitter and metabolites']
    }
        
    subclass_keyword_dict = {
        'insecticide and insecticide-related metabolites' : ['insecticide'],
        'herbicide and herbicide-related metabolites' : ['herbicide'],
        'fungicide and fungicide-related metabolites' : ['fungicide'],
        'prostaglandin and prostaglandin-related molecules' : ['prostaglandin', 'PG', 'prostanoid', 'isoprostane'],
        'leukotriene and leukotriene-related molecules' : ['leukotriene'],
        'thromboxane and thromboxane-related molecules': ['thromboxane'],
        'androgens and androgen-related molecules': ['androgen', 'testosterone', 'androsterone'],
        'estrogens and estrogen-related molecules' : ['estrogen', 'estetrol', 'estriol', 'estradiol'],
        'progesterones and progesterone-related molecules' : ['progestogens', 'progesterone'],
        'glycerolipids and energy storage lipid molecules' : [' glycerol'],
        'glycerophospholipids, sphingolipids, sterols and membrane lipid molecules' :['phosphatidic acid', 'phosphatidylserine', 'phosphatidylethanolamine', 
                                                                                      'phosphatidylglycerol', 'phosphatidylglycerophosphate', 'cardiolipin'],
        'anticonvulsants' : ['anticonvulsant'],
        'antiparkinsons' : ['antiparkinson', 'anti-parkinson', 'parkinson'],
        'antipsychotics' : ['antipsychotic', 'anti-psychotic', 'schizophrenia'],
        'psychostimulants' : ['psychostimulants'],
        'addiction-recovery drugs' : ['treat addiction'],
        'recreational drugs' : ['cocaine' , 'heroin', 'cannabis', 'hallucinogen'],
        'common analgesics and antipyretics' : ['acetaminophen', 'codeine'],
        'opioid analgesics' : ['opioid analgesic', 'narcotic'],
        'benzodiazepines, sedatives, barbiturates, and anesthetics' : ['morphin','anesthetic', 'sleep agent', 'barbiturate', 'sedative', 'benzodiazepine'],
        'NSAIDs' : ['NSAIA', 'NSAID'],
        'antihistamines' : ['antihistamine', 'anti-histamine'],
        'asthma-related drugs' : ['bronchodilator', 'asthma'],
        'cardiovascular drugs' : ['hypotension', 'plasma renin', 'antihypertensive', 'hypertension', 'renin inhibitor', 'vasodilator', 'dilate blood vessel', 
                                  'antianginal', 'anti-anginal', 'cardiac arrhythmia', 'skeletal muscle', 'candesartan', 'vasoconstrictive'],
        'gastrointestinal drugs' : ['inflammatory bowel disease', 'antiemetic', 'gastrointestinal tract', 'antiulcer'],
        'orphan drugs' : ['orphan drug'],
        'antimuscarinics and anticholinergics' : ['antimuscarinic', 'muscarinic antagonist'],
        'antineoplastics' : ['antineoplastic', 'cancer', 'alkylating agent'],
        'immunosuppresants' : ['immunosuppressive', 'immunosuppresant'],
        'opthalmic drugs' : ['glaucoma', ' eye', 'opthalmic'],
        'topical dermatology drugs' : ['dermato', 'psoriasis', ' skin', 'itching'],
        'antidiabetic' : ['diabetic', 'diabetes', 'insulin'],
        'statins and diet management drugs' : ['statin', 'cholesterol', 'LDL', 'HDL', 'diet'],
        'anabolic steroids' : ['anabolic steroid'],
        'contraceptives' : ['contraceptive'],
        'hormone therapy drugs' : ['hormone replacement', 'estrogen', 'menopause'],
        'antivirals' : ['virus','antiviral', 'HIV'],
        'antibiotics' : ['antitubercular','antibiotic', 'antimicrobial', 'antibacterial', 'beta-lactamase', 'bactericidal', 'tetracycline'],
        'antifungals' : ['fungi', 'fungal', 'antifungal'],
        'antiparasitics' : ['helmint', 'worm', 'parasite'],
        'diagnostics agents and contrast media' : ['contrast'],
        'noradrenaline system related neurotransmitter and metabolites' : ['noradrenaline', 'norepinephrine'] , 
        'dopamine system related neurotransmitter and metabolites' : ['dopamine', 'dihydroxyphenethylamine'],
        'serotonin system related neurotransmitter and metabolites' : ['serotonin', 'hydroxytryptamine'],
        'cholinergic system related neurotransmitter and metabolites' : ['acetylcholine']
    }
    
    reldiet_keyword_dict = {
        'coffee and coffee products' :['coffee'],
        'cereals and cereal products':['cereal', 'wheat'],
        'alcoholic beverages':['alcoholic beverage'],
        'cocoa and cocoa products':['cocoa', 'chocolate', 'carob'],
        'tea and tea products':['tea'],
        'cooking oils':['fats and oils', 'borage', 'olives'],
        'fruits and fruit products':['fruit', 'currant', ' plum','pomes', 'avocado', 'bilberry', 'citrus', 'melon', 'lemon', 'apple', 'date', 'banana'],
        'legumes and legume products':['soy', ' pea','nuts', 'pulses', 'bean', 'isoflavone', 'chickpea', 'ginkgo'],
        'tobacco and tobacco products':['tobacco'],
        'herbs and spices' :['herbs and spices', 'coriander', 'dock', ' dill', ' cumin', 'cloves', 'anise', 'parsley', 'burdock', 'ginger', 'turmeric', 'angelica', 'rhubarb', ' sage', 'mugwort', ' chili'],

        'seafood and seafood products':['EPA',' sea','crustacean', 'mollusk', 'fish', 'octopus', 'salmon', 'tuna', 'squid', 'prawn', 'crab'],
        'meats and meat products':['chicken', 'poultry', 'raw meat', 'pork'],
        'cheese, dairy, and dairy products':['cheese', 'dairy', 'milk'],

        'fungi and fungi products':['edible mush', 'lion'],
        'vegetables and vegetable products':['vegetable', 'tomato', 'brassicas', 'onion', 'corn', 'butterbur', 'cucumber', 'potato', 'carrot', 'gourd']
    }
    
    reldisease_keyword_dict = {
        
        'inflammatory bowel disease' :['inflammatory bowel disease'],
        'asthma':['asthma'],
        'hypertension':['hypertension'],
        'depression and mood disorders':['antidepressant'],
        'schizophrenia':['schizo'],
        'parkinsons disease': ['parkinson'],
        'diabetes':['diabetes', 'diabetic', 'glycemic'],
        'cancer':['cancer', 'neoplastic'],
        'skin diseases':['dermato', 'psoriasis', 'itching'],
        'cardiac arrhythmia':['cardiac arrhythmia'],
        'organic acids imbalance' : ['aciduria', 'acidemia'],
        'hyperprolinemia' : ['hyperprolinemia'],
        'seizures':['seizure','convuls'],
        'HIV': ['HIV'],
        'infectious diseases' : ['bacterial infection', 'viral infection', 'fungal infection']

    }
    
    desc_first_sentence = desc.split(". ")[0]
    
    for i in source_keyword_dict.keys():
        for z in source_keyword_dict[i]:
            desc_source = 'ubiquitous'
            if z in desc_first_sentence or z.capitalize() in desc_first_sentence:
                desc_source = i
                found_source = True
    
                if desc_source == 'synthetic_1' or desc_source == 'synthetic_2':
                    desc_source = 'synthetic'
                break
                
            else:
                found_source = False
                
        if found_source:
            break

            
    try:        
        for a in source_class_dict[desc_source]:
            for j in class_keyword_dict[a]:
                desc_class = 'NA'
                if j in desc or j.capitalize() in desc:
                    desc_class = a
                    found_class = True
                    break
                else:
                    found_class = False

            if found_class:
                break
    except:
        desc_class = 'NA'
            
    try:       
        for b in class_subclass_dict[desc_class]:
            for k in subclass_keyword_dict[b]:
                desc_subclass = 'NA'
                if k in desc or k.capitalize() in desc:
                    desc_subclass = b
                    found_subclass = True
                    break
                else:
                    found_subclass = False
                    
            if found_subclass:
                break
    except:
        desc_subclass = 'NA'
        
    for ii in reldiet_keyword_dict.keys():
        for zz in reldiet_keyword_dict[ii]:
            desc_reldiet = 'NA'
            if zz in desc or zz.capitalize() in desc:
                desc_reldiet = ii
                found_reldiet = True
                break
            else:
                found_reldiet = False

        if found_reldiet:
            break    
            
    for jj in reldisease_keyword_dict.keys():
        for kk in reldisease_keyword_dict[jj]:
            desc_reldisease = 'NA'
            if kk in desc or kk.capitalize() in desc:
                desc_reldisease = jj
                found_reldisease = True
                break
            else:
                found_reldisease = False

        if found_reldisease:
            break  
      
    if desc_class == 'drug metabolites':
        try:
            desc_reldrug = desc.split('have used or taken ')[1].split('.')[0]
            
        except:  
            try:
                desc_reldrug = desc.split('a metabolite of ')[1].split('.')[0]
            except:
                try:
                    desc_reldrug = desc.split('a derivative of ')[1].split('.')[0]
                    
                except:
                    desc_reldrug = 'NA'

    else:
        desc_reldrug = 'NA'
        
    if desc_class == 'drugs':
        desc_reldrug = name
      
    return desc_source, desc_class, desc_subclass, desc_reldiet, desc_reldisease, desc_reldrug

In [17]:
for x in list(newallhit_dict.keys()):
    for y in list(newallhit_dict[x].keys()):
        tmpont = {'source':'', 'chem_ont_class':'', 'chem_ont_subclass':'', 
                  'chem_ont_reldiet':[''], 'chem_ont_reldrug':[''], 'chem_ont_reldisease':[''] }
        
        desc = allhit_hmdb[x][y]['desc']
        name = allhit_hmdb[x][y]['name']
        
        tmpont['source'], tmpont['chem_ont_class'], tmpont['chem_ont_subclass'], tmpont['chem_ont_reldiet'], tmpont['chem_ont_reldisease'], tmpont['chem_ont_reldrug'] = find_ontology(name, desc)
    
        newallhit_dict[x][y]['ontology'] = tmpont

In [18]:
#change masses that has 0 matches to unknown
for x in list(newallhit_dict.keys()):
    if newallhit_dict[x] == {}:
        newallhit_dict[x]['unknown compound'] = {
                  "name": f"unknown_{x}",
                  "neutral_mass": "0",
                  "desc": "na",
                  "taxonomy": {
                        "kingdom": "unknown",
                        "superclass": "unknown",
                        "class": "unknown",
                        "subclass": "unknown",
                        "direct_parent": "unknown"
                  },
                  "ontology": {
                        "source": "unknown",
                        "chem_ont_class": "unknown",
                        "chem_ont_subclass": "unknown",
                        "chem_ont_reldiet": "unknown",
                        "chem_ont_reldrug": "unknown",
                        "chem_ont_reldisease": "unknown"}
        }
                  
    else:
        pass

In [19]:
out_file = open(f"match_result/filtered_hit.json", "w")
json.dump(newallhit_dict, out_file, indent = 6)
out_file.close()