In [1]:
import re
from itertools import chain
import networkx
import requests
from mydisease.utils import read_obo
from mydisease.utils.common import list2dict

from networkx.readwrite import json_graph
import json
from collections import defaultdict
from typing import List

from pymongo import MongoClient
client = MongoClient()
db = client.mydisease.DO

In [2]:
def graph_to_d(graph):
    """
    :param graph: A networkx graph made from reading ontology
    :type graph: networkx.classes.multidigraph.MultiDiGraph
    :return:
    """
    node_link_data = json_graph.node_link_data(graph)
    nodes = node_link_data['nodes']

    idx_id = {idx: node['id'] for idx,node in enumerate(nodes)}
    for link in node_link_data['links']:
        # store the edges (links) within the graph
        key = link['key']
        source = link['source']
        target = link['target']

        if key not in nodes[source]:
            nodes[source][key] = set()
        nodes[source][key].add(idx_id[target])

    # for mongo insertion
    for node in nodes:
        node['_id'] = node['id'].lower()
        if "alt_id" in node:
            node['alt_id'] = [x.lower() for x in node['alt_id']]
        if "is_a" in node:
            node['is_a'] = [x.lower() for x in node['is_a']]
        if "property_value" in node:
            del node['property_value']
        del node['id']
        for k,v in node.items():
            if isinstance(v, set):
                node[k] = list(v)
    d = {node['_id']: node for node in nodes}

    return d


def parse_synonym(line: str):
    # line = "synonym: \"The other white meat\" EXACT MARKETING_SLOGAN [MEAT:00324, BACONBASE:03021]"
    return line[line.index("\"")+1:line.rindex("\"")] if line.count("\"") == 2 else line


def parse_def(line: str):
    """
    Parse definition field.
    Returns a tuple(definition, list of crosslink urls)
    
    >>> parse_def("\"A description.\" [url:http://www.ncbi.goc/123, url:http://www.ncbi.nlm.nih.gov/pubmed/15318016]")
    ('A description.', ['url:http\\://www.ncbi.goc/123', 'url:http\\://www.ncbi.nlm.nih.gov/pubmed/1531801'])
    
    """
    definition = line[line.index("\"")+1:line.rindex("\"")] if line.count("\"") == 2 else line
    if line.endswith("]") and line.count("["):
        left_bracket = [m.start() for m in re.finditer('\[', line)]
        right_bracket = [m.start() for m in re.finditer('\]', line)]
        endliststr = line[left_bracket[-1]+1:right_bracket[-1]]
        endlist = [x.strip().replace("\\\\","").replace("\\","") for x in endliststr.split(", ")]
        return definition, endlist
    else:
        return definition, None


def parse_xref(xrefs: List[str]):
    """
    Parse xref field. Input is list of strings (xref IDs)
    Normalizes prefix strings (MSH -> MESH, ORDO -> Orphanet) and converts prefix to lowercase
    Returns dict[ID prefix: list of IDs without prefix]
    
    >>> parse_xref(['MSH:D006954',  'SNOMEDCT_US_2016_03_01:190781009',  'SNOMEDCT_US_2016_03_01:34349009',  'UMLS_CUI:C0020481'])
    {'MESH': ['D006954'],
     'SNOMEDCT_US_2016_03_01': ['190781009', '34349009'],
     'UMLS_CUI': ['C0020481']}
    
    """
    
    xrefs = [x for x in xrefs if ":" in x]
    xrefs = [x.split(":",1)[0].lower() + ":" + x.split(":",1)[1] for x in xrefs]
    for n,xref in enumerate(xrefs):
        if xref.startswith("msh:"):
            xrefs[n] = "mesh:" + xref.split(":",1)[1]
        if xref.startswith("ordo:"):
            xrefs[n] = "orphanet:" + xref.split(":",1)[1]
    return list2dict(xrefs)

In [3]:
graph = read_obo(open("/home/gstupp/projects/biothings/mydisease/mydisease/data/doid.obo").readlines())
d = graph_to_d(graph)

for value in d.values():
    if 'xref' in value:
        value['xref'] = parse_xref(value['xref'])
    if 'synonym' in value:
        value['synonym'] = list(map(parse_synonym, value['synonym']))
    if 'def' in value:
        value['def'],ref = parse_def(value['def'])
        if ref:
            if 'xref' in value:
                value['xref'].update(parse_xref(ref))
            else:
                value['xref'] = parse_xref(ref)

In [5]:
d['doid:1171']

{'_id': 'doid:1171',
 'comment': 'OMIM mapping confirmed by DO. [SN].',
 'is_a': ['doid:1168'],
 'name': 'hyperlipoproteinemia type V',
 'synonym': ['familial hyperlipoproteinemia type V',
  'familial type 5 hyperlipoproteinemia (disorder)',
  'Fredrickson type V lipaemia'],
 'xref': {'mesh': ['D006954'],
  'nci': ['C35645'],
  'omim': ['144650'],
  'snomedct_us_2016_03_01': ['190781009', '34349009'],
  'umls_cui': ['C0020481']}}

In [7]:
from collections import Counter
Counter(chain(*[x.get('xref',[]) for x in d.values()])).most_common(100)

[('umls_cui', 5776),
 ('snomedct_us_2016_03_01', 4188),
 ('nci', 4117),
 ('mesh', 2873),
 ('url', 2635),
 ('icd10cm', 2417),
 ('icd9cm', 2124),
 ('omim', 1433),
 ('orphanet', 471),
 ('efo', 131),
 ('pmid', 101),
 ('kegg', 39),
 ('csp', 37),
 ('ls', 35),
 ('hp', 31),
 ('nci2009_04d', 29),
 ('sn', 16),
 ('snomedct', 13),
 ('meddra', 11),
 ('efopat_id', 10),
 ('isbn', 8),
 ('ja', 7),
 ('icd10', 5),
 ('https', 5),
 ('pdf', 4),
 ('do', 4),
 ('ctv3', 3),
 ('snomedct_us_2015_03_01', 2),
 ('stedman', 2),
 ('omm', 2),
 ('http', 1),
 ('ndfrt', 1),
 ('utl', 1),
 ('dermo', 1),
 ('who', 1),
 ('url`', 1),
 ('umls', 1),
 ('ic10cm', 1),
 ('icd9', 1),
 ('nord', 1),
 ('mth', 1)]

In [8]:
db.drop()
db.insert_many(d.values())

<pymongo.results.InsertManyResult at 0x7f637acda3a8>

In [9]:
db.find_one('doid:1171')

{'_id': 'doid:1171',
 'comment': 'OMIM mapping confirmed by DO. [SN].',
 'is_a': ['doid:1168'],
 'name': 'hyperlipoproteinemia type V',
 'synonym': ['familial hyperlipoproteinemia type V',
  'familial type 5 hyperlipoproteinemia (disorder)',
  'Fredrickson type V lipaemia'],
 'xref': {'mesh': ['D006954'],
  'nci': ['C35645'],
  'omim': ['144650'],
  'snomedct_us_2016_03_01': ['190781009', '34349009'],
  'umls_cui': ['C0020481']}}

In [10]:
# how many ICD10CM xrefs does each DO item have?
xrefs = list(db.find({'xref':{'$exists': True}},{'xref':1}))
Counter([[y.split(":")[0] for y in x['xref']].count("icd10cm") for x in xrefs])

Counter({0: 4380, 1: 2417})

In [11]:
jsonld = {"doid": {
            "@context": {
                "is_a": "https://www.w3.org/2000/01/rdf-schema#subClassOf",
                "name": "http://www.w3.org/2000/01/rdf-schema#label",
                "synonym": "http://www.geneontology.org/formats/oboInOwl#hasExactSynonym",
                "xref": "http://www.geneontology.org/formats/oboInOwl#hasDbXref"
                }
            },
          "doid/xref": {
                "@context": {
                    "mesh": "http://identifers.org/mesh/",
                    "orphanet": "http://identifiers.org/orphanet.ordo/",
                    "umls_cui": "http://identifiers.org/umls/",
                    "snomedct_us_2016_03_01": "http://identifiers.org/snomedct/",
                    "nci": "",
                    "icd10cm": "http://identifiers.org/icd/",
                    "icd9cm": "",
                    "omim": "http://identifiers.org/omim/",
                    "efo": "http://identifiers.org/efo/",
                    "kegg": "http://identifiers.org/kegg.disease/",
                    "url": ""
                }
            }
          }

In [12]:
jsonld

{'doid': {'@context': {'is_a': 'https://www.w3.org/2000/01/rdf-schema#subClassOf',
   'name': 'http://www.w3.org/2000/01/rdf-schema#label',
   'synonym': 'http://www.geneontology.org/formats/oboInOwl#hasExactSynonym',
   'xref': 'http://www.geneontology.org/formats/oboInOwl#hasDbXref'}},
 'doid/xref': {'@context': {'efo': 'http://identifiers.org/efo/',
   'icd10cm': 'http://identifiers.org/icd/',
   'icd9cm': '',
   'kegg': 'http://identifiers.org/kegg.disease/',
   'mesh': 'http://identifers.org/mesh/',
   'nci': '',
   'omim': 'http://identifiers.org/omim/',
   'orphanet': 'http://identifiers.org/orphanet.ordo/',
   'snomedct_us_2016_03_01': 'http://identifiers.org/snomedct/',
   'umls_cui': 'http://identifiers.org/umls/',
   'url': ''}}}