# MED-RT parser
Tenzen Rabgang and Romana Pernisch

In [None]:
import xmltodict
import os
from owlready2 import *
from collections import OrderedDict 

In [2]:
file_name_xml = 'Core_MEDRT_2020.10.05_XML.xml'

xml_dict = None
with open('./data_medrt/{}'.format(file_name_xml), 'r') as xml:
    xml_dict = xmltodict.parse(xml.read())

In [3]:
namespace = xml_dict['terminology']['namespace']
ref_namespaces = xml_dict['terminology']['referencedNamespace']

proptypes = xml_dict['terminology']['proptype']
assntypes = xml_dict['terminology']['assntype'] # relations e.g. may_treat
qualtypes = xml_dict['terminology']['qualtype']

terms = xml_dict['terminology']['term'] # name of concepts in MEDRT
concepts = xml_dict['terminology']['concept'] # detailed description of concept

associations = xml_dict['terminology']['association'] # detailed description of relations

In [23]:
may_treat_assoc = []
parent_of_assoc = []

for assoc in associations:
    if assoc['name'] == "may_treat":
        may_treat_assoc.append(assoc)
    elif assoc['name'] == "Parent Of":
        parent_of_assoc.append(assoc)

In [24]:
parent_child_map = {}

for pc_assoc in parent_of_assoc:
    parent_child_map[pc_assoc['from_code']] = pc_assoc

In [25]:
may_treat_assoc_inferred = may_treat_assoc.copy()

for mt_assoc in may_treat_assoc:
    from_code = mt_assoc['from_code']
    to_code = mt_assoc['to_code']

    while from_code in parent_child_map:
        assoc = parent_child_map[from_code]
        
        new_mt_assoc = OrderedDict()
        new_mt_assoc['from_namespace'] = assoc['to_namespace']
        new_mt_assoc['from_code'] = assoc['to_code']
        new_mt_assoc['to_namespace'] = mt_assoc['to_namespace']
        new_mt_assoc['to_code'] = mt_assoc['to_code']
        may_treat_assoc_inferred.append(new_mt_assoc)

        from_code = assoc['to_code']
    
    while to_code in parent_child_map:
        assoc = parent_child_map[to_code]
        
        new_mt_assoc = OrderedDict()
        new_mt_assoc['to_namespace'] = assoc['to_namespace']
        new_mt_assoc['to_code'] = assoc['to_code']
        
        new_mt_assoc['from_namespace'] = mt_assoc['from_namespace']
        new_mt_assoc['from_code'] = mt_assoc['from_code']
        may_treat_assoc_inferred.append(new_mt_assoc)

        to_code = assoc['to_code']

In [26]:
code_map = {}
relations = []

for mt_assoc in may_treat_assoc_inferred:
    code_map[mt_assoc['from_code']] = { 'code': mt_assoc['from_code'], 'ns': mt_assoc['from_namespace'], 'type': 'drug' }
    code_map[mt_assoc['to_code']] = { 'code': mt_assoc['to_code'], 'ns': mt_assoc['to_namespace'], 'type': 'disease' }
    
    relations.append((mt_assoc['from_code'], mt_assoc['to_code']))

In [12]:
drugs = [d for d in code_map.values() if d['type'] == 'drug']
diseases = [d for d in code_map.values() if d['type'] == 'disease']

# sort by namespace
drugs = sorted(drugs, key=lambda k: k['ns'])
diseases = sorted(diseases, key=lambda k: k['ns']) 

In [13]:
code_to_id_map = {}
with open('./output/node_list.txt', 'w') as node_list:
    for idx, drug in enumerate(drugs):
        node_list.write('{} {} {} {}\n'.format(idx, drug['code'], drug['type'], drug['ns']))
        code_to_id_map[drug['code']] = idx
        
    for idx, disease in enumerate(diseases):
        cont_id = idx + len(drugs)
        node_list.write('{} {} {} {}\n'.format(cont_id, disease['code'], disease['type'], disease['ns']))
        code_to_id_map[disease['code']] = cont_id

In [14]:
with open('./output/medrt.edgelist', 'w') as edge_file:
    for rel in relations:
        drug_id = code_to_id_map[rel[0]]
        disease_id = code_to_id_map[rel[1]]
        
        edge_file.write("{} {}\n".format(drug_id, disease_id))

In [27]:
len(may_treat_assoc_inferred)

14476