# Generate graphs from UIMA XML files

In [16]:
import networkx as nx, lxml, lxml.etree, glob, subprocess, re, joblib, os, networkx, collections, traceback
from IPython.display import display
from PIL.Image import Image

%pylab inline

Populating the interactive namespace from numpy and matplotlib


## Preprocess PMC texts

In [2]:
INTRODUCTION_RE = re.compile(r'^==== Body$', re.I | re.MULTILINE)
REFERENCES_RE = re.compile(r'^==== Refs$', re.I | re.MULTILINE)
def clean_pmc_text(txt):
    match = INTRODUCTION_RE.search(txt)
    if match:
        txt = txt[match.end():]
    match = REFERENCES_RE.search(txt)
    if match:
        txt = txt[:match.start()]
    return txt


def preprocess_text(in_file, out_file):
    with open(in_file, 'r') as f:
        txt = f.read()
    txt = clean_pmc_text(txt)
    with open(out_file, 'w') as f:
        f.write(txt)


def preprocess_texts(files, out_dir, n_jobs=1):
    joblib.Parallel(n_jobs=n_jobs)(joblib.delayed(preprocess_text)(in_file,
                                                                   os.path.join(out_dir, os.path.basename(in_file)))
                                   for in_file in files)

In [3]:
# preprocess_texts(glob.glob('./data/0_sources/*.txt'),
#                  './data/1_preprocessed/',
#                  n_jobs=1)

## Execute cTAKES

In [4]:
# !/notebook/ctakes/apache-ctakes-4.0.0/bin/runClinicalPipeline.sh -i /notebook/data/1_preprocessed/ --xmiOut /notebook/data/2_xmi/ --user rsuvorov  --pass "1My\$UMLS0T#S1" &>/dev/null

## Load TypeSystem

What kind of information will we have in training data?

In [5]:
BUILTIN_TYPES_RE = re.compile(r'^uima\.cas.')
LIST_TYPES_RE = re.compile(r'uima\.cas\..*(Array|List)$')
TOP_TYPES = {'uima.cas.TOP',
             'uima.cas.AnnotationBase',
             'uima.tcas.Annotation',
             'uima.cas.EmptyFSList',
             'uima.cas.NonEmptyFSList',
             'uima.cas.NULL',
             'uima.tcas.DocumentAnnotation' }


def parse_bool(s):
    return s.lower() == 'true'


SIMPLE_TYPES_MAPPING = {'uima.cas.Integer' : int,
                        'uima.cas.Float' : float,
                        'uima.cas.String' : str,
                        'uima.cas.Boolean' : parse_bool,
                        'uima.cas.Byte' : int,
                        'uima.cas.Short' : int,
                        'uima.cas.Long' : int,
                        'uima.cas.Double' : float }
TYPESYSTEM_NAMESPACES = { 'uima' : 'http://uima.apache.org/resourceSpecifier' }
IGNORED_TYPES = re.compile(r'\.(Sofa|NULL|View|DocumentAnnotation|.*FSList|NULL)$')
ANNOTATION_CUE_SLOT_NAME = 'ANN_CUE'


def query_xpath(root, xpath):
    return root.xpath(xpath, namespaces = TYPESYSTEM_NAMESPACES)


def get_xpath_str(root, xpath):
    return ''.join(query_xpath(root, xpath))


def load_feature_info(node):
    range_type = get_xpath_str(node, './uima:rangeTypeName/text()')
    element_type = get_xpath_str(node, './uima:elementType/text()')
    is_list = bool(LIST_TYPES_RE.search(range_type))
    info = dict(type=range_type,
                is_list=is_list)
    if element_type:
        info['value_type'] = element_type
    return (get_xpath_str(node, './uima:name/text()'), info)


def load_type_info(node):
    return dict(name=get_xpath_str(node, './uima:name/text()'),
                supertype=get_xpath_str(node, './uima:supertypeName/text()'),
                properties={ feat_name : feat_info for feat_name, feat_info in
                            (load_feature_info(feat_node) for feat_node in
                             query_xpath(node, './uima:features/uima:featureDescription')) })


def build_full_properties(all_types, typename):
    t = all_types[typename]
    type_hierarchy = [typename]
    while t.get('supertype', '') and not t['supertype'] in TOP_TYPES:
        t = all_types[t['supertype']]
        type_hierarchy.append(t['name'])
    type_hierarchy.reverse()
    result = {}
    for tname in type_hierarchy:
        result.update(all_types[tname]['properties'])
    return result


def load_typesystem(in_fname):
    with open(in_fname, 'r') as f:
        tree = lxml.etree.parse(f)
        types_hierarchy =  { t['name'] : t for t in
                            (load_type_info(type_node) for type_node in
                             query_xpath(tree, '/uima:typeSystemDescription/uima:types/uima:typeDescription')) }
        flattened_types = { typename : build_full_properties(types_hierarchy, typename)
                           for typename in types_hierarchy.keys() }
        return flattened_types

In [6]:
typesystem = load_typesystem('/notebook/ctakes/apache-ctakes-4.0.0/resources/org/apache/ctakes/typesystem/types/TypeSystem.xml')

In [7]:
print('Total types:', len(typesystem))

Total types: 161


In [8]:
for tname, tinfo in typesystem.items():
    print('!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
    print(tname)
    print('\n'.join('{}\t{}'.format(feat_name, feat_info) for feat_name, feat_info in tinfo.items()))
    print()

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
org.apache.ctakes.typesystem.type.syntax.WordToken
tokenNumber	{'type': 'uima.cas.Integer', 'is_list': False}
normalizedForm	{'type': 'uima.cas.String', 'is_list': False}
partOfSpeech	{'type': 'uima.cas.String', 'is_list': False}
lemmaEntries	{'type': 'uima.cas.FSList', 'is_list': True, 'value_type': 'org.apache.ctakes.typesystem.type.syntax.Lemma'}
capitalization	{'type': 'uima.cas.Integer', 'is_list': False}
numPosition	{'type': 'uima.cas.Integer', 'is_list': False}
suggestion	{'type': 'uima.cas.String', 'is_list': False}
canonicalForm	{'type': 'uima.cas.String', 'is_list': False}

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
org.apache.ctakes.typesystem.type.syntax.BaseToken
tokenNumber	{'type': 'uima.cas.Integer', 'is_list': False}
normalizedForm	{'type': 'uima.cas.String', 'is_list': False}
partOfSpeech	{'type': 'uima.cas.String', 'is_list': False}
lemmaEntries	{'type': 'uima.cas.FSList', 'is_list': True, 'value_type': 'org.apache.ctakes.typesystem.type.sy

## XMI -> Graphs

In [9]:
NS_RE = re.compile(r'^\{([^}]+)\}(.*)$')
def rmns(s):
    return NS_RE.sub(r'\2', s)


NS_TYPE_RE = re.compile('^http:///(.*).ecore$')
def ns2fqtn(s):
    match = NS_RE.match(s)
    ns = match.group(1)
    ns_type_match = NS_TYPE_RE.match(ns)
    prefix = (ns_type_match.group(1).replace('/', '.') + '.') if ns_type_match else ''
    return prefix + match.group(2)


SPECIAL_FEATURES = {'id', '_type'}
PREDEFINED_FEATURES = {
    'begin' : { 'type': 'uima.cas.Integer', 'is_list': False },
    'end' : { 'type': 'uima.cas.Integer', 'is_list': False },
}
ATTRIBS_TO_IGNORE = { 'sofa' }
TAGS_TO_IGNORE_RE = re.compile(r'(Sofa|View|NULL|DocumentIdPrefix|FSList)$', re.I)

NODE_TYPES_RE = re.compile(r'(Token|Concept|Sentence)$')
ORDER_LINK_CLS_RE = re.compile(r'(Token|Sentence)$')

def xmi2graph(in_file, types):
    with open(in_file, 'r') as f:
        tree = lxml.etree.parse(f)

    elems_by_id = {}

    for node in tree.getroot().getchildren():
        tag = ns2fqtn(node.tag)
        if TAGS_TO_IGNORE_RE.search(tag):
            continue

        attribs = { norm_a : v
                   for a, v in node.attrib.items()
                   for norm_a in (rmns(a),)
                   if not norm_a in ATTRIBS_TO_IGNORE }
        attribs['_type'] = tag
        try:
            attribs['id'] = int(attribs['id'])
        except:
#             print('Could not parse id:', attribs)
            continue

        elems_by_id[attribs['id']] = attribs

    # process attributes: parse values, collapse lists to arrays
    for elem_id, elem in elems_by_id.items():
        type_name = elem['_type']

        type_info = types[type_name]
        for feat_name in elem.keys():
            if feat_name in SPECIAL_FEATURES:
                continue

            if feat_name in PREDEFINED_FEATURES:
                feat_type = PREDEFINED_FEATURES[feat_name]
            else:
                feat_type = type_info[feat_name]
            src_value = elem[feat_name]

            simple_parser = SIMPLE_TYPES_MAPPING.get(feat_type['type'], None)
            if not simple_parser is None:
                elem[feat_name] = simple_parser(src_value)
            elif feat_type['is_list']:
                if src_value:
                    lst_value_parser = SIMPLE_TYPES_MAPPING.get(feat_type['value_type'], int)
                    lst_values = src_value.split(' ')
                    elem[feat_name] = [lst_value_parser(v) for v in lst_values]
                else:
                    elem[feat_name] = []
            else:
                elem[feat_name] = int(src_value)

    # create nodes in graph
    graph = nx.DiGraph()
    for elem_id, elem in elems_by_id.items():
        if not NODE_TYPES_RE.search(elem['_type']):
            continue
        graph.add_node(elem_id, **elem)

    # link tokens and sentences by order
    elems_by_order_link_cls = collections.defaultdict(list)
    for elem in elems_by_id.values():
        match = ORDER_LINK_CLS_RE.search(elem['_type'])
        if not match:
            continue
        link_cls = match.group(1)
        elems_by_order_link_cls[link_cls].append(elem)

    for link_cls, elems in elems_by_order_link_cls.items():
        elems.sort(key=lambda d: d['begin'])
        graph.add_edges_from(((elems[i]['id'], elems[i+1]['id'])
                              for i in range(len(elems) - 1)),
                             type='succ')
    # link nodes by id
    # link nodes by text overlap
    # safe graph to disk
    return elems_by_id, graph


def make_graphs_in_memory(in_files, types, n_jobs=15):
    return joblib.Parallel(n_jobs=n_jobs)(joblib.delayed(xmi2graph)(fname, types)
                                          for fname in in_files)


def xmi2graph_to_file(in_file, types, out_file):
    graph = xmi2graph(in_file, types)
    nx.write_graphml(graph, out_file)


def make_graphs_to_files(in_files, types, out_dir):
    joblib.Parallel(n_jobs=n_jobs)(joblib.delayed(xmi2graph_to_file)(fname,
                                                                     types,
                                                                     os.path.join(out_dir,
                                                                                  os.path.basename(fname)))
                                   for fname in in_files)

In [10]:
graphs = make_graphs_in_memory(glob.glob('/notebook/data/2_xmi/*.xmi')[:10],
                               typesystem)

In [11]:
collections.Counter(n['_type']
                    for es, g in graphs
                    for n in es.values()).most_common()

[('org.apache.ctakes.typesystem.type.syntax.WordToken', 39119),
 ('org.apache.ctakes.typesystem.type.syntax.Chunk', 31238),
 ('org.apache.ctakes.typesystem.type.syntax.PunctuationToken', 9399),
 ('org.apache.ctakes.typesystem.type.refsem.UmlsConcept', 9233),
 ('org.apache.ctakes.typesystem.type.textsem.SemanticArgument', 7619),
 ('org.apache.ctakes.typesystem.type.syntax.NumToken', 4813),
 ('org.apache.ctakes.typesystem.type.textsem.Predicate', 4766),
 ('org.apache.ctakes.typesystem.type.textspan.Sentence', 3643),
 ('org.apache.ctakes.typesystem.type.syntax.NewlineToken', 2056),
 ('org.apache.ctakes.typesystem.type.syntax.SymbolToken', 1898),
 ('org.apache.ctakes.typesystem.type.syntax.ConllDependencyNode', 868),
 ('org.apache.ctakes.typesystem.type.syntax.ContractionToken', 28),
 ('org.apache.ctakes.typesystem.type.textsem.SemanticRoleRelation', 10)]