# Process scientific articles with cTAKES, generate graphs and then make training dataset for BigARTM

In [1]:
import networkx as nx, lxml, lxml.etree, glob, subprocess, \
    re, joblib, os, networkx, collections, traceback, numpy, \
    bisect, itertools, json, tqdm

from IPython.display import display
from PIL.Image import Image

%pylab inline

Populating the interactive namespace from numpy and matplotlib


## Preprocess PMC texts

In [2]:
INTRODUCTION_RE = re.compile(r'^==== Body$', re.I | re.MULTILINE)
REFERENCES_RE = re.compile(r'^==== Refs$', re.I | re.MULTILINE)
def clean_pmc_text(txt):
    match = INTRODUCTION_RE.search(txt)
    if match:
        txt = txt[match.end():]
    match = REFERENCES_RE.search(txt)
    if match:
        txt = txt[:match.start()]
    return txt


def preprocess_text(in_file, out_file):
    with open(in_file, 'r') as f:
        txt = f.read()
    txt = clean_pmc_text(txt)
    with open(out_file, 'w') as f:
        f.write(txt)


def preprocess_texts(files, out_dir, n_jobs=1):
    joblib.Parallel(n_jobs=n_jobs)(joblib.delayed(preprocess_text)(in_file,
                                                                   os.path.join(out_dir, os.path.basename(in_file)))
                                   for in_file in files)

In [3]:
# preprocess_texts(glob.glob('./data/0_sources/*.txt'),
#                  './data/1_preprocessed/',
#                  n_jobs=1)

## Execute cTAKES

In [4]:
!/notebook/ctakes/apache-ctakes-4.0.0/bin/runClinicalPipeline.sh -i /notebook/data/1_preprocessed/ --xmiOut /notebook/data/2_xmi/ --user ****  --pass "****" # &>/dev/null

log4j: reset attribute= "false".
log4j: Threshold ="null".
log4j: Retreiving an instance of org.apache.log4j.Logger.
log4j: Setting [ProgressAppender] additivity to [false].
log4j: Level value for ProgressAppender is  [INFO].
log4j: ProgressAppender level set to INFO
log4j: Class name: [org.apache.log4j.ConsoleAppender]
log4j: Parsing layout of class: "org.apache.log4j.PatternLayout"
log4j: Setting property [conversionPattern] to [%m].
log4j: Adding appender named [noEolAppender] to category [ProgressAppender].
log4j: Retreiving an instance of org.apache.log4j.Logger.
log4j: Setting [ProgressDone] additivity to [false].
log4j: Level value for ProgressDone is  [INFO].
log4j: ProgressDone level set to INFO
log4j: Class name: [org.apache.log4j.ConsoleAppender]
log4j: Parsing layout of class: "org.apache.log4j.PatternLayout"
log4j: Setting property [conversionPattern] to [%m%n].
log4j: Adding appender named [eolAppender] to category [ProgressDone].
log4j: Level value for root is  [INFO].
l

13 Dec 2017 15:16:05  INFO SentenceDetector - Starting processing.
13 Dec 2017 15:16:05  INFO TokenizerAnnotatorPTB - process(JCas) in org.apache.ctakes.core.ae.TokenizerAnnotatorPTB
13 Dec 2017 15:16:05  INFO ContextDependentTokenizerAnnotator - process(JCas)
13 Dec 2017 15:16:05  INFO POSTagger - process(JCas)
13 Dec 2017 15:16:05  INFO Chunker -  process(JCas)
13 Dec 2017 15:16:05  INFO ChunkAdjuster -  process(JCas)
13 Dec 2017 15:16:05  INFO ChunkAdjuster -  process(JCas)
13 Dec 2017 15:16:05  INFO AbstractJCasTermAnnotator - Starting processing
13 Dec 2017 15:16:05  INFO AbstractJCasTermAnnotator - Finished processing
13 Dec 2017 15:16:05  INFO ClearNLPSemanticRoleLabelerAE - Starting processing
13 Dec 2017 15:16:05  INFO ClearNLPSemanticRoleLabelerAE - Finished processing
Dec 13, 2017 3:16:05 PM org.apache.uima.util.MessageReport decreasingWithTrace(51)
13 Dec 2017 15:16:05  INFO SentenceDetector - Starting processing.
13 Dec 2017 15:16:05  INFO TokenizerAnnotatorPTB - process(J

13 Dec 2017 15:20:59  INFO Chunker -  process(JCas)
13 Dec 2017 15:21:00  INFO ChunkAdjuster -  process(JCas)
13 Dec 2017 15:21:00  INFO ChunkAdjuster -  process(JCas)
13 Dec 2017 15:21:00  INFO AbstractJCasTermAnnotator - Starting processing
13 Dec 2017 15:21:00  INFO AbstractJCasTermAnnotator - Finished processing
13 Dec 2017 15:21:00  INFO ClearNLPSemanticRoleLabelerAE - Starting processing
13 Dec 2017 15:21:00  INFO ClearNLPSemanticRoleLabelerAE - Finished processing
13 Dec 2017 15:21:27  INFO SentenceDetector - Starting processing.
13 Dec 2017 15:21:27  INFO TokenizerAnnotatorPTB - process(JCas) in org.apache.ctakes.core.ae.TokenizerAnnotatorPTB
13 Dec 2017 15:21:27  INFO ContextDependentTokenizerAnnotator - process(JCas)
13 Dec 2017 15:21:27  INFO POSTagger - process(JCas)
13 Dec 2017 15:21:27  INFO Chunker -  process(JCas)
13 Dec 2017 15:21:28  INFO ChunkAdjuster -  process(JCas)
13 Dec 2017 15:21:28  INFO ChunkAdjuster -  process(JCas)
13 Dec 2017 15:21:28  INFO AbstractJCasTer

13 Dec 2017 15:24:36  INFO POSTagger - process(JCas)
13 Dec 2017 15:24:36  INFO Chunker -  process(JCas)
13 Dec 2017 15:24:37  INFO ChunkAdjuster -  process(JCas)
13 Dec 2017 15:24:37  INFO ChunkAdjuster -  process(JCas)
13 Dec 2017 15:24:37  INFO AbstractJCasTermAnnotator - Starting processing
13 Dec 2017 15:24:37  INFO AbstractJCasTermAnnotator - Finished processing
13 Dec 2017 15:24:37  INFO ClearNLPSemanticRoleLabelerAE - Starting processing
13 Dec 2017 15:24:37  INFO ClearNLPSemanticRoleLabelerAE - Finished processing
13 Dec 2017 15:24:59  INFO SentenceDetector - Starting processing.
13 Dec 2017 15:24:59  INFO TokenizerAnnotatorPTB - process(JCas) in org.apache.ctakes.core.ae.TokenizerAnnotatorPTB
13 Dec 2017 15:24:59  INFO ContextDependentTokenizerAnnotator - process(JCas)
13 Dec 2017 15:24:59  INFO POSTagger - process(JCas)
13 Dec 2017 15:24:59  INFO Chunker -  process(JCas)
13 Dec 2017 15:24:59  INFO ChunkAdjuster -  process(JCas)
13 Dec 2017 15:24:59  INFO ChunkAdjuster -  pro

13 Dec 2017 15:29:26  INFO POSTagger - process(JCas)
13 Dec 2017 15:29:26  INFO Chunker -  process(JCas)
13 Dec 2017 15:29:26  INFO ChunkAdjuster -  process(JCas)
13 Dec 2017 15:29:26  INFO ChunkAdjuster -  process(JCas)
13 Dec 2017 15:29:26  INFO AbstractJCasTermAnnotator - Starting processing
13 Dec 2017 15:29:26  INFO AbstractJCasTermAnnotator - Finished processing
13 Dec 2017 15:29:26  INFO ClearNLPSemanticRoleLabelerAE - Starting processing
13 Dec 2017 15:29:26  INFO ClearNLPSemanticRoleLabelerAE - Finished processing
13 Dec 2017 15:29:44  INFO SentenceDetector - Starting processing.
13 Dec 2017 15:29:44  INFO TokenizerAnnotatorPTB - process(JCas) in org.apache.ctakes.core.ae.TokenizerAnnotatorPTB
13 Dec 2017 15:29:44  INFO ContextDependentTokenizerAnnotator - process(JCas)
13 Dec 2017 15:29:44  INFO POSTagger - process(JCas)
13 Dec 2017 15:29:44  INFO Chunker -  process(JCas)
13 Dec 2017 15:29:44  INFO ChunkAdjuster -  process(JCas)
13 Dec 2017 15:29:44  INFO ChunkAdjuster -  pro

13 Dec 2017 15:32:05  INFO AbstractJCasTermAnnotator - Finished processing
13 Dec 2017 15:32:05  INFO ClearNLPSemanticRoleLabelerAE - Starting processing
13 Dec 2017 15:32:05  INFO ClearNLPSemanticRoleLabelerAE - Finished processing
13 Dec 2017 15:32:10  INFO SentenceDetector - Starting processing.
13 Dec 2017 15:32:10  INFO TokenizerAnnotatorPTB - process(JCas) in org.apache.ctakes.core.ae.TokenizerAnnotatorPTB
13 Dec 2017 15:32:10  INFO ContextDependentTokenizerAnnotator - process(JCas)
13 Dec 2017 15:32:10  INFO POSTagger - process(JCas)
13 Dec 2017 15:32:10  INFO Chunker -  process(JCas)
13 Dec 2017 15:32:11  INFO ChunkAdjuster -  process(JCas)
13 Dec 2017 15:32:11  INFO ChunkAdjuster -  process(JCas)
13 Dec 2017 15:32:11  INFO AbstractJCasTermAnnotator - Starting processing
13 Dec 2017 15:32:11  INFO AbstractJCasTermAnnotator - Finished processing
13 Dec 2017 15:32:11  INFO ClearNLPSemanticRoleLabelerAE - Starting processing
13 Dec 2017 15:32:11  INFO ClearNLPSemanticRoleLabelerAE

13 Dec 2017 15:38:13  INFO AbstractJCasTermAnnotator - Finished processing
13 Dec 2017 15:38:13  INFO ClearNLPSemanticRoleLabelerAE - Starting processing
13 Dec 2017 15:38:13  INFO ClearNLPSemanticRoleLabelerAE - Finished processing
13 Dec 2017 15:38:59  INFO SentenceDetector - Starting processing.
13 Dec 2017 15:38:59  INFO TokenizerAnnotatorPTB - process(JCas) in org.apache.ctakes.core.ae.TokenizerAnnotatorPTB
13 Dec 2017 15:38:59  INFO ContextDependentTokenizerAnnotator - process(JCas)
13 Dec 2017 15:38:59  INFO POSTagger - process(JCas)
13 Dec 2017 15:38:59  INFO Chunker -  process(JCas)
13 Dec 2017 15:38:59  INFO ChunkAdjuster -  process(JCas)
13 Dec 2017 15:38:59  INFO ChunkAdjuster -  process(JCas)
13 Dec 2017 15:38:59  INFO AbstractJCasTermAnnotator - Starting processing
13 Dec 2017 15:38:59  INFO AbstractJCasTermAnnotator - Finished processing
13 Dec 2017 15:38:59  INFO ClearNLPSemanticRoleLabelerAE - Starting processing
13 Dec 2017 15:39:00  INFO ClearNLPSemanticRoleLabelerAE

13 Dec 2017 15:43:48  INFO Chunker -  process(JCas)
13 Dec 2017 15:43:48  INFO ChunkAdjuster -  process(JCas)
13 Dec 2017 15:43:48  INFO ChunkAdjuster -  process(JCas)
13 Dec 2017 15:43:48  INFO AbstractJCasTermAnnotator - Starting processing
13 Dec 2017 15:43:48  INFO AbstractJCasTermAnnotator - Finished processing
13 Dec 2017 15:43:48  INFO ClearNLPSemanticRoleLabelerAE - Starting processing
13 Dec 2017 15:43:48  INFO ClearNLPSemanticRoleLabelerAE - Finished processing
13 Dec 2017 15:43:51  INFO SentenceDetector - Starting processing.
13 Dec 2017 15:43:51  INFO TokenizerAnnotatorPTB - process(JCas) in org.apache.ctakes.core.ae.TokenizerAnnotatorPTB
13 Dec 2017 15:43:51  INFO ContextDependentTokenizerAnnotator - process(JCas)
13 Dec 2017 15:43:51  INFO POSTagger - process(JCas)
13 Dec 2017 15:43:51  INFO Chunker -  process(JCas)
13 Dec 2017 15:43:51  INFO ChunkAdjuster -  process(JCas)
13 Dec 2017 15:43:51  INFO ChunkAdjuster -  process(JCas)
13 Dec 2017 15:43:51  INFO AbstractJCasTer

13 Dec 2017 15:50:14  INFO ClearNLPSemanticRoleLabelerAE - Starting processing
13 Dec 2017 15:50:14  INFO ClearNLPSemanticRoleLabelerAE - Finished processing
13 Dec 2017 15:50:14  INFO SentenceDetector - Starting processing.
13 Dec 2017 15:50:14  INFO TokenizerAnnotatorPTB - process(JCas) in org.apache.ctakes.core.ae.TokenizerAnnotatorPTB
13 Dec 2017 15:50:14  INFO ContextDependentTokenizerAnnotator - process(JCas)
13 Dec 2017 15:50:14  INFO POSTagger - process(JCas)
13 Dec 2017 15:50:14  INFO Chunker -  process(JCas)
13 Dec 2017 15:50:14  INFO ChunkAdjuster -  process(JCas)
13 Dec 2017 15:50:14  INFO ChunkAdjuster -  process(JCas)
13 Dec 2017 15:50:14  INFO AbstractJCasTermAnnotator - Starting processing
13 Dec 2017 15:50:14  INFO AbstractJCasTermAnnotator - Finished processing
13 Dec 2017 15:50:14  INFO ClearNLPSemanticRoleLabelerAE - Starting processing
13 Dec 2017 15:50:14  INFO ClearNLPSemanticRoleLabelerAE - Finished processing
13 Dec 2017 15:50:17  INFO SentenceDetector - Starti

13 Dec 2017 15:54:52  INFO ContextDependentTokenizerAnnotator - process(JCas)
13 Dec 2017 15:54:52  INFO POSTagger - process(JCas)
13 Dec 2017 15:54:52  INFO Chunker -  process(JCas)
13 Dec 2017 15:54:52  INFO ChunkAdjuster -  process(JCas)
13 Dec 2017 15:54:52  INFO ChunkAdjuster -  process(JCas)
13 Dec 2017 15:54:52  INFO AbstractJCasTermAnnotator - Starting processing
13 Dec 2017 15:54:52  INFO AbstractJCasTermAnnotator - Finished processing
13 Dec 2017 15:54:53  INFO ClearNLPSemanticRoleLabelerAE - Starting processing
13 Dec 2017 15:54:53  INFO ClearNLPSemanticRoleLabelerAE - Finished processing
13 Dec 2017 15:55:15  INFO SentenceDetector - Starting processing.
13 Dec 2017 15:55:15  INFO TokenizerAnnotatorPTB - process(JCas) in org.apache.ctakes.core.ae.TokenizerAnnotatorPTB
13 Dec 2017 15:55:16  INFO ContextDependentTokenizerAnnotator - process(JCas)
13 Dec 2017 15:55:16  INFO POSTagger - process(JCas)
13 Dec 2017 15:55:16  INFO Chunker -  process(JCas)
13 Dec 2017 15:55:16  INFO 

13 Dec 2017 15:59:56  INFO Chunker -  process(JCas)
13 Dec 2017 15:59:56  INFO ChunkAdjuster -  process(JCas)
13 Dec 2017 15:59:56  INFO ChunkAdjuster -  process(JCas)
13 Dec 2017 15:59:56  INFO AbstractJCasTermAnnotator - Starting processing
13 Dec 2017 15:59:56  INFO AbstractJCasTermAnnotator - Finished processing
13 Dec 2017 15:59:57  INFO ClearNLPSemanticRoleLabelerAE - Starting processing
13 Dec 2017 15:59:57  INFO ClearNLPSemanticRoleLabelerAE - Finished processing
13 Dec 2017 16:00:06  INFO SentenceDetector - Starting processing.
13 Dec 2017 16:00:06  INFO TokenizerAnnotatorPTB - process(JCas) in org.apache.ctakes.core.ae.TokenizerAnnotatorPTB
13 Dec 2017 16:00:06  INFO ContextDependentTokenizerAnnotator - process(JCas)
13 Dec 2017 16:00:06  INFO POSTagger - process(JCas)
13 Dec 2017 16:00:06  INFO Chunker -  process(JCas)
13 Dec 2017 16:00:06  INFO ChunkAdjuster -  process(JCas)
13 Dec 2017 16:00:06  INFO ChunkAdjuster -  process(JCas)
13 Dec 2017 16:00:06  INFO AbstractJCasTer

13 Dec 2017 16:02:08  INFO ClearNLPSemanticRoleLabelerAE - Starting processing
13 Dec 2017 16:02:08  INFO ClearNLPSemanticRoleLabelerAE - Finished processing


## Load UIMA TypeSystem

What kind of information will we have in training data?

In [5]:
BUILTIN_TYPES_RE = re.compile(r'^uima\.cas.')
LIST_TYPES_RE = re.compile(r'uima\.cas\..*(Array|List)$')
TOP_TYPES = {'uima.cas.TOP',
             'uima.cas.AnnotationBase',
             'uima.tcas.Annotation',
             'uima.cas.EmptyFSList',
             'uima.cas.NonEmptyFSList',
             'uima.cas.NULL',
             'uima.tcas.DocumentAnnotation' }


def parse_bool(s):
    return s.lower() == 'true'


SIMPLE_TYPES_MAPPING = {'uima.cas.Integer' : int,
                        'uima.cas.Float' : float,
                        'uima.cas.String' : str,
                        'uima.cas.Boolean' : parse_bool,
                        'uima.cas.Byte' : int,
                        'uima.cas.Short' : int,
                        'uima.cas.Long' : int,
                        'uima.cas.Double' : float }
TYPESYSTEM_NAMESPACES = { 'uima' : 'http://uima.apache.org/resourceSpecifier' }
IGNORED_TYPES = re.compile(r'\.(Sofa|NULL|View|DocumentAnnotation|.*FSList|NULL)$')
ANNOTATION_CUE_SLOT_NAME = 'ANN_CUE'


def query_xpath(root, xpath):
    return root.xpath(xpath, namespaces = TYPESYSTEM_NAMESPACES)


def get_xpath_str(root, xpath):
    return ''.join(query_xpath(root, xpath))


def load_feature_info(node):
    range_type = get_xpath_str(node, './uima:rangeTypeName/text()')
    element_type = get_xpath_str(node, './uima:elementType/text()')
    is_list = bool(LIST_TYPES_RE.search(range_type))
    info = dict(type=range_type,
                is_list=is_list)
    if element_type:
        info['value_type'] = element_type
    return (get_xpath_str(node, './uima:name/text()'), info)


def load_type_info(node):
    return dict(name=get_xpath_str(node, './uima:name/text()'),
                supertype=get_xpath_str(node, './uima:supertypeName/text()'),
                properties={ feat_name : feat_info for feat_name, feat_info in
                            (load_feature_info(feat_node) for feat_node in
                             query_xpath(node, './uima:features/uima:featureDescription')) })


def build_full_properties(all_types, typename):
    t = all_types[typename]
    type_hierarchy = [typename]
    while t.get('supertype', '') and not t['supertype'] in TOP_TYPES:
        t = all_types[t['supertype']]
        type_hierarchy.append(t['name'])
    type_hierarchy.reverse()
    result = {}
    for tname in type_hierarchy:
        result.update(all_types[tname]['properties'])
    return result


def load_typesystem(in_fname):
    with open(in_fname, 'r') as f:
        tree = lxml.etree.parse(f)
        types_hierarchy =  { t['name'] : t for t in
                            (load_type_info(type_node) for type_node in
                             query_xpath(tree, '/uima:typeSystemDescription/uima:types/uima:typeDescription')) }
        flattened_types = { typename : build_full_properties(types_hierarchy, typename)
                           for typename in types_hierarchy.keys() }
        return flattened_types

In [6]:
typesystem = load_typesystem('/notebook/ctakes/apache-ctakes-4.0.0/resources/org/apache/ctakes/typesystem/types/TypeSystem.xml')

In [7]:
print('Total types:', len(typesystem))

Total types: 161


In [8]:
for tname, tinfo in typesystem.items():
    print('!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
    print(tname)
    print('\n'.join('{}\t{}'.format(feat_name, feat_info) for feat_name, feat_info in tinfo.items()))
    print()

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
org.apache.ctakes.typesystem.type.syntax.WordToken
tokenNumber	{'type': 'uima.cas.Integer', 'is_list': False}
normalizedForm	{'type': 'uima.cas.String', 'is_list': False}
partOfSpeech	{'type': 'uima.cas.String', 'is_list': False}
lemmaEntries	{'type': 'uima.cas.FSList', 'is_list': True, 'value_type': 'org.apache.ctakes.typesystem.type.syntax.Lemma'}
capitalization	{'type': 'uima.cas.Integer', 'is_list': False}
numPosition	{'type': 'uima.cas.Integer', 'is_list': False}
suggestion	{'type': 'uima.cas.String', 'is_list': False}
canonicalForm	{'type': 'uima.cas.String', 'is_list': False}

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
org.apache.ctakes.typesystem.type.syntax.BaseToken
tokenNumber	{'type': 'uima.cas.Integer', 'is_list': False}
normalizedForm	{'type': 'uima.cas.String', 'is_list': False}
partOfSpeech	{'type': 'uima.cas.String', 'is_list': False}
lemmaEntries	{'type': 'uima.cas.FSList', 'is_list': True, 'value_type': 'org.apache.ctakes.typesystem.type.sy

## XMI -> Graphs

In [9]:
NS_RE = re.compile(r'^\{([^}]+)\}(.*)$')
def rmns(s):
    return NS_RE.sub(r'\2', s)


NS_TYPE_RE = re.compile('^http:///(.*).ecore$')
def ns2fqtn(s):
    match = NS_RE.match(s)
    ns = match.group(1)
    ns_type_match = NS_TYPE_RE.match(ns)
    prefix = (ns_type_match.group(1).replace('/', '.') + '.') if ns_type_match else ''
    return prefix + match.group(2)


def serialize_attribs(attribs):
    result = {}
    for k, v in attribs.items():
        if isinstance(v, list):
            result[k] = ' '.join(map(str, v))
        else:
            result[k] = v
    del result['id']
    return result


def overlapping_id_pairs(elems_by_id, from_type_re, to_type_re):
    suitable_elems = [e for e in elems_by_id.values()
                      if 'begin' in e and 'end' in e]
    from_elems = [e for e in suitable_elems
                  if from_type_re.search(e['_type'])]
    to_elems = [e for e in suitable_elems
                if to_type_re.search(e['_type'])]

    for elem in from_elems:
        cur_id, begin, end = elem['id'], elem['begin'], elem['end']
        for over_elem in to_elems:
            over_id = over_elem['id']
            if cur_id == over_id:
                continue
            if not (over_elem['begin'] >= end or begin >= over_elem['end']):
                yield (cur_id, over_id)


SPECIAL_FEATURES = {'id', '_type', 'text'}
PREDEFINED_FEATURES = {
    'begin' : { 'type': 'uima.cas.Integer', 'is_list': False },
    'end' : { 'type': 'uima.cas.Integer', 'is_list': False },
}
STRANGE_XMI_ATTRIBS = { 'id' }
ATTRIBS_TO_IGNORE = { 'sofa' }
TAGS_TO_IGNORE_RE = re.compile(r'(Sofa|View|NULL|DocumentIdPrefix|FSList|DocumentAnnotation|Segment|NewlineToken)$', re.I)

NODE_TYPES_RE = re.compile(r'.') # re.compile(r'(Token|Concept|Sentence|UmlsConcept)$')
ORDER_LINK_CLS_RE = re.compile(r'(Token|Sentence)$')

OVERLAP_LINKS = (
    (re.compile(r'Token$'), re.compile(r'^(?!.*Token).*$'), 'annot'),
)

def xmi2graph(in_file, types):
    with open(in_file, 'r') as f:
        tree = lxml.etree.parse(f)

    doc_text = lxml.etree.ETXPath('//{http:///uima/cas.ecore}Sofa/@sofaString')(tree)[0]

    elems_by_id = {}

    for node in tree.getroot().getchildren():
        tag = ns2fqtn(node.tag)
        if TAGS_TO_IGNORE_RE.search(tag):
            continue

        attribs = { norm_a : v
                   for a, v in node.attrib.items()
                   if not a in STRANGE_XMI_ATTRIBS
                   for norm_a in (rmns(a),)
                   if not norm_a in ATTRIBS_TO_IGNORE }
        attribs['_type'] = tag
        try:
            attribs['id'] = int(attribs['id'])
        except:
#             print('Could not parse id:', attribs)
            continue

        begin = attribs.get('begin', None)
        end = attribs.get('end', None)
        if (not begin is None) and (not end is None):
            attribs['text'] = doc_text[int(begin):int(end)]

        elems_by_id[attribs['id']] = attribs

    # process attributes: parse values, collapse lists to arrays
    for elem_id, elem in elems_by_id.items():
        type_name = elem['_type']

        type_info = types[type_name]
        for feat_name in elem.keys():
            if feat_name in SPECIAL_FEATURES:
                continue

            if feat_name in PREDEFINED_FEATURES:
                feat_type = PREDEFINED_FEATURES[feat_name]
            else:
                feat_type = type_info[feat_name]
            src_value = elem[feat_name]

            simple_parser = SIMPLE_TYPES_MAPPING.get(feat_type['type'], None)
            if not simple_parser is None:
                elem[feat_name] = simple_parser(src_value)
            elif feat_type['is_list']:
                if src_value:
                    lst_value_parser = SIMPLE_TYPES_MAPPING.get(feat_type['value_type'], int)
                    lst_values = src_value.split(' ')
                    elem[feat_name] = [lst_value_parser(v) for v in lst_values]
                else:
                    elem[feat_name] = []
            else:
                elem[feat_name] = int(src_value)

    # create nodes in graph
    graph = nx.DiGraph()
    for elem in elems_by_id.values():
        if not NODE_TYPES_RE.search(elem['_type']):
            continue
        type_info = types[elem['_type']]
        simple_attribs = { name : value for name, value in elem.items()
                          if name in SPECIAL_FEATURES
                          or (name in type_info
                              and type_info[name]['type'] in SIMPLE_TYPES_MAPPING) }
        graph.add_node(elem['id'], **serialize_attribs(simple_attribs))

    # link tokens and sentences by order
    elems_by_order_link_cls = collections.defaultdict(list)
    for elem in elems_by_id.values():
        match = ORDER_LINK_CLS_RE.search(elem['_type'])
        if not match:
            continue
        link_cls = match.group(1)
        elems_by_order_link_cls[link_cls].append(elem)

    for link_cls, elems in elems_by_order_link_cls.items():
        elems.sort(key=lambda d: d['begin'])
        graph.add_edges_from(((elems[i]['id'], elems[i+1]['id'])
                              for i in range(len(elems) - 1)),
                             type='succ')

    # link nodes by text overlap
    for from_type_re, to_type_re, link_type in OVERLAP_LINKS:
        overlap_edge_endpoints = list(overlapping_id_pairs(elems_by_id,
                                                           from_type_re,
                                                           to_type_re))
        graph.add_edges_from(overlap_edge_endpoints,
                             type=link_type)

    # link nodes by id
    for elem in elems_by_id.values():
        type_info = types[elem['_type']]
        elem_id = elem['id']
        for feat_name, feat_value in elem.items():
            feat_info = type_info.get(feat_name, None)

            if (feat_info is None
                or feat_info['type'] in SIMPLE_TYPES_MAPPING
                or feat_info.get('value_type', '') in SIMPLE_TYPES_MAPPING):
                continue
            if not feat_info['is_list']:
                feat_value = [feat_value]
            for ref in feat_value:
                if graph.has_node(ref):
                    graph.add_edge(elem_id, ref, type=feat_name)

    return elems_by_id, graph


def make_graphs_in_memory(in_files, types, n_jobs=8):
    return joblib.Parallel(n_jobs=n_jobs)(joblib.delayed(xmi2graph)(fname, types)
                                          for fname in in_files)


def xmi2graph_to_file(in_file, types, out_file):
    _, graph = xmi2graph(in_file, types)
    nx.write_graphml(graph, out_file)


def make_graphs_to_files(in_files, types, out_dir, n_jobs=8):
    joblib.Parallel(n_jobs=n_jobs)(joblib.delayed(xmi2graph_to_file)(fname,
                                                                     types,
                                                                     os.path.join(out_dir,
                                                                                  os.path.splitext(os.path.basename(fname))[0] + '.graphml'))
                                   for fname in in_files)

In [10]:
test_files = list(glob.glob('/notebook/data/2_xmi/*.xmi'))
# test_files = ['/notebook/data/2_xmi/AMB_Express_2013_Oct_3_3_58.txt.xmi']
# print(test_files)

In [11]:
make_graphs_to_files(test_files,
                     typesystem,
                     '/notebook/data/3_graphs/')

In [12]:
# graphs = make_graphs_in_memory(test_files[:1],
#                                typesystem,
#                                n_jobs=1)

In [13]:
# collections.Counter(n['_type']
#                     for es, g in graphs
#                     for n in es.values()).most_common()

## Graph -> Flat Contexts

In [14]:
CUR_VERTEX_RE = re.compile('^.*Token$')
CTX_EDGE_RE = re.compile('.')
CTX_VERTEX_RE = re.compile('(?<!Sentence)$')
EXCLUDE_ATTRIB_RE = re.compile('''(
    :id
    | :begin
    | :end
    | type
    | :_type
    | :tokenNumber
    | :numPosition
    | :feats
    | :cpostag
    | :form
    | :pdeprel
    | :score
    | Chunk:text
    | ConllDependencyNode:text
    | :confidence
)$''', re.X)
VERTEX2RELATIONS = (
#     'org.apache.ctakes.typesystem.type.syntax.Chunk' : { 'annot' },
    (re.compile('.'), re.compile('^(?!(succ|annot))')),
)


def get_adj_edges(graph, need_out_edges, start_node, prefix):
    if need_out_edges:
        edge_getter = graph.out_edges
        take_node = 1
    else:
        edge_getter = graph.in_edges
        take_node = 0

    for edge_tuple in edge_getter(start_node, data=True):
        yield (prefix, edge_tuple[take_node], edge_tuple[-1])


def add_features(graph, train_sample, ctx_name_prefix, neigh_node, edge_data,
                 ctx_edge_re, ctx_vertex_re, exclude_attrib_re, vertex2relations,
                 node_prefix_nesting, add_ctx_gens=None):
    edge_type = edge_data['type']
    if not ctx_edge_re.search(edge_type):
        return

    node_data = graph.nodes[neigh_node]
    node_type = node_data['_type']
    if not ctx_vertex_re.search(node_type):
        return

    cur_modality_name = ctx_name_prefix + edge_type
    cur_modality = train_sample[cur_modality_name]

    for attrib, val in edge_data.items():
        if not exclude_attrib_re.search(attrib):
            cur_modality['{}={}'.format(attrib, val)] += 1

    node_type_prefix = '.'.join(node_type.split('.')[-node_prefix_nesting:])
    for attrib, val in node_data.items():
        full_attrib_name = '{}@{}'.format(node_type_prefix, attrib)
        if not exclude_attrib_re.search(full_attrib_name):
            cur_modality['{}={}'.format(full_attrib_name, val)] += 1
    
    if not add_ctx_gens is None:
        for rel_node_re, ctx_rels_re in vertex2relations:
            if rel_node_re.search(node_type):
                add_ctx_gens.append((ctx2_prefix, ctx2_node, ctx2_edge_data)
                                    for ctx2_prefix, ctx2_node, ctx2_edge_data
                                    in get_adj_edges(graph,
                                                     True,
                                                     neigh_node,
                                                     '{}>{}>'.format(cur_modality_name, node_type_prefix))
                                    if ctx_rels_re.search(ctx2_edge_data['type']))


def gen_contexts_edgetypes2modalities(graph,
                                      cur_vertex_re=CUR_VERTEX_RE,
                                      ctx_edge_re=CTX_EDGE_RE,
                                      ctx_vertex_re=CTX_VERTEX_RE,
                                      exclude_attrib_re=EXCLUDE_ATTRIB_RE,
                                      vertex2relations=VERTEX2RELATIONS,
                                      include_in_edges=True,
                                      add_edge_dir_prefix=True,
                                      node_prefix_nesting=2):
    for n in graph.nodes.keys():
        attribs = graph.nodes[n]
        _type = attribs['_type']
        if not cur_vertex_re.search(_type):
            continue

        train_sample = collections.defaultdict(lambda: collections.defaultdict(float))

        edge_gens = [[('', n, {'type' : '_current'})],
                     get_adj_edges(graph,
                                   True,
                                   n,
                                   'out_' if add_edge_dir_prefix else '')]
        if include_in_edges:
            edge_gens.append(get_adj_edges(graph,
                                           False,
                                           n,
                                           'in_' if add_edge_dir_prefix else ''))

        add_ctx_gens = []
        for ctx_name_prefix, neigh_node, edge_data in itertools.chain.from_iterable(edge_gens):
            add_features(graph, train_sample, ctx_name_prefix, neigh_node, edge_data,
                         ctx_edge_re, ctx_vertex_re, exclude_attrib_re, vertex2relations,
                         node_prefix_nesting, add_ctx_gens=add_ctx_gens)

        for ctx_name_prefix, neigh_node, edge_data in itertools.chain.from_iterable(add_ctx_gens):
            add_features(graph, train_sample, ctx_name_prefix, neigh_node, edge_data,
                         ctx_edge_re, ctx_vertex_re, exclude_attrib_re, vertex2relations,
                         node_prefix_nesting)

        yield train_sample

In [15]:
g = nx.read_graphml('/notebook/data/3_graphs/AMB_Express_2013_Oct_3_3_58.txt.graphml')

In [16]:
g_samples = list(gen_contexts_edgetypes2modalities(g))

In [17]:
with open('/notebook/data/4_vw/sample.json', 'w') as f:
    json.dump(g_samples, f, indent=4)

## Flat Contexts -> VW for BigARTM

In [18]:
def sample2vw_str(ctx):
    return ' '.join('|{} {}'.format(modality_name,
                                    ' '.join('{}:{}'.format(feat_name, feat_val)
                                             for feat_name, feat_val in modality.items()))
                    for modality_name, modality in ctx.items()) + '\n'

In [19]:
with open('/notebook/data/4_vw/data.vw', 'w') as f:
    for fname in tqdm.tqdm(glob.glob('/notebook/data/3_graphs/*.graphml')):
        g = nx.read_graphml(fname)
        for sample in gen_contexts_edgetypes2modalities(g):
            f.write(sample2vw_str(sample))

100%|██████████| 101/101 [06:38<00:00,  3.95s/it]
