# Preprocessing based on original annotation file - not very accurate

In [1]:
import sys
from os import listdir, makedirs
from os.path import join, exists

import numpy as np
import torch
import spacy
import codecs
import xmltodict
import pickle
from tqdm import tqdm
from allennlp.modules.elmo import Elmo, batch_to_ids

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [2]:
WORK_DIR = "../Data/bioc_FH_training/"
PROCESSED_DIR = "../Data/processed/"

torch.set_num_threads(8)

In [3]:
def read_xml(path, f):
    """
    Get Observations and FamaliMembers from xml file.
    """
    
    with open(join(path, f),'r') as f:
        content = f.read()
        content = xmltodict.parse(content)

    Obs, Fam = [], []
    for entity in content['data']['annotations']['entity']: # get `entity`s from xml
        spans = []
        for span in entity['span'].split(';'):
            spans.append(span.split(','))

        if entity['type'] == 'Observation':
            Obs.extend(spans)
        elif entity['type'] == 'FamilyMember':
            Fam.extend(spans)

    return {'OBS': Obs, 'FAM': Fam}

In [4]:
! python -m spacy download en_core_web_lg
nlp = spacy.load('en_core_web_lg')

prefix = "/home/texuanw/tools/allenNLP/" # or = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/"
options_file = prefix + "elmo_2x4096_512_2048cnn_2xhighway_options.json"
weight_file = prefix + "elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"
elmo = Elmo(options_file, weight_file, 2, dropout=0)

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


In [5]:
def get_token_ent_type(token, entities):
    """
    Get the entity type of a token.
    """

    for type_, entities_ in entities.items():
        for entity in entities_:
            if token.idx >= int(entity[0]) and token.idx + len(token) <= int(entity[1]):
                # if a token is in the span of a annotation with a type, then the token will be of that type
                return type_

In [6]:
def get_doc_label(doc, entities):
    """
    Get labels of all tokens in a document.
    Each token is labeled as [entity_type, POS_type, dependency]
    """
    
    doc_label = []
    
    for sent in doc.sents:
        sent_label = []

        for token in sent:
            ent_type = get_token_ent_type(token, entities)
            if ent_type:
                token.ent_type_ = ent_type
                print(token, ent_type)
            else:
                print(token, 'O')

            if not token.ent_type_ in entd.keys():
                entd[token.ent_type_] = len(entd)

            if not token.pos_ in posd.keys():
                posd[token.pos_] = len(posd)

            if not token.dep_ in depd.keys():
                depd[token.dep_] = len(depd)

            sent_label.append([entd[token.ent_type_],posd[token.pos_],depd[token.dep_]])

        doc_label.append(sent_label)
        
    return doc_label

In [None]:
txtfiles = [f for f in listdir(WORK_DIR) if f.endswith('.txt')]
xmlfiles = [f[:-3]+'xml' for f in txtfiles]

corpus = []
labels = []

entd = dict()
posd = dict()
depd = dict()
entd['<PAD>'] = len(entd)
posd['<PAD>'] = len(posd)
depd['<PAD>'] = len(depd)

for findex, txtfile in enumerate(txtfiles):
    
    entities = read_xml(WORK_DIR, xmlfiles[findex])
    #print(entities) ###
    
    print(txtfile)
    with open(join(WORK_DIR, txtfile),'r') as f:
        content = f.read()
        doc = nlp(content)

    labels.append(get_doc_label(doc, entities))

    doc_sent = list(map(lambda sent: list(map(lambda x: x.text, sent)), doc.sents)) # sents to a list of lists of tokens
    corpus.append(doc_sent)
    doc_emb = elmo(batch_to_ids(doc_sent)) # generate embemdding !! - list of lists [[embd_of_word, ], ]
    assert doc_emb['mask'].sum().item() == len(doc)
    with open(join(PROCESSED_DIR, txtfile[:-3] + 'pkl'), 'wb') as f:
        pickle.dump(doc_emb, f)

In [8]:
print('labels:', labels[0][0])
print('\ncorpus:', corpus[0][0])

labels: [[1, 1, 1], [1, 2, 2], [1, 2, 2], [1, 3, 3], [1, 3, 4], [1, 4, 5], [1, 4, 6], [1, 5, 7], [1, 1, 1], [1, 3, 8], [1, 5, 7], [1, 1, 1], [1, 3, 8], [2, 3, 9], [1, 6, 10], [1, 7, 11]]

corpus: ['A', 'detailed', 'comprehensive', 'family', 'history', 'was', 'obtained', 'from', 'the', 'patient', 'during', 'the', 'visit', 'today', '.', ' ']


In [9]:
OBJECTS_DIR = '../objects/'
np.save(join(OBJECTS_DIR, 'labeled_corpus'), np.array(labels))
with open(join(OBJECTS_DIR, 'label_dict.pkl'),'wb') as f:
    pickle.dump([entd, posd, depd], f)

In [10]:
doc_emb['elmo_representations'][0][0][0].size()

torch.Size([1024])

In [11]:
print(len(entd))
print(entd)
# print(posd)
# print(depd)

17
{'<PAD>': 0, '': 1, 'DATE': 2, 'PERSON': 3, 'FAM': 4, 'OBS': 5, 'CARDINAL': 6, 'GPE': 7, 'ORG': 8, 'QUANTITY': 9, 'PERCENT': 10, 'ORDINAL': 11, 'TIME': 12, 'NORP': 13, 'LOC': 14, 'LAW': 15, 'PRODUCT': 16}
