# Preprocessing based on original annotation file

In [1]:
from os.path import join

import torch
import spacy
import pickle as pkl
from allennlp.modules.elmo import Elmo, batch_to_ids


WORK_DIR = "../Data/bioc_FH_training/"
OBJECTS_DIR = '../objects/'
PROCESSED_TRAIN_DATA_DIR = "../Data/processed/"
PROCESSED_TEST_DATA_DIR = "../Data/processed_test_data/"

torch.set_num_threads(30)

In [2]:
! python -m spacy download en_core_web_lg
nlp = spacy.load('en_core_web_lg')

prefix = "/home/texuanw/tools/allenNLP/" # or = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/"
options_file = prefix + "elmo_2x4096_512_2048cnn_2xhighway_options.json"
weight_file = prefix + "elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"

elmo = Elmo(options_file, weight_file, 2, dropout=0)

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


In [3]:
def load_pkl(path, file_name):
    with open(join(path, file_name), 'rb') as pkl_f:
        content = pkl.load(pkl_f)
    return content

def pkl_dump(path, file_name, object_):
    with open(join(path, file_name), 'wb') as pkl_file:
        pkl.dump(object_, pkl_file)

# Train set

In [4]:
def get_token_ent_type(token, entities, bio=False):
    """
        entity: [start, end, ent_type, content, attributes, id]
    """
    
    ret_type = "O"
    
    for entity in entities:
        if token.idx >= entity[0] and token.idx < entity[1]:
            if entity[2] == 'Observation':
                ret_type = "OBS"
                
            elif entity[2] == 'FamilyMember':
                if entity[4]['SideOfFamily'] in ['NA', None]:
                    ret_type = "FM"
                else:
                    if str(token).strip().lower() not in ["paternal", "maternal"]:
                        ret_type = entity[4]['SideOfFamily']  # Paternal or Maternal
                    
            elif entity[2] == 'LivingStatus':
                alive, healthy = entity[4]['Alive'], entity[4]['Healthy']
                
                if 'No' in [alive, healthy]:
                    ret_type = "LS-0"
                elif alive == healthy == 'NA':
                    ret_type = "LS-1"
                elif alive == healthy == 'Yes':
                    ret_type = "LS-4"
                else:
                    ret_type = "LS-2"    # one Yes, one NA

            else:
                print(entity[2])

        prefix = ""
        if bio:
            if token.idx == entity[0]:
                prefix = 'B-'
            elif token.idx > entity[0] and token.idx < entity[1]:
                prefix = 'I-'

    return prefix + ret_type

In [5]:
def get_doc_labels(doc, entities, count, bio=False):
    """
        Get labels of all tokens in a document.
        Each token is labeled as [entity_type, POS_type, dependency]
    """
    
    doc_label = []
    
    for sent in doc.sents:
        sent_label = []

        for idx, token in enumerate(sent):
            ent_type = get_token_ent_type(token, entities)

            if bio and ent_type.startswith('B'):
                print(token, ent_type)
                count += 1
            elif ent_type != 'O':
                count += 1

            token.ent_type_ = ent_type
            if not token.ent_type_ in entd.keys():
                entd[token.ent_type_] = len(entd)

            if not token.pos_ in posd.keys():
                posd[token.pos_] = len(posd)

            if not token.dep_ in depd.keys():
                depd[token.dep_] = len(depd)

            sent_label.append([entd[token.ent_type_],posd[token.pos_],depd[token.dep_]])

        doc_label.append(sent_label)
        
    return doc_label, count

In [6]:
entd = dict()  # entity types dictionary
posd = dict()  # part of speech dictionary
depd = dict()  # dependency dictionary
entd['<PAD>'] = len(entd)
posd['<PAD>'] = len(posd)
depd['<PAD>'] = len(depd)

train_texts = load_pkl(OBJECTS_DIR, 'train_texts.pkl')
entities = load_pkl(OBJECTS_DIR, 'entities_corrected.pkl')

train_corpus_labels, count = {}, 0
for title, doc in train_texts.items():
    doc = nlp(doc)
    doc_labels, count = get_doc_labels(doc, entities[title], count)
    
    train_corpus_labels[title] = doc_labels

print("number of entities in IO:", count)
pkl_dump(OBJECTS_DIR, 'label_dict.pkl', [entd, posd, depd])
print(entd)
# print(posd)
# print(depd)

pkl_dump(OBJECTS_DIR, 'train_corpus_labels.pkl', train_corpus_labels)

number of entities in IO: 3164
{'<PAD>': 0, 'FM': 1, 'O': 2, 'LS-2': 3, 'OBS': 4, 'LS-0': 5, 'Maternal': 6, 'Paternal': 7, 'LS-4': 8}


In [7]:
print('train_corpus_labels:\n', train_corpus_labels[title][0])

train_corpus_labels:
 [[2, 10, 16], [2, 4, 4], [2, 6, 6], [2, 6, 17], [2, 7, 18], [2, 7, 1], [2, 5, 5], [2, 10, 16], [2, 6, 7], [2, 6, 15], [2, 2, 2], [2, 11, 19]]


In [8]:
def embedding_docs(texts, path2store):
    corpus = {}
    
    for title, doc in texts.items():
        # sents to a list of lists of tokens
        doc = nlp(doc)
        doc_sent = list(map(lambda sent: list(map(lambda x: x.text, sent)), doc.sents))
        corpus[title] = doc_sent

        doc_emb = elmo(batch_to_ids(doc_sent)) # generate embemdding !! - list of lists [[embd_of_word, ], ]
        assert doc_emb['mask'].sum().item() == len(doc)

        # print(doc_emb['elmo_representations'][0][0][0].size()) # (batch_size, timesteps, embedding_dim)
        pkl_dump(path2store, title + '.emb.pkl', doc_emb)
    
    return corpus

train_corpus = embedding_docs(train_texts, PROCESSED_TRAIN_DATA_DIR)
pkl_dump(OBJECTS_DIR, 'train_corpus.pkl', train_corpus)
print('corpus:\n', train_corpus[title][0])

corpus:
 ['A', 'detailed', 'family', 'history', 'was', 'taken', 'during', 'the', 'visit', 'today', '.', ' ']


# Test set

In [9]:
test_texts = load_pkl(OBJECTS_DIR, 'test_texts.pkl')
test_corpus = embedding_docs(test_texts, PROCESSED_TEST_DATA_DIR)
pkl_dump(OBJECTS_DIR, 'test_corpus.pkl', test_corpus)

In [10]:
def get_fake_doc_labels(doc):
    """
        Get fake labels of all tokens in a test document.
        Each token is labeled as [entity_type, POS_type, dependency] (3 dim)
    """
    
    doc_label = []
    
    for sent in doc.sents:
        sent_label = []

        for idx, token in enumerate(sent):
            sent_label.append([1, 1,1])

        doc_label.append(sent_label)
        
    return doc_label

In [11]:
test_corpus_labels = {}
for title, doc in test_texts.items():
    doc = nlp(doc)
    doc_labels = get_fake_doc_labels(doc)
    
    test_corpus_labels[title] = doc_labels

pkl_dump(OBJECTS_DIR, 'test_corpus_labels.pkl', test_corpus_labels)