# Annotation errors hava found:

- doc_50: "full" anned as FM
- doc_169: first char missing at idx 10
- doc_59: 3 lost last

In [1]:
from os import listdir

OBJECTS = './objects/'
train_data_directory = './Data/bioc-FH-training/'
test_data_directory = './Data/testRelease-0805/'

In [2]:
def get_file_names(path):
    file_list  = listdir(path)
    return set([file.split('.')[0] for file in file_list if file != '' and file.endswith('txt')])

train_file_names = get_file_names(train_data_directory)
test_file_names = get_file_names(test_data_directory)

# Read files

In [3]:
def read_files(directory, file_names):
    texts = dict()

    for file_name in file_names:
        file = directory + file_name + '.txt'

        with open(file, 'r') as f:
            texts[file_name] = f.read().replace('\n', ' ')

    return texts

train_texts = read_files(train_data_directory, train_file_names)
test_texts = read_files(test_data_directory, test_file_names)

# Tokenization and POS

In [4]:
import pickle
from os.path import join

def pkl_dump(path, file_name, object_):
    with open(join(path, file_name), 'wb') as pkl_file:
        pickle.dump(object_, pkl_file)


In [5]:
import nltk

def preprocessing(texts, path2store, purpose_of_data="train"):
    """ get sentences, tokens and part-of-speech from texts by using NLTK.
        Then store them into pkl files.
    """

    sents  = dict()
    tokens = dict()
    pos    = dict()

    for file_name, file_content in texts.items():
        tokens_   = []
        tags      = []

        sentences = nltk.sent_tokenize(file_content)
        for sentence in sentences:
            words = nltk.word_tokenize(sentence)
            tokens_.append(words)
            tags.append(nltk.pos_tag(words))

        sents[file_name] = sentences
        tokens[file_name] = tokens_
        pos[file_name] = tags
        
    pkl_dump(path2store, purpose_of_data+"_texts.pkl", texts)
    pkl_dump(path2store, purpose_of_data+"_sents.pkl", sents)
    pkl_dump(path2store, purpose_of_data+"_tokens.pkl", tokens)
    pkl_dump(path2store, purpose_of_data+"_pos.pkl", pos)

    return

preprocessing(train_texts, OBJECTS, "train")
preprocessing(test_texts, OBJECTS, "test")

# For training data: Get annotated entities

In [6]:
import xml.etree.ElementTree

entities = {}
for file_name in train_texts.keys():
    file_path = train_data_directory + file_name + '.xml'
    
    root = xml.etree.ElementTree.parse(file_path).getroot()
    annotations = root.findall('annotations')[0]

    entities_ = [] # get span of every annoted entity
    for entity in annotations.findall('entity'):
        id = entity.find('id').text
        typee = entity.find('type')

        if typee.text in ['FamilyMember', 'Observation', 'LivingStatus']: # other options: Age, PHI
            properties = entity.find('properties')
            for spans in entity.findall('span')[0].text.split(';'):
                span = list(map(int, spans.split(',')))
                attributes = {}
                
                if typee.text == 'FamilyMember':
                    attributes['SideOfFamily'] = properties.find('SideOfFamily').text
                elif typee.text == 'Observation':
                    attributes['Negation'] = properties.find('Negation').text
                elif typee.text == 'LivingStatus':
                    attributes['Alive'] = properties.find('Alive').text
                    attributes['Healthy'] = properties.find('Healthy').text
                    attributes['ID'] = properties.find('ID').text
                
                entities_.append((span[0], span[1],
                                  typee.text,
                                  train_texts[file_name][span[0]:span[1]],
                                  attributes,
                                  id))

    entities_.sort(key=lambda tup: tup[1])
    entities[file_name] = entities_

In [7]:
# save entities to disk
pkl_dump(OBJECTS, "train_entities.pkl", entities)

# Entities stat

In [None]:
num_entities = 0
living_status = []
for file_name in train_texts.keys():
    print('\n', file_name)
    # print(texts[file_name])
    
    for entity in entities[file_name]:
        num_entities += 1
        if entity[2] == 'LivingStatus':
            living_status.append(entity[3])
        print(entity)

In [10]:
print("there are", num_entities, "enties in all.")

there are 2202 enties in all.


In [9]:
from collections import Counter
Counter(living_status)

Counter({'died ': 39,
         'living': 5,
         'healthy ': 10,
         'alive': 7,
         'died': 123,
         'good general health': 11,
         'healthy': 118,
         'deceased': 3,
         'otherwise healthy': 1,
         'living and well': 7,
         'passed away ': 17,
         'stillborn': 1,
         'passed away': 11,
         'good health': 32,
         'alive ': 4,
         'well': 5,
         'alive and well': 1,
         'health': 2,
         'good general health ': 1,
         'doing well': 1,
         'death': 1,
         'living ': 1,
         ' living and well': 3,
         ' healthy': 2,
         ' health': 1,
         'ied ': 1,
         'without problems': 1,
         'dead': 2,
         'living and well ': 1,
         'generally healthy': 1,
         'health ': 1,
         ' alive': 1})