In [1]:
import numpy as np
import os
import re

In [2]:
root_txt = 'data/cadec/text/'
root_ann = 'data/cadec/original/'

In [3]:
def read_files(fn):
    med, i = re.findall(r'(\w+)\.(\d+)\.txt', fn)[0]
    print(med, i)
    i = int(i)

    with open(os.path.join(root_txt, fn), 'r') as infile:
        text = infile.readlines()
        text = ''.join(text)
    with open(os.path.join(root_ann, fn.replace('txt', 'ann')), 'r') as infile:
        annotations = infile.readlines()
        annotations = [l.strip() for l in annotations if not l.startswith('#')]
    return i, med, text, annotations
    

def parse_annotations(lines):
    annots = {}
    for i in range(len(lines)):
        annots[i] = {}
        entity = re.findall(r'(Finding|ADR|Drug|Disease|Symptom) ([\d; ]+)\t(.*)$', 
                            lines[i])[0]
        annots[i]['ner'] = entity[0]
        boundaries = entity[1].split(';')
        boundaries = [[int(bb) for bb in b.split()] for b in boundaries]
        annots[i]['boundaries'] = boundaries
        annots[i]['text'] = entity[2]
    return annots


def get_current_annot(annots, idx, start):
    if idx > len(annots):
        return idx - 1
    boundaries = annots[idx]['boundaries']
    if start > boundaries[-1][-1]:
        return get_current_annot(annots, idx+1, start)
    return idx

In [4]:
def get_IOB_tags(text: str, annotations: dict):
    tokens = re.findall(r'\w+|[^\w\s]', text)
    if len(annotations) == 0:
        return tokens, ['O' for _ in tokens]
    offset = 0
    idx = 0
    tags = []
    text_tmp = text
    for token in tokens:
        if token == '.':
            tags.append('O')
            continue
        span = np.asarray(re.search(token, text_tmp).span())
        idx = get_current_annot(annotations, idx, span[0] + offset)
        boundaries = annotations[idx]['boundaries']
        found = False
        for i, (start, end) in enumerate(boundaries):
            if (span[0] + offset >= start) and (span[1] + offset <= end):
                prefix = 'B-'
                if i > 0 or span[0] + offset > start:
                    prefix = 'I-'
                tags.append(prefix + annotations[idx]['ner'])
                found = True
                break
        if not found:
            tags.append('O')
        offset += span[1]
        text_tmp = text_tmp[span[1]:]
    return tokens, tags

In [5]:
data = {}

for fn in os.listdir(root_txt):
    i, med, text, annotations = read_files(fn)
    annots = parse_annotations(annotations)
    tokens, tags = get_IOB_tags(text, annots)
    if med not in data.keys():
        data[med] = {}
    data[med][i] = {
        'tokens': tokens, 'ner': tags
    }
    

LIPITOR 86


KeyError: 7

In [36]:
print(data['LIPITOR'][977]['ner'])

['B-ADR']


In [37]:
def print_tags_tokens(data):
    tokens = data['tokens']
    tags = data['ner']
    line1 = ""
    line2 = ""
    for word, label in zip(tokens, tags):
        max_length = max(len(word), len(label))
        line1 += word + " " * (max_length - len(word) + 1)
        line2 += label + " " * (max_length - len(label) + 1)
    print(line1)
    print(line2)


In [38]:
data['LIPITOR'].keys()

dict_keys([86, 92, 952, 946, 45, 775, 761, 51, 79, 991, 749, 985, 588, 577, 211, 205, 563, 239, 403, 365, 371, 417, 359, 198, 826, 832, 167, 601, 615, 173, 629, 628, 614, 172, 166, 600, 833, 199, 827, 358, 370, 416, 402, 364, 238, 204, 562, 576, 210, 589, 984, 78, 748, 990, 760, 50, 44, 774, 947, 953, 93, 87, 91, 979, 85, 945, 789, 951, 52, 762, 776, 46, 986, 992, 560, 206, 212, 574, 548, 399, 414, 372, 366, 400, 428, 819, 831, 825, 170, 616, 602, 164, 158, 159, 603, 165, 171, 617, 824, 830, 818, 429, 367, 401, 415, 373, 398, 549, 213, 575, 561, 207, 993, 987, 777, 47, 53, 763, 950, 788, 944, 84, 90, 978, 798, 940, 954, 94, 80, 968, 983, 997, 767, 57, 43, 773, 559, 203, 565, 571, 217, 388, 439, 377, 411, 405, 363, 834, 820, 808, 149, 613, 175, 161, 607, 160, 606, 612, 174, 148, 809, 821, 835, 404, 362, 376, 410, 438, 389, 570, 216, 202, 564, 558, 42, 772, 766, 56, 996, 982, 81, 969, 95, 955, 1000, 941, 799, 957, 943, 83, 97, 994, 68, 980, 758, 770, 40, 54, 764, 599, 228, 214, 9, 572, 5

In [39]:
print_tags_tokens(data['LIPITOR'][155])

stomach 
B-ADR   


In [40]:
print(data['LIPITOR'][155])

{'tokens': ['stomach', 'pain', ',', 'dizzy', 'spells', ',', 'hairloss', ',', 'fatigue', ',', 'dry', 'eyes', ',', 'joint', 'pain', '.', 'When', 'I', 'stopped', 'taking', 'Lipitor', ',', 'I', 'was', 'amazed', 'at', 'how', 'much', 'better', 'I', 'began', 'to', 'feel', '.', 'While', 'taking', 'Lipitor', 'I', 'had', 'stomach', 'cramps', 'every', 'day', '.', 'I', 'was', 'tired', ',', 'and', 'could', 'not', 'function', 'normal', '.'], 'ner': ['B-ADR']}
