This file is parsing i2b2 training data and annotating it with the CoNLL BIO scheme, which has this form:

[record#] [word] [POS tag] [chunk tag] [NER tag]

In [1]:
import os
from nltk import pos_tag, RegexpParser
import pandas as pd
import numpy as np

In [2]:
dir_name = "./data/annotations/"  #  "entries" or "annotations"
test = os.listdir(dir_name)

for filename in test:
    noExtension = filename.split(".")[0]
    #print (filename)

    if filename[0] != ".":
        os.rename(dir_name+filename, dir_name+noExtension)

In [3]:
a_ids = []
e_ids = []

for filename in os.listdir("./data/annotations"):
    if filename[0] != ".":  # ignore hidden files
        a_ids.append(int(filename))
for filename in os.listdir("./data/entries"):
    if filename[0] != ".": 
        e_ids.append(int(filename))
    
a_ids = tuple(sorted(a_ids)) 
e_ids = tuple(sorted(e_ids))

intersection = list(set(a_ids) & set(e_ids))
if len(intersection) == len(a_ids):
    print("Success: all anotations have a corresponding entry.", len(intersection))

Success: all anotations have a corresponding entry. 259


In [4]:
print(a_ids[-1])

995723


## Build corpora

In [5]:
# build annotation and entry corpora

a_corpus = []
e_corpus = []

# only annotations and corresponding files
for file in a_ids:
    path = "./data/annotations/" + str(file)
    with open(path) as f:
        content = f.read().splitlines()
        a_corpus.append(content)

    path = "./data/entries/" + str(file)
    with open(path) as f:
        #content = f.readlines()
        content = f.read().splitlines()
        e_corpus.append(content)
    

In [6]:
print(a_corpus[1])

['m="cefpodoxime" 48:0 48:0|| do="nm"|| mo="nm"|| f="nm"|| du="nm"|| r="nm"|| e="start"|| t="past"|| c="factual"|| ln="narrative"', 'm="aspart" 68:2 68:2|| do="4 units" 68:3 68:4|| mo="subcutaneous" 68:5 68:5|| f="before dinner" 68:6 68:7|| du="nm"|| r="nm"|| e="nm"|| t="nm"|| c="nm"|| ln="list"', 'm="lantus" 32:9 32:9|| do="7 units" 32:6 32:7|| mo="nm"|| f="q.a.m....q.p.m." 32:10 32:10|| du="nm"|| r="nm"|| e="stop"|| t="past"|| c="factual"|| ln="list"', 'm="cefpodoxime" 48:6 48:6|| do="nm"|| mo="nm"|| f="after dialysis on monday-wednesday-friday." 49:1 49:4|| du="nm"|| r="nm"|| e="nm"|| t="nm"|| c="nm"|| ln="narrative"', 'm="tylenol" 74:3 74:3|| do="650 mg" 74:4 74:5|| mo="nm"|| f="p.r.n." 74:6 74:6|| du="nm"|| r="pain" 74:7 74:7|| e="nm"|| t="nm"|| c="nm"|| ln="list"', 'm="aspart" 70:2 70:2|| do="0 units" 71:8 71:9|| mo="nm"|| f="nm"|| du="nm"|| r="blood sugar" 71:4 71:5|| e="nm"|| t="nm"|| c="nm"|| ln="list"', 'm="cefpodoxime" 24:2 24:2|| do="nm"|| mo="nm"|| f="nm"|| du="nm"|| r="nm

## Set up dataframe

In [7]:
#  ["id", "row", "offset", "word", "POS", "chunk", "NER"]
entries_cols = ["id", "row", "offset", "word"]
entries_df = pd.DataFrame(columns=entries_cols)

In [8]:
entries_df.head()

Unnamed: 0,id,row,offset,word


In [9]:
annotations_cols = ["id", "NER_tag", "row", "offset", "length"]
annotations_df = pd.DataFrame(columns=annotations_cols)

In [10]:
annotations_df.head()

Unnamed: 0,id,NER_tag,row,offset,length


## Number of annotations

In [11]:
med_count = 0
dosage_count = 0
mode_count = 0
freq_count = 0
dur_count = 0
reason_count = 0

for document in a_corpus:
    for line in document:
        if "m=\"nm\"" not in line:
            med_count += 1
        if "do=\"nm\"" not in line:
            dosage_count += 1
        if "mo=\"nm\"" not in line:
            mode_count += 1
        if "f=\"nm\"" not in line:
            freq_count += 1
        if "du=\"nm\"" not in line:
            dur_count += 1
        if "r=\"nm\"" not in line:
            reason_count += 1
        
print("Medication annotations: ", med_count)
print("Dosage annotations: ", dosage_count)
print("Mode annotations: ", mode_count)
print("Frequency annotations: ", freq_count)
print("Duration annotations: ", dur_count)
print("Reason annotations: ", reason_count)


Medication annotations:  9220
Dosage annotations:  4626
Mode annotations:  3477
Frequency annotations:  4186
Duration annotations:  570
Reason annotations:  1662


## Build annotations data frame

In [12]:
annotations_df = pd.DataFrame(columns=annotations_cols)  # reset df
tmp_list = []

for i, document in enumerate(a_corpus):
    
    for row in document:
        row = row.split("||")
        #print(row, "\n")
        
        for tag in row: 
            tag = tag.split("=")
            if len(tag) > 1:
                if ":" in tag[1]:
                    tag_label = tag[0].lstrip(" ")
                    tag_row_a = tag[1].split(" ")[-2:][0].split(":")[0]
                    tag_row_b = tag[1].split(" ")[-2:][1].split(":")[0]
                    #print(tag_label, tag_row_a, tag_row_b)
                
                    # some annotations have non-standard formatting (losing 64 instances)
                    try:
                        tag_offset_a = int(tag[1].split(" ")[-2:][0].split(":")[1])
                        tag_offset_b = int(tag[1].split(" ")[-2:][1].split(":")[1])
                        length = tag_offset_b - tag_offset_a + 1
                        #print(tag_offset_a, tag_offset_b, length)

                        # 1 row = 1 token with a tag
                        first = True
                        BIO_tag = "B-"
                        if length > 1 and tag_row_a == tag_row_b:
                            for offset in range(tag_offset_a, tag_offset_b+1):
                                if first: 
                                    tag_label = BIO_tag + tag_label
                                    first = False
                                else:
                                    tag_label = tag_label.replace("B-", "I-")
                                tmp_list.append([a_ids[i], tag_label, tag_row_a, offset, 1])
                                #if "I-" in tag_label:
                                #    print(row, tag_label, tag)
                        # TODO: tags over line breaks
                        else:
                            tmp_list.append([a_ids[i], BIO_tag + tag_label, tag_row_a, tag_offset_a, length])
                    except:
                        pass
                else:
                    pass

annotations_df = pd.DataFrame(tmp_list, columns=annotations_cols)
annotations_df.reset_index(inplace=True)
                        

In [13]:
annotations_df["NER_tag"].unique()

array(['B-m', 'B-do', 'I-do', 'B-mo', 'B-f', 'I-f', 'I-m', 'B-du', 'I-du',
       'B-r', 'I-r', 'I-mo'], dtype=object)

In [14]:
annotations_df = annotations_df.drop(columns=["index", "length"])
annotations_df.shape

(36665, 4)

In [15]:
annotations_df.head()

Unnamed: 0,id,NER_tag,row,offset
0,11995,B-m,37,12
1,11995,B-do,37,13
2,11995,I-do,37,14
3,11995,B-mo,38,0
4,11995,B-f,38,1


In [16]:
entries_df.head()

Unnamed: 0,id,row,offset,word


In [17]:
e_corpus[0][0].split(" ")

['RECORD', '#11995']

## Build entries data frame

In [18]:
entries_df = pd.DataFrame(columns=entries_cols)  # reset df
tmp_list = []

for doc_i, document in enumerate(e_corpus):
    
    tmp_list.append([0, 0, 0, "-DOCSTART-"])
    tmp_list.append([0, 0, 0, "-EMPTYLINE-"])
    
    for row_i, row in enumerate(document):
        row_split = row.split(" ")
        for word_i, word in enumerate(row_split):
            word = word.rstrip(".")  # strip "." from end of word
            word = word.replace("\t", "")
            word_id = a_ids[doc_i]
            word_row = row_i+1  # 1-based indexing 
            word_offset = word_i # 0-based indexing
            
            if len(word) > 0 and "|" not in word:
                tmp_list.append([word_id, word_row, word_offset, word])
        
    tmp_list.append([0, 0, 0, "-EMPTYLINE-"])

entries_df = pd.DataFrame(tmp_list, columns=entries_cols)


In [19]:
entries_df.head()

Unnamed: 0,id,row,offset,word
0,0,0,0,-DOCSTART-
1,0,0,0,-EMPTYLINE-
2,11995,1,0,RECORD
3,11995,1,1,#11995
4,11995,2,0,785297081


In [20]:
annotations_df.head()

Unnamed: 0,id,NER_tag,row,offset
0,11995,B-m,37,12
1,11995,B-do,37,13
2,11995,I-do,37,14
3,11995,B-mo,38,0
4,11995,B-f,38,1


In [21]:
ner_counter = [1 for i in annotations_df["NER_tag"] if "B-" in i]
print(len(ner_counter), "named entities")

23676 named entities


## Joing entries and annotations

In [22]:
# ensure correct dtypes
annotations_df[['id', 'row', 'offset']] = annotations_df[['id', 'row', 'offset']].apply(pd.to_numeric)
annotations_df['NER_tag'] = annotations_df["NER_tag"].astype(str)
entries_df[['id', 'row', 'offset']] = entries_df[['id', 'row', 'offset']].apply(pd.to_numeric)
entries_df["word"] = entries_df["word"].astype(str)


In [23]:
result_df = pd.merge(entries_df, annotations_df, how="left", on=['id', 'row', 'offset'])

In [24]:
# replace NaNs with "O"
print("columns with missing data:\n", result_df.isna().any())
result_df = result_df.fillna("O")

columns with missing data:
 id         False
row        False
offset     False
word       False
NER_tag     True
dtype: bool


In [25]:
print("columns with missing data:\n", result_df.isna().any())

columns with missing data:
 id         False
row        False
offset     False
word       False
NER_tag    False
dtype: bool


In [26]:
result_df = result_df.drop(columns=["id", "row", "offset"])
result_df.head()

Unnamed: 0,word,NER_tag
0,-DOCSTART-,O
1,-EMPTYLINE-,O
2,RECORD,O
3,#11995,O
4,785297081,O


In [27]:
result_df.shape

(292887, 2)

In [28]:
# 71 fewer annotations than expected as annotations over line breaks are not included
ner_counter = [1 for i in result_df["NER_tag"] if "B-" in i]
print(len(ner_counter), "named entities")

23670 named entities


# POS tagger

In [29]:
from nltk.chunk.regexp import RegexpChunkParser, ChunkRule, RegexpParser
from nltk.tree import Tree

In [30]:
text = result_df["word"].tolist()
text_pos = pos_tag(text)
text_pos_list = [i[1] for i in text_pos]

In [31]:
len(text_pos_list)

292887

In [32]:
result_df.columns

Index(['word', 'NER_tag'], dtype='object')

In [33]:
result_df["POS_tag"] = text_pos_list

In [34]:
result_df.head()

Unnamed: 0,word,NER_tag,POS_tag
0,-DOCSTART-,O,JJ
1,-EMPTYLINE-,O,NN
2,RECORD,O,NNP
3,#11995,O,VBZ
4,785297081,O,CD


# CoNLL chunk tagger

In [35]:
text_test = "EU rejects German call to boycott British lamb.".split(" ")
text_pos_test = pos_tag(text_test)

In [36]:
text_pos_test

[('EU', 'NNP'),
 ('rejects', 'VBZ'),
 ('German', 'JJ'),
 ('call', 'NN'),
 ('to', 'TO'),
 ('boycott', 'VB'),
 ('British', 'JJ'),
 ('lamb.', 'NN')]

used for building regex 
grammar = r"""
    NP: {<DT|PP\$>?<JJ>*<NN.*>+} # noun phrase
    PP: {<IN><NP>}               # prepositional phrase
    VP: {<MD>?<VB.*><NP|PP>}     # verb phrase
    CLAUSE: {<NP><VP>}           # full clause
"""

### Noun phrases

In [37]:
rule_0 = ChunkRule("<DT>?<JJ.*>*<NN.*>+", "More complete chunk NP sequences")

chunk_parser_np = RegexpChunkParser([rule_0],chunk_label='NP')

chunk_result_tree_np = chunk_parser_np.parse(text_pos)


In [38]:
chunk_tag_np = []

for i in chunk_result_tree_np:
    if isinstance(i, Tree):
        for j in range(0, len(i)):
            if j == 0:
                # print("B-" + i.label())
                chunk_tag_np.append("B-" + i.label())
            else:
                chunk_tag_np.append("I-" + i.label())
                # print("I-" + i.label())
    else:
        # print("O")
        chunk_tag_np.append("O")


In [39]:
len(chunk_tag_np) == result_df.shape[0]  # check that chunk col has same length

True

In [40]:
print(chunk_tag_np[1])

I-NP


### Verb phrases

In [41]:
rule_1 = ChunkRule("<VBD|IN|\.>", "Verb phrases")

chunk_parser_vp = RegexpChunkParser([rule_1],chunk_label='VP')

chunk_result_tree_vp = chunk_parser_vp.parse(text_pos)

In [42]:
chunk_tag_vp = []

for i in chunk_result_tree_vp:
    if isinstance(i, Tree):
        for j in range(0, len(i)):
            if j == 0:
                # print("B-" + i.label())
                chunk_tag_vp.append("B-" + i.label())
            else:
                chunk_tag_vp.append("I-" + i.label())
                # print("I-" + i.label())
    else:
        # print("O")
        chunk_tag_vp.append("O")


In [43]:
len(chunk_tag_np) == result_df.shape[0] == len(chunk_tag_vp)

True

In [44]:
# augment chunk tags with verb phrase tags
for i, entry in enumerate(chunk_tag_np):
    if entry == "O":
        chunk_tag_np[i] = chunk_tag_vp[i]

There are not prepositional phrases.

In [45]:
result_df["chunk_tag"] = chunk_tag_np

In [46]:
result_df = result_df[['word', 'POS_tag', 'chunk_tag', 'NER_tag']]  # order columns

In [47]:
result_df.shape

(292887, 4)

In [48]:
result_df[['word', 'POS_tag', 'chunk_tag', 'NER_tag']] = result_df[['word', 'POS_tag', 'chunk_tag', 'NER_tag']].astype(str)
result_df.dtypes

word         object
POS_tag      object
chunk_tag    object
NER_tag      object
dtype: object

### Data split

In [49]:
result_df.shape

(292887, 4)

In [50]:
result_df = result_df.reindex()

In [51]:
# find indices of new documents
result_df[result_df["word"] == "-DOCSTART-"].index.values.tolist()

[0,
 858,
 1697,
 3431,
 4962,
 5695,
 6844,
 7849,
 9098,
 10547,
 11288,
 12705,
 14868,
 15652,
 17041,
 17999,
 19023,
 19868,
 21178,
 22324,
 23121,
 24181,
 24926,
 25972,
 27166,
 28662,
 29917,
 30987,
 31920,
 32863,
 33791,
 35066,
 36329,
 37519,
 38954,
 40043,
 40990,
 43497,
 44199,
 45649,
 46329,
 48408,
 49194,
 50338,
 51388,
 52032,
 53184,
 54281,
 55002,
 56258,
 58451,
 59389,
 60584,
 62185,
 63048,
 63493,
 64114,
 64933,
 65746,
 66883,
 67573,
 68806,
 69511,
 71456,
 71980,
 73313,
 74016,
 74737,
 76888,
 77523,
 78329,
 79608,
 81095,
 81602,
 81920,
 83001,
 83871,
 84583,
 85146,
 88984,
 90103,
 90815,
 91146,
 92126,
 92975,
 94225,
 96274,
 97001,
 97738,
 98920,
 100363,
 101251,
 101944,
 102590,
 104309,
 105007,
 105843,
 106819,
 107723,
 108346,
 110127,
 111580,
 112989,
 114300,
 116003,
 117855,
 118664,
 120439,
 121889,
 123318,
 124041,
 125058,
 127179,
 128125,
 128825,
 129411,
 130542,
 131363,
 132208,
 134810,
 135398,
 137238,
 1383

In [52]:
train = 202062  
dev = 247618
result_train_df = result_df.iloc[:train]
result_dev_df = result_df.iloc[train:dev]
result_test_df = result_df.iloc[dev:]

In [55]:
result_test_df.tail()

Unnamed: 0,word,POS_tag,chunk_tag,NER_tag
292787,8,CD,O,O
292788,Prednisone,NNP,B-NP,B-m
292789,Prednisone,NNP,I-NP,B-m
292790,5,CD,O,B-do
292791,mg,NN,B-NP,I-do
...,...,...,...,...
292882,6/10,CD,O,O
292883,T:,NNP,B-NP,O
292884,1/22,CD,O,O
292885,[report_end],NNP,B-NP,O


In [54]:
print("train shape ", result_train_df.shape)
print("dev shape ", result_dev_df.shape)
print("test shape ", result_test_df.shape)

train shape  (202062, 4)
dev shape  (45556, 4)
test shape  (45269, 4)


In [133]:
result_df.to_csv("result_df_NER_POS_chunk.csv")

# Write to txt

In [134]:
np.savetxt("train.txt", result_train_df.values, fmt="%s")
np.savetxt("valid.txt", result_dev_df.values, fmt="%s")
np.savetxt("test.txt", result_test_df.values, fmt="%s")