In [1]:
import glob  # Finds all the pathnames matching a specified pattern, 
             # typically specified with regex (re) 
import re
import pandas as pd
from nltk import pos_tag
from nltk.chunk.regexp import RegexpChunkParser, ChunkRule, RegexpParser
from nltk.tree import Tree
import numpy as np
import os.path 
from nltk import tokenize

In [3]:
data_dir = 'train_data/beth/'  # Path to directory containing .con and .txt files
output_dir = data_dir + 'output/'

a_corpus = glob.glob(data_dir+'concept/*.con')  # Make list of concept files
e_corpus = glob.glob(data_dir+'txt/*.txt')  # Make list of documents

base_str = "record-"
# base_str = ['_DH', '_ELMVH', '_WGH', '_PUMC', '_RWH', '_SC', '_a', '_EH', 
#            '_YC','']

In [4]:
a_ids = []
e_ids = []

# Use regex to create doc id 

for con in a_corpus:
    f_id = re.findall(r'\d+', con)[0]
    a_ids.append(f_id)

for doc in e_corpus:
    f_id = re.findall(r'\d+', doc)[0]
    e_ids.append(f_id)

a_ids = tuple(sorted(a_ids)) 
e_ids = tuple(sorted(e_ids))
intersection = list(set(a_ids) & set(e_ids))
if len(intersection) == len(a_ids):
    print("Count of concept files with corresponding doc:", len(intersection))

Count of concept files with corresponding doc: 73


In [6]:
a_corpus = []
e_corpus = []

# for f_id in a_ids:
#     for i in base_str:
#         path = data_dir + "concept/" + f_id + i +".con"
#         if os.path.isfile(path) == True:
#             with open(path) as f:
#                 content = f.read().splitlines()
#                 a_corpus.append(content)

#         path = data_dir + "txt/" + f_id + i +".txt"
#         if os.path.isfile(path) == True:
#             with open(path) as f:
#                 content = f.read().splitlines()
#                 e_corpus.append(content)

for f_id in a_ids:
    path = data_dir + "concept/" + base_str + f_id +".con"
    with open(path) as f:
        content = f.read().splitlines()
        a_corpus.append(content)

    path = data_dir + "txt/" + base_str + f_id +".txt"
    with open(path) as f:
        content = f.read().splitlines()
        e_corpus.append(content)

In [7]:
entries_cols = ["id", "row", "offset", "word"]
entries_df = pd.DataFrame(columns=entries_cols)

annotations_cols = ["id", "NER_tag", "row", "offset", "length"]
annotations_df = pd.DataFrame(columns=annotations_cols)

In [8]:
annotations_df = pd.DataFrame(columns=annotations_cols)  # Reset df
tmp_list = []  # Set up variable to hold row info

for i, document in enumerate(a_corpus):
    
    for row in document:
        row = row.split("||")
        text_info = row[0]
        type_info = row[1]
        
        text = text_info.split('"')[1]
        
        offset_start = text_info.split(' ')[-2]
        offset_end = text_info.split(' ')[-1]
        
        line = offset_start.split(':')[0] # Given one sentence to line, 
                                          # line number will be the same for offset_start and offset_end
        
        word_offset_start = int(offset_start.split(':')[1])
        word_offset_end = int(offset_end.split(':')[1])
        length = word_offset_end-word_offset_start +1
        
        a_type = type_info.split('"')[-2]
        
        # Split text into tokens with IOB tags
        first = True  # Set up flag to id start of text
#         BIO_tag = "B-"
        if length > 1:  # Isolate text with multiple tokens 
            for offset in range(word_offset_start, word_offset_end+1):
                if first:
                    tag_label = a_type # Set tag for first word to start with B-
                    first = False  # Change flag
                else:
                    tag_label = tag_label
                tmp_list.append([a_ids[i], tag_label, line, offset, 1])                
        else:
            tmp_list.append([a_ids[i], a_type, line, word_offset_start, length])
        
annotations_df = pd.DataFrame(tmp_list, columns=annotations_cols)
annotations_df = annotations_df.drop(columns=["length"])

annotations_df.head(50)

Unnamed: 0,id,NER_tag,row,offset
0,105,problem,55,6
1,105,problem,55,7
2,105,problem,55,8
3,105,problem,143,1
4,105,problem,143,2
5,105,problem,26,0
6,105,problem,68,1
7,105,problem,68,2
8,105,problem,68,3
9,105,test,21,0


In [9]:
annotations_df.head(100)

Unnamed: 0,id,NER_tag,row,offset
0,105,problem,55,6
1,105,problem,55,7
2,105,problem,55,8
3,105,problem,143,1
4,105,problem,143,2
...,...,...,...,...
95,105,problem,56,1
96,105,problem,56,2
97,105,problem,89,3
98,105,problem,89,4


In [10]:
entries_df = pd.DataFrame(columns=entries_cols)  # Reset df
tmp_list = []

for doc_i, document in enumerate(e_corpus):
    
    tmp_list.append([0, 0, 0, "-DOCSTART-"])
    tmp_list.append([0, 0, 0, "-EMPTYLINE-"])
    
    for row_i, row in enumerate(document):
        row_split = row.split(" ")
        for word_i, word in enumerate(row_split):
            word = word.replace("\t", "")
            word_id = a_ids[doc_i]
            word_row = row_i+1  # 1-based indexing 
            word_offset = word_i # 0-based indexing
            
            if len(word) > 0 and "|" not in word:
                tmp_list.append([word_id, word_row, word_offset, word])
        
    tmp_list.append([0, 0, 0, "-EMPTYLINE-"])

entries_df = pd.DataFrame(tmp_list, columns=entries_cols)

ner_counter = [1 for i in annotations_df["NER_tag"] if "B-" in i]
print(len(ner_counter), "named entities")

0 named entities


In [11]:
annotations_df[['id', 'row', 'offset']] = annotations_df[['id', 'row', 'offset']].apply(pd.to_numeric)
annotations_df['NER_tag'] = annotations_df["NER_tag"].astype(str)
entries_df[['id', 'row', 'offset']] = entries_df[['id', 'row', 'offset']].apply(pd.to_numeric)
entries_df["word"] = entries_df["word"].astype(str)

result_df = pd.merge(entries_df, annotations_df, how="left", on=['id', 'row', 'offset'])

# Check for NaNs (should be only in NER_tag, where NaNs will be replaced with "O" (outside))
print("Columns with missing data:\n", result_df.isna().any())

Columns with missing data:
 id         False
row        False
offset     False
word       False
NER_tag     True
dtype: bool


In [12]:
result_df = result_df.fillna("O")
result_df = result_df.drop(columns=["id", "row", "offset"])

ner_counter = [1 for i in result_df["NER_tag"] if "B-" in i]
print(len(ner_counter), "named entities and", result_df.shape[0], "tokens")

0 named entities and 88942 tokens


In [13]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/Sylvia/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [14]:
text = result_df["word"].tolist()
text_pos = pos_tag(text)
text_pos_list = [i[1] for i in text_pos]
print(len(text_pos_list), 'tokens')

88942 tokens


In [15]:
rule_0 = ChunkRule("<DT>?<JJ.*>*<NN.*>+", "More complete chunk NP sequences")
chunk_parser_np = RegexpChunkParser([rule_0],chunk_label='NP')
chunk_result_tree_np = chunk_parser_np.parse(text_pos)

chunk_tag_np = []

for i in chunk_result_tree_np:
    if isinstance(i, Tree):
        for j in range(0, len(i)):
            if j == 0:
                # print("B-" + i.label())
                chunk_tag_np.append("B-" + i.label())
            else:
                chunk_tag_np.append("I-" + i.label())
                # print("I-" + i.label())
    else:
        # print("O")
        chunk_tag_np.append("O")

len(chunk_tag_np) == result_df.shape[0]  # check that chunk col has same length

True

In [16]:
rule_1 = ChunkRule("<VBD|IN|\.>", "Verb phrases")
chunk_parser_vp = RegexpChunkParser([rule_1],chunk_label='VP')
chunk_result_tree_vp = chunk_parser_vp.parse(text_pos)

chunk_tag_vp = []

for i in chunk_result_tree_vp:
    if isinstance(i, Tree):
        for j in range(0, len(i)):
            if j == 0:
                # print("B-" + i.label())
                chunk_tag_vp.append("B-" + i.label())
            else:
                chunk_tag_vp.append("I-" + i.label())
                # print("I-" + i.label())
    else:
        # print("O")
        chunk_tag_vp.append("O")
        
len(chunk_tag_np) == result_df.shape[0] == len(chunk_tag_vp)

True

In [17]:
for i, entry in enumerate(chunk_tag_np):
    if entry == "O":
        chunk_tag_np[i] = chunk_tag_vp[i]
        
result_df["POS_tag"] = text_pos_list
result_df["chunk_tag"] = chunk_tag_np

result_df = result_df[['word', 'POS_tag', 'chunk_tag', 'NER_tag']]  # Reorder columns
result_df[['word', 'POS_tag', 'chunk_tag', 'NER_tag']] = result_df[['word', 'POS_tag', 'chunk_tag', 'NER_tag']].astype(str)

In [18]:
output_df = result_df.copy()

# Identify idx at end of sentences
idx = output_df[output_df['word'] == '.'].index.values.tolist()
print(len(idx), 'total sentences')

4585 total sentences


In [19]:
df_new = pd.DataFrame(index=[el+0.5 for el in idx]) 
output_df = pd.concat([output_df, df_new])
output_df = output_df.sort_index()
output_df = output_df.reset_index(drop=True)
output_df.fillna("", inplace=True)
output_df[output_df['word'] == ""].head()

Unnamed: 0,word,POS_tag,chunk_tag,NER_tag
92,,,,
108,,,,
130,,,,
148,,,,
155,,,,


In [20]:
output_df['Sentences #'] = (output_df['word'] == ".").cumsum() + 1
output_df['Sentences #'] = output_df['Sentences #'].shift(1)
output_df['Sentences #'].iloc[0] = 1
output_df['Sentences #'] = output_df['Sentences #'].astype(int)
output_df['Sentences #'] = 'Sentence:' + output_df['Sentences #'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [21]:
output_df

Unnamed: 0,word,POS_tag,chunk_tag,NER_tag,Sentences #
0,-DOCSTART-,JJ,B-NP,O,Sentence:1
1,-EMPTYLINE-,NNP,I-NP,O,Sentence:1
2,Admission,NNP,I-NP,O,Sentence:1
3,Date,NNP,I-NP,O,Sentence:1
4,:,:,O,O,Sentence:1
...,...,...,...,...,...
93522,End,NNP,B-NP,O,Sentence:4586
93523,of,IN,B-VP,O,Sentence:4586
93524,Report,NNP,B-NP,O,Sentence:4586
93525,),),O,O,Sentence:4586


In [22]:
np.savetxt(output_dir+"beth.txt", output_df.values, fmt="%s")

In [23]:
# train = int(round(output_df.shape[0]*.7,0))  
# dev = int(round(output_df.shape[0]*.85,0))
# result_train_df = output_df.iloc[:train]
# result_dev_df = output_df.iloc[train:dev]
# result_test_df = output_df.iloc[dev:]

In [24]:
# np.savetxt(output_dir+"train.txt", result_train_df.values, fmt="%s")
# np.savetxt(output_dir+"valid.txt", result_dev_df.values, fmt="%s")
# np.savetxt(output_dir+"test.txt", result_test_df.values, fmt="%s")