In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
%cd '/content/drive/Shareddrives/NLP Project'

Mounted at /content/drive
/content/drive/Shareddrives/NLP Project


In [None]:
import os
from eval_script import RecordTrack1
from nltk.tokenize import WordPunctTokenizer
import json
from nltk.tokenize.punkt import PunktSentenceTokenizer

In [None]:
def process_n2c2_files(base_dir, data_split, tokenizer):
    '''
    base_dir: (str) base directory (abs or rel to current)
    data_split: (str) train, test, or dev
    tokenizer: tokenizer instance of nltk library
    '''
    data_dir = os.path.join(base_dir, data_split)
    root_names = [file.replace(".ann", "") for file in 
                  os.listdir(data_dir) if ".ann" in file]

    json_output = []
    sent_tokenizer = PunktSentenceTokenizer()
    for root_name in root_names:
        ann_file = os.path.join(data_dir, root_name + ".ann")
        txt_file = os.path.join(data_dir, root_name + ".txt")
        txt = open(txt_file, "r")
        doc_text = txt.read()
        annot_file = open(ann_file, "r")
        sentences = sent_tokenizer.tokenize(doc_text)
        spans = sent_tokenizer.span_tokenize(doc_text)
        # Get tokens and spans, where spans are [(start_char, end_char),...]

        # Process annotation file for same file
        # Only keep medical mentions for now; sort by start pos
        annotation = RecordTrack1(ann_file)
        all_tags = annotation.annotations['tags'].values()
        meds = sorted([tag for tag in all_tags if tag.ttype == "Drug"],
                       key = lambda item: item.start)
        med_idx = 0
        
        for s_idx, sentence in enumerate(sentences):
            tokens = tokenizer.tokenize(sentence)
            token_spans = list(tokenizer.span_tokenize(sentence))
            labels = ["O"] * len(tokens)
            sent_start = spans[s_idx][0]

            # save character pos in document for later processing
            doc_tok_spans = [(start + sent_start, end + sent_start) for (start, end) in token_spans]
            
            for i, (start_tok, end_tok) in enumerate(token_spans):
                if med_idx == len(meds):
                    break
                if start_tok + sent_start == meds[med_idx].start:
                    labels[i] = "B-MED"
                elif start_tok + sent_start > meds[med_idx].start:
                    labels[i] = "I-MED"
                if end_tok + sent_start >= meds[med_idx].end:
                    med_idx += 1

            json_output.append({"tokens": tokens, "ner_tags": labels, "token_spans": doc_tok_spans, "note_id": root_name})

    with open(f"{base_dir}/input/ner_input/ner_input_{data_split}.json", 'w') as fp:
        fp.write('\n'.join(json.dumps(i) for i in json_output) +'\n')
    

In [None]:
base_dir = 'data/split_data'
tokenizer = WordPunctTokenizer()

for data_split in ['train', 'dev', 'test']:
    process_n2c2_files(base_dir, data_split, tokenizer)