In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
%cd '/content/drive/Shareddrives/NLP Project'

Mounted at /content/drive
/content/drive/Shareddrives/NLP Project


In [2]:
import os
from eval_script import RecordTrack1
from nltk.tokenize import WordPunctTokenizer
import json
from nltk.tokenize.punkt import PunktSentenceTokenizer

In [3]:
def process_n2c2_files_task3(base_dir, data_split, annotation_type, from_ner=False, test_ann_dir=None):
    '''
    base_dir: (str) base directory (abs or rel to current)
    data_split: (str) train, test, or dev
    tokenizer: tokenizer instance of nltk library
    '''
    text_dir = os.path.join(base_dir, data_split)
    # If we are testing, use the annotations generated in Task 2 Event
    if from_ner:
        ann_dir = os.path.join(base_dir, 'output', test_ann_dir)
    else:
        ann_dir = text_dir
    root_names = [file.replace(".ann", "") for file in 
                  os.listdir(ann_dir) if ".ann" in file]
    counts = {}
    json_output = []
    sent_tokenizer = PunktSentenceTokenizer()
    all_mentions = []
    for root_name in root_names:
        ann_file = os.path.join(ann_dir, root_name + ".ann")
        txt_file = os.path.join(text_dir, root_name + ".txt")
        txt = open(txt_file, "r")
        doc_text = txt.read()
        sentences = sent_tokenizer.tokenize(doc_text)
        spans = sent_tokenizer.span_tokenize(doc_text)
        # Process annotation file for same file
        # Only keep medical mentions for now; sort by start pos
        annotation = RecordTrack1(ann_file)
        all_ann = annotation.annotations['attributes'].values()
        anns = sorted([ann for ann in all_ann if ann.rtype == annotation_type],
                       key = lambda item: item.arg.start)
        

        all_mentions.extend(anns)
        for sent_start, sentence in enumerate(sentences):
            start_idx = spans[sent_start][0]
            end_idx = spans[sent_start][1]
            for ann in anns:
                if ann.arg.start >= start_idx and ann.arg.start <= end_idx:
                    if (sent_start > 0) and (sent_start < (len(sentences) - 1)):
                        # all 3 sentences
                        text = sentences[sent_start - 1] + sentence + sentences[sent_start + 1]
                    elif sent_start > 0:
                        # prev_sentence + sentence
                        text = sentences[sent_start - 1] + sentence
                    elif sent_start < (len(sentences) - 1):
                        # sentence + next_sentence
                        text = sentence + sentences[sent_start + 1]
                    label = ann.rval
                    if label not in counts:
                        counts[label] = 0
                    counts[label] += 1
                    json_output.append({"text": text, 
                                        "label": label, 
                                        "note_id": root_name, 
                                        "tid": ann.arg.rid, 
                                        "rid": ann.rid}) 
        assert len(all_mentions) == len(json_output)
    print(counts)
    print(json_output)

    if from_ner:
        # test file starting from NER output is labeled differently
        filepath = f"{base_dir}/input/context_input/{annotation_type.lower()}_{data_split}_from_ner.json"
    else:
        filepath = f"{base_dir}/input/context_input/{annotation_type.lower()}_{data_split}.json"

    with open(filepath, 'w') as fp:
        fp.write('\n'.join(json.dumps(i) for i in json_output) +'\n')

In [4]:
base_dir = 'data/split_data'
test_ann_dir = 'event_predicted_annotations'

# produce files from gold standard (all dispositions) and from our pipleine "from ner"
for from_ner in [False, True]:
  for ann_type in ["Certainty", "Negation", "Temporality", "Action", "Actor"]:
    print(ann_type)
    process_n2c2_files_task3(base_dir, 'test', ann_type, from_ner, test_ann_dir)


Certainty
{'Certain': 127, 'Hypothetical': 14, 'Conditional': 14}
[{'text': 'The \npatient was found unconscious on the floor, having a generalized \ntonic clonic seizure.The patient was intubated and given 4 mg of \nAtivan prior to arrival in the Emergency Department with the \nparamedics.On arrival in the Emergency Department, the patient \nwas noted to be comatose, intubated, and clearly in acute distress.', 'label': 'Certain', 'note_id': '357-01', 'tid': 'E1', 'rid': 'A1'}, {'text': 'The patient has been increased his activity and prior to this time had been extremely active with no erythema, tenderness or drainage from his leg.My concern is that he has increased warmth with blanching and erythema on the medial aspect of his knee consistent with a superficial cellulitis and I believe that he would benefit from hospital admission with a course of IV antibiotics, bedrest and elevate.We will tentatively plan to admit him for localized wound care measures, treatment with IV Nafcillin o