In [None]:
import urllib.request
import os
import codecs
import zipfile

In [None]:
import pandas as pd

In [None]:
from IPython.display import display, HTML

In [None]:
import sklearn.metrics

In [None]:
class Annotation(object):
    def __init__(self):
        self.start_index = -1
        self.end_index = -1
        self.type = ''
        self.spanned_text = ''
        
    # adding this so that pyConText's HTML markup can work seamlessly
    def getSpan(self):
        return (self.start_index, self.end_index)
    
    def getCategory(self):
        # pyConText graph objects actually expect a list here
        return [self.type]

class AnnotatedDocument(object):
    def __init__(self):
        self.text = ''
        self.annotations = []
        self.positive_label = -1
        
def read_brat_annotations(lines):
    annotations = []
    # BRAT FORMAT is:
    # NUMBER[TAB]TYPE[SPACE]START_INDEX[SPACE]END_INDEX[SPACE]SPANNED_TEXT
    for line in lines:
        line = str(line)
        tab_tokens = line.split('\t')
        space_tokens = tab_tokens[1].split()
        anno = Annotation()
        anno.spanned_text = tab_tokens[-1]
        anno.type = space_tokens[0]
        anno.start_index = int(space_tokens[1])
        anno.end_index = int(space_tokens[2])
        annotations.append(anno)
    return annotations
        
def read_annotations(archive_file, force_redownload = False):
    print('Reading annotations from file : ' + archive_file)
    filename = archive_file.split('/')[-1]
    
    if force_redownload or not os.path.isfile(filename):
        print('Downloading remote file : '+ archive_file)
        urllib.request.urlretrieve(archive_file, filename)
    
    annotated_doc_map = {}
    
    print('Opening local file : ' + filename)
    z = zipfile.ZipFile(filename, "r")
    zinfo = z.namelist()
    for name in zinfo:
        if name.endswith('.txt') or name.endswith('.ann'):
            basename = name.split('.')[0]
            if basename not in annotated_doc_map:
                annotated_doc_map[basename] = AnnotatedDocument()
            anno_doc = annotated_doc_map[basename]
            # handle text and BRAT annotation files (.ann) differently
            if name.endswith('.txt'):
                with z.open(name) as f1:
                    anno_doc.text = f1.read().decode('utf8')
            else:
                with z.open(name) as f1:
                    # handle this as utf8 or we get back byte arrays
                    anno_doc.annotations = read_brat_annotations(codecs.iterdecode(f1, 'utf8'))
                    
    # now let's finally assign a 0 or 1 to each document based on whether we see our expected type for the pneumonia label
    for key, anno_doc in annotated_doc_map.items():
        annos = anno_doc.annotations
        anno_doc.positive_label = 0
        for anno in annos:
            if anno.type == 'DOCUMENT_PNEUMONIA_YES':
                anno_doc.positive_label = 1
                    
    return list(annotated_doc_map.values())

def calculate_prediction_metrics(gold_docs, prediction_function):
    gold_labels = [x.positive_label for x in gold_docs]
    pred_labels = []
    for gold_doc in gold_docs:
        pred_label = prediction_function(gold_doc.text)
        pred_labels.append(pred_label)
        
    # now let's use scikit-learn to compute some metrics
    precision = sklearn.metrics.precision_score(gold_labels, pred_labels)
    recall = sklearn.metrics.recall_score(gold_labels, pred_labels)
    f1 = sklearn.metrics.f1_score(gold_labels, pred_labels)
    # let's use Pandas to make a confusion matrix for us
    confusion_matrix_df = pd.crosstab(pd.Series(gold_labels, name = 'Actual'), 
                                      pd.Series(pred_labels, name = 'Predicted'))
    
    print('Precision : {0}'.format(precision))
    print('Recall : {0}'.format(recall))
    print('F1: {0}'.format(f1))
    
    print('Confusion Matrix : ')
    print(confusion_matrix_df)
    
annotated_docs = read_annotations('https://github.com/burgersmoke/DeCART_2017_rulebased_NLP/raw/master/data/BRAT/BratTestArchive.zip')
print('Total Annotated Documents : {0}'.format(len(annotated_docs)))

total_positives = 0
for anno_doc in annotated_docs:
    if anno_doc.positive_label:
        total_positives += 1
    
print('Total Positive Pneumonia Documents : {0}'.format(total_positives))

In [None]:
# helper functions to highlight annotations from BRAT
def mark_text(txt,nodes,colors = {"name":"red","pet":"blue"},default_color="black"):
    from pyConTextNLP.display.html import __insert_color
    # this function had to be copied and modified from pyConTextNLP.display.html.mark_text 
    # so that the default_color could be passed through
    if not nodes:
        return txt
    else:
        n = nodes.pop(-1)
        return mark_text(__insert_color(txt,
                                        n.getSpan(),
                                        colors.get(n.getCategory()[0],default_color)),
                         nodes,
                         colors=colors,
                         # this was not being passed through 
                        default_color = default_color)
    
def pneumonia_html_markup(anno_doc):
    from pyConTextNLP.display.html import __sort_by_span
    # this bit mimics 'mark_document_with_html' from pyConTextNLP.display.html
    colors = {}
    colors['DOCUMENT_PNEUMONIA_YES'] = 'red'
    colors['DOCUMENT_PNEUMONIA_NO'] = 'green'
    colors['SPAN_POSITIVE_PNEUMONIA_EVIDENCE'] = 'orange'
    default_color = 'red'
    html = """<p> {0} </p>""".format(" ".join([mark_text(anno_doc.text,
                                                 __sort_by_span(anno_doc.annotations),
                                                 colors=colors,
                                                 default_color=default_color)]))
    return html

In [None]:
# let's find the document with the most annotations
most_annotated_doc = None
for anno_doc in annotated_docs:
    
    for anno in anno_doc.annotations:
        print(anno.getCategory())
    
    if most_annotated_doc is None or len(anno_doc.annotations) > len(most_annotated_doc.annotations):
        most_annotated_doc = anno_doc
        print('Most Annotations so far : {}'.format(len(most_annotated_doc.annotations)))

In [None]:
# let's display one of our documents in HTML
display(HTML(pneumonia_html_markup(most_annotated_doc).replace('\n', '<br>')))

In [None]:
# let's first illustrate a naive baseline by always prediction NO pneumonia (i.e. 0)
def naive_negative_pneumonia_prediction(text):
    return 0
    
print('Predicting and validating the naive baseline of always predicting NO')
calculate_prediction_metrics(annotated_docs, naive_negative_pneumonia_prediction)

In [None]:
# let's first illustrate a naive baseline by always prediction NO pneumonia (i.e. 0)
def naive_positive_pneumonia_prediction(text):
    return 1
    
print('Predicting and validating the naive baseline of always predicting YES')
calculate_prediction_metrics(annotated_docs, naive_positive_pneumonia_prediction)

In [None]:
# now let's try a very naive simulated baseline to assign positive Pneumonia anytime the work "pneumonia" appears in a document
def naive_pneumonia_keyword_prediction(text):
    if 'pneumonia' in text:
        return 1
    else:
        return 0
    
print('Predicting and validating the naive PNEUMONIA keyword baseline')
calculate_prediction_metrics(annotated_docs, naive_pneumonia_keyword_prediction)

In [None]:
# let's try another one where we use a class to store some "keywords"
class KeywordClassifier(object):
    def __init__(self):
        self.keywords = set()
    def predict(self, text):
        prediction = 0
        for keyword in self.keywords:
            if keyword in text:
                prediction = 1
        return prediction
    
keyword_classifier = KeywordClassifier()
# let's load in some manual keywords...
keyword_classifier.keywords.add('positive')
keyword_classifier.keywords.add('confirmed')
keyword_classifier.keywords.add('probable')

print('Predicting and validating a classifier which uses some keywords above')
calculate_prediction_metrics(annotated_docs, keyword_classifier.predict)

# NOTE : Below this point will likely be split into another notebook

In [None]:
import pyConTextNLP
import pyConTextNLP.pyConTextGraph as pyConText
import pyConTextNLP.itemData as itemData
import nltk

In [None]:
# make sure that we have downloaded NLTK tokenizer data
nltk.download('punkt')

In [None]:
modifiers = itemData.instantiateFromCSVtoitemData(
    "https://raw.githubusercontent.com/chapmanbe/pyConTextNLP/master/KB/lexical_kb_05042016.tsv")
targets = itemData.instantiateFromCSVtoitemData(
    "https://raw.githubusercontent.com/chapmanbe/pyConTextNLP/master/KB/utah_crit.tsv")

print('Total Modifiers Loaded : [{0}]'.format(len(modifiers)))
print('Total Targets Loaded : [{0}]'.format(len(targets)))

In [None]:
def markup_sentence(s, modifiers, targets, prune_inactive=True):
    """
    """
    markup = pyConText.ConTextMarkup()
    markup.setRawText(s)
    markup.cleanText()
    markup.markItems(modifiers, mode="modifier")
    markup.markItems(targets, mode="target")
    markup.pruneMarks()
    markup.dropMarks('Exclusion')
    # apply modifiers to any targets within the modifiers scope
    markup.applyModifiers()
    markup.pruneSelfModifyingRelationships()
    if prune_inactive:
        markup.dropInactiveModifiers()
    return markup

In [None]:
# once again, let's grab the most human annotated document
# let's find the document with the most annotations
most_annotated_doc = None
for anno_doc in annotated_docs:
    
    for anno in anno_doc.annotations:
        print(anno.getCategory())
    
    if most_annotated_doc is None or len(anno_doc.annotations) > len(most_annotated_doc.annotations):
        most_annotated_doc = anno_doc
        print('Most Annotations so far : {}'.format(len(most_annotated_doc.annotations)))

In [None]:
context = pyConText.ConTextDocument()

sentences = nltk.sent_tokenize(most_annotated_doc.text.lower())
print('About to process {0} sentences...'.format(len(sentences)))

results = []
for s in sentences:
    #print(s)
    #print('*******************')
    m = markup_sentence(s, modifiers=modifiers, targets=targets)
    results.append(m)

for r in results:
    context.addMarkup(r)
    
print('Document marked up')

In [None]:
# prepare some colors for displaying any markup we might see
clrs = {\
    "pneumonia": "blue",
    "pneumothorax": "blue",
    "definite_negated_existence": "red",
    "probable_negated_existence": "indianred",
    "ambivalent_existence": "orange",
    "probable_existence": "forestgreen",
    "definite_existence": "green",
    "historical": "goldenrod",
    "indication": "pink",
    "acute": "golden"
}

In [None]:
display(HTML(pyConTextNLP.display.html.mark_document_with_html(context,colors = clrs, default_color="black")))

In [None]:
# let's see what this looks like in XML
print(context.getXML())