In [None]:
# we will definitely need pyConText
import pyConTextNLP
from pyConTextNLP import pyConTextGraph
from pyConTextNLP.itemData import itemData
from pyConTextNLP.display.html import mark_document_with_html
print(pyConTextNLP.__version__)

In [None]:
# useful utilities in RadNLP as well
import radnlp
import radnlp.view as rview
from radnlp.data import classrslts

In [None]:
# we will need a few other packages
import nltk
import urllib
import pandas as pd

In [None]:
# packages for interaction
from IPython.html.widgets import interact, interactive, fixed
import ipywidgets

In [None]:
from nlp_pneumonia_utils import Annotation
from nlp_pneumonia_utils import AnnotatedDocument
from nlp_pneumonia_utils import read_brat_annotations
from nlp_pneumonia_utils import read_annotations
from nlp_pneumonia_utils import calculate_prediction_metrics
from nlp_pneumonia_utils import mark_text
from nlp_pneumonia_utils import pneumonia_html_markup

print('Imported pneumonia nlp utilities...')

In [None]:
from IPython.display import display, HTML, Image

In [None]:
%matplotlib inline

In [None]:
# First thing, let's load in our dataset

# TODO : Update this to use a training set
annotated_docs = read_annotations('pneumonia_brat_full_set1.zip')

print('Total Annotated Documents : {0}'.format(len(annotated_docs)))

In [None]:
def markup_sentence(s, modifiers, targets, prune_inactive=True, verbose = False):
    """
    """
    markup = pyConTextGraph.ConTextMarkup()
    markup.setRawText(s)
    markup.cleanText()
    markup.markItems(targets, mode="target")
    markup.markItems(modifiers, mode="modifier")
    markup.pruneMarks()
    markup.dropMarks('Exclusion')
    # apply modifiers to any targets within the modifiers scope
    markup.applyModifiers()
    markup.pruneSelfModifyingRelationships()
    if prune_inactive:
        markup.dropInactiveModifiers()
    return markup

In [None]:
# let us set up an example document to work with
example_document = """
PORTABLE CHEST:  Comparison made to prior film from X:XX a.m. the same day.
     
The ET tube and nasogastric tube remain in good position. Cardiac and
mediastinal contours are stable. No acute changes are seen within the lung
parenchyma; specifically, there is no evidence of new infiltrate (skin folds
do project over the right lung). No consolidation on either side.

IMPRESSION: No evidence of pneumonia."""

example_sentence = """IMPRESSION: No evidence of pneumonia."""

# Before we continue, note that any itemData in pyConText has 4 parts:
1. The literal (e.g. "pneumonia", "pneumoniathorax", "can rule out", "cannot be excluded", etc)
2. The category (e.g. "EVIDENCE_OF_PNEUMONIA")
3. The regular expression (optional) used to capture the literal in the text. If no regular expression is provided, a regular expression is generated literally from the literal.
4. The rule (optional). If the itemData is being used as a modifier, the rule states what direction the modifier operates in the sentence: current valid values are: "forward", the item can modify objects following it in the sentence; "backward", the item can modify objects preceding it in the sentence; or "bidirectional", the item can modify objects preceding and following it in the sentence.

In [None]:
# Now let's set up some rules for pyConText for EVIDENCE_OF_PNEUMONIA
# At this moment, we will just set up these "concepts" and well handle modifiers for them after that

targets1 = []
modifiers1 = []

# so before we add targets, remember from above that they will look like this : 
# targets = itemData(["literal", "CATEGORY", "regular expression(s)", "empty or forward or backward or bidirectional"])

# so now let's set this up for "pneumonia" with the category "EVIDENCE_OF_PNEUMONIA"
targets1 = itemData(["pneumonia", "EVIDENCE_OF_PNEUMONIA", "", ""])

# let's go ahead and use this now on one single example sentence:
markup = markup_sentence(example_sentence, modifiers1, targets1)
# prettier display with IPython display
display(markup.nodes(data = True))
#print(markup.getXML())

In [None]:
# this now works on entire documents combining all sentence-level objects into
# one object we can can then graph
def markup_context_document(report_text, modifiers, targets):
    context = pyConTextGraph.ConTextDocument()
    
    # we will use ntlk for breaking up sentences
    sentences = nltk.sent_tokenize(report_text)
    for sentence in sentences:
        m = markup_sentence(sentence, modifiers=modifiers, targets=targets)
        context.addMarkup(m)
    
    return context

In [None]:
example_sentence_2 = """Findings consistent with CHF, although underlying bilateral lower lobe pneumonias cannot be excluded."""

In [None]:
# let's see how things look on this sentence
markup_sentence_2 = markup_sentence(example_sentence_2, modifiers1, targets1, verbose = True)
display(markup_sentence_2.nodes(data = True))

# So we didn't mark up a target for "pneumonias" since we only had the singular variant "pneumonia".  Let's add that as we augment our target concepts

In [None]:
# Our first attempt was very simple target, so now let's add some additional concepts
targets2 = []
modifiers2 = []

# so before we add targets, remember from above that they will look like this : 
# targets = itemData(["literal", "CATEGORY", "regular expression(s)", "empty or forward or backward or bidirectional"])

# before we continue, let's clear a mapping of compiled regular expressions which pyConText uses
if len(pyConTextGraph.compiledRegExprs) > 0:
    print('Clearing pyConText compiled regular expressions')
    pyConTextGraph.compiledRegExprs = {}

# so now let's set this up with more variants of "EVIDENCE_OF_PNEUMONIA"
targets2 = itemData(["pneumonia", "EVIDENCE_OF_PNEUMONIA", r"pneumonia[s]?", ""],
                   ["consolidation", "EVIDENCE_OF_PNEUMONIA", "", ""],
                   ["infiltrate", "EVIDENCE_OF_PNEUMONIA", "", ""])

# let's go ahead and use this again on our updated targets
context = markup_context_document(example_document, modifiers2, targets2)
# prettier display with IPython display
display(context.getDocumentGraph().nodes(data = True))
#print(context.getXML())

In [None]:
# let's look at this markup in HTML with colors
# prepare some colors for displaying any markup we might see
evidence_only_colors = {
    "evidence_of_pneumonia": "orange"
}

context_html = pyConTextNLP.display.html.mark_document_with_html(context, colors = evidence_only_colors, default_color="black")
display(HTML(context_html))

In [None]:
# let's also look again to see if our regular expression for "pneumonia" and "pneumonias" worked properly
markup_sentence_2_check = markup_sentence(example_sentence_2, modifiers2, targets2)
print(targets2)
display(markup_sentence_2_check.nodes(data = True))

In [None]:
# so now that we have added some pyConText targets, let's wire this up into a classifier so that we 
# can see that adding targets can increase our Recall even if Precision suffers
# We will address Precision when we start working with ConText Modifiers
class ConTextTargetOnlyClassifier(object):
    def __init__(self, modifiers, targets):
        self.modifiers = modifiers
        self.targets = targets
    def predict(self, text):
        # let's use our other functions in this notebook to perform sentence-wise markup and
        # we can then check to see if these contain any EVIDENCE_OF_PNEUMONIA category types
        context = markup_context_document(text, self.modifiers, self.targets)
        document_graph = context.getDocumentGraph()
        
        # let's walk through all of the nodes in the graph and see how many are evidence of pneumonia
        pneumonia_evidence_count = 0
        for node in document_graph.nodes():
            category_list = node.getCategory()
            for category in category_list:
                if category.upper() == 'EVIDENCE_OF_PNEUMONIA':
                    pneumonia_evidence_count += 1
            
        # do we have at least one category of pneumonia evidence here?
        return (pneumonia_evidence_count) > 0
           
# this one has only one target
classifier1 = ConTextTargetOnlyClassifier(modifiers1, targets1)
# this one has 3...
classifier2 = ConTextTargetOnlyClassifier(modifiers2, targets2)

# and now we can assess their performance
print('****************')
print('Performance for Classifier 1 : One total Target')
calculate_prediction_metrics(annotated_docs, classifier1.predict)

print('****************')
print('Performance for Classifier 1 : 3 total Targets')
calculate_prediction_metrics(annotated_docs, classifier2.predict)

In [None]:
# So we have improved recall, but what are we going to do about Precision?
# Since both Precision and Recall are measured equally in our F1 measure, we need to address it

# The solution to this is to improve our classification pipeline with ConText Modifiers 

# Developing modifiers takes time and objective measure.  Luckily, many of them have already been developed by Dr. Wendy Chapman
# and others on various research efforts.  Let's see what kind of data they contain
context_modifiers_url = "https://raw.githubusercontent.com/chapmanbe/pyConTextNLP/master/KB/lexical_kb_05042016.tsv"

In [None]:
modifier_file = urllib.request.urlopen(context_modifiers_url, data=None)
# now let's load this in directly into a DataFrame with Pandas and take a look at it
modifier_df = pd.read_csv(modifier_file, delimiter = "\t")
display(modifier_df.head(10))
display(modifier_df.tail(10))

In [None]:
modifiers3 = pyConTextNLP.itemData.instantiateFromCSVtoitemData(context_modifiers_url)
# let's just use the same targets as above for our third pipeline
targets3 = targets2

print('Total Modifiers Loaded for pipeline #3 : [{0}]'.format(len(modifiers3)))
print('Total Targets Loaded for pipeline #3 : [{0}]'.format(len(targets3)))

In [None]:
# Now we can use leverage both Targets and Modifiers to properly leverage context
# let's see what this looks like in HTML with our document:
# let's look at this markup in HTML with colors
# prepare some colors for displaying any markup we might see
colors = {
    "evidence_of_pneumonia": "orange",
    "definite_negated_existence": "red",
    "probable_negated_existence": "indianred",
    "ambivalent_existence": "orange",
    "probable_existence": "forestgreen",
    "definite_existence": "green",
    "historical": "goldenrod",
    "indication": "pink",
    "acute": "golden"
}

# let's mark up a new context object for our pipeline#3
context3 = markup_context_document(example_document, modifiers3, targets3)

display(HTML(pyConTextNLP.display.html.mark_document_with_html(context3, colors = colors, default_color="black")))

In [None]:
# now let's take a closer look at the XML to see how this is working behind the scenes
print(context3.getXML())

In [None]:
%%time
# NOTE : This is a "magic" command to Jupyter to time the execution of this entire cell

# OK, so now that we've got some decent Targets and Modifiers to start from, let's process all of the documents
# and then visualize the relationships between Targets and Modifiers for some of these documents
report_results = []
print('Marking up all documents...')
for anno_doc in annotated_docs:
    report_context = markup_context_document(anno_doc.text, modifiers3, targets3)
    # package this up into a class that the RadNLP utilities can use
    results = classrslts(context_document=report_context, exam_type="Chest X-Ray", report_text=anno_doc.text, classification_result='N/A')
    report_results.append(results)
    
print('DONE Marking up all documents...')

In [None]:
# This function let's us iterate through all documents and view the markup
def view_markup(contexts, colors):
    @interact(i=ipywidgets.IntSlider(min=0, max=len(contexts)-1))
    def _view_markup(i):
        markup = contexts[i]
        rview.markup_to_pydot(markup)
        display(Image("tmp.png"))
        
        report_html = pyConTextNLP.display.html.mark_document_with_html(markup.context_document, colors = evidence_only_colors, default_color="black")
        
        display(HTML(report_html))

In [None]:
view_markup(report_results, colors)