# Step 1 - 6
### Here is a notebook for [the MedExtractionPipeline](./MedExtractionPipeline.ipynb)

# Step 7: Processing Context

The template of this notebook is largely based on the MultiSentenceDocuments.ipynb example notebook by pyContextNLP

In [1]:
import sys
sys.path.append('../') 
sys.path.append('../modules') # Append Map with modules to the source tree
import numpy as np
import pandas as pd
import re
from textblob import TextBlob
import EMR_functions as em

import modules.pyConTextNLP2.pyConTextNLP.pyConText as pyConText

import modules.pyConTextNLP2 #import itemData
#from textblob import TextBlob
import modules.pyConTextNLP2.pyConTextNLP.display.html as html
from IPython.display import display, HTML

def read_csv(f_name):
    return pd.read_csv(f_name, sep="|")

## Open Dataset

In [2]:
df_explore = pd.read_csv(r'../output_files/DF_explore.csv', index_col=0, sep="|", na_values=['NA'])


In [3]:
df_annotated = df_explore.sample(frac=0.5, replace=True, random_state=777)

Comments: 01/02/2019 vb1 7.5 mg per wk
Direction: bidirectional
Lex: weken geleden
Regex: (\d+(?:\.\d+)?)\s?(x|mg|g)?\s?(?:\S+)?\s?(weken|wk|week|wkn|maand|mnd|maanden|jr|jaar|jaren)
Type: FREQUENCY, CONCENTRATION
Capture: $strength_nr $strength_unit $freq_unit


Comments: 01/02/2019 vb1 dagelijkse dosis 5 mg
Direction: bidirectional
Lex: dagelijkse dosis mg
Regex: (d|dd|dagelijkse dosis) (\d+(?:\.\d+)?) (x|mg|g)?
Type: FREQUENCY, CONCENTRATION
Capture: $freq_unit $strength_nr $strength_unit


Comments: 01/02/2019 vb1 dagelijkse dosis 5 mg
Direction: bidirectional
Lex: [nr] dagelijkse dosis mg
Regex: (\d+(?:\.\d+)?)\s(?:\S+)?\s?(d|dd|dagelijkse dosis)\s(\d+(?:\.\d+)?)\s(x|mg|g)?
Type: FREQUENCY, CONCENTRATION
Capture: $freq_nr $freq_unit $strength_nr $strength_unit


Comments: 01/02/2019 vb1 dagelijkse dosis 5 mg
Direction: bidirectional
Lex: [nr] dagelijkse dosis mg
Regex: (nr|naar)\s(?:\S+)?\s?(d|dd|dagelijkse dosis)\s(\d+(?:\.\d+)?)\s?(x|mg|g)?
Type: FREQUENCY, CONCENTRATION
Capture

### Explore Data - Pattern searching

In [38]:
#def extractFeatures(rules):'\\b(\\d+)\\s(mg)\\b'
df_annotated['XANTWOORD'].str.extract('(\S+) (\d+(?:.\d+)?) (\S+) (\S+) (\S+) (\S+)').dropna(thresh=1)#[['XANTWOORD']]

Unnamed: 0,0,1,2,3,4,5
30214,lab,1,x,p,mnd,dorogaan
43472,^prednisolon,10,mg,door,tot,raz
20045,naar,1,d,200,mg,^uitleg
26176,^co,6,mnd,met,lab,en
36642,lab,3,mnden,^,-,controle
26804,;,40,mg,ongecompliceerd,^injectie,linker
9632,",",10,:,15,h,-
29529,naar,75,mg,/,dg,^tp
17849,mtx,15,^advies,probeer,diclo,alleen
2444,naar,2,dd,500,mg,^


### Initialize list of modifiers & targets

- Targets = other words for entities
- Modifiers = terms that influence the meaning of the sentence (like negation terms)

In [1]:
import sys
sys.path.append('../') 
sys.path.append('../modules') 

import modules.pyConTextNLP2.pyConTextNLP.pyConText as pyConText
import modules.pyConTextNLP2.pyConTextNLP.itemData as itemData
from IPython.display import display, HTML
import EMR_functions as em




modifiers = itemData.get_items(
    '../corpus/featureExtractionRegex.yml', url=False)
targets = itemData.get_items(
    '../corpus/targets_simple.yml', url=False)
print(modifiers)
# Maybe use different modifiers/ entities for different steps!!

[literal<<7 mg per week>>; category<<['frequency', 'concentration']>>; re<<(\d+(?:\.\d+)?)\s?(mg|g)?\s?(?:\S+)?\s?(weken|wk|week|wkn|maand|mnd|maanden|jr|jaar|jaren)>>; rule<<bidirectional>>, literal<<1 x elke week>>; category<<['frequency', 'concentration']>>; re<<(\d+(?:\.\d+)?)\s?(?:\S+\s){0,3}\s?(weken|wk|week|wkn|maand|mnd|maanden|jr|jaar|jaren)>>; rule<<bidirectional>>, literal<<dagelijkse dosis mg>>; category<<['frequency', 'concentration']>>; re<<(d|dd|dagelijkse dosis) (\d+(?:\.\d+)?) (x|mg|g)?>>; rule<<bidirectional>>, literal<<conc dagelijkse dosis mg>>; category<<['frequency', 'concentration']>>; re<<(\d+(?:\.\d+)?)\s(?:\S+)?\s?(d|dd|dagelijkse dosis)\s(\d+(?:\.\d+)?)\s(x|mg|g)?>>; rule<<bidirectional>>, literal<<nr dagelijkse dosis mg>>; category<<['frequency', 'concentration']>>; re<<(?:nr|naar)?\s?(\d+(?:\.\d+)?)?\s?(?:\S+)?\s?(d|dd|dagelijkse dosis)\s(\d+(?:\.\d+)?)\s?(x|mg|g)?>>; rule<<bidirectional>>, literal<<X mg>>; category<<['concentration']>>; re<<(\d+(?:\.\d+)?)

### Markup sentence with the modifiers / targets

In [2]:
def markup_sentence(s, modifiers, targets, prune_inactive=True):
    """
    """
    markup = pyConText.ConTextMarkup()
    markup.setRawText(s)
    markup.cleanText()
    
    markup.markItems(modifiers, mode="modifier")
    markup.markItems(targets, mode="target")
    markup.pruneMarks()
    markup.dropMarks('Exclusion')
    # apply modifiers to any targets within the modifiers scope
    markup.applyModifiers()
    markup.pruneSelfModifyingRelationships()
    if prune_inactive:
        markup.dropInactiveModifiers()
    return markup

## Split the document into sentences and process each sentence

pyConTextNLP comes with a simple sentence splitter in ``helper.py``. I have not been maintaining this and have recently been using TextBlob to split sentences. A known problem with either sentence splitting solution is enumerated lists that don't use periods.

Important: You might need to install the 'punkt' package from nltk

In [22]:
clrs = {\
    "target": "blue",
    "definite_negated_existence": "red",
    "probable_negated_existence": "indianred",
    "ambivalent_existence": "orange",
    "probable_existence": "forestgreen",
    "definite_existence": "green",
    "historic": "goldenrod",
    "frequency": "pink",
    "concentration": "green",
    "change": "violet"
}

In [32]:
import xml.etree.ElementTree
import matplotlib.pyplot as plt
import modules.pyConTextNLP2.pyConTextNLP.display.html as html
from textblob import TextBlob
import modules.pyConTextNLP2
from IPython.display import display, HTML


def readContext(report):
    context = pyConText.ConTextDocument()
    blob = TextBlob(report.lower())
    count = 0
    rslts = []
    
    for s in blob.sentences:
        m = markup_sentence(s.raw, modifiers=modifiers, targets=targets)
        rslts.append(m)
        
    print(rslts)
    for r in rslts:
        context.addMarkup(r)
    #g = context.getDocumentGraph()
    return context

units = ['ophogen van 20 tot 3.5 g per mnd mtx.', 'nr 2 d 100 mg mtx', 'naar 10 mg mtx', '1x elke week mtx', 'mtx hervatten lagere dosis : 7.5 mg / wk per os ', 'van 20 naar 3.5 mg mtx']

for x in units:
    context =readContext(x)
    display(HTML(html.mark_document_with_html(context ,colors = clrs, default_color="black")))

#print(context)






[__________________________________________
rawText: ophogen van 20 tot 3.5 g per mnd mtx.
cleanedText: ophogen van 20 tot 3.5 g per mnd mtx.
********************************
TARGET: <id> 273322116129484881588993682792637350921 </id> <phrase> mtx </phrase> <category> ['medication'] </category> <capture> {'target': 'mtx'} </capture> 
----MODIFIED BY: <id> 273322115733344069015107194837130101769 </id> <phrase> 3.5 g per mnd </phrase> <category> ['frequency', 'concentration'] </category> <capture> {'strength_nr': '3.5', 'strength_unit': 'g', 'freq_unit': 'mnd'} </capture> 
__________________________________________
]


[__________________________________________
rawText: nr 2 d 100 mg mtx
cleanedText: nr 2 d 100 mg mtx
********************************
TARGET: <id> 273326869740007574935613491459108455433 </id> <phrase> mtx </phrase> <category> ['medication'] </category> <capture> {'target': 'mtx'} </capture> 
----MODIFIED BY: <id> 273326077775295082350004223335091749897 </id> <phrase> nr 2 d 100 mg </phrase> <category> ['frequency', 'concentration'] </category> <capture> {'freq_nr': '2', 'freq_unit': 'd', 'strength_nr': '100', 'strength_unit': 'mg'} </capture> 
__________________________________________
]


[__________________________________________
rawText: naar 10 mg mtx
cleanedText: naar 10 mg mtx
********************************
TARGET: <id> 273345094198089918589162225689717457929 </id> <phrase> mtx </phrase> <category> ['medication'] </category> <capture> {'target': 'mtx'} </capture> 
----MODIFIED BY: <id> 273345094118861756072977384330076701705 </id> <phrase> 10 mg </phrase> <category> ['concentration'] </category> <capture> {'strength': '10', 'unit': 'mg'} </capture> 
__________________________________________
]


[__________________________________________
rawText: 1x elke week mtx
cleanedText: 1x elke week mtx
********************************
TARGET: <id> 273347471359877996576153727598426407945 </id> <phrase> mtx </phrase> <category> ['medication'] </category> <capture> {'target': 'mtx'} </capture> 
----MODIFIED BY: <id> 273347471280649834059574539796414022665 </id> <phrase> 1x elke week </phrase> <category> ['frequency', 'concentration'] </category> <capture> {'freq_nr': '1', 'freq_unit': 'week'} </capture> 
__________________________________________
]


[__________________________________________
rawText: mtx hervatten lagere dosis : 7.5 mg / wk per os
cleanedText: mtx hervatten lagere dosis : 7.5 mg / wk per os
********************************
TARGET: <id> 273350640961747542236431037844445807625 </id> <phrase> mtx </phrase> <category> ['medication'] </category> <capture> {'target': 'mtx'} </capture> 
----MODIFIED BY: <id> 273350640724063054690428928854335767561 </id> <phrase> 7.5 mg / wk </phrase> <category> ['frequency', 'concentration'] </category> <capture> {'strength_nr': '7.5', 'strength_unit': 'mg', 'freq_unit': 'wk'} </capture> 
__________________________________________
]


[__________________________________________
rawText: van 20 naar 3.5 mg mtx
cleanedText: van 20 naar 3.5 mg mtx
********************************
TARGET: <id> 273354602845242230538845512157464970249 </id> <phrase> mtx </phrase> <category> ['medication'] </category> <capture> {'target': 'mtx'} </capture> 
----MODIFIED BY: <id> 273354602766014068023294833920353321993 </id> <phrase> van 20 naar 3.5 mg </phrase> <category> ['concentration', 'change'] </category> <capture> {'strength_nr': '3.5', 'strength_unit': 'mg'} </capture> 
__________________________________________
]


## Build the Pipeline