# Introduction
Implementation of the cTAKES BoW method with relation pairs (f.e. CUI-Relationship-CUI) (added to the BoW cTAKES orig. pairs (Polarity-CUI)), evaluated against the annotations from: 
> Gehrmann, Sebastian, et al. "Comparing deep learning and concept extraction based methods for patient phenotyping from clinical narratives." PloS one 13.2 (2018): e0192360.

## Import Packages

In [None]:
# imported packages
import multiprocessing
import collections
import itertools
import re
import os

# xml and xmi
from lxml import etree

# arrays and dataframes
import pandas
import numpy
from pandasql import sqldf

# classifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import FunctionTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

# plotting
import matplotlib 
matplotlib.use('Agg') # server
try:
    get_ipython
    # jupyter notebook
    %matplotlib inline 
except:
    pass
import matplotlib.pyplot as plt

In [None]:
# import custom modules
import context # set search path to one level up
from src import evaluation  # method for evaluation of classifiers

## Define variables and parameters

In [None]:
# variables and parameters
# filenames
input_directory = '../data/interim/cTAKES_output'
input_filename = '../data/raw/annotations.csv'
results_filename = '../reports/ctakes_relationgram_bow_tfidf_results.csv'
plot_filename_1 = '../reports/ctakes_relationgram_bow_tfidf_boxplot_1.png'
plot_filename_2 = '../reports/ctakes_relationgram_bow_tfidf_boxplot_2.png'

# number of splits and repeats for cross validation
n_splits = 5
n_repeats = 10
# n_repeats = 1  # for testing

# number of workers
n_workers=multiprocessing.cpu_count()
# n_workers = 1  # for testing

# keep the conditions for which results are reported in the publication
conditions = [  
#     'cohort',
    'Obesity',
#     'Non.Adherence',
#     'Developmental.Delay.Retardation',
    'Advanced.Heart.Disease', 
    'Advanced.Lung.Disease', 
    'Schizophrenia.and.other.Psychiatric.Disorders',
    'Alcohol.Abuse', 
    'Other.Substance.Abuse',
    'Chronic.Pain.Fibromyalgia', 
    'Chronic.Neurological.Dystrophies', 
    'Advanced.Cancer',
    'Depression',
#     'Dementia',
#     'Unsure',
]

## Load and prepare data

### Load and parse xmi data

In [None]:
%load_ext ipycache

In [None]:
%%cache --read 2.6-JS-ctakes-relationgram-bow-tfidf_cache.pkl X  

def ctakes_xmi_to_df(xmi_path):
    records = []
    
    tree = etree.parse(xmi_path)
    root = tree.getroot()

    mentions = []
    for mention in root.iterfind('*[@{http://www.omg.org/XMI}id][@typeID][@polarity]'):
        if 'ontologyConceptArr' in mention.attrib:
            for concept in mention.attrib['ontologyConceptArr'].split(" "):
                d = dict(mention.attrib)
                d['ontologyConceptArr'] = concept
                mentions.append(d)
        else:
            d = dict(mention.attrib)
            mentions.append(d)
    mentions_df = pandas.DataFrame(mentions)
    
    concepts = []
    for concept in root.iterfind('*[@{http://www.omg.org/XMI}id][@cui][@tui]'):
        concepts.append(dict(concept.attrib))
    concepts_df = pandas.DataFrame(concepts)
    
    events = []
    for event in root.iterfind('*[@{http://www.omg.org/XMI}id][@properties]'):
        events.append(dict(event.attrib))
    events_df = pandas.DataFrame(events)
    
    eventproperties = []
    for eventpropertie in root.iterfind('*[@{http://www.omg.org/XMI}id][@docTimeRel]'):
        eventproperties.append(dict(eventpropertie.attrib))
    eventproperties_df = pandas.DataFrame(eventproperties)
    
    merged_df = mentions_df.add_suffix('_1')\
        .merge(right=concepts_df, left_on='ontologyConceptArr_1', right_on='{http://www.omg.org/XMI}id')\
        .merge(right=events_df, left_on='event_1', right_on='{http://www.omg.org/XMI}id')\
        .merge(right=eventproperties_df, left_on='properties', right_on='{http://www.omg.org/XMI}id')
    
#     # unique cui and tui per event IDEA: consider keeping all
#     merged_df = merged_df.drop_duplicates(subset=['event', 'cui', 'tui'])
    
    # merge polarity of the *mention and the cui
    merged_df = merged_df.dropna(subset=['cui'])  # remove any NaN
    merged_df['polaritycui'] = merged_df['polarity_1'] + merged_df['cui']
    
    # extract relations
    textrelations = []
    for tr in root.iterfind('*[@{http://www.omg.org/XMI}id][@category][@arg1][@arg2]'):
        textrelations.append(dict(tr.attrib))
    textrelations_df = pandas.DataFrame(textrelations)
    
    relationarguments = []
    for relationargument in root.iterfind('*[@{http://www.omg.org/XMI}id][@argument][@role]'):
        relationarguments.append(dict(relationargument.attrib))
    relationarguments_df = pandas.DataFrame(relationarguments)    
    
    # transforms
    tdf = textrelations_df
    tdf['xmiid'] = tdf['{http://www.omg.org/XMI}id']
    rdf = relationarguments_df
    rdf['xmiid'] = rdf['{http://www.omg.org/XMI}id']
    mdf = mentions_df
    mdf['xmiid'] = mdf['{http://www.omg.org/XMI}id']
    cdf = concepts_df
    cdf['xmiid'] = cdf['{http://www.omg.org/XMI}id']

    subquery_1 = """
    -- table with:
        -- (from *Relation): category
        -- (from RelationArgument): argument (as argument1 and argument2) (Foreign Key *Mentions.xmiid)
        -- (from *Mention): begin - end (as begin1 - end1 - begin2 - end2)
        SELECT
            r.category,
            m1.begin as begin1,
            m1.end as end1,
            m2.begin as begin2,
            m2.end as end2
        FROM
            tdf r
        INNER JOIN
            rdf a1
            ON r.arg1 = a1.xmiid
            INNER JOIN
                rdf a2
                ON r.arg2 = a2.xmiid
                INNER JOIN
                    mdf m1
                    ON a1.argument = m1.xmiid
                    INNER JOIN
                        mdf m2
                        ON a2.argument = m2.xmiid
    """

    subquery_2 = """
    -- table with: 
        -- (from *Mentions): begin - end - polarity
        -- (from Concepts): cui
        SELECT
            m.begin,
            m.end,
            m.polarity,
            c.cui
        FROM
            mdf m
            INNER JOIN
            cdf c
            ON
            m.ontologyConceptArr = c.xmiid
    """

    # run subqueries and save in new tables
    sq1 = sqldf(subquery_1, locals())
    sq2 = sqldf(subquery_2, locals())

    query = """
    -- table with:
    -- (from Concept): cui1, cui2
    -- (from *Mention): polarity1, polarity2
    -- (from *Relation): category (what kind of relation)
    SELECT
        sq1.category,
        sq21.cui as cui1,
        sq22.cui as cui2,
        sq21.polarity as polarity1,
        sq22.polarity as polarity2
    FROM
        sq1 sq1
    INNER JOIN
        sq2 sq21
        ON sq21.begin >= sq1.begin1
        and sq21.end <= sq1.end1
        INNER JOIN
            sq2 sq22
            ON sq22.begin >= sq1.begin2
            and sq22.end <= sq1.end2
    """

    res = sqldf(query, locals())

    # remove duplicates
    res = res.drop_duplicates(subset=['cui1', 'cui2', 'category', 'polarity1', 'polarity2'])

    res['string'] = res['polarity1'] + res['cui1'] + res['category'] + res['polarity2'] + res['cui2']

    # return as a string
    return ' '.join(list(res['string']) + list(merged_df['polaritycui']))

X = []

# key function for sorting the files according to the integer of the filename
def key_fn(x):
    i = x.split(".")[0]
    if i != "":
        return int(i)
    return None

for f in sorted(os.listdir(input_directory), key=key_fn):  # for each file in the input directory
    if f.endswith(".xmi"):
        fpath = os.path.join(input_directory, f)
        # parse file and append as a dataframe to x_df
        try:
            X.append(ctakes_xmi_to_df(fpath))
        except Exception as e:
            print e
            X.append('NaN')

X = numpy.array(X)

### Load annotations and classification data 

In [None]:
# read and parse csv file
data = pandas.read_csv(input_filename)
# data = data[0:100]  # for testing
# X = X[0:100]  # for testing
data.head()

In [None]:
# groups: the subject ids
# used in order to ensure that 
# "patients’ notes stay within the set, so that all discharge notes in the 
# test set are from patients not previously seen by the model." Gehrmann17.
groups_df = data.filter(items=['subject.id']) 
groups = groups_df.as_matrix()
# y: the annotated classes
y_df = data.filter(items=conditions) # filter the conditions
y = y_df.as_matrix()

In [None]:
print(X.shape, groups.shape, y.shape)

## Define classifiers

In [None]:
# dictionary of classifiers (sklearn estimators)
classifiers = collections.OrderedDict()

In [None]:
def tokenizer(text):
    pattern = r'[\s]+'  # match any sequence of whitespace characters
    repl = r' '  # replace with space
    temp_text = re.sub(pattern, repl, text)
    return temp_text.lower().split(' ')  # lower-case and split on space

In [None]:
prediction_models = [
    ('logistic_regression', LogisticRegression(random_state=0)),
    ("random_forest", RandomForestClassifier(random_state=0)),
    ("naive_bayes", MultinomialNB()),
    ("svm_linear", SVC(kernel="linear", random_state=0, probability=True)),
    ("gradient_boosting", GradientBoostingClassifier(random_state=0)),
]

# BoW
representation_models = [('ctakes_relationgram_bow_tfidf', TfidfVectorizer(tokenizer=tokenizer))]  # IDEA: Use Tfidf on normal BoW model aswell?

# cross product of representation models and prediction models
# save to classifiers as pipelines of rep. model into pred. model
for rep_model, pred_model in itertools.product(representation_models, prediction_models):
    classifiers.update({  # add this classifier to classifiers dictionary
        '{rep_model}_{pred_model}'.format(rep_model=rep_model[0], pred_model=pred_model[0]):  # classifier name
        Pipeline([rep_model, pred_model]),  # concatenate representation model with prediction model in a pipeline
    })

## Run and evaluate

In [None]:
results = evaluation.run_evaluation(X=X, 
                                    y=y, 
                                    groups=groups,
                                    conditions=conditions,
                                    classifiers=classifiers,
                                    n_splits=n_splits, 
                                    n_repeats=n_repeats, 
                                    n_workers=n_workers)

## Save and plot results

In [None]:
# save results        
results_df = pandas.DataFrame(results)
results_df.to_csv(results_filename)

In [None]:
results_df.head(100)

In [None]:
## load results for plotting
# import pandas
# results = pandas.read_csv('output/results.csv')

In [None]:
# plot and save
axs = results_df.groupby('name').boxplot(column='AUROC', by='condition', rot=90, figsize=(10,10))
for ax in axs:
    ax.set_ylim(0,1)

plt.savefig(plot_filename_1)

In [None]:
# plot and save
axs = results_df.groupby('condition').boxplot(column='AUROC', by='name', rot=90, figsize=(10,10))
for ax in axs:
    ax.set_ylim(0,1)

plt.savefig(plot_filename_2)