# Introduction
Reimplementation of the BoW (with n-grams) baseline methods presented in and evaluated against the annotations from: 
> Gehrmann, Sebastian, et al. "Comparing deep learning and concept extraction based methods for patient phenotyping from clinical narratives." PloS one 13.2 (2018): e0192360.

## Import Packages

In [None]:
# imported packages
import multiprocessing
import collections
import itertools
import re

# arrays and dataframes
import pandas
import numpy

# classifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import FunctionTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

# plotting
import matplotlib 
matplotlib.use('Agg') # server
try:
    get_ipython
    # jupyter notebook
    %matplotlib inline 
except:
    pass
import matplotlib.pyplot as plt

In [None]:
# import custom modules
import context # set search path to one level up
from src import evaluation  # method for evaluation of classifiers

## Define variables and parameters

In [None]:
# variables and parameters
# filenames
input_filename = '../data/raw/annotations.csv'
results_filename = '../reports/ngrams_bow_results.csv'
plot_filename_1 = '../reports/ngrams_bow_boxplot_1.png'
plot_filename_2 = '../reports/ngrams_bow_boxplot_2.png'

# number of splits and repeats for cross validation
n_splits = 5
n_repeats = 10
# n_repeats = 1  # for testing

# number of workers
n_workers=multiprocessing.cpu_count()
# n_workers = 1  # for testing

# keep the conditions for which results are reported in the publication
conditions = [  
#     'cohort',
    'Obesity',
#     'Non.Adherence',
#     'Developmental.Delay.Retardation',
    'Advanced.Heart.Disease', 
    'Advanced.Lung.Disease', 
    'Schizophrenia.and.other.Psychiatric.Disorders',
    'Alcohol.Abuse', 
    'Other.Substance.Abuse',
    'Chronic.Pain.Fibromyalgia', 
    'Chronic.Neurological.Dystrophies', 
    'Advanced.Cancer',
    'Depression',
#     'Dementia',
#     'Unsure',
]

## Load and prepare data

In [None]:
# read and parse csv file
data = pandas.read_csv(input_filename)
# data = data[0:100]  # for testing
data.head()

In [None]:
# assign and clean the X and y variables
# X: the clinical notes
X_df = data.filter(items=['text'])
X = X_df['text']
# groups: the subject ids
# used in order to ensure that 
# "patients’ notes stay within the set, so that all discharge notes in the 
# test set are from patients not previously seen by the model." Gehrmann17.
groups_df = data.filter(items=['subject.id']) 
groups = groups_df.as_matrix()
# y: the annotated classes
y_df = data.filter(items=conditions) # filter the conditions
y = y_df.as_matrix()

In [None]:
print(X_df.shape, groups_df.shape, y_df.shape)
print(X.shape, groups.shape, y.shape)

In [None]:
X_df.head()

In [None]:
groups_df.head()

In [None]:
y_df.head()

## Define classifiers

In [None]:
# dictionary of classifiers (sklearn estimators)
classifiers = collections.OrderedDict()

In [None]:
def tokenizer(text):
    pattern = r'[^\w]+'  # match any non-alphanumerical character
    repl = r' '  # replace with space
    temp_text = re.sub(pattern, repl, text)
    return temp_text.lower().split(' ')  # lower-case and split on space

In [None]:
prediction_models = [
    ('logistic_regression', LogisticRegression(random_state=0)),
    ("random_forest", RandomForestClassifier(random_state=0)),
    ("naive_bayes", MultinomialNB()),
    ("svm_linear", SVC(kernel="linear", random_state=0, probability=True)),
    ("gradient_boosting", GradientBoostingClassifier(random_state=0)),
]

# 1-gram, 1-gram + 2-gram ..., 1-gram to 5-gram (word grams) -> BoW
representation_models = [('{n}gram_bow'.format(n=i), CountVectorizer(ngram_range=(1, i), tokenizer=tokenizer)) for i in range(1, 5+1)]

# cross product of representation models and prediction models
# save to classifiers as pipelines of rep. model into pred. model
for rep_model, pred_model in itertools.product(representation_models, prediction_models):
    classifiers.update({  # add this classifier to classifiers dictionary
        '{rep_model}_{pred_model}'.format(rep_model=rep_model[0], pred_model=pred_model[0]):  # classifier name
        Pipeline([rep_model, pred_model]),  # concatenate representation model with prediction model in a pipeline
    })

## Run and evaluate

In [None]:
results = evaluation.run_evaluation(X=X, 
                                    y=y, 
                                    groups=groups,
                                    conditions=conditions,
                                    classifiers=classifiers,
                                    n_splits=n_splits, 
                                    n_repeats=n_repeats, 
                                    n_workers=n_workers)

## Save and plot results

In [None]:
# save results        
results_df = pandas.DataFrame(results)
results_df.to_csv(results_filename)

In [None]:
results_df.head()

In [None]:
## load results for plotting
# import pandas
# results = pandas.read_csv('output/results.csv')

In [None]:
# plot and save
axs = results_df.groupby('name').boxplot(column='AUROC', by='condition', rot=90, figsize=(10,10))
for ax in axs:
    ax.set_ylim(0,1)

plt.savefig(plot_filename_1)

In [None]:
# plot and save
axs = results_df.groupby('condition').boxplot(column='AUROC', by='name', rot=90, figsize=(10,10))
for ax in axs:
    ax.set_ylim(0,1)

plt.savefig(plot_filename_2)