### Inputs:

* Training Set: 'labeled-data-2019-07-18_14-22.csv': sentence level training dataset.
* Test Set: 'gold_standard_HF_150.csv': sentence level test set.
* Label: dyspnea.

Reason for sentence level:
* ELMO requires a lot of computational resource and long training time. Training time increases exponentially with the length of the notes. 
* This serves as a starting point, I will find more computational resource to train on whole note level.

### Outputs:

* model_elmo_weights_dyspnea_sentences_unbalanced.h5
* model_elmo_weights_dyspnea_sentences_balanced.h5
* model_elmo_weights_dyspnea_sentences_unbalanced.png
* model_elmo_weights_dyspnea_sentences_balanced.png
* predicts_unbalanced.csv
* predicts_balanced.csv
* confusion_matrix_unbalanced.csv
* confusion_matrix_balanced.csv

## 1. Setting Up

### 1.1. Import Packages

In [28]:
import warnings
warnings.filterwarnings('ignore')

In [1]:
# Loading packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import os
import tensorflow_hub as hub
import tensorflow as tf
import tensorflow.keras as keras
from keras.layers import Input, Lambda, Dense
from keras.models import Model
import keras.backend as K
from numpy import asarray
from numpy import savetxt
from sklearn.metrics import confusion_matrix, f1_score

Using TensorFlow backend.


### 1.2. Setting Directories

In [59]:
os.chdir("/Users/huynguyen/Desktop/cumc_research/Task1_BERT_or_Elmo")
df_train_path = os.getcwd() + "/labeled-data-2019-07-18_14-22.csv"
df_test_path = os.getcwd() + "/gold_standard_HF_150.csv"
elmo_module_path = os.getcwd() + "/module/module_elmo3"

### Balanced

In [60]:
elmo_model_path_balanced = os.getcwd() + "/model_elmo_weights_dyspnea_sentences_balanced.h5"
model_plot_path_balanced = os.getcwd() + "/model_elmo_weights_dyspnea_sentences_balanced.png"
elmo_predict_path_balanced = os.getcwd() + "/predicts_balanced.csv"

### Unbalanced

In [61]:
elmo_model_path_unbalanced = os.getcwd() + "/model_elmo_weights_dyspnea_sentences_unbalanced.h5"
model_plot_path_unbalanced = os.getcwd() + "/model_elmo_weights_dyspnea_sentences_unbalanced.png"
elmo_predict_path_unbalanced = os.getcwd() + "/predicts_unbalanced.csv"

### 1.3. Helper Functions

In [32]:
# Helper functions
def replace_contraction(text):
    contraction_patterns = [(r'won\'t', 'will not'),
                            (r'can\'t', 'can not'),
                            (r'i\'m', 'i am'),
                            (r'ain\'t', 'is not'),
                            (r'(\w+)\'ll', '\g<1> will'),
                            (r'(\w+)n\'t', '\g<1> not'),
                            (r'(\w+)\'ve', '\g<1> have'),
                            (r'(\w+)\'s', '\g<1> is'),
                            (r'(\w+)\'re', '\g<1> are'),
                            (r'(\w+)\'d', '\g<1> would'),
                            (r'&', 'and'),
                            (r'dammit', 'damn it'),
                            (r'dont', 'do not'),
                            (r'wont', 'will not')]
    patterns = [(re.compile(regex), repl) for (regex, repl) in contraction_patterns]
    for (pattern, repl) in patterns:
        (text, count) = re.subn(pattern, repl, text)
    return text


def replace_links(text, filler=' '):
        text = re.sub(r'((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*',
                      filler, text).strip()
        return text

def remove_numbers(text):
    text = ''.join([i for i in text if not i.isdigit()])
    return text

def str_len(text):
    return len(text.split())

def cleanText(text):
    text = text.strip().replace("\n", " ").replace("\r", " ")
    text = replace_contraction(text)
    text = replace_links(text, "link")
    text = remove_numbers(text)
    text = re.sub(r'[,!@#$%^&*)(|/><";:.?\'\\}{]',"",text)
    text = text.lower()
    return text

### 2. Loading Data

### 2.1. Loading Training Data

In [35]:
def loading_training_set(df_train_path, subset = 0.1, balanced = False):
    
    # Loading and processing the sentence-level dataset
    df_train = pd.read_csv(df_train_path)
    df_train_dyspnea = df_train[['Note', 'Dyspnea (# of simclins)']]
    df_train_dyspnea['Dyspnea (# of simclins)'] = df_train_dyspnea['Dyspnea (# of simclins)'].fillna(0.0)
    df_train_dyspnea['dyspnea'] = np.where(df_train_dyspnea['Dyspnea (# of simclins)'] > 0.0, 1, 0)
    df_train_dyspnea = df_train_dyspnea[['Note', 'dyspnea']].reset_index()
    df_train_dyspnea = df_train_dyspnea.drop('index', axis = 1)
    
    # Remove rows where 'Note' is empty
    df_train_dyspnea = df_train_dyspnea[pd.notnull(df_train_dyspnea['Note'])]
    df_train_dyspnea['sent_len'] = df_train_dyspnea['Note'].apply(str_len)
    
    # Clip the length of 'Note' to 35 words max.
    df_train_dyspnea = df_train_dyspnea[df_train_dyspnea['sent_len'] < 35]
    
    # Subset
    df_train_dyspnea = df_train_dyspnea.sample(frac = subset, random_state = 2019)
    
    if balanced:
    # Balance the training set.
        df_pos = df_train_dyspnea[df_train_dyspnea['dyspnea'] == 1]
        df_neg = df_train_dyspnea[df_train_dyspnea['dyspnea'] == 0].sample(n = df_pos.shape[0], random_state = 2019)
        df_train_dyspnea = pd.concat([df_pos, df_neg])
        df_train_dyspnea = df_train_dyspnea.reset_index()
        df_train_dyspnea.drop('index', inplace = True, axis = 1)
    
    # Final processing
    df_train_dyspnea['Note'] = df_train_dyspnea['Note'].apply(cleanText)
    df_train_dyspnea['Note'] = df_train_dyspnea['Note'].str.replace('\s+', ' ', regex = True)    
    
    return df_train_dyspnea

### Check Balanced Case

In [37]:
df_train = loading_training_set(df_train_path, subset = 0.1, balanced = True)
X_train = np.array(df_train["Note"])
y_train = np.array(df_train["dyspnea"])
sum(y_train)/len(y_train)

0.5

### Check Unbalanced Case

In [38]:
df_train = loading_training_set(df_train_path, subset = 0.1)
X_train = np.array(df_train["Note"])
y_train = np.array(df_train["dyspnea"])
sum(y_train)/len(y_train)

0.006426119064585087

### 2.2. Loading Test Data

In [39]:
def loading_test_set(df_test_path, balanced = False):
    
    # Loading and processing the sentence-level dataset
    df = pd.read_csv(df_test_path)
    for i in range(1, 5):
        df['dyspnea_' + str(i)] = np.where(df['Category ' + str(i)] == 'Dyspnea', 1, 0)
    df = df[['Note', 'dyspnea_1', 'dyspnea_2', 'dyspnea_3', 'dyspnea_4']]
    df['dyspnea'] = df[['dyspnea_1', 'dyspnea_2', 'dyspnea_3', 'dyspnea_4']].sum(axis = 1)
    df['dyspnea'] = np.where(df['dyspnea'] > 0, 1, 0)
    df = df[['Note', 'dyspnea']]
    
    # Remove rows where 'Note' is empty
    df = df[pd.notnull(df['Note'])]
    df['sent_len'] = df['Note'].apply(str_len)
    
    # Clip the length of 'Note' to 35 words max.
    df = df[df['sent_len'] < 35]
    if balanced:
        df_pos = df[df['dyspnea'] == 1]
        df_neg = df[df['dyspnea'] == 0].sample(n = df_pos.shape[0], random_state = 2019)
        df = pd.concat([df_pos, df_neg])
        df = df.reset_index()
        df.drop('index', inplace = True, axis = 1)
    
    # Final processing
    df['Note'] = df['Note'].apply(cleanText)
    df['Note'] = df['Note'].str.replace('\s+', ' ', regex = True)
    return df

### Check Balanced Case

In [40]:
df_test = loading_test_set(df_test_path, balanced = True)
X_test = np.array(df_test["Note"])
y_test = np.array(df_test["dyspnea"])
sum(y_test)/len(y_test)

0.5

### Check Unbalanced Case

In [41]:
df_test = loading_test_set(df_test_path)
X_test = np.array(df_test["Note"])
y_test = np.array(df_test["dyspnea"])
sum(y_test)/len(y_test)

0.006663753550360498

### 3. Modelling Functions

In [42]:
def train_elmo(elmo_module_path, elmo_model_path, epochs=20, batch_size=256):
    embed = hub.Module(elmo_module_path)
    def ELMoEmbedding(x):
        return embed(tf.squeeze(tf.cast(x, tf.string)), signature="default", as_dict=True)["default"]
    def build_model():
        input_text = Input(shape=(1,), dtype="string")
        embedding = Lambda(ELMoEmbedding, output_shape=(1024, ))(input_text)
        dense = Dense(256, activation='relu', kernel_regularizer=keras.regularizers.l2(0.001))(embedding)
        pred = Dense(1, activation='sigmoid')(dense)
        model = Model(inputs=[input_text], outputs=pred)
        model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
        return model
    model_elmo_dyspnea = build_model()
    with tf.Session() as session:
        K.set_session(session)
        session.run(tf.global_variables_initializer())
        session.run(tf.tables_initializer())
        history = model_elmo_dyspnea.fit(X_train, y_train,
                                         epochs=epochs,
                                         batch_size=batch_size,
                                         validation_split = 0.2)
        model_elmo_dyspnea.save_weights(elmo_model_path)
    return history

In [56]:
def plot_training(history, model_plot_path):
    acc = history.history['acc']
    val_acc = history.history['val_acc']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    epochs = range(1, len(acc) + 1)
    plt.figure(figsize = (6, 6))
    plt.plot(epochs, acc, 'g', label='Training Acc')
    plt.plot(epochs, val_acc, 'b', label='Validation Acc')
    plt.title('Training and validation Acc')
    plt.xlabel('Epochs')
    plt.ylabel('Acc')
    plt.legend()
    plt.savefig(model_plot_path, bbox_inches = 'tight', dpi = 500)

### 4. Prediction Functions

In [55]:
def run_prediction(df_test, elmo_model_path, elmo_predict_path, elmo_module_path):
    df_test_text = df_test['Note'].to_list()
    test_text_to_pred = np.array(df_test_text, dtype=object)[:, np.newaxis]
    embed = hub.Module(elmo_module_path)
    def ELMoEmbedding(x):
        return embed(tf.squeeze(tf.cast(x, tf.string)), signature="default", as_dict=True)["default"]
    def build_model():
        input_text = Input(shape=(1,), dtype="string")
        embedding = Lambda(ELMoEmbedding, output_shape=(1024, ))(input_text)
        dense = Dense(256, activation='relu', kernel_regularizer=keras.regularizers.l2(0.001))(embedding)
        pred = Dense(1, activation='sigmoid')(dense)
        model = Model(inputs=[input_text], outputs=pred)
        model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
        return model
    model_elmo_dyspnea = build_model()
    with tf.Session() as session:
        K.set_session(session)
        session.run(tf.global_variables_initializer())
        session.run(tf.tables_initializer())
        model_elmo_dyspnea.load_weights(elmo_model_path)
        predicts = model_elmo_dyspnea.predict(test_text_to_pred)
        savetxt(elmo_predict_path, predicts, delimiter=',')
    return predicts

In [58]:
def get_confusion_matrix(predicts, df_test):
    predicts = np.array(predicts)
    predicts = np.where(predicts > 0.5, 1, 0)
    y_test = np.array(df_test['dyspnea'])
    return confusion_matrix(y_test, predicts), f1_score(y_test, predicts, average = 'macro')

### 5. Running Pipeline

### Unbalanced

In [None]:
# Prepare training set
df_train = loading_training_set(df_train_path, subset = 0.1)
X_train = np.array(df_train["Note"])
y_train = np.array(df_train["dyspnea"])

# Prepare test set
df_test = loading_test_set(df_test_path)
X_test = np.array(df_test["Note"])
y_test = np.array(df_test["dyspnea"])

# Training Elmo
history = train_elmo(elmo_module_path, elmo_model_path_unbalanced, epochs=5)
plot_training(history, model_plot_path_unbalanced)

# Running predictions
predicts = run_prediction(df_test, elmo_model_path_unbalanced, elmo_predict_path_unbalanced, elmo_module_path)
get_confusion_matrix(predicts, df_test)

### Balanced

In [None]:
# Prepare training set
df_train = loading_training_set(df_train_path, subset = 0.1, balanced = True)
X_train = np.array(df_train["Note"])
y_train = np.array(df_train["dyspnea"])

# Prepare test set
df_test = loading_test_set(df_test_path, balanced = True)
X_test = np.array(df_test["Note"])
y_test = np.array(df_test["dyspnea"])

# Training Elmo
history = train_elmo(elmo_module_path, elmo_model_path_balanced, epochs=5)
plot_training(history, model_plot_path_balanced)

# Running predictions
predicts = run_prediction(df_test, elmo_model_path_balanced, elmo_predict_path_balanced, elmo_module_path)
get_confusion_matrix(predicts, df_test)