### Inputs:

* Training Set: 'labeled-data-2019-07-18_14-22.csv': sentence level training dataset.
* Test Set: 'gold_standard_HF_150.csv': sentence level test set.
* Label: dyspnea.

Reason for sentence level:
* BERT requires a lot of computational resource and long training time. Training time increases exponentially with the length of the notes. 

### Outputs:

* model_bert_weights_dyspnea_sentences_unbalanced.h5
* model_bert_weights_dyspnea_sentences_balanced.h5

### References:
* https://github.com/strongio/keras-bert/blob/master/keras-bert.ipynb

### 1. Setting Up

### 1.1. Import Packages

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [17]:
import pandas as pd
import tensorflow_hub as hub
import os
import re
import tensorflow as tf
import numpy as np
from bert.tokenization import FullTokenizer
from tqdm import tqdm_notebook
from tensorflow.keras import backend as K
from sklearn.metrics import confusion_matrix, f1_score

### 1.2. Setting Directories and Parameters

In [14]:
sess = tf.Session()
bert_path = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"
max_seq_length = 50

In [5]:
os.chdir("/Users/huynguyen/Desktop/cumc_research/Task1_BERT_or_Elmo")
os.chdir("/work/han2114")
df_train_path = os.getcwd() + "/labeled-data-2019-07-18_14-22.csv"
df_test_path = os.getcwd() + "/gold_standard_HF_150.csv"

### Balanced

In [7]:
bert_model_path_balanced = os.getcwd() + "/model_bert_weights_dyspnea_sentences_balanced.h5"
bert_model_plot_path_balanced = os.getcwd() + "/model_bert_weights_dyspnea_sentences_balanced.png"
bert_predict_path_balanced = os.getcwd() + "/predicts_balanced.csv"

### Unbalanced

In [8]:
bert_model_path_unbalanced = os.getcwd() + "/model_bert_weights_dyspnea_sentences_unbalanced.h5"
bert_model_plot_path_unbalanced = os.getcwd() + "/model_bert_weights_dyspnea_sentences_unbalanced.png"
bert_predict_path_unbalanced = os.getcwd() + "/predicts_unbalanced.csv"

### 1.3. Helper Functions

In [4]:
# Helper functions
def replace_contraction(text):
    contraction_patterns = [(r'won\'t', 'will not'),
                             (r'can\'t', 'can not'),
                             (r'i\'m', 'i am'),
                             (r'ain\'t', 'is not'),
                             (r'(\w+)\'ll', '\g<1> will'),
                             (r'(\w+)n\'t', '\g<1> not'),
                             (r'(\w+)\'ve', '\g<1> have'),
                             (r'(\w+)\'s', '\g<1> is'),
                             (r'(\w+)\'re', '\g<1> are'),
                             (r'(\w+)\'d', '\g<1> would'),
                             (r'&', 'and'),
                             (r'dammit', 'damn it'),
                             (r'dont', 'do not'),
                             (r'wont', 'will not')]
    patterns = [(re.compile(regex), repl) for (regex, repl) in contraction_patterns]
    for (pattern, repl) in patterns:
        (text, count) = re.subn(pattern, repl, text)
    return text


def replace_links(text, filler=' '):
    text = re.sub(r'((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*',
    filler, text).strip()
    return text

def remove_numbers(text):
    text = ''.join([i for i in text if not i.isdigit()])
    return text

def str_len(text):
    return len(text.split())

def strip_extra_ws(text):
    return text.replace('\s+', ' ', regex = True)

def cleanText(text):
    text = text.strip().replace("\n", " ").replace("\r", " ")
    text = replace_contraction(text)
    text = replace_links(text, "link")
    text = remove_numbers(text)
    text = re.sub(r'[,!@#$%^&*)(|/><";:.?\'\\}{]',"",text)
    text = text.lower()
    return text

### 2. Loading Data

### 2.1. Loading Training Data

In [9]:
def loading_training_set(df_train_path, subset = 0.1, balanced = False):
    
    # Loading and processing the sentence-level dataset
    df_train = pd.read_csv(df_train_path)
    df_train_dyspnea = df_train[['Note', 'Dyspnea (# of simclins)']]
    df_train_dyspnea['Dyspnea (# of simclins)'] = df_train_dyspnea['Dyspnea (# of simclins)'].fillna(0.0)
    df_train_dyspnea['dyspnea'] = np.where(df_train_dyspnea['Dyspnea (# of simclins)'] > 0.0, 1, 0)
    df_train_dyspnea = df_train_dyspnea[['Note', 'dyspnea']].reset_index()
    df_train_dyspnea = df_train_dyspnea.drop('index', axis = 1)
    
    # Remove rows where 'Note' is empty
    df_train_dyspnea = df_train_dyspnea[pd.notnull(df_train_dyspnea['Note'])]
    df_train_dyspnea['sent_len'] = df_train_dyspnea['Note'].apply(str_len)
    
    # Clip the length of 'Note' to 35 words max.
    df_train_dyspnea = df_train_dyspnea[df_train_dyspnea['sent_len'] < 35]
    
    # Subset
    df_train_dyspnea = df_train_dyspnea.sample(frac = subset, random_state = 2019)
    
    if balanced:
    # Balance the training set.
        df_pos = df_train_dyspnea[df_train_dyspnea['dyspnea'] == 1]
        df_neg = df_train_dyspnea[df_train_dyspnea['dyspnea'] == 0].sample(n = df_pos.shape[0], random_state = 2019)
        df_train_dyspnea = pd.concat([df_pos, df_neg])
        df_train_dyspnea = df_train_dyspnea.reset_index()
        df_train_dyspnea.drop('index', inplace = True, axis = 1)
    
    # Final processing
    df_train_dyspnea['Note'] = df_train_dyspnea['Note'].apply(cleanText)
    df_train_dyspnea['Note'] = df_train_dyspnea['Note'].str.replace('\s+', ' ', regex = True)    
    
    return df_train_dyspnea

In [19]:
train = loading_training_set(df_train_path, subset = 0.1)
train_balanced = loading_training_set(df_train_path, subset = 1, balanced = True)

### 2.2. Loading Test Data

In [11]:
def loading_test_set(df_test_path, balanced = False):
    
    # Loading and processing the sentence-level dataset
    df = pd.read_csv(df_test_path)
    for i in range(1, 5):
        df['dyspnea_' + str(i)] = np.where(df['Category ' + str(i)] == 'Dyspnea', 1, 0)
    df = df[['Note', 'dyspnea_1', 'dyspnea_2', 'dyspnea_3', 'dyspnea_4']]
    df['dyspnea'] = df[['dyspnea_1', 'dyspnea_2', 'dyspnea_3', 'dyspnea_4']].sum(axis = 1)
    df['dyspnea'] = np.where(df['dyspnea'] > 0, 1, 0)
    df = df[['Note', 'dyspnea']]
    
    # Remove rows where 'Note' is empty
    df = df[pd.notnull(df['Note'])]
    df['sent_len'] = df['Note'].apply(str_len)
    
    # Clip the length of 'Note' to 35 words max.
    df = df[df['sent_len'] < 35]
    if balanced:
        df_pos = df[df['dyspnea'] == 1]
        df_neg = df[df['dyspnea'] == 0].sample(n = df_pos.shape[0], random_state = 2019)
        df = pd.concat([df_pos, df_neg])
        df = df.reset_index()
        df.drop('index', inplace = True, axis = 1)
    
    # Final processing
    df['Note'] = df['Note'].apply(cleanText)
    df['Note'] = df['Note'].str.replace('\s+', ' ', regex = True)
    return df

In [12]:
test = loading_test_set(df_test_path)

### 2.3. Preparing Features and Labels

In [15]:
# Create datasets (Only take up to max_seq_length words for memory)
train_text_bert = train['Note'].tolist()
train_text_bert = [' '.join(t.split()[0:max_seq_length]) for t in train_text_bert]
train_text_bert = np.array(train_text_bert, dtype=object)[:, np.newaxis]
train_label_bert = train['dyspnea'].tolist()

test_text_bert = test['Note'].tolist()
test_text_bert = [' '.join(t.split()[0:max_seq_length]) for t in test_text_bert]
test_text_bert = np.array(test_text_bert, dtype=object)[:, np.newaxis]
test_label_bert = test['dyspnea'].tolist()

### Balanced

In [None]:
# Create datasets (Only take up to max_seq_length words for memory)
train_text_bert_balanced = train_balanced['Note'].tolist()
train_text_bert_balanced = [' '.join(t.split()[0:max_seq_length]) for t in train_text_bert_balanced]
train_text_bert_balanced = np.array(train_text_bert_balanced, dtype=object)[:, np.newaxis]
train_label_bert_balanced = train_balanced['dyspnea'].tolist()
test_text_bert = test['Note'].tolist()
test_text_bert = [' '.join(t.split()[0:max_seq_length]) for t in test_text_bert]
test_text_bert = np.array(test_text_bert, dtype=object)[:, np.newaxis]
test_label_bert = test['dyspnea'].tolist()

### 2.4. Tokenizing

In [16]:
class PaddingInputExample(object):
    """Fake example so the num input examples is a multiple of the batch size.
  When running eval/predict on the TPU, we need to pad the number of examples
  to be a multiple of the batch size, because the TPU requires a fixed batch
  size. The alternative is to drop the last batch, which is bad because it means
  the entire output data won't be generated.
  We use this class instead of `None` because treating `None` as padding
  battches could cause silent errors.
  """

class InputExample(object):
    """A single training/test example for simple sequence classification."""
    def __init__(self, guid, text_a, text_b=None, label=None):
        """Constructs a InputExample.
    Args:
      guid: Unique id for the example.
      text_a: string. The untokenized text of the first sequence. For single
        sequence tasks, only this sequence must be specified.
      text_b: (Optional) string. The untokenized text of the second sequence.
        Only must be specified for sequence pair tasks.
      label: (Optional) string. The label of the example. This should be
        specified for train and dev examples, but not for test examples.
    """
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label

def create_tokenizer_from_hub_module():
    """Get the vocab file and casing info from the Hub module."""
    bert_module =  hub.Module(bert_path)
    tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
    vocab_file, do_lower_case = sess.run(
        [
            tokenization_info["vocab_file"],
            tokenization_info["do_lower_case"],
        ]
    )
    return FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)

def convert_single_example(tokenizer, example, max_seq_length=50):
    """Converts a single `InputExample` into a single `InputFeatures`."""
    if isinstance(example, PaddingInputExample):
        input_ids = [0] * max_seq_length
        input_mask = [0] * max_seq_length
        segment_ids = [0] * max_seq_length
        label = 0
        return input_ids, input_mask, segment_ids, label
    tokens_a = tokenizer.tokenize(example.text_a)
    if len(tokens_a) > max_seq_length - 2:
        tokens_a = tokens_a[0 : (max_seq_length - 2)]
    tokens = []
    segment_ids = []
    tokens.append("[CLS]")
    segment_ids.append(0)
    for token in tokens_a:
        tokens.append(token)
        segment_ids.append(0)
    tokens.append("[SEP]")
    segment_ids.append(0)
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    # The mask has 1 for real tokens and 0 for padding tokens. Only real
    # tokens are attended to.
    input_mask = [1] * len(input_ids)
    # Zero-pad up to the sequence length.
    while len(input_ids) < max_seq_length:
        input_ids.append(0)
        input_mask.append(0)
        segment_ids.append(0)
    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length
    return input_ids, input_mask, segment_ids, example.label

def convert_examples_to_features(tokenizer, examples, max_seq_length=50):
    """Convert a set of `InputExample`s to a list of `InputFeatures`."""
    input_ids, input_masks, segment_ids, labels = [], [], [], []
    for example in tqdm_notebook(examples, desc="Converting examples to features"):
        input_id, input_mask, segment_id, label = convert_single_example(
            tokenizer, example, max_seq_length
        )
        input_ids.append(input_id)
        input_masks.append(input_mask)
        segment_ids.append(segment_id)
        labels.append(label)
    return (
        np.array(input_ids),
        np.array(input_masks),
        np.array(segment_ids),
        np.array(labels).reshape(-1, 1),
    )

def convert_text_to_examples(texts, labels):
    """Create InputExamples"""
    InputExamples = []
    for text, label in zip(texts, labels):
        InputExamples.append(
            InputExample(guid=None, text_a=" ".join(text), text_b=None, label=label)
        )
    return InputExamples

In [None]:
# Instantiate tokenizer
tokenizer = create_tokenizer_from_hub_module()

In [None]:
# Convert data to InputExample format
train_examples = convert_text_to_examples(train_text_bert, train_label_bert)

train_examples = convert_text_to_examples(train_text_bert_balanced, train_label_bert_balanced)
test_examples = convert_text_to_examples(test_text_bert, test_label_bert)

In [None]:
# Convert to features
(train_input_ids, train_input_masks, train_segment_ids, train_labels 
) = convert_examples_to_features(tokenizer, train_examples, max_seq_length=max_seq_length)
(test_input_ids, test_input_masks, test_segment_ids, test_labels
) = convert_examples_to_features(tokenizer, test_examples, max_seq_length=max_seq_length)

### 3. Modeling

### 3.1. BERT Layer

In [13]:
class BertLayer(tf.keras.layers.Layer):
    def __init__(
        self,
        n_fine_tune_layers=10,
        pooling="first",
        bert_path="https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1",
        **kwargs,
    ):
        self.n_fine_tune_layers = n_fine_tune_layers
        self.trainable = True
        self.output_size = 768
        self.pooling = pooling
        self.bert_path = bert_path
        if self.pooling not in ["first", "mean"]:
            raise NameError(
                f"Undefined pooling type (must be either first or mean, but is {self.pooling}"
            )
        super(BertLayer, self).__init__(**kwargs)
    def build(self, input_shape):
        self.bert = hub.Module(
            self.bert_path, trainable=self.trainable, name=f"{self.name}_module"
        )
        # Remove unused layers
        trainable_vars = self.bert.variables
        if self.pooling == "first":
            trainable_vars = [var for var in trainable_vars if not "/cls/" in var.name]
            trainable_layers = ["pooler/dense"]
        elif self.pooling == "mean":
            trainable_vars = [
                var
                for var in trainable_vars
                if not "/cls/" in var.name and not "/pooler/" in var.name
            ]
            trainable_layers = []
        else:
            raise NameError(
                f"Undefined pooling type (must be either first or mean, but is {self.pooling}"
            )
        # Select how many layers to fine tune
        for i in range(self.n_fine_tune_layers):
            trainable_layers.append(f"encoder/layer_{str(11 - i)}")
        # Update trainable vars to contain only the specified layers
        trainable_vars = [
            var
            for var in trainable_vars
            if any([l in var.name for l in trainable_layers])
        ]
        # Add to trainable weights
        for var in trainable_vars:
            self._trainable_weights.append(var)
        for var in self.bert.variables:
            if var not in self._trainable_weights:
                self._non_trainable_weights.append(var)
        super(BertLayer, self).build(input_shape)
    def call(self, inputs):
        inputs = [K.cast(x, dtype="int32") for x in inputs]
        input_ids, input_mask, segment_ids = inputs
        bert_inputs = dict(
            input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids
        )
        if self.pooling == "first":
            pooled = self.bert(inputs=bert_inputs, signature="tokens", as_dict=True)[
                "pooled_output"
            ]
        elif self.pooling == "mean":
            result = self.bert(inputs=bert_inputs, signature="tokens", as_dict=True)[
                "sequence_output"
            ]
            mul_mask = lambda x, m: x * tf.expand_dims(m, axis=-1)
            masked_reduce_mean = lambda x, m: tf.reduce_sum(mul_mask(x, m), axis=1) / (
                    tf.reduce_sum(m, axis=1, keepdims=True) + 1e-10)
            input_mask = tf.cast(input_mask, tf.float32)
            pooled = masked_reduce_mean(result, input_mask)
        else:
            raise NameError(f"Undefined pooling type (must be either first or mean, but is {self.pooling}")
        return pooled
    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.output_size)

### 3.2. Build Model

In [14]:
# Build model
def build_model(max_seq_length): 
    in_id = tf.keras.layers.Input(shape=(max_seq_length,), name="input_ids")
    in_mask = tf.keras.layers.Input(shape=(max_seq_length,), name="input_masks")
    in_segment = tf.keras.layers.Input(shape=(max_seq_length,), name="segment_ids")
    bert_inputs = [in_id, in_mask, in_segment]
    bert_output = BertLayer(n_fine_tune_layers=3, pooling="first")(bert_inputs)
    dense = tf.keras.layers.Dense(256, activation='relu')(bert_output)
    pred = tf.keras.layers.Dense(1, activation='sigmoid')(dense)
    model = tf.keras.models.Model(inputs=bert_inputs, outputs=pred)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    return model
def initialize_vars(sess):
    sess.run(tf.local_variables_initializer())
    sess.run(tf.global_variables_initializer())
    sess.run(tf.tables_initializer())
    K.set_session(sess)

### 4. Running Model

In [None]:
model = build_model(max_seq_length)
# Instantiate variables
initialize_vars(sess)
model.fit(
    [train_input_ids, train_input_masks, train_segment_ids], 
    train_labels,
    validation_data=([test_input_ids, test_input_masks, test_segment_ids], test_labels),
    epochs=5,
    batch_size=32
)
model.save('model_bert_weights_dyspnea_sentences_balanced.h5')
model.save('model_bert_weights_dyspnea_sentences_unbalanced.h5')

### 5. Predictions

In [None]:
# Clear and load model
model = None
model = build_model(max_seq_length)
initialize_vars(sess)
model.load_weights('model_bert_weights_dyspnea_sentences_balanced.h5')

preds = model.predict([test_input_ids, 
                       test_input_masks, 
                       test_segment_ids]) 

In [18]:
def get_confusion_matrix(predicts, df_test):
    predicts = np.array(predicts)
    predicts = np.where(predicts > 0.5, 1, 0)
    y_test = np.array(df_test['dyspnea'])
    return confusion_matrix(y_test, predicts), f1_score(y_test, predicts, average = 'macro')

In [None]:
get_confusion_matrix(preds, test)