<a href="https://colab.research.google.com/github/johnc1231/ResearchNotebooks/blob/master/PICO_Classification/BiLSTM_for_PICO_Classification_11_5_18_Three_Models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!git clone https://github.com/bepnye/EBM-NLP.git
!tar -xzf EBM-NLP/ebm_nlp_1_00.tar.gz

Cloning into 'EBM-NLP'...
remote: Enumerating objects: 12, done.[K
remote: Counting objects: 100% (12/12), done.[K
remote: Compressing objects: 100% (11/11), done.[K
remote: Total 106 (delta 0), reused 4 (delta 0), pack-reused 94[K
Receiving objects: 100% (106/106), 38.52 MiB | 10.12 MiB/s, done.
Resolving deltas: 100% (21/21), done.


Pip Stuff

In [0]:
#pip stuff

!pip -qqq install gensim
!pip -qqq install git+https://www.github.com/keras-team/keras-contrib.git
!pip -qqq install --upgrade keras

Python setup

In [5]:
import os
from collections import Counter

import numpy as np

import keras
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense, LSTM, Bidirectional, TimeDistributed, Reshape, Dropout
from keras_contrib.layers import CRF

from sklearn.metrics import f1_score, precision_score, recall_score

from gensim.models.keyedvectors import KeyedVectors
from gensim.test.utils import datapath

import nltk
nltk.download('punkt')

from google.colab import drive

Using TensorFlow backend.


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [7]:
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


Paths

In [0]:
base_dir = 'ebm_nlp_1_00'
spans_dir = os.path.join(base_dir, 'annotations/aggregated/starting_spans')
participants_dir = os.path.join(spans_dir, 'participants')
participants_training_dir = os.path.join(participants_dir, 'train')
interventions_dir = os.path.join(spans_dir, 'interventions')
interventions_training_dir = os.path.join(interventions_dir, 'train')
outcomes_dir = os.path.join(spans_dir, 'outcomes')
outcomes_training_dir = os.path.join(outcomes_dir, 'train')


documents_dir = os.path.join(base_dir, 'documents')
embedding_path = "/content/drive/My Drive/Colab Notebooks/Research/PubMed-w2v.bin"

In [0]:
# Training Parameters
max_abstract_len = 400
max_num_tokens = 30000
embedding_dimension = 200

In [0]:
#Training data:
training_ids = [filename.split('_')[0] for filename in os.listdir(participants_training_dir) if filename.endswith('.ann')]


In [11]:
def tokens_and_labels(ids):
  all_participant_labels = []
  all_interventions_labels = []
  all_outcomes_labels = []
  
  abstract_lengths = []
  all_tokens = []
  all_labels = []
    
  for id in ids:
    with open(os.path.join(documents_dir, id + ".tokens")) as tokens_file:
      unlimited_tokens = tokens_file.read().split(" ")
      abstract_lengths.append(len(unlimited_tokens))
      tokens = unlimited_tokens[:max_abstract_len]
      all_tokens.append(tokens)
  
    with open(os.path.join(participants_training_dir, id + "_AGGREGATED.ann")) as annotations_file:
      participant_labels = [int(label) for label in annotations_file.read().split(",")[:max_abstract_len]]
      all_participant_labels.append(participant_labels)
      
    with open(os.path.join(interventions_training_dir, id + "_AGGREGATED.ann")) as annotations_file:
      interventions_labels = [int(label) for label in annotations_file.read().split(",")[:max_abstract_len]]
      all_interventions_labels.append(interventions_labels)
      
    with open(os.path.join(outcomes_training_dir, id + "_AGGREGATED.ann")) as annotations_file:
      outcomes_labels = [int(label) for label in annotations_file.read().split(",")[:max_abstract_len]]
      all_outcomes_labels.append(outcomes_labels)
      
  
  return (all_tokens, all_participant_labels, all_interventions_labels, all_outcomes_labels, abstract_lengths)
        
(all_tokens, all_participant_labels, all_intervention_labels, all_outcome_labels, abstract_lengths) = tokens_and_labels(training_ids)


# Need to get the max_num_tokens most common tokens
counter = Counter()

for tokens in all_tokens:
  counter.update(tokens)

# Leave space for reserved tokens when we do mapping
most_common_tokens = counter.most_common(max_num_tokens)   
num_regular_tokens = len(most_common_tokens) #Extra 2 for the unk token and the blank token.
num_tokens = num_regular_tokens + 2
unk_token = num_tokens - 2
blank_token = num_tokens - 1
print("Found {} distinct words".format(num_regular_tokens))
print("25th percentile abstract length is {}".format(np.percentile(abstract_lengths, 25)))
print("95th percentile abstract length is {}".format(np.percentile(abstract_lengths, 95)))

Found 30000 distinct words
25th percentile abstract length is 206.0
95th percentile abstract length is 438.0


In [0]:
# Need a word to index mapping
word_to_index = {word_count_pair[0]:index for index, word_count_pair in enumerate(most_common_tokens)}

#### Creating embedding matrix



In [0]:
#Embedding data
wv_from_bin = KeyedVectors.load_word2vec_format(datapath(embedding_path), binary=True)  # C binary format
#Get average vector in space.
average_vec = np.mean(wv_from_bin.vectors, axis=0).shape

In [14]:
embedding_matrix = np.zeros(shape=(num_tokens, embedding_dimension))
count_of_unknowns = 0
for i in range(0, num_regular_tokens):
  token = most_common_tokens[i][0]
  if token in wv_from_bin: #Ignoring words not in vocab
    embedding_matrix[i] = wv_from_bin[token]
  else:
    embedding_matrix[i] = np.copy(average_vec)
    count_of_unknowns += 1

print("There were {} unknown tokens".format(count_of_unknowns))


997


#### Creating vectors of token indicies

In [0]:
#Now I need to make a vector where the words are replaced with numbers. 
all_tokens_indices =[]

for tokens in all_tokens:
  all_tokens_indices.append([word_to_index.get(token, unk_token) for token in tokens])

In [16]:
count_of_unks = 0;
count_of_tokens = 0;
for token_indices in all_tokens_indices:
  for index in token_indices:
    count_of_tokens += 1
    if index == unk_token:
      count_of_unks += 1

print(count_of_tokens)
print(count_of_unks)

1271581
21525


# Participants

In [0]:
participants_x_train = []
participants_y_train = []

participants_x_train = keras.preprocessing.sequence.pad_sequences(all_tokens_indices, maxlen=max_abstract_len, value = blank_token)
participants_y_train = keras.preprocessing.sequence.pad_sequences(all_participant_labels, maxlen=max_abstract_len, value = 0.0)

participants_x_val = participants_x_train[4000:]
participants_y_val = participants_y_train[4000:]

participants_x_train = participants_x_train[:4000]
participants_y_train = participants_y_train[:4000]

participants_crf_y_train = (np.arange(participants_y_train.max() + 1) == participants_y_train[...,None]).astype(int)
participants_crf_y_val = (np.arange(participants_y_val.max() + 1) == participants_y_val[...,None]).astype(int)

In [20]:
model = Sequential()
model.add(Embedding(num_tokens,
                    embedding_dimension,
                    weights=[embedding_matrix],
                    input_length=max_abstract_len,
                    trainable=True))
model.add(Bidirectional(LSTM(180, return_sequences=True)))
model.add(Dropout(.7))
model.add(TimeDistributed(Dense(128, activation="relu")))
crf = CRF(2)
model.add(crf)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 400, 200)          6000400   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 400, 360)          548640    
_________________________________________________________________
dropout_1 (Dropout)          (None, 400, 360)          0         
_________________________________________________________________
time_distributed_1 (TimeDist (None, 400, 128)          46208     
_________________________________________________________________
crf_1 (CRF)                  (None, 400, 2)            266       
Total params: 6,595,514
Trainable params: 6,595,514
Non-trainable params: 0
_________________________________________________________________


In [0]:
model.compile(optimizer="adam",
              loss=crf.loss_function,
              metrics=[crf.accuracy])
history = model.fit(participants_x_train, participants_crf_y_train, epochs = 3, batch_size = 64, validation_data=(participants_x_val, participants_crf_y_val))

In [32]:
participants_val_predictions = model.predict(participants_x_val)
participants_val_predictions_indexed = np.argmax(participants_val_predictions, axis=2)
participants_val_predictions_flat = participants_val_predictions_indexed.flatten()
participants_val_actual_flat = participants_y_val.flatten()
computed_f1_score = f1_score(participants_val_actual_flat, participants_val_predictions_flat)
computed_precision = precision_score(participants_val_actual_flat, participants_val_predictions_flat)
computed_recall = recall_score(participants_val_actual_flat, participants_val_predictions_flat)

print("F1 Score: {} Precision: {} Recall: {}".format(computed_f1_score, computed_precision, computed_recall))

F1 Score: 0.6985229881554621 Precision: 0.799953466728711 Recall: 0.6199199451876961


# Interventions

In [0]:
interventions_x_train = []
interventions_y_train = []

interventions_x_train = keras.preprocessing.sequence.pad_sequences(all_tokens_indices, maxlen=max_abstract_len, value = blank_token)
interventions_y_train = keras.preprocessing.sequence.pad_sequences(all_participant_labels, maxlen=max_abstract_len, value = 0.0)

interventions_x_val = interventions_x_train[4000:]
interventions_y_val = interventions_y_train[4000:]

interventions_x_train = interventions_x_train[:4000]
interventions_y_train = interventions_y_train[:4000]

interventions_crf_y_train = (np.arange(interventions_y_train.max() + 1) == interventions_y_train[...,None]).astype(int)
interventions_crf_y_val = (np.arange(interventions_y_val.max() + 1) == interventions_y_val[...,None]).astype(int)

In [39]:
model = Sequential()
model.add(Embedding(num_tokens,
                    embedding_dimension,
                    weights=[embedding_matrix],
                    input_length=max_abstract_len,
                    trainable=True))
model.add(Bidirectional(LSTM(180, return_sequences=True)))
model.add(Dropout(.7))
model.add(TimeDistributed(Dense(128, activation="relu")))
crf = CRF(2)
model.add(crf)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 400, 200)          6000400   
_________________________________________________________________
bidirectional_3 (Bidirection (None, 400, 360)          548640    
_________________________________________________________________
dropout_3 (Dropout)          (None, 400, 360)          0         
_________________________________________________________________
time_distributed_3 (TimeDist (None, 400, 128)          46208     
_________________________________________________________________
crf_3 (CRF)                  (None, 400, 2)            266       
Total params: 6,595,514
Trainable params: 6,595,514
Non-trainable params: 0
_________________________________________________________________


In [40]:
model.compile(optimizer="adam",
              loss=crf.loss_function,
              metrics=[crf.accuracy])
history = model.fit(interventions_x_train, interventions_crf_y_train, epochs = 3, batch_size = 64, validation_data=(interventions_x_val, interventions_crf_y_val))

Train on 4000 samples, validate on 802 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [41]:
interventions_val_predictions = model.predict(interventions_x_val)
interventions_val_predictions_indexed = np.argmax(interventions_val_predictions, axis=2)
interventions_val_predictions_flat = interventions_val_predictions_indexed.flatten()
interventions_val_actual_flat = interventions_y_val.flatten()
computed_f1_score = f1_score(interventions_val_actual_flat, interventions_val_predictions_flat)
computed_precision = precision_score(interventions_val_actual_flat, interventions_val_predictions_flat)
computed_recall = recall_score(interventions_val_actual_flat, interventions_val_predictions_flat)

print("F1 Score: {} Precision: {} Recall: {}".format(computed_f1_score, computed_precision, computed_recall))

F1 Score: 0.680953527570431 Precision: 0.7000533170843172 Recall: 0.6628682701669611


# Outcomes

In [0]:
outcomes_x_train = []
outcomes_y_train = []

outcomes_x_train = keras.preprocessing.sequence.pad_sequences(all_tokens_indices, maxlen=max_abstract_len, value = blank_token)
outcomes_y_train = keras.preprocessing.sequence.pad_sequences(all_participant_labels, maxlen=max_abstract_len, value = 0.0)

outcomes_x_val = outcomes_x_train[4000:]
outcomes_y_val = outcomes_y_train[4000:]

outcomes_x_train = outcomes_x_train[:4000]
outcomes_y_train = outcomes_y_train[:4000]

outcomes_crf_y_train = (np.arange(outcomes_y_train.max() + 1) == outcomes_y_train[...,None]).astype(int)
outcomes_crf_y_val = (np.arange(outcomes_y_val.max() + 1) == outcomes_y_val[...,None]).astype(int)

In [43]:
model = Sequential()
model.add(Embedding(num_tokens,
                    embedding_dimension,
                    weights=[embedding_matrix],
                    input_length=max_abstract_len,
                    trainable=True))
model.add(Bidirectional(LSTM(180, return_sequences=True)))
model.add(Dropout(.7))
model.add(TimeDistributed(Dense(128, activation="relu")))
crf = CRF(2)
model.add(crf)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 400, 200)          6000400   
_________________________________________________________________
bidirectional_4 (Bidirection (None, 400, 360)          548640    
_________________________________________________________________
dropout_4 (Dropout)          (None, 400, 360)          0         
_________________________________________________________________
time_distributed_4 (TimeDist (None, 400, 128)          46208     
_________________________________________________________________
crf_4 (CRF)                  (None, 400, 2)            266       
Total params: 6,595,514
Trainable params: 6,595,514
Non-trainable params: 0
_________________________________________________________________


In [44]:
model.compile(optimizer="adam",
              loss=crf.loss_function,
              metrics=[crf.accuracy])
history = model.fit(outcomes_x_train, outcomes_crf_y_train, epochs = 3, batch_size = 64, validation_data=(outcomes_x_val, outcomes_crf_y_val))

Train on 4000 samples, validate on 802 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [45]:
outcomes_val_predictions = model.predict(outcomes_x_val)
outcomes_val_predictions_indexed = np.argmax(outcomes_val_predictions, axis=2)
outcomes_val_predictions_flat = outcomes_val_predictions_indexed.flatten()
outcomes_val_actual_flat = outcomes_y_val.flatten()
computed_f1_score = f1_score(outcomes_val_actual_flat, outcomes_val_predictions_flat)
computed_precision = precision_score(outcomes_val_actual_flat, outcomes_val_predictions_flat)
computed_recall = recall_score(outcomes_val_actual_flat, outcomes_val_predictions_flat)

print("F1 Score: {} Precision: {} Recall: {}".format(computed_f1_score, computed_precision, computed_recall))

F1 Score: 0.7076986337214735 Precision: 0.7345956852397951 Recall: 0.6827016696116259
