In [17]:
import tensorflow as tf
import time
import logging
import os
import sys

import numpy as np
from tqdm import tqdm

from data_batcher import get_batch_generator
from vocab import get_glove, get_char_embed
from modules import RNNEncoder, masked_softmax
from tensorflow.python.ops.rnn_cell import DropoutWrapper

from bilm import Batcher, BidirectionalLanguageModel, weight_layers

np.set_printoptions(threshold=np.inf)

In [18]:
#Define statistics and hyperparameters
batch_size = 2
hidden_size = 7
context_len = 100
question_len = 25
embedding_size = 50
char_size = 20
num_of_char = 72
max_word_len = 20
dropout = 0.2

#Define path
train_context_path =  "/Users/lam/Desktop/Lam-cs224n/Projects/qa/squadV2/data/train.context"
train_qn_path = "/Users/lam/Desktop/Lam-cs224n/Projects/qa/squadV2/data/train.question"
train_ans_path = "/Users/lam/Desktop/Lam-cs224n/Projects/qa/squadV2/data/train.span"
dev_qn_path = "/Users/lam/Desktop/Lam-cs224n/Projects/qa/squadV2/data/dev.question"
dev_context_path = "/Users/lam/Desktop/Lam-cs224n/Projects/qa/squadV2/data/dev.context"
dev_ans_path = "/Users/lam/Desktop/Lam-cs224n/Projects/qa/squadV2/data/dev.span"

elmo_dir = "/Users/lam/Desktop/Lam-cs224n/Projects/qa/squad/data/elmo"

class FLAGS(object):
    def __init__(self, batch_size, hidden_size, context_len, question_len, embedding_size, char_size, num_of_char, max_word_len, dropout, elmo_dir):
        self.batch_size = batch_size
        self.hidden_size = hidden_size
        self.context_len = context_len
        self.question_len = question_len
        self.embedding_size = embedding_size
        self.char_size = char_size
        self.num_of_char = num_of_char
        self.max_word_len = max_word_len
        self.dropout = dropout
        self.elmo_dir = elmo_dir

FLAGS = FLAGS(batch_size, hidden_size, context_len, question_len, embedding_size, char_size, num_of_char, max_word_len, dropout, elmo_dir)

In [19]:
glove_path = "/Users/lam/Desktop/Lam-cs224n/Projects/qa/squadV2/data/glove.6B.50d.txt"
emb_matrix, word2id, id2word = get_glove(glove_path, FLAGS.embedding_size)
char2id, id2char = get_char_embed()

  0%|          | 0/400000 [00:00<?, ?it/s]

Loading GLoVE vectors from file: /Users/lam/Desktop/Lam-cs224n/Projects/qa/squadV2/data/glove.6B.50d.txt


100%|██████████| 400000/400000 [00:10<00:00, 37280.82it/s]


In [31]:
class QAModel(object):
    def __init__(self, FLAGS, id2word, word2id, emb_matrix, id2char, char2id):
        self.FLAGS = FLAGS
        self.id2word = id2word
        self.word2id = word2id
        self.emb_matrix =  emb_matrix
        self.id2char = id2char
        self.char2id = char2id
        
        self.batcher = Batcher("/Users/lam/Desktop/Lam-cs224n/Projects/qa/squad/data/elmo/elmo_vocab.txt", 50)
        self.filters = [(5,10)] #change back to 100 after
        
        self.options_file = "/Users/lam/Desktop/Lam-cs224n/Projects/qa/squad/data/elmo/elmo.json"
        self.weight_file = "/Users/lam/Desktop/Lam-cs224n/Projects/qa/squad/data/elmo/lm_weight.hdf5"
        
        with tf.variable_scope("QAModel"):
            self.add_placeholders()
            self.add_embedding_layer(emb_matrix)
        
    def add_placeholders(self):
        self.context_ids = tf.placeholder(tf.int32)
        self.context_mask = tf.placeholder(tf.int32)
        self.qn_ids = tf.placeholder(tf.int32)
        self.qn_mask = tf.placeholder(tf.int32)
        self.ans_span = tf.placeholder(tf.int32, shape=[None, 2])
        
        #NOTE:CHANGE
        #self.context_char = tf.placeholder(tf.int32, shape=[None, self.FLAGS.context_len, self.FLAGS.max_word_len])
        #self.qn_char = tf.placeholder(tf.int32, shape=[None, self.FLAGS.question_len, self.FLAGS.max_word_len])
        #The following two may not be necessary
        #self.context_char_mask = tf.placeholder(tf.int32, shape=[None, self.FLAGS.context_len, self.FLAGS.max_word_len])
        #self.qn_char_mask = tf.placeholder(tf.int32, shape=[None, self.FLAGS.question_len, self.FLAGS.max_word_len])
        self.context_elmo = tf.placeholder('int32', shape=[None, None, 50])
        self.qn_elmo = tf.placeholder('int32', shape=[None, None, 50])
        
        # Add a placeholder to feed in the keep probability (for dropout).
        # This is necessary so that we can instruct the model to use dropout when training, but not when testing
        self.keep_prob = tf.placeholder_with_default(1.0, shape=())
    
    def add_embedding_layer(self, emb_matrix):
        with tf.variable_scope("embeddings"):
            #set to constant so its untrainable
            embedding_matrix = tf.constant(emb_matrix, dtype=tf.float32, name="emb_matrix") # shape (400002, embedding_size)

            # Get the word embeddings for the context and question,
            self.context_embs = tf.nn.embedding_lookup(embedding_matrix, self.context_ids)
            self.qn_embs = tf.nn.embedding_lookup(embedding_matrix, self.qn_ids)

        #self.add_char_embedding_layer()

    def add_elmo_embedding_layer(self, options_file, weight_file, output_use=False):
        """
        Adds ELMo lstm embeddings to the graph.

        Inputs:
            options_file: json_file for the pretrained model
            weight_file: weights hdf5 file for the pretrained model
            output_use: determine if use elmo in output of biRNN (default False)
        """
        #Build biLM graph
        bilm = BidirectionalLanguageModel(options_file, weight_file)
        context_embeddings_op = bilm(self.context_elmo)
        question_embeddings_op = bilm(self.qn_elmo)

        # Get an op to compute ELMo (weighted average of the internal biLM layers)
        # Our SQuAD model includes ELMo at both the input and output layers
        # of the task GRU, so we need 4x ELMo representations for the question
        # and context at each of the input and output.
        # We use the same ELMo weights for both the question and context
        # at each of the input and output.
        #compute the final ELMo representations.
        self.elmo_context_input = weight_layers('input', context_embeddings_op, l2_coef=0.001)['weighted_op'] #(batch size, context size, ????)
        with tf.variable_scope('', reuse=True):
            # the reuse=True scope reuses weights from the context for the question
            self.elmo_question_input = weight_layers(
                'input', question_embeddings_op, l2_coef=0.001
            )['weighted_op']

        if output_use:
            self.elmo_context_output = weight_layers(
                'output', context_embeddings_op, l2_coef=0.001
            )['weighted_op']
            with tf.variable_scope('', reuse=True):
                # the reuse=True scope reuses weights from the context for the question
                self.elmo_question_output = weight_layers(
                    'output', question_embeddings_op, l2_coef=0.001
                )['weighted_op']
    
    
    def run_train_iter(self, session, batch):
        input_feed = {}
        input_feed[self.context_ids] = batch.context_ids
        input_feed[self.context_mask] = batch.context_mask
        
        #NOTE: CHANGE added context_char
        #input_feed[self.context_char] = batch.context_char
        input_feed[self.context_elmo] = self.batcher.batch_sentences(batch.context_tokens)
        
        input_feed[self.qn_ids] = batch.qn_ids
        input_feed[self.qn_mask] = batch.qn_mask
        
        #NOTE: CHANGE added qn_char
        #input_feed[self.qn_char] = batch.qn_char
        input_feed[self.qn_elmo] = self.batcher.batch_sentences(batch.qn_tokens)
        
        input_feed[self.ans_span] = batch.ans_span
        input_feed[self.keep_prob] = 1.0 - self.FLAGS.dropout # apply dropout
        
        
        output_feed = [self.elmo_context_input]
        sess.run(output_feed, feed_dict=input_feed)
        for i in output_feed:
            print(i.shape)
    
    def train(self, session, train_context_path, train_qn_path, train_ans_path, dev_qn_path, dev_context_path, dev_ans_path):
        #self.add_elmo_embedding_layer(self.options_file, self.weight_file)
        for batch in get_batch_generator(self.word2id, self.char2id, train_context_path, train_qn_path, train_ans_path, self.FLAGS.batch_size, self.FLAGS.context_len, self.FLAGS.question_len, self.FLAGS.max_word_len, discard_long=True):
            self.sample_batch = batch
            
            self.run_train_iter(session, batch)
            break

In [36]:
tf.reset_default_graph()
options_file = "/Users/lam/Desktop/Lam-cs224n/Projects/qa/squad/data/elmo/elmo.json"
weight_file= "/Users/lam/Desktop/Lam-cs224n/Projects/qa/squad/data/elmo/lm_weight.hdf5"

qa_model = QAModel(FLAGS, id2word, word2id, emb_matrix, id2char, char2id)
qa_model.add_elmo_embedding_layer(options_file, weight_file)
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    qa_model.train(sess, train_context_path, train_qn_path, train_ans_path, dev_qn_path, dev_context_path, dev_ans_path)
    #variables_names =[v.name for v in tf.trainable_variables()]
    #values = sess.run(variables_names)

USING SKIP CONNECTIONS
USING SKIP CONNECTIONS


ValueError: Variable bilm/char_embed already exists, disallowed. Did you mean to set reuse=True or reuse=tf.AUTO_REUSE in VarScope? Originally defined at:

  File "/anaconda3/envs/cs231n/lib/python3.6/site-packages/bilm-0.1-py3.6.egg/bilm/model.py", line 266, in custom_getter
    return getter(name, *args, **kwargs)
  File "/anaconda3/envs/cs231n/lib/python3.6/site-packages/bilm-0.1-py3.6.egg/bilm/model.py", line 336, in _build_word_char_embeddings
    initializer=tf.random_uniform_initializer(-1.0, 1.0)
  File "/anaconda3/envs/cs231n/lib/python3.6/site-packages/bilm-0.1-py3.6.egg/bilm/model.py", line 281, in _build
    self._build_word_char_embeddings()


In [26]:
options_file = "/Users/lam/Desktop/Lam-cs224n/Projects/qa/squad/data/elmo/emlo.json"
weight_file= "/Users/lam/Desktop/Lam-cs224n/Projects/qa/squad/data/elmo/lm_weight.hdf5"

bilm = BidirectionalLanguageModel(options_file, weight_file)

In [6]:
batcher = Batcher("/Users/lam/Desktop/Lam-cs224n/Projects/qa/squad/data/elmo/elmo_vocab.txt", 50)

raw_context = [
    'Pretrained biLMs compute representations useful for NLP tasks .',
    'They give state of the art performance for many tasks .'
]
tokenized_context = [sentence.split() for sentence in raw_context]
context_ids = batcher.batch_sentences(tokenized_context)
qn_elmo = batcher.batch_sentences(qa_model.sample_batch.qn_tokens)

print("batcher sentence shape: ", context_ids.shape)
print("First sentence len: ", len(tokenized_context[0]))
print("Second sentence len: ", len(tokenized_context[1]))
print("--"*20)
print("batcher sentence shape: ", batcher.batch_sentences(qa_model.sample_batch.qn_tokens).shape)
print("First question len: ", len(qa_model.sample_batch.qn_tokens[0]))
print("Second question len: ", len(qa_model.sample_batch.qn_tokens[1]))

batcher sentence shape:  (2, 13, 50)
First sentence len:  9
Second sentence len:  11
----------------------------------------
batcher sentence shape:  (2, 17, 50)
First question len:  15
Second question len:  8


In [20]:
qn_elmo[0][23]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0])

In [7]:
qa_model.sample_batch.context_ids.shape

(2, 100)

In [57]:
for batch in get_batch_generator(word2id, char2id, train_context_path, train_qn_path, train_ans_path, batch_size, context_len, question_len, max_word_len, discard_long=True):
    sample = batch
    break

Refilling batches...
Refilling batches took 1.08 seconds


In [63]:
print("context_ids shape: ", sample.context_ids.shape)
print("context_mask shape: ", sample.context_mask.shape)

print("qn_ids shape: ", sample.qn_ids.shape)
print("qn_mask shape: ", sample.qn_mask.shape)

context_ids shape:  (2, 85)
context_mask shape:  (2, 85)
qn_ids shape:  (2, 11)
qn_mask shape:  (2, 11)


In [61]:
print("first context tokens len: ", len(sample.context_tokens[0]))
print("second context tokens len: ", len(sample.context_tokens[1]))

print("first question tokens len: ", len(sample.qn_tokens[0]))
print("second question tokens len: ", len(sample.qn_tokens[1]))

first context tokens len:  85
second context tokens len:  85
first question tokens len:  7
second question tokens len:  11
