In [56]:
from __future__ import absolute_import
from __future__ import division

from tqdm import tqdm
import numpy as np
import os
import io
import json
import sys
import logging

import tensorflow as tf

# from qa_model import Model
%run Model.py

In [57]:
########################################################################################
embedding_size = 50
mode = "train"

num_epochs = 0
learning_rate = 0.001
max_gradient_norm = 5.0
dropout = 0.15
batch_size = 60
hidden_size_encoder = 150
hidden_size_qp_matching = 150
hidden_size_sm_matching = 50
hidden_size_fully_connected = 200
context_len = 300
question_len = 30
embedding_size = 100

########################################################################################
data_dir = "../Data/"

In [58]:
def initialize_model(session, model, train_dir, expect_exists):
    """
    Initializes model from train_dir.

    Inputs:
      session: TensorFlow session
      model: QAModel
      train_dir: path to directory where we'll look for checkpoint
      expect_exists: If True, throw an error if no checkpoint is found.
        If False, initialize fresh model if no checkpoint is found.
    """
    print("Looking for model at %s..." % train_dir)
    ckpt = tf.train.get_checkpoint_state(train_dir)
    v2_path = ckpt.model_checkpoint_path + ".index" if ckpt else ""
    if ckpt and (tf.gfile.Exists(ckpt.model_checkpoint_path) or tf.gfile.Exists(v2_path)):
        print("Reading model parameters from %s" % ckpt.model_checkpoint_path_)
        model.saver.restore(session, ckpt.model_checkpoint_path)
    else:
        if expect_exists:
            raise Exception("There is no saved checkpoint at %s" % train_dir)
        else:
            print("There is no saved checkpoint at %s. Creating model with fresh parameters." % train_dir)
            session.run(tf.global_variables_initializer())
            print('Num params: %d' % sum(v.get_shape().num_elements() for v in tf.trainable_variables()))


In [59]:
_PAD = b"<pad>"
_UNK = b"<unk>"
_START_VOCAB = [_PAD, _UNK]
PAD_ID = 0
UNK_ID = 1

def get_glove(glove_path, glove_dim):
    """Reads from original GloVe .txt file and returns embedding matrix and
    mappings from words to word ids.

    Input:
      glove_path: path to glove.6B.{glove_dim}d.txt
      glove_dim: integer; needs to match the dimension in glove_path

    Returns:
      emb_matrix: Numpy array shape (400002, glove_dim) containing glove embeddings
        (plus PAD and UNK embeddings in first two rows).
        The rows of emb_matrix correspond to the word ids given in word2id and id2word
      word2id: dictionary mapping word (string) to word id (int)
      id2word: dictionary mapping word id (int) to word (string)
    """

    print("Loading GLoVE vectors from file: %s" % glove_path)
    vocab_size = int(4e5) # this is the vocab size of the corpus we've downloaded

    emb_matrix = np.zeros((vocab_size + len(_START_VOCAB), glove_dim))
    word2id = {}
    id2word = {}

    random_init = True
    # randomly initialize the special tokens
    if random_init:
        emb_matrix[:len(_START_VOCAB), :] = np.random.randn(len(_START_VOCAB), glove_dim)

    # put start tokens in the dictionaries
    idx = 0
    for word in _START_VOCAB:
        word2id[word] = idx
        id2word[idx] = word
        idx += 1

    # go through glove vecs
    with open(glove_path, 'r') as fh:
        for line in tqdm(fh, total=vocab_size):
            line = line.lstrip().rstrip().split(" ")
            word = line[0]
            vector = list(map(float, line[1:]))
            if glove_dim != len(vector):
                raise Exception("You set --glove_path=%s but --embedding_size=%i. If you set --glove_path yourself then make sure that --embedding_size matches!" % (glove_path, glove_dim))
            emb_matrix[idx, :] = vector
            word2id[word] = idx
            id2word[idx] = word
            idx += 1

    final_vocab_size = vocab_size + len(_START_VOCAB)
    assert len(word2id) == final_vocab_size
    assert len(id2word) == final_vocab_size
    assert idx == final_vocab_size

    return emb_matrix, word2id, id2word


In [60]:
def initialize_model(session, model, train_dir, expect_exists):
    """
    Initializes model from train_dir.

    Inputs:
      session: TensorFlow session
      model: QAModel
      train_dir: path to directory where we'll look for checkpoint
      expect_exists: If True, throw an error if no checkpoint is found.
        If False, initialize fresh model if no checkpoint is found.
    """
    print("Looking for model at %s..." % train_dir)
    ckpt = tf.train.get_checkpoint_state(train_dir)
    v2_path = ckpt.model_checkpoint_path + ".index" if ckpt else ""
    if ckpt and (tf.gfile.Exists(ckpt.model_checkpoint_path) or tf.gfile.Exists(v2_path)):
        print("Reading model parameters from %s" % ckpt.model_checkpoint_path_)
        model.saver.restore(session, ckpt.model_checkpoint_path)
    else:
        if expect_exists:
            raise Exception("There is no saved checkpoint at %s" % train_dir)
        else:
            print("There is no saved checkpoint at %s. Creating model with fresh parameters." % train_dir)
            session.run(tf.global_variables_initializer())
            print('Num params: %d' % sum(v.get_shape().num_elements() for v in tf.trainable_variables()))


In [13]:
glove_path = data_dir + "Glove/" + "glove.6B.{}d.txt".format(embedding_size)
# Load embedding matrix and vocab mappings
emb_matrix, word2id, id2word = get_glove(glove_path, embedding_size)

Loading GLoVE vectors from file: ../Data/Glove/glove.6B.50d.txt


100%|██████████| 400000/400000 [00:07<00:00, 55379.10it/s]


In [61]:
# Get filepaths to train/dev datafiles for tokenized queries, contexts and answers
train_context_path = data_dir + "Train/" + "context"
train_qn_path = data_dir + "Train/" + "question"
train_ans_path = data_dir + "Train/" + "span"

dev_context_path = data_dir + "Dev/" + "context"
dev_qn_path = data_dir + "Dev/" + "question"
dev_ans_path = data_dir + "Dev/" + "span"

tf.reset_default_graph()
qa_model = QAModel(id2word, word2id, emb_matrix)

















last dim 300
Basic attn keys (?, 300, 300)
Basic attn values (?, 300, 30)
Basic attn logits (?, 300, 30)














In [62]:
qa_model

<__main__.QAModel at 0x7f831feb6a58>

In [63]:
 # Split by mode
if mode == "train":
    train_dir_path = data_dir + "Train/"
    file_handler = logging.FileHandler( train_dir_path + "log.txt")
    logging.getLogger().addHandler(file_handler)

    # # Make bestmodel dir if necessary
    # if not os.path.exists(bestmodel_dir):
    #     os.makedirs(bestmodel_dir)

    with tf.Session() as sess:

        # Load most recent model
        initialize_model(sess, qa_model,train_dir_path, expect_exists=False)

        # Train
        qa_model.train(train_dir_path , sess, train_context_path, train_qn_path, train_ans_path, dev_qn_path, dev_context_path, dev_ans_path)


Looking for model at ../Data/Train/...
There is no saved checkpoint at ../Data/Train/. Creating model with fresh parameters.
Num params: 301502


INFO:root:Number of params: 301502 (retrieval took 1.844465 secs)
INFO:root:Beginning training loop...


Refilling batches...


ValueError: invalid literal for int() with base 10: "b'115"

In [67]:
string = "b'115 114'"
ans = []
words = string.split()
print(words[0][2:])
ans.append(int(words[0][2:]))
ans.append(int(words[1][:-1]))
print(ans)

115
[115, 114]
