In [None]:
import json
import os
import numpy as np
import tensorflow as tf
from tensorflow.python.ops import embedding_ops
from tensorflow.python.ops import variable_scope as vs
from tqdm import tqdm

In [None]:
import nltk
nltk.download('punkt')
from nltk import word_tokenize,sent_tokenize

## Preprocessing Data

In [None]:
def load_json(filename):
    with open(filename) as data_file:
        data = json.load(data_file)
    return data

In [None]:
def tokenize_data(string):
    tokens = [token.replace("''", '" ').replace("``", '" ').lower() for token in nltk.word_tokenize(string)]
    return tokens

In [None]:
def get_Word_Index(context, context_tokens):
    result = ''
    current_word_index = 0
    wordMap = dict()

    for char_index, char in enumerate(context):
        if char !='\n' and char !=' ':
            result += char
            context_token = context_tokens[current_word_index]
            if result == context_token:
                start = char_index - len(result) + 1
                for char_position in range(start, char_index+1):
                    wordMap[char_position] = (result, current_word_index)
                result = ''
                current_word_index += 1
                
    if current_word_index != len(context_tokens):
        return None
    else:
        return wordMap

In [None]:
def preprocess(dataset, datatype, jsonDir):
    num_map_problem = 0
    num_token_problem = 0
    num_align_problem = 0
    num_examples = 0
    examples = []
    
    for eventID in tqdm(range(len(dataset['data']))):
        event_Paragraphs = dataset['data'][eventID]['paragraphs']
        for paragraphID in range(len(event_Paragraphs)):
            # Context Data
            context = event_Paragraphs[paragraphID]['context']
            context = context.replace("''", '" ').replace("``", '" ').lower()
            context_tokens = tokenize_data(context)
            
            wordIndex = get_Word_Index(context, context_tokens)
            if wordIndex is None:
                num_map_problem += len(event_Paragraphs[paragraphID]['qas'])
                continue
            
            # Question and Answer Data
            qaSet = event_Paragraphs[paragraphID]['qas']
            for qID in qaSet:
                question = qID['question']
                question = question.replace("''", '" ').replace("``", '" ').lower()
                question_tokens = tokenize_data(question)
                
                ans_text = qID['answers'][0]['text']
                ans_text = ans_text.lower()
                ans_start_index = qID['answers'][0]['answer_start']
                ans_end_index = ans_start_index + len(ans_text)
                
                if context[ans_start_index:ans_end_index] != ans_text:
                    num_align_problem += 1
                    continue
                
                ans_start_word = wordIndex[ans_start_index][1]
                ans_end_word = wordIndex[ans_end_index-1][1]
                ans_tokens = context_tokens[ans_start_word:ans_end_word+1]
                if("".join(ans_tokens) != "".join(ans_text.split())):
                    num_token_problem += 1
                    continue
                
                examples.append((" ".join(context_tokens), " ".join(question_tokens), " ".join(ans_tokens), " ".join([str(ans_start_word), str(ans_end_word)])))
                num_examples += 1
    
    # Creating files for context, questions, answers, and answer span indexes
    index = list(range(len(examples)))
    np.random.shuffle(index)
    with open(os.path.join(jsonDir, datatype +'.context'), 'w', encoding="utf-8") as context_file,  \
         open(os.path.join(jsonDir, datatype +'.question'), 'w', encoding="utf-8") as question_file,\
         open(os.path.join(jsonDir, datatype +'.answer'), 'w', encoding="utf-8") as answer_file, \
         open(os.path.join(jsonDir, datatype +'.span'), 'w', encoding="utf-8") as span_file:
        
        for i in index:
            (context, question, answer, span_index) = examples[i]
            context_file.write(context + '\n')
            question_file.write(question + '\n')
            answer_file.write(answer + '\n')
            span_file.write(span_index + '\n')
    
    # Returning results
    print ("Number of triples ignored due to token mapping problems: ", num_map_problem)
    print ("Number of triples ignored due to unalignment with tokenization problems: ", num_token_problem)
    print ("Number of triples ignored due to span alignment problems: ", num_align_problem)
    print ("Processed examples: %i out of %i" % (num_examples, num_examples+num_map_problem+num_token_problem+num_align_problem))

In [None]:
# Read data
jsonDir = "./dataset/"
dev_data = load_json(os.path.join(jsonDir,"dev-v1.1.json"))
train_data = load_json(os.path.join(jsonDir,"train-v1.1.json"))

In [None]:
preprocess(dev_data, "dev", jsonDir)

In [None]:
preprocess(train_data, "train", jsonDir)

In [None]:
# NEED TO BE REPLACED WITH STUTI'S FILE
_PAD = b"<pad>"
_UNK = b"<unk>"
_START_VOCAB = [_PAD, _UNK]
PAD_ID = 0
UNK_ID = 1

def get_glove(glove_path, glove_dim):
    """Reads from original GloVe .txt file and returns embedding matrix and
    mappings from words to word ids.
    Input:
      glove_path: path to glove.6B.{glove_dim}d.txt
      glove_dim: integer; needs to match the dimension in glove_path
    Returns:
      emb_matrix: Numpy array shape (400002, glove_dim) containing glove embeddings
        (plus PAD and UNK embeddings in first two rows).
        The rows of emb_matrix correspond to the word ids given in word2id and id2word
      word2id: dictionary mapping word (string) to word id (int)
      id2word: dictionary mapping word id (int) to word (string)
    """

    print ("Loading GLoVE vectors from file: %s" % glove_path)
    vocab_size = int(4e5) # this is the vocab size of the corpus we've downloaded

    emb_matrix = np.zeros((vocab_size + len(_START_VOCAB), glove_dim))
    word2id = {}
    id2word = {}

    random_init = True
    # randomly initialize the special tokens
    if random_init:
        emb_matrix[:len(_START_VOCAB), :] = np.random.randn(len(_START_VOCAB), glove_dim)

    # put start tokens in the dictionaries
    idx = 0
    for word in _START_VOCAB:
        word2id[word] = idx
        id2word[idx] = word
        idx += 1

    # go through glove vecs
    with open(glove_path, 'r', encoding="utf-8") as fh:
        for line in tqdm(fh, total=vocab_size):
            line = line.lstrip().rstrip().split(" ")
            word = line[0]
            vector = list(map(float, line[1:]))
            if glove_dim != len(vector):
                raise Exception("You set --glove_path=%s but --embedding_size=%i. If you set --glove_path yourself then make sure that --embedding_size matches!" % (glove_path, glove_dim))
            emb_matrix[idx, :] = vector
            word2id[word] = idx
            id2word[idx] = word
            idx += 1

    final_vocab_size = vocab_size + len(_START_VOCAB)
    assert len(word2id) == final_vocab_size
    assert len(id2word) == final_vocab_size
    assert idx == final_vocab_size

    return id2word, word2id, emb_matrix

## Model

In [None]:
class mrcModel(object):
    hidden_bidaf_size = 150
    hidden_full_size = 200
    
    def __init__(self, id2word, word2id, embed_matrix):
        self.id2word = id2word
        self.word2id = word2id
        with tf.variable_scope("QAModel", initializer=tf.contrib.layers.variance_scaling_initializer(factor=1.0, uniform=True)):
            self.add_placeholders()
            self.add_embed_layer(embed_matrix)
            self.create_layers()
            self.add_loss()
        
    def add_placeholders(self):
        # Add placeholders for the inputs
        self.context_ids = tf.placeholder(tf.int32, shape=[None, context_len])
        self.context_mask = tf.placeholder(tf.int32, shape=[None, context_len])
        self.question_ids = tf.placeholder(tf.int32, shape=[None, question_len])
        self.question_mask = tf.placeholder(tf.int32, shape=[None, question_len])
        self.answer_span = tf.placeholder(tf.int32, shape=[None, 2]) # The start and end index

        # Add a placeholder to feed in the probability (for dropout)
        self.prob_dropout = tf.placeholder_with_default(1.0, shape=())
    5
    def add_embed_layer(self, embed_matrix):
#         with vs.variable_scope("embedding"):
        with tf.variable_scope("embedding"):
            embedding_matrix = tf.constant(embed_matrix, dtype=tf.float32, name="embed_matrix")
            self.context_embed = embedding_ops.embedding_lookup(embedding_matrix, self.context_ids)
            self.question_embed = embedding_ops.embedding_lookup(embedding_matrix, self.question_ids)
    
    def create_layers(self):
        ### Add highway layer
        context_size = self.context_embed.get_shape().as_list()[-1]
        for i in range(2):
            self.context_embed = self.create_highway_layer(self.context_embed, context_size, scope_name = "HighwayLayer", carry_bias = -1.0)
            self.question_embed = self.create_highway_layer(self.question_embed, context_size, scope_name = "HighwayLayer", carry_bias = -1.0)
        
        
        ### Add RNN Encoder Layer
        rnn_encoder = RNNEncoder(hidden_encoder_size, self.prob_dropout)
        context_hidden_layer = rnn_encoder.add_layer(self.context_embed, self.context_mask, scopename="EncoderLayer")
        question_hidden_layer = rnn_encoder.add_layer(self.question_embed, self.question_mask, scopenape="EncoderLayer")
        
        
        ### Add Attention Layer using BiDAF
        attention_layer = BidafAttention(self.prob_dropout, 2*hidden_encoder_size)
        output_BiDAF = attention_layer.add_layer(question_hidden_layer, self.question_mask, context_hidden_layer, self.context_mask)
        self.attention = tf.reduce_max(outputBiDAF, axis=2)
        #!! See if you can remove the first parameter since we don't use
        _, self.bidaf_probability = masked_softmax(self.attention, self.context_mask, 1)
        combination_cq = tf.concat([context_hidden_layer, output_BiDAF], axis=2)
        
        hidden_BiDAF = RNNEncoder(hidden_bidaf_size, self.prob_dropout)
        # The final BiDAF layer is the output_hidden_BiDAF
        output_hidden_BiDAF = hidden_BiDAF.add_layer(combination_cq, self.context_mask, scopename="BiDAFLayer")
        
        
        ### Add Output Layer: Predicting start and end of answer
        final_combination_cq = tf.contrib.layers.fully_connected(combination_cq, num_outputs=hidden_full_size)
        
        # Compute start distribution
#         with vs.variable_scope("Start")
        with tf.variable_scope("Start"):
            start_layer = Softmax()
            self.
        # Computer end distribution
        
        
        # Softmax layer
#         Softmax()
#         Softmax.add_layer()
        
#         masked_softmax => Function not a clss (Pass masked softmax here)
        # HIghway layer
#             Highway()
#             Highway.add_layer()
#         from Layers import *
        
        
    def add_loss(self):
#         with vs.variable_scope("loss"):
        with tf.variable_scope("loss"):
            # Loss for start prediction
            loss_start = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits_start, labels=self.answer_span[:, 0])
            self.loss_start = tf.reduce_mean(loss_start) # Average across batch

            # Loss for end prediction
            loss_end = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits_end, labels=self.answer_span[:, 1])
            self.loss_end = tf.reduce_mean(loss_end) #Average across batch

            # Total loss
            self.loss = self.loss_start + self.loss_end

    ### HELPER FUNCTIONS for the initialization of the model
#     def masked_softmax():
        
        
    

In [None]:
def create_highway_layer(self, x, size, scope_name, carry_bias=-1.0):
        W_T = tf.Variable(tf.truncated_normal([size, size], stddev=0.1), name="weight_transform")
        b_T = tf.Variable(tf.constant(carry_bias, shape=[size]), name="bias_transform")

        W = tf.Variable(tf.truncated_normal([size, size], stddev=0.1), name="weight")
        b = tf.Variable(tf.constant(0.1, shape=[size]), name="bias")

        T = tf.sigmoid(self.highway_multi(x, W_T) + b_T, name="transform_gate")
        H = tf.nn.relu(self.highway_multi(x, W) + b, name="activation")
        C = tf.subtract(1.0, T, name="carry_gate")

        y = tf.add(tf.multiply(H, T), tf.multiply(x, C), "y")
        return y
    
    def highway_multi(self, matrix, weight):
        matrixShape = matrix.get_shape().as_list()
        weightShape = weight.get_shape().as_list()
        matrixTempShape = tf.reshape(matrix, [-1, matrixShape[-1]])
        result = tf.matmul(matrixTempShape, weight)
        
        return tf.reshape(result, [-1, matrixShape[1], weightShape[-1]])

## Main Program

In [None]:
## Static variables
data_dir = "./dataset/"

# Hyperparameters
learning_rate = 0.001
batch_size = 60
hidden_encoder_size = 150

context_len = 300
question_len = 30
embed_size = 100

In [None]:
## Getting train and dev data
train_context = os.path.join(data_dir, "train.context")
train_questions = os.path.join(data_dir, "train.question")
train_ans_span = os.path.join(data_dir, "train.span")
dev_context = os.path.join(data_dir, "dev.context")
dev_questions = os.path.join(data_dir, "dev.question")
dev_ans_span = os.path.join(data_dir, "dev.span")

In [None]:
## Create Glove Vector
id2word, word2id, embed_matrix = get_glove("G:/glove.6B/glove.6B.100d.txt", 100)

In [None]:
# Initialize model
mrcModel = mrcModel(id2word, word2id, embed_matrix)