In [1]:
class Params():
    
    # data
    data_size = -1
    num_epochs = 10
    train_prop = 0.9
    
    # Training
        # NOTE: To use demo, put batch_size == 1
    mode = "train" # case-insensitive options: ['train', 'test', 'debug']
    dropout = 0.2
    zoneout = None 
    optimizer = "adam" # Options: ['adadelta', 'adam', 'gradientdescent', 'adagrad']
    batch_size = 50 if mode is not "test" else 100 #size of the mini-batch for training
    save_steps = 50
    clip = True # clip gradient norm
    norm = 5.0 
    # NOTE: Change the hyperparameters of your learning algos here
    opt_arg = {
        'adadelta':{'learning_rate':1, 'rho': 0.95, 'epsilon':1e-6},
        'adam':{'learning_rate':1e-3, 'beta1':0.9, 'beta2':0.999, 'epsilon':1e-8},
        'gradientdescent':{'learning_rate':1},
        'adagrad':{'learning_rate':1}
    }
    
    # Architecture
    SRU = True # Use SRU cell, if False, use standard GRU cell
    max_p_len = 300 # Maximum number of words in each passage context
    max_q_len = 30 # Maximum number of words in each question context
    max_char_len = 16 # Maximum number of characters in a word
    vocab_size = 91605 # number of vocabs after Glove training
    char_vocab_size = 95 # number of charaters in Glove
    emb_size = 300 # Embeddings size for words
    char_emb_size = 8 # Embedding size for characters
    attn_size = 75 # RNN cell and attention module size
    num_layers = 3 # Number of layers at question-passage matching
    bias = True # Use bias term in attention

In [2]:
import numpy as np
import tqdm
import spacy
import tensorflow as tf

# GRU part

In [5]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import collections
import hashlib
import numbers

from tensorflow.python.ops import clip_ops
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import init_ops
from tensorflow.python.ops import math_ops
from tensorflow.python.ops import nn_ops
from tensorflow.python.ops import variable_scope as vs
from tensorflow.python.util import nest
from tensorflow.contrib.rnn import RNNCell

In [None]:
'''
attention weights from https://www.microsoft.com/en-us/research/wp-content/uploads/2017/05/r-net.pdf
W_u^Q.shape:    (2 * attn_size, attn_size)
W_u^P.shape:    (2 * attn_size, attn_size)
W_v^P.shape:    (attn_size, attn_size)
W_g.shape:      (4 * attn_size, 4 * attn_size)
W_h^P.shape:    (2 * attn_size, attn_size)
W_v^Phat.shape: (2 * attn_size, attn_size)
W_h^a.shape:    (2 * attn_size, attn_size)
W_v^Q.shape:    (attn_size, attn_size)
'''

def get_attn_params(attn_size, initializer = tf.truncated_normal_initializer):
    '''
    Args:
        attn_size: the size of attention specified in https://www.microsoft.com/en-us/research/wp-content/uploads/2017/05/r-net.pdf
        initializer: the author of the original paper used Gaussian initialization, however xavier converge faster
    Returns:
        params: A collection of parameters used throughout the layers
    '''
    with tf.variable_scope("attention_weights"):
        params = {
            "W_u_Q": tf.get_variable("W_u_Q", dtype = tf.float32, shape = (2 * attn_size, attn_size), initializer = initializer()),
            "W_u_P": tf.get_variable("W_u_P", dtype = tf.float32, shape = (2 * attn_size, 2*attn_size), initializer = initializer()),
            "W_v_P": tf.get_variable("W_v_P", dtype = tf.float32, shape = (attn_size, attn_size), initializer = initializer()),
            "W_v_P_2": tf.get_variable("W_v_P_2", dtype = tf.float32, shape = (2*attn_size, attn_size), initializer = initializer()),
            "W_g": tf.get_variable("W_g", dtype = tf.float32, shape = (4*attn_size, 4*attn_size), initializer = initializer()),
            "W_h_P": tf.get_variable("W_h_P", dtype = tf.float32, shape = (2*attn_size, attn_size), initializer = initializer()),
            "W_v_Phat": tf.get_variable("W_v_Phat", dtype = tf.float32, shape = (2*attn_size, attn_size), initializer = initializer()),
            "W_h_a": tf.get_variable("W_h_a", dtype = tf.float32, shape = (2*attn_size, attn_size), initializer = initializer()),
            "W_v_Q": tf.get_variable("W_v_Q", dtype = tf.float32, shape = (attn_size, attn_size), initializer = initializer()),
            "v": tf.get_variable("v", dtype = tf.float32, shape = (attn_size), initializer = initializer())
        }
        return params

def encoding(word, char, word_embeddings, char_embeddings, scope = "embedding"):
    with tf.variable_scope(scope):
        word_encoding = tf.nn.embedding_lookup(word_embeddings, word)
        char_encoding = tf.nn.embedding_lookup(char_embeddings, char)
        return word_encoding, char_encoding

def apply_dropout(inputs, size = None, is_training = True):
    '''
    Implementation of ZoneOut from https://arxiv.org/pdf/1606.01305.pdf
    '''
    if Params.dropout is None and Params.zoneout is None:
        return inputs
    if Params.zoneout is not None:
        return ZoneoutWrapper(inputs, state_zoneout_prob = Params.zoneout, is_training = is_training)
    elif is_training:
        return tf.contrib.rnn.DropoutWrapper(inputs,
                                            output_keep_prob = 1 - Params.dropout,
                                            dtype = tf.float32)
    else:
        return inputs


def bidirectional_GRU(inputs, inputs_len, cell = None, cell_fn = tf.contrib.rnn.GRUCell, units = Params.attn_size,
                     layers = 1, scope = "Bidirectional_GRU", output = 0, is_training = True, reuse = None):
    '''
    Bidirectional RNN with GRU cells.
    
    Args:
        inputs: rnn input of shape (batch_size, timestep, dim)
        inputs_len: rnn input_len of shape (batch_size, )
        cell: rnn cell of type RNN_Cell.
        output: if 0, output returns rnn output for every timestep,
                if 1, output returns concatenated state of backward and forward rnn.
    '''
    with tf.variable_scope(scope, reuse = reuse):
        if cell is not None:
            (cell_fw, cell_bw) = call
        else:
            shapes = inputs.get_shape().as_list()
            if len(shapes) > 3:
                inputs = tf.reshape(inputs, (shapes[0]*shapes[1], shapes[2], -1))
                inputs_len = tf.reshape(inputs_len, (shapes[0]*shapes[1]))
                
            # if no cells are provided, use standard GRU cell implementation
            if layers > 1 :
                cell_fw = MultiRNNCell([apply_dropout(cell_fn(units), size = inputs.shape[-1] if i == 0 else units, 
                                                      is_training = is_training) for i in range(layers)])
                cell_bw = MultiRNNCell([apply_dropout(cell_fn(units), size = inputs.shape[-1] if i == 0 else units,
                                                     is_training = is_training) for i in range(layers)])
            else:
                cell_fw, cell_bw = [apply_dropout(cell_fn(units), size = inputs.shape[-1], is_training = is_training) for _ in range(2)]
        outputs, states = tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, inputs,
                                                         sequence_length = inputs_len,
                                                         dtype = tf.float32)
        
        if output == 0:
            return tf.concat(outputs, 2)
        elif output == 1:
            return tf.reshape(tf.concat(states, 1), (Params.batch_size, shapes[1], 2*units))
        

def pointer_net(passage, passage_len, question, question_len, cell, params, scope = "pointer_network"):
    