-
Notifications
You must be signed in to change notification settings - Fork 4
/
StanfordReader1.py
122 lines (97 loc) · 5.99 KB
/
StanfordReader1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
"""
Goal:
- Takes in batches of (document, question, answer) tuples,
runs bidirectional rnn, finds attention weights, and calculates loss
Architecture Overview:
- Bidirectional LSTM/GRU on documents and questions (concatenate depth-wise)
- Take last outputs of questions (from each direction) as query vector
- Use bilinear weight to calculate similarity metric/attention weight for
each word in the document using the query vector
- Take weighted sum of word vectors and use that to make prediction
Issues:
- Better to pass mask itself instead of repeatedly creating masks with seq_lens?
- Make softmax numerically stable
- Gradient Clipping in GRU
Credits: Attentive Reader model developed by https://arxiv.org/pdf/1506.03340.pdf
and Stanford Reader model developed by https://arxiv.org/pdf/1606.02858v2.pdf
"""
import tensorflow as tf
import numpy as np
from rnn_cell import GRUCell
from rnn import bidirectional_rnn, rnn
from attention import BilinearFunction
class StanfordReader(object):
"""
Purpose:
Instances of this class run the whole StanfordReader model.
"""
def __init__(self, max_entities, hidden_size=128, vocab_size=50000, embedding_dim=100, batch_size=32):
self.max_entities = max_entities
tf.set_random_seed(1234)
# Placeholders
# can add assert statements to ensure shared None dimensions are equal (batch_size)
self.input_d = tf.placeholder(tf.int32, [None, None], name="input_d")
self.input_q = tf.placeholder(tf.int32, [None, None], name="input_q")
self.input_a = tf.placeholder(tf.int32, [None, ], name="input_a")
self.input_m = tf.placeholder(tf.int32, [None, ], name="input_m")
seq_lens_d = tf.reduce_sum(tf.cast(self.input_d >= 0, tf.int32), 1)
seq_lens_q = tf.reduce_sum(tf.cast(self.input_q >= 0, tf.int32), 1)
mask_d = tf.cast(tf.sequence_mask(seq_lens_d), tf.int32)
mask_q = tf.cast(tf.sequence_mask(seq_lens_q), tf.int32)
mask_m = tf.cast(tf.sequence_mask(self.input_m, maxlen=max_entities), dtype=tf.float32)
# Document and Query embddings; One-hot-encoded answers
masked_d = tf.mul(self.input_d, mask_d)
masked_q = tf.mul(self.input_q, mask_q)
one_hot_a = tf.one_hot(self.input_a, self.max_entities)
# Buildling Graph (Network Layers)
# ==================================================
with tf.device('/cpu:0'), tf.variable_scope("embedding"):
W_embeddings = tf.get_variable(shape=[vocab_size, embedding_dim], \
initializer=tf.random_uniform_initializer(-0.01, 0.01),\
name="W_embeddings")
################## Make option to use pre-trained embeddings ##################
# Dimensions: batch x max_length x embedding_dim
document_embedding = tf.gather(W_embeddings, masked_d)
question_embedding = tf.gather(W_embeddings, masked_q)
with tf.variable_scope("bidirection_rnn"):
mask_d = tf.cast(tf.sequence_mask(seq_lens_d), tf.float32)
mask_q = tf.cast(tf.sequence_mask(seq_lens_q), tf.float32)
# Bidirectional RNNs for Document and Question
forward_cell_d = GRUCell(state_size=hidden_size, input_size=embedding_dim, scope="GRU-Forward-D")
backward_cell_d = GRUCell(state_size=hidden_size, input_size=embedding_dim, scope="GRU-Backward-D")
forward_cell_q = GRUCell(state_size=hidden_size, input_size=embedding_dim, scope="GRU-Forward-Q")
backward_cell_q = GRUCell(state_size=hidden_size, input_size=embedding_dim, scope="GRU-Backward-Q")
hidden_states_d, last_state_d = bidirectional_rnn(forward_cell_d, backward_cell_d, \
document_embedding, mask_d, concatenate=True)
hidden_states_q, last_state_q = bidirectional_rnn(forward_cell_q, backward_cell_q, \
question_embedding, mask_q, concatenate=True)
with tf.variable_scope("attention"):
# Attention Layer
attention = BilinearFunction(attending_size=hidden_size*2, attended_size=hidden_size*2)
self.alpha_weights, self.attend_result = attention(attending=last_state_q, attended=hidden_states_d, \
time_mask=mask_d)
with tf.variable_scope("prediction"):
W_predict = tf.get_variable(name="predict_weight", shape=[hidden_size*2, self.max_entities], \
initializer=tf.random_normal_initializer(mean=0.0, stddev=0.1))
b_predict = tf.get_variable(name="predict_bias", shape=[self.max_entities],
initializer=tf.constant_initializer(0.0))
# Dimensions (batch_size x max_entities)
predict_probs = (tf.matmul(self.attend_result, W_predict) + b_predict) * mask_m
# Custom Softmax b/c need to use time_mask --------------------
# Also numerical stability:
# e_x = exp(x - x.max(axis=1))
# out = e_x / e_x.sum(axis=1)
numerator = tf.exp(tf.sub(predict_probs, tf.expand_dims(tf.reduce_max(predict_probs, 1), -1))) * mask_m
denom = tf.reduce_sum(numerator, 1)
# Transpose so broadcasting scalar division works properly
# Dimensions (batch x max_entities)
predict_probs_normalized = tf.div(numerator, tf.expand_dims(denom, 1))
likelihoods = tf.reduce_sum(tf.mul(predict_probs_normalized, one_hot_a), 1)
log_likelihoods = tf.log(likelihoods+0.00000000000000000001)
# Negative log-likelihood loss
self.loss = tf.mul(tf.reduce_sum(log_likelihoods), -1)/tf.cast(tf.shape(self.input_d)[0], tf.float32)
correct_vector = tf.cast(tf.equal(tf.argmax(one_hot_a, 1), tf.argmax(predict_probs_normalized, 1)), \
tf.float32, name="correct_vector")
self.accuracy = tf.reduce_mean(correct_vector)
def get_mask_shape(self):
print (self.mask_d.get_shape(), self.mask_q.get_shape())