/
model.py
226 lines (186 loc) · 7.97 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
"""
Define the recurrent entity network model.
"""
from __future__ import absolute_import
from __future__ import print_function
from __future__ import division
from functools import partial
import tensorflow as tf
from entity_networks.dynamic_memory_cell import DynamicMemoryCell
from entity_networks.model_ops import cyclic_learning_rate, \
get_sequence_length, \
count_parameters, \
prelu
OPTIMIZER_SUMMARIES = [
"learning_rate",
"loss",
"gradients",
"gradient_norm",
]
def get_input_encoding(inputs, initializer=None, scope=None):
"""
Implementation of the learned multiplicative mask from Section 2.1, Equation 1.
This module is also described in [End-To-End Memory Networks](https://arxiv.org/abs/1502.01852)
as Position Encoding (PE). The mask allows the ordering of words in a sentence to affect the
encoding.
"""
with tf.variable_scope(scope, 'Encoding', initializer=initializer):
_, _, max_sentence_length, embedding_size = inputs.get_shape().as_list()
positional_mask = tf.get_variable(
name='positional_mask',
shape=[max_sentence_length, embedding_size])
encoded_input = tf.reduce_sum(inputs * positional_mask, axis=2)
return encoded_input
def get_output_module(
last_state,
encoded_query,
num_blocks,
vocab_size,
activation=tf.nn.relu,
initializer=None,
scope=None):
"""
Implementation of Section 2.3, Equation 6. This module is also described in more detail here:
[End-To-End Memory Networks](https://arxiv.org/abs/1502.01852).
"""
with tf.variable_scope(scope, 'Output', initializer=initializer):
last_state = tf.stack(tf.split(last_state, num_blocks, axis=1), axis=1)
_, _, embedding_size = last_state.get_shape().as_list()
# Use the encoded_query to attend over memories
# (hidden states of dynamic last_state cell blocks)
attention = tf.reduce_sum(last_state * encoded_query, axis=2)
# Subtract max for numerical stability (softmax is shift invariant)
attention_max = tf.reduce_max(attention, axis=-1, keep_dims=True)
attention = tf.nn.softmax(attention - attention_max)
attention = tf.expand_dims(attention, axis=2)
# Weight memories by attention vectors
u = tf.reduce_sum(last_state * attention, axis=1)
# R acts as the decoder matrix to convert from internal state to the output vocabulary size
R = tf.get_variable('R', [embedding_size, vocab_size])
H = tf.get_variable('H', [embedding_size, embedding_size])
q = tf.squeeze(encoded_query, axis=1)
y = tf.matmul(activation(q + tf.matmul(u, H)), R)
return y
outputs = None
return outputs
def get_outputs(inputs, params):
"Return the outputs from the model which will be used in the loss function."
embedding_size = params['embedding_size']
num_blocks = params['num_blocks']
vocab_size = params['vocab_size']
story = inputs['story']
query = inputs['query']
batch_size = tf.shape(story)[0]
normal_initializer = tf.random_normal_initializer(stddev=0.1)
ones_initializer = tf.constant_initializer(1.0)
# Extend the vocab to include keys for the dynamic memory cell,
# allowing the initialization of the memory to be learned.
vocab_size = vocab_size + num_blocks
with tf.variable_scope('EntityNetwork', initializer=normal_initializer):
# PReLU activations have their alpha parameters initialized to 1
# so they may be identity before training.
alpha = tf.get_variable(
name='alpha',
shape=embedding_size,
initializer=ones_initializer)
activation = partial(prelu, alpha=alpha)
# Embeddings
embedding_params = tf.get_variable(
name='embedding_params',
shape=[vocab_size, embedding_size])
# The embedding mask forces the special "pad" embedding to zeros.
embedding_mask = tf.constant(
value=[0 if i == 0 else 1 for i in range(vocab_size)],
shape=[vocab_size, 1],
dtype=tf.float32)
embedding_params_masked = embedding_params * embedding_mask
story_embedding = tf.nn.embedding_lookup(embedding_params_masked, story)
query_embedding = tf.nn.embedding_lookup(embedding_params_masked, query)
# Input Module
encoded_story = get_input_encoding(
inputs=story_embedding,
initializer=ones_initializer,
scope='StoryEncoding')
encoded_query = get_input_encoding(
inputs=query_embedding,
initializer=ones_initializer,
scope='QueryEncoding')
# Memory Module
# We define the keys outside of the cell so they may be used for memory initialization.
# Keys are initialized to a range outside of the main vocab.
keys = [key for key in range(vocab_size - num_blocks, vocab_size)]
keys = tf.nn.embedding_lookup(embedding_params_masked, keys)
keys = tf.split(keys, num_blocks, axis=0)
keys = [tf.squeeze(key, axis=0) for key in keys]
cell = DynamicMemoryCell(
num_blocks=num_blocks,
num_units_per_block=embedding_size,
keys=keys,
initializer=normal_initializer,
recurrent_initializer=normal_initializer,
activation=activation)
# Recurrence
initial_state = cell.zero_state(batch_size, tf.float32)
sequence_length = get_sequence_length(encoded_story)
_, last_state = tf.nn.dynamic_rnn(
cell=cell,
inputs=encoded_story,
sequence_length=sequence_length,
initial_state=initial_state)
# Output Module
outputs = get_output_module(
last_state=last_state,
encoded_query=encoded_query,
num_blocks=num_blocks,
vocab_size=vocab_size,
initializer=normal_initializer,
activation=activation)
parameters = count_parameters()
print('Parameters: {}'.format(parameters))
return outputs
def get_predictions(outputs):
"Return the actual predictions for use with evaluation metrics or TF Serving."
predictions = tf.argmax(outputs, axis=-1)
return predictions
def get_loss(outputs, labels, mode):
"Return the loss function which will be used with an optimizer."
loss = None
if mode == tf.contrib.learn.ModeKeys.INFER:
return loss
loss = tf.losses.sparse_softmax_cross_entropy(
logits=outputs,
labels=labels)
return loss
def get_train_op(loss, params, mode):
"Return the trainining operation which will be used to train the model."
train_op = None
if mode != tf.contrib.learn.ModeKeys.TRAIN:
return train_op
global_step = tf.contrib.framework.get_or_create_global_step()
learning_rate = cyclic_learning_rate(
learning_rate_min=params['learning_rate_min'],
learning_rate_max=params['learning_rate_max'],
step_size=params['learning_rate_step_size'],
mode='triangular',
global_step=global_step)
tf.summary.scalar('learning_rate', learning_rate)
train_op = tf.contrib.layers.optimize_loss(
loss=loss,
global_step=global_step,
learning_rate=learning_rate,
optimizer='Adam',
clip_gradients=params['clip_gradients'],
gradient_noise_scale=params['gradient_noise_scale'],
summaries=OPTIMIZER_SUMMARIES)
return train_op
def model_fn(features, labels, mode, params):
"Return ModelFnOps for use with Estimator."
outputs = get_outputs(features, params)
predictions = get_predictions(outputs)
loss = get_loss(outputs, labels, mode)
train_op = get_train_op(loss, params, mode)
return tf.contrib.learn.ModelFnOps(
predictions=predictions,
loss=loss,
train_op=train_op,
mode=mode)