forked from tensorflow/tensorflow
-
Notifications
You must be signed in to change notification settings - Fork 0
/
cudnn_rnn_ops.py
2053 lines (1834 loc) · 85.3 KB
/
cudnn_rnn_ops.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Cudnn RNN operators."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
from tensorflow.contrib.checkpoint.python import split_dependency
from tensorflow.contrib.rnn.python.ops import lstm_ops
from tensorflow.python.compat import compat
from tensorflow.python.framework import dtypes
from tensorflow.python.framework import ops
from tensorflow.python.framework import random_seed
from tensorflow.python.keras.engine import base_layer
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import gen_cudnn_rnn_ops
from tensorflow.python.ops import init_ops
from tensorflow.python.ops import math_ops
from tensorflow.python.ops import nn_ops
from tensorflow.python.ops import rnn_cell_impl
from tensorflow.python.ops import state_ops
from tensorflow.python.ops import variable_scope as vs
from tensorflow.python.training import saver
from tensorflow.python.training.tracking import tracking as trackable_lib
CUDNN_RNN_UNIDIRECTION = "unidirectional"
CUDNN_RNN_BIDIRECTION = "bidirectional"
CUDNN_LSTM = "lstm"
CUDNN_GRU = "gru"
CUDNN_RNN_RELU = "rnn_relu"
CUDNN_RNN_TANH = "rnn_tanh"
# Half for cell input, half for hidden states.
CUDNN_LSTM_PARAMS_PER_LAYER = 8
CUDNN_GRU_PARAMS_PER_LAYER = 6
CUDNN_RNN_TANH_PARAMS_PER_LAYER = 2
CUDNN_RNN_RELU_PARAMS_PER_LAYER = 2
CUDNN_INPUT_LINEAR_MODE = "linear_input"
CUDNN_INPUT_SKIP_MODE = "skip_input"
CUDNN_INPUT_AUTO_MODE = "auto_select"
# pylint:disable=protected-access
_BIAS_VARIABLE_NAME = rnn_cell_impl._BIAS_VARIABLE_NAME
_WEIGHTS_VARIABLE_NAME = rnn_cell_impl._WEIGHTS_VARIABLE_NAME
# pylint:enable=protected-access
class CudnnCompatibleLSTMCell(lstm_ops.LSTMBlockCell):
"""Cudnn Compatible LSTMCell.
A simple wrapper around `tf.contrib.rnn.LSTMBlockCell` to use along with
`tf.contrib.cudnn_rnn.CudnnLSTM`. The latter's params can be used by
this cell seamlessly.
"""
def __init__(self, num_units, reuse=None):
super(CudnnCompatibleLSTMCell, self).__init__(
num_units,
forget_bias=0,
cell_clip=None,
use_peephole=False,
reuse=reuse,
name="cudnn_compatible_lstm_cell")
self._names.update({"scope": "cudnn_compatible_lstm_cell"})
class CudnnCompatibleGRUCell(rnn_cell_impl.GRUCell):
r"""Cudnn Compatible GRUCell.
A GRU impl akin to `tf.compat.v1.nn.rnn_cell.GRUCell` to use along with
`tf.contrib.cudnn_rnn.CudnnGRU`. The latter's params can be used by
it seamlessly.
It differs from platform-independent GRUs in how the new memory gate is
calculated. Nvidia picks this variant based on GRU author's[1] suggestion and
the fact it has no accuracy impact[2].
[1] https://arxiv.org/abs/1406.1078
[2] http://svail.github.io/diff_graphs/
Cudnn compatible GRU (from Cudnn library user guide):
```python
# reset gate
$$r_t = \sigma(x_t * W_r + h_t-1 * R_h + b_{Wr} + b_{Rr})$$
# update gate
$$u_t = \sigma(x_t * W_u + h_t-1 * R_u + b_{Wu} + b_{Ru})$$
# new memory gate
$$h'_t = tanh(x_t * W_h + r_t .* (h_t-1 * R_h + b_{Rh}) + b_{Wh})$$
$$h_t = (1 - u_t) .* h'_t + u_t .* h_t-1$$
```
Other GRU (see `tf.compat.v1.nn.rnn_cell.GRUCell` and
`tf.contrib.rnn.GRUBlockCell`):
```python
# new memory gate
\\(h'_t = tanh(x_t * W_h + (r_t .* h_t-1) * R_h + b_{Wh})\\)
```
which is not equivalent to Cudnn GRU: in addition to the extra bias term b_Rh,
```python
\\(r .* (h * R) != (r .* h) * R\\)
```
"""
def __init__(self, num_units, reuse=None, kernel_initializer=None):
super(CudnnCompatibleGRUCell, self).__init__(
num_units,
activation=None,
reuse=reuse,
kernel_initializer=kernel_initializer)
def build(self, inputs_shape):
if inputs_shape[1].value is None:
raise ValueError("Expected inputs.shape[-1] to be known, saw shape: %s" %
inputs_shape)
input_depth = inputs_shape[1].value
self._gate_kernel = self.add_variable(
"gates/%s" % _WEIGHTS_VARIABLE_NAME,
shape=[input_depth + self._num_units, 2 * self._num_units],
initializer=self._kernel_initializer)
self._gate_bias = self.add_variable(
"gates/%s" % _BIAS_VARIABLE_NAME,
shape=[2 * self._num_units],
initializer=(self._bias_initializer
if self._bias_initializer is not None else
init_ops.constant_initializer(1.0, dtype=self.dtype)))
self._candidate_input_kernel = self.add_variable(
"candidate/input_projection/%s" % _WEIGHTS_VARIABLE_NAME,
shape=[input_depth, self._num_units],
initializer=self._kernel_initializer)
self._candidate_hidden_kernel = self.add_variable(
"candidate/hidden_projection/%s" % _WEIGHTS_VARIABLE_NAME,
shape=[self._num_units, self._num_units],
initializer=self._kernel_initializer)
self._candidate_input_bias = self.add_variable(
"candidate/input_projection/%s" % _BIAS_VARIABLE_NAME,
shape=[self._num_units],
initializer=(self._bias_initializer
if self._bias_initializer is not None else
init_ops.zeros_initializer(dtype=self.dtype)))
self._candidate_hidden_bias = self.add_variable(
"candidate/hidden_projection/%s" % _BIAS_VARIABLE_NAME,
shape=[self._num_units],
initializer=(self._bias_initializer
if self._bias_initializer is not None else
init_ops.zeros_initializer(dtype=self.dtype)))
def call(self, inputs, state):
"""Gated recurrent unit (GRU) with nunits cells."""
gate_inputs = math_ops.matmul(
array_ops.concat([inputs, state], 1), self._gate_kernel)
gate_inputs = nn_ops.bias_add(gate_inputs, self._gate_bias)
value = math_ops.sigmoid(gate_inputs)
r, u = array_ops.split(value=value, num_or_size_splits=2, axis=1)
candidate = nn_ops.bias_add(
math_ops.matmul(inputs, self._candidate_input_kernel),
self._candidate_input_bias)
candidate += r * nn_ops.bias_add(
math_ops.matmul(state, self._candidate_hidden_kernel),
self._candidate_hidden_bias)
candidate = self._activation(candidate)
new_h = (1 - u) * candidate + u * state
return new_h, new_h
class CudnnParamsFormatConverter(object):
"""Abstract class that converts between params of Cudnn Rnn and TF Rnn."""
def __init__(self,
num_layers,
num_units,
input_size,
num_proj=None,
input_mode=CUDNN_INPUT_LINEAR_MODE,
direction=CUDNN_RNN_UNIDIRECTION):
"""Constructor.
Args:
num_layers: the number of layers for the RNN model.
num_units: the number of units within the RNN model.
input_size: the size of the input, it could be different from the
num_units.
num_proj: The output dimensionality for the projection matrices.
If None or 0, no projection is performed.
input_mode: indicate whether there is a linear projection between the
input and the actual computation before the first layer. It could be one
of 'linear_input', 'skip_input' or 'auto_select'. * 'linear_input'
(default) always applies a linear projection of input onto RNN hidden
state. (standard RNN behavior). * 'skip_input' is only allowed when
input_size == num_units; * 'auto_select' implies 'skip_input' when
input_size == num_units; otherwise, it implies 'linear_input'.
direction: the direction model that the model operates. Could be either
'unidirectional' or 'bidirectional'
"""
self._num_layers = num_layers
self._input_size = input_size
self._num_units = num_units
self._input_mode = input_mode
self._num_proj = num_proj
self._direction = direction
self._num_dirs = 1 if self._direction == CUDNN_RNN_UNIDIRECTION else 2
self._num_params = (
self._num_params_per_layer * self._num_layers * self._num_dirs)
def tf_canonical_to_opaque(self, tf_canonicals, weights_proj=None):
r"""Converts tf canonical weights to cudnn opaque param."""
cu_weights, cu_biases = self._tf_canonical_to_cu_canonical(tf_canonicals,
weights_proj)
cu_weights = [array_ops.reshape(w, [-1]) for w in cu_weights]
opaque_params = self._cu_canonical_to_opaque(cu_weights, cu_biases)
return opaque_params
def opaque_to_tf_canonical(self, opaque_param):
r"""Converts cudnn opaque param to tf canonical weights."""
cu_weights, cu_biases = self._opaque_to_cu_canonical(opaque_param)
if self._num_proj:
weights, biases, weights_proj = self._cu_canonical_to_tf_canonical(
cu_weights, cu_biases)
return weights, biases, weights_proj
else:
weights, biases = self._cu_canonical_to_tf_canonical(
cu_weights, cu_biases)
return weights, biases
def _opaque_to_cu_canonical(self, opaque_param):
"""Converts opaque params to Cudnn canonical format.
Args:
opaque_param: An opaque tensor storing cudnn rnn params (weights and
biases).
Returns:
2 list for weights and biases respectively.
"""
with ops.device("/gpu:0"):
if compat.forward_compatible(2019, 6, 26) and self._num_proj:
num_params_weights = (self._num_params +
1 * self._num_layers * self._num_dirs)
num_params_biases = self._num_params
weights, biases = gen_cudnn_rnn_ops.cudnn_rnn_params_to_canonical_v2(
num_layers=self._num_layers,
num_units=self._num_units,
input_size=self._input_size,
params=opaque_param,
rnn_mode=self._rnn_mode,
input_mode=self._input_mode,
direction=self._direction,
num_params_weights=num_params_weights,
num_params_biases=num_params_biases,
num_proj=self._num_proj)
else:
weights, biases = gen_cudnn_rnn_ops.cudnn_rnn_params_to_canonical(
num_layers=self._num_layers,
num_units=self._num_units,
input_size=self._input_size,
params=opaque_param,
num_params=self._num_params,
rnn_mode=self._rnn_mode,
input_mode=self._input_mode,
direction=self._direction)
return (weights, biases)
def _cu_canonical_to_opaque(self, cu_weights, cu_biases):
"""Converts from Cudnn canonical format to opaque params.
Args:
cu_weights: a list of tensors, Cudnn canonical weights.
cu_biases: a list of tensors, Cudnn canonical biases.
Returns:
a single opaque tensor.
"""
with ops.device("/gpu:0"):
if compat.forward_compatible(2019, 6, 26) and self._num_proj:
return gen_cudnn_rnn_ops.cudnn_rnn_canonical_to_params_v2(
num_layers=self._num_layers,
num_units=self._num_units,
input_size=self._input_size,
weights=cu_weights,
biases=cu_biases,
rnn_mode=self._rnn_mode,
input_mode=self._input_mode,
num_proj=self._num_proj,
direction=self._direction)
else:
return gen_cudnn_rnn_ops.cudnn_rnn_canonical_to_params(
num_layers=self._num_layers,
num_units=self._num_units,
input_size=self._input_size,
weights=cu_weights,
biases=cu_biases,
rnn_mode=self._rnn_mode,
input_mode=self._input_mode,
direction=self._direction)
def _cu_canonical_to_tf_canonical(self, cu_weights, cu_biases):
r"""Transform from Cudnn canonical to tf canonical.
The elements of argument lists are laid out in the following format:
------------------------------------------------------------
| weights | biases |
------------------------------------------------------------
\ \
\ \
-------------------------------
| layer1 |layer2 |... |
-------------------------------
\ \
---------------
|fwd |bak |
---------------
Args:
cu_weights: a list of tensors of Cudnn canonical weights.
cu_biases: a list of tensors of Cudnn canonical biases.
Returns:
1 tuple, tf canonical weights and biases.
"""
tf_weights, tf_biases = [], []
tf_weights_proj = []
layer_weights_num = self._num_params_per_layer * self._num_dirs
layer_biases_num = layer_weights_num
layer_weights_num += (1 * self._num_dirs) if self._num_proj else 0
for i in range(self._num_layers):
layer_weights = cu_weights[i * layer_weights_num:(i + 1) *
layer_weights_num]
layer_biases = cu_biases[i * layer_biases_num:(i + 1) * layer_biases_num]
if self._direction == CUDNN_RNN_UNIDIRECTION:
self._cu_canonical_to_tf_canonical_single_layer(layer_weights,
layer_biases,
tf_weights, tf_biases,
tf_weights_proj)
else:
fw_weights = layer_weights[:len(layer_weights) // 2]
bw_weights = layer_weights[len(layer_weights) // 2:]
fw_biases = layer_biases[:len(layer_biases) // 2]
bw_biases = layer_biases[len(layer_biases) // 2:]
self._cu_canonical_to_tf_canonical_single_layer(
fw_weights,
fw_biases,
tf_weights,
tf_biases,
tf_weights_proj,
)
self._cu_canonical_to_tf_canonical_single_layer(
bw_weights,
bw_biases,
tf_weights,
tf_biases,
tf_weights_proj,
)
if self._num_proj:
return (tf_weights, tf_biases, tf_weights_proj)
else:
return (tf_weights, tf_biases)
def _cu_canonical_to_tf_canonical_single_layer(self, cu_weights, cu_biases,
tf_weights, tf_biases,
tf_weigths_proj=None):
r"""Transform single layer Cudnn canonicals to tf canonicals.
The elements of cu_weights, cu_biases are laid out in the following format:
-------------------------------------------------------------------------
| gate0 param on inputs | gate0 param on hidden state | gate1 ..........|
-------------------------------------------------------------------------
Args:
cu_weights: a list of tensors, single layer weights.
cu_biases: a list of tensors, single layer biases.
tf_weights: a list where transformed weights are stored.
tf_biases: a list where transformed biases are stored.
"""
raise NotImplementedError("Abstract method")
def _tf_canonical_to_cu_canonical(self, tf_canonicals, weights_proj):
r"""Transform from tf canonical to Cudnn canonical.
This is the reverse routine of _TransformCanonical().
Args:
tf_canonicals: a list of tensors of tf canonical params. The elements are
laid out in the following format:
------------------------------------------------------------
| weights | biases |
------------------------------------------------------------
\ \
\ \
-------------------------------
| layer1 |layer2 |... |
-------------------------------
\ \
---------------
|fwd |bak |
---------------
weights_proj: (optional) weights matrices for projection
Returns:
2 lists: the recovered cudnn canonical weights and biases.
"""
weights = tf_canonicals[:len(tf_canonicals) // 2]
biases = tf_canonicals[len(tf_canonicals) // 2:]
cu_weights, cu_biases = [], []
layer_weights_num = len(weights) // self._num_layers
layer_biases_num = len(biases) // self._num_layers
for i in range(self._num_layers):
layer_weights = weights[i * layer_weights_num:(i + 1) * layer_weights_num]
layer_biases = biases[i * layer_biases_num:(i + 1) * layer_biases_num]
if self._direction == CUDNN_RNN_UNIDIRECTION:
cu_weights.extend(self._tf_to_cudnn_weights(i, *layer_weights))
if weights_proj is not None:
pw = array_ops.transpose(weights_proj[i])
cu_weights.append(pw)
cu_biases.extend(self._tf_to_cudnn_biases(*layer_biases))
else:
fw_weights, bw_weights = layer_weights[:len(layer_weights) //
2], layer_weights[
len(layer_weights) // 2:]
fw_biases, bw_biases = layer_biases[:len(layer_biases) //
2], layer_biases[len(layer_biases
) // 2:]
cu_weights.extend(self._tf_to_cudnn_weights(i, *fw_weights))
if weights_proj is not None:
pw0 = array_ops.transpose(weights_proj[2*i+0])
cu_weights.append(pw0)
cu_biases.extend(self._tf_to_cudnn_biases(*fw_biases))
cu_weights.extend(self._tf_to_cudnn_weights(i, *bw_weights))
if weights_proj is not None:
pw1 = array_ops.transpose(weights_proj[2*i+1])
cu_weights.append(pw1)
cu_biases.extend(self._tf_to_cudnn_biases(*bw_biases))
return cu_weights, cu_biases
def _cudnn_to_tf_weights(self, *cu_weights):
r"""Stitches cudnn canonical weights to generate tf canonical weights."""
raise NotImplementedError("Abstract method")
def _tf_to_cudnn_weights(self, layer, *tf_weights):
r"""Reverses the operations in StitchWeights()."""
raise NotImplementedError("Abstract method")
def _cudnn_to_tf_biases(self, *biases):
r"""Stitches cudnn canonical biases to generate tf canonical biases."""
raise NotImplementedError("Abstract method")
def _tf_to_cudnn_biases(self, *tf_biases):
r"""Reverses the operations in StitchBiases()."""
raise NotImplementedError("Abstract method")
class CudnnParamsFormatConverterLSTM(CudnnParamsFormatConverter):
"""Helper class that converts between params of Cudnn and TF LSTM."""
_rnn_mode = CUDNN_LSTM
_num_params_per_layer = CUDNN_LSTM_PARAMS_PER_LAYER
def _cudnn_to_tf_gate_params(self, *cu_gate_order):
i_g, f_g, c_g, o_g = cu_gate_order
return [i_g, c_g, f_g, o_g]
def _tf_to_cudnn_gate_params(self, *tf_gate_order):
i_g, c_g, f_g, o_g = tf_gate_order
return [i_g, f_g, c_g, o_g]
def _cudnn_to_tf_weights(self, *cu_weights):
r"""Stitching cudnn canonical weights to generate tf canonical weights."""
if self._num_proj:
w_i, w_f, w_c, w_o, r_i, r_f, r_c, r_o, pw = cu_weights
else:
w_i, w_f, w_c, w_o, r_i, r_f, r_c, r_o = cu_weights
# pylint: disable=invalid-name
W_i = array_ops.concat([w_i, r_i], axis=1)
W_f = array_ops.concat([w_f, r_f], axis=1)
W_c = array_ops.concat([w_c, r_c], axis=1)
W_o = array_ops.concat([w_o, r_o], axis=1)
# pylint: enable=invalid-name
# Cudnn LSTM weights are in ifco order, other tf LSTMs are in icfo order.
reordered = self._cudnn_to_tf_gate_params(*[W_i, W_f, W_c, W_o])
if self._num_proj:
return (array_ops.transpose(array_ops.concat(reordered, axis=0)),
array_ops.transpose(pw))
else:
return (array_ops.transpose(array_ops.concat(reordered, axis=0)),)
def _tf_to_cudnn_weights(self, layer, *tf_weights):
r"""Reverse the operations in StitchWeights()."""
input_size = self._input_size
num_units = self._num_units
if layer == 0:
input_weight_width = input_size
else:
input_weight_width = self._num_proj if self._num_proj else num_units
if self._direction == CUDNN_RNN_BIDIRECTION:
input_weight_width *= 2
(tf_weight,) = tf_weights
w = array_ops.transpose(tf_weight)
# pylint: disable=invalid-name
W_i, W_f, W_c, W_o = self._tf_to_cudnn_gate_params(
*array_ops.split(w, 4, axis=0))
hidden_state_width = self._num_proj if self._num_proj else num_units
w_i, r_i = array_ops.split(W_i, [input_weight_width, hidden_state_width],
axis=1)
w_c, r_c = array_ops.split(W_c, [input_weight_width, hidden_state_width],
axis=1)
w_f, r_f = array_ops.split(W_f, [input_weight_width, hidden_state_width],
axis=1)
w_o, r_o = array_ops.split(W_o, [input_weight_width, hidden_state_width],
axis=1)
return w_i, w_f, w_c, w_o, r_i, r_f, r_c, r_o
# pylint: enable=invalid-name
def _cudnn_to_tf_biases(self, *cu_biases):
r"""Stitching cudnn canonical biases to generate tf canonical biases."""
b_wi, b_wf, b_wc, b_wo, b_ri, b_rf, b_rc, b_ro = cu_biases
# Save only the sum instead of individual biases. When recovering, return
# two biases each with half the value. Since RNN does not regularize by
# weight decay, it has no side effect in training or inference.
# pylint: disable=invalid-name
B_i = b_wi + b_ri
B_f = b_wf + b_rf
B_c = b_wc + b_rc
B_o = b_wo + b_ro
# pylint: enable=invalid-name
reordered = self._cudnn_to_tf_gate_params(*[B_i, B_f, B_c, B_o])
return (array_ops.concat(reordered, axis=0),)
def _tf_to_cudnn_biases(self, *tf_biases):
r"""Reverse the operations in StitchBiases()."""
(tf_bias,) = tf_biases
# pylint: disable=invalid-name
B_i, B_f, B_c, B_o = self._tf_to_cudnn_gate_params(
*array_ops.split(tf_bias, 4, axis=0))
# pylint: enable=invalid-name
# pylint: disable=unbalanced-tuple-unpacking
b_wi, b_ri = (B_i * 0.5,) * 2
b_wf, b_rf = (B_f * 0.5,) * 2
b_wc, b_rc = (B_c * 0.5,) * 2
b_wo, b_ro = (B_o * 0.5,) * 2
# pylint: enable=unbalanced-tuple-unpacking
# Return ifco order for Cudnn LSTM.
return b_wi, b_wf, b_wc, b_wo, b_ri, b_rf, b_rc, b_ro
def _cu_canonical_to_tf_canonical_single_layer(self, cu_weights, cu_biases,
tf_weights, tf_biases,
tf_weights_proj=None):
if self._num_proj:
(w, pw) = self._cudnn_to_tf_weights(*cu_weights)
tf_weights.append(w)
tf_weights_proj.append(pw)
else:
(w,) = self._cudnn_to_tf_weights(*cu_weights)
tf_weights.append(w)
(b,) = self._cudnn_to_tf_biases(*cu_biases)
tf_biases.append(b)
class CudnnParamsFormatConverterGRU(CudnnParamsFormatConverter):
"""Helper class that converts between params of Cudnn and TF GRU."""
_rnn_mode = CUDNN_GRU
_num_params_per_layer = CUDNN_GRU_PARAMS_PER_LAYER
_rnn_cell_name = base_layer.to_snake_case(CudnnCompatibleGRUCell.__name__)
def _cudnn_to_tf_weights(self, *cu_weights):
r"""Stitching cudnn canonical weights to generate tf canonical weights."""
w_i, w_r, w_h, r_i, r_r, r_h = cu_weights
# pylint: disable=invalid-name
W_i = array_ops.concat([w_i, r_i], axis=1)
W_r = array_ops.concat([w_r, r_r], axis=1)
# pylint: enable=invalid-name
return (array_ops.transpose(array_ops.concat([W_i, W_r], axis=0)),
array_ops.transpose(w_h), array_ops.transpose(r_h))
def _tf_to_cudnn_weights(self, layer, *tf_weights):
r"""Reverse the operations in StitchWeights()."""
input_size = self._input_size
num_units = self._num_units
if layer == 0:
input_weight_width = input_size
else:
input_weight_width = num_units
if self._direction == CUDNN_RNN_BIDIRECTION:
input_weight_width *= 2
# pylint: disable=invalid-name
W_ir, w_h, r_h = tf_weights
W_ir = array_ops.transpose(W_ir)
w_h = array_ops.transpose(w_h)
r_h = array_ops.transpose(r_h)
W_i, W_r = array_ops.split(W_ir, 2, axis=0)
w_i, r_i = array_ops.split(W_i, [input_weight_width, num_units], axis=1)
w_r, r_r = array_ops.split(W_r, [input_weight_width, num_units], axis=1)
# pylint: enable=invalid-name
return w_i, w_r, w_h, r_i, r_r, r_h
def _cudnn_to_tf_biases(self, *biases):
r"""Stitching cudnn canonical biases to generate tf canonical biases."""
b_wi, b_wr, b_wh, b_ri, b_rr, b_rh = biases
return (
# Save only the sum instead of individual biases. When recovering,
# return two biases each with half the value. Since RNN does not
# regularize by weight decay, it has no side effect in training or
# inference.
array_ops.concat([b_wi, b_wr], axis=0) +
array_ops.concat([b_ri, b_rr], axis=0),
b_wh,
b_rh)
def _tf_to_cudnn_biases(self, *tf_biases):
r"""Reverse the operations in StitchBiases()."""
# b_ir is the summed bias of reset and update gate.
b_ir, b_wh, b_rh = tf_biases
bi, br = b_ir * 0.5, b_ir * 0.5
b_wi, b_wr = array_ops.split(bi, 2, axis=0)
b_ri, b_rr = array_ops.split(br, 2, axis=0)
return b_wi, b_wr, b_wh, b_ri, b_rr, b_rh
def _cu_canonical_to_tf_canonical_single_layer(self, cu_weights, cu_biases,
tf_weights, tf_biases,
tf_weights_proj=None):
# pylint: disable=invalid-name
W_ir, w_h, r_h = self._cudnn_to_tf_weights(*cu_weights)
b_ir, b_wh, b_rh = self._cudnn_to_tf_biases(*cu_biases)
# pylint: enable=invalid-name
tf_weights.extend([W_ir, w_h, r_h])
tf_biases.extend([b_ir, b_wh, b_rh])
class CudnnParamsFormatConverterBasic(CudnnParamsFormatConverterLSTM):
"""Helper class that converts between params of Cudnn and TF Relu/Tanh RNN."""
def _cudnn_to_tf_weights(self, *cu_weights):
r"""Stitching cudnn canonical weights to generate tf canonical weights."""
w_i, w_h = cu_weights
W = array_ops.concat([w_i, w_h], axis=1) # pylint: disable=invalid-name
return (array_ops.transpose(W),)
def _tf_to_cudnn_weights(self, layer, *tf_weights):
r"""Reverse the operations in StitchWeights()."""
input_size = self._input_size
num_units = self._num_units
if layer == 0:
input_weight_width = input_size
else:
input_weight_width = num_units
if self._direction == CUDNN_RNN_BIDIRECTION:
input_weight_width *= 2
(tf_weight,) = tf_weights
# pylint: disable=invalid-name
W = array_ops.transpose(tf_weight)
w_i, w_h = array_ops.split(W, [input_weight_width, num_units], axis=1)
return w_i, w_h
# pylint: enable=invalid-name
def _cudnn_to_tf_biases(self, *cu_biases):
r"""Stitching cudnn canonical biases to generate tf canonical biases."""
# Save only the sum instead of individual biases. When recovering, return
# two biases each with half the value. Since RNN does not regularize by
# weight decay, it has no side effect in training or inference.
b_wi, b_wh = cu_biases
return (b_wi + b_wh,)
def _tf_to_cudnn_biases(self, *tf_biases):
r"""Reverse the operations in StitchBiases()."""
(tf_bias,) = tf_biases
b_i = tf_bias * 0.5
b_h = tf_bias * 0.5
return b_i, b_h
class CudnnParamsFormatConverterTanh(CudnnParamsFormatConverterBasic):
"""Helper class that converts between params of Cudnn and TF Tanh RNN."""
_rnn_mode = CUDNN_RNN_TANH
_num_params_per_layer = CUDNN_RNN_TANH_PARAMS_PER_LAYER
class CudnnParamsFormatConverterRelu(CudnnParamsFormatConverterBasic):
"""Helper class that converts between params of Cudnn and TF Relu RNN."""
_rnn_mode = CUDNN_RNN_RELU
_num_params_per_layer = CUDNN_RNN_RELU_PARAMS_PER_LAYER
# TODO(yaozhang): make sure we only save the canonical version of params and
# don't save the platform-specific version to avoid potential race
# conditions where params is updated by both versions when being restored.
# Currently, checkpointing will function properly, despite that we save both
# versions, because Saver restores customized savables after Variables.
# However, it is good to not rely on this restoring order of Saver and to
# avoid unnecessary storage. Add a test to check only the canonical version is
# saved.
class CudnnOpaqueParamsSaveable(saver.BaseSaverBuilder.SaveableObject):
"""Abstract SaveableObject implementation handling Cudnn opaque params."""
def __init__(self,
opaque_params,
num_layers,
num_units,
input_size,
input_mode=CUDNN_INPUT_LINEAR_MODE,
direction=CUDNN_RNN_UNIDIRECTION,
scope=None,
name="cudnn_rnn_saveable"):
"""Creates a CudnnOpaqueParamsSaveable object.
CudnnOpaqueParamsSaveable is saveable/restorable in a checkpoint file
and is used to save/restore the weights and biases parameters in a
canonical format which is directly consumable by platform-independent tf
RNN cells. Parameters are saved as tensors layer by layer with weight
tensors followed by bias tensors, and forward direction followed by
backward direction (if applicable). When restoring, a user could name
param_variables as desired, and restore weight and bias tensors to these
variables.
For CudnnRNNRelu or CudnnRNNTanh, there are 2 tensors per weight and per
bias for each layer: tensor 0 is applied to the input from the previous
layer and tensor 1 to the recurrent input.
For CudnnLSTM, there are 8 tensors per weight and per bias for each
layer: tensor 0-3 are applied to the input from the previous layer and
tensor 4-7 to the recurrent input. Tensor 0 and 4 are for the input gate;
tensor 1 and 5 the forget gate; tensor 2 and 6 the new memory gate;
tensor 3 and 7 the output gate.
For CudnnGRU, there are 6 tensors per weight and per bias for each layer:
tensor 0-2 are applied to the input from the previous layer and
tensor 3-5 to the recurrent input. Tensor 0 and 3 are for the reset gate;
tensor 1 and 4 the update gate; tensor 2 and 5 the new memory gate.
Args:
opaque_params: a variable, Cudnn RNN opaque params.
num_layers: the number of layers for the RNN model.
num_units: the number of units within the RNN model.
input_size: the size of the input, it could be different from the
num_units.
input_mode: indicate whether there is a linear projection between the
input and the actual computation before the first layer. It could be
'linear_input', 'skip_input' or 'auto_select'. 'linear_input' (default)
always applies a linear projection of input onto RNN hidden state.
(standard RNN behavior). 'skip_input' is only allowed when input_size ==
num_units; 'auto_select' implies 'skip_input' when input_size ==
num_units; otherwise, it implies 'linear_input'.
direction: the direction model that the model operates. Could be either
'unidirectional' or 'bidirectional'
scope: string of VariableScope, the scope of equivalent subgraph
consisting only platform-independent tf RNN cells.
name: the name of the CudnnOpaqueParamsSaveable object.
"""
# Define in subclasses.
self._num_layers = num_layers
self._input_size = input_size
self._num_units = num_units
self._input_mode = input_mode
self._direction = direction
if scope is not None:
scope_name = scope.name if isinstance(scope, vs.VariableScope) else scope
self._scope = scope_name or None
else:
self._scope = None
self._variables = opaque_params
self._num_dirs = 1 if self._direction == CUDNN_RNN_UNIDIRECTION else 2
# Defined in subclasses.
self._format_converter = None
tf_weights, tf_biases = (
self.format_converter.opaque_to_tf_canonical(self._variables))
tf_weight_names, tf_bias_names = self._tf_canonical_names()
# We currently don't use slice_spec. It might be useful in a distributed
# setting where each parameter server node stores a slice of variable,
# instead of having the master pull all slices and then save them.
slice_spec = ""
params = tf_weights + tf_biases
self._weight_names = tf_weight_names
self._bias_names = tf_bias_names
self._param_names = tf_weight_names + tf_bias_names
prefixed_param_names = tf_weight_names + tf_bias_names
if self._scope:
prefixed_param_names = [
"%s/%s" % (self._scope, pn) for pn in prefixed_param_names
]
specs = [
saver.BaseSaverBuilder.SaveSpec(param, slice_spec, param_name)
for param, param_name in zip(params, prefixed_param_names)
]
super(CudnnOpaqueParamsSaveable,
self).__init__(array_ops.identity(self._variables), specs, name)
@property
def format_converter(self):
if self._format_converter is None:
self._format_converter = self._format_converter_cls(
self._num_layers, self._num_units, self._input_size,
input_mode=self._input_mode,
direction=self._direction)
return self._format_converter
def restore(self, restored_tensors, restored_shapes):
opaque_params = self.format_converter.tf_canonical_to_opaque(
restored_tensors)
return state_ops.assign(
self._variables, opaque_params, validate_shape=False)
def _trackable_save(self, save_buffer):
weights, biases = self.format_converter.opaque_to_tf_canonical(
self._variables)
for name, tensor in zip(self._param_names, weights + biases):
save_buffer[name] = array_ops.identity(tensor)
def _trackable_restore(self, restore_buffer):
tensors = [
array_ops.identity(restore_buffer[name]) for name in self._param_names
]
return self.restore(
restored_tensors=tensors,
restored_shapes=None # Unused
)
def _add_trackable_dependencies(self, trackable, dtype):
"""Add canonical weight dependencies to `trackable`.
When saving or restoring, converts to or from the opaque buffer
format. Weights are saved and loaded in the configuration expected by
cuDNN-compatible cells.
Args:
trackable: An object inheriting from `Trackable` to add dependencies too
(typically the cuDNN `Layer`).
dtype: The dtype for the canonical parameter Tensors.
"""
split_dependencies = split_dependency.split_dependency(
component_names=self._param_names,
component_dtypes=(dtype,) * len(self._param_names),
fill_save_buffer_fn=self._trackable_save,
consume_restore_buffer_fn=self._trackable_restore,
device=self._variables[0].device)
self._trackable_track_params(trackable, split_dependencies)
def _trackable_track_params(self, trackable, params):
"""Tracks parameters in a canonical configuration."""
return # NotImplementedError raised by the Layer.
def _tf_canonical_names(self):
tf_weights_names, tf_biases_names = [], []
for i in range(self._num_layers):
if self._direction == CUDNN_RNN_UNIDIRECTION:
prefix = self._tf_canonical_name_prefix(i)
self._tf_canonical_names_single_layer(prefix, tf_weights_names,
tf_biases_names)
else:
fwd_prefix = self._tf_canonical_name_prefix(i, is_fwd=True)
bak_prefix = self._tf_canonical_name_prefix(i, is_fwd=False)
self._tf_canonical_names_single_layer(fwd_prefix, tf_weights_names,
tf_biases_names)
self._tf_canonical_names_single_layer(bak_prefix, tf_weights_names,
tf_biases_names)
return tf_weights_names, tf_biases_names
def _tf_canonical_name_prefix(self, layer, is_fwd=True):
if self._direction == CUDNN_RNN_UNIDIRECTION:
return "rnn/multi_rnn_cell/cell_%d/%s" % (layer, self._rnn_cell_name)
else:
if is_fwd:
return ("stack_bidirectional_rnn/cell_%d/bidirectional_rnn/fw/%s" %
(layer, self._rnn_cell_name))
else:
return ("stack_bidirectional_rnn/cell_%d/bidirectional_rnn/bw/%s" %
(layer, self._rnn_cell_name))
def _tf_canonical_names_single_layer(self, prefix, tf_weights_names,
tf_biases_names):
raise NotImplementedError("Abstract method")
class CudnnLSTMSaveable(CudnnOpaqueParamsSaveable):
"""SaveableObject implementation handling Cudnn LSTM opaque params."""
_format_converter_cls = CudnnParamsFormatConverterLSTM
_rnn_cell_name = base_layer.to_snake_case(CudnnCompatibleLSTMCell.__name__)
def _tf_canonical_names_single_layer(self, prefix, tf_weights_names,
tf_bias_names):
tf_weights_names.append(prefix + "/kernel")
tf_bias_names.append(prefix + "/bias")
def _trackable_track_params(self, trackable, params):
"""Track parameters for compatibility with CudnnCompatibleLSTMCell."""
biases = []
weights = []
for name in self._weight_names:
weights.append(params[name])
for name in self._bias_names:
biases.append(params[name])
assert len(params) == len(weights) + len(biases)
if len(weights) == 1 and len(biases) == 1:
# For single-layer cells, allow substituting a cell with no MultiRNNCell
# wrapping.
kernel, = weights # pylint: disable=unbalanced-tuple-unpacking
bias, = biases # pylint: disable=unbalanced-tuple-unpacking
trackable._track_trackable(kernel, name="kernel") # pylint: disable=protected-access
trackable._track_trackable(bias, name="bias") # pylint: disable=protected-access
assert len(biases) == len(weights)
for cell_index, (bias, kernel) in enumerate(zip(biases, weights)):
cell = trackable_lib.AutoTrackable()
trackable._track_trackable(cell, name="cell-%d" % cell_index) # pylint: disable=protected-access
cell.bias = bias
cell.kernel = kernel
class CudnnGRUSaveable(CudnnOpaqueParamsSaveable):
"""SaveableObject implementation handling Cudnn GRU opaque params."""
_format_converter_cls = CudnnParamsFormatConverterGRU
_rnn_cell_name = base_layer.to_snake_case(CudnnCompatibleGRUCell.__name__)
def _tf_canonical_names_single_layer(self, prefix, tf_weights_names,
tf_bias_names):
tf_weights_names.append(prefix + "/gates/kernel")
tf_weights_names.append(prefix + "/candidate/input_projection/kernel")
tf_weights_names.append(prefix + "/candidate/hidden_projection/kernel")
tf_bias_names.append(prefix + "/gates/bias")
tf_bias_names.append(prefix + "/candidate/input_projection/bias")
tf_bias_names.append(prefix + "/candidate/hidden_projection/bias")
class CudnnRNNTanhSaveable(CudnnLSTMSaveable):
_format_converter_cls = CudnnParamsFormatConverterTanh
_rnn_cell_name = base_layer.to_snake_case(rnn_cell_impl.BasicRNNCell.__name__)
class CudnnRNNReluSaveable(CudnnLSTMSaveable):
_format_converter_cls = CudnnParamsFormatConverterRelu
_rnn_cell_name = base_layer.to_snake_case(rnn_cell_impl.BasicRNNCell.__name__)
_cudnn_rnn_common_doc_string = """
Cudnn RNN has an opaque parameter buffer that can be used for inference and
training. But it is possible that the layout of the parameter buffers
changes between generations. So it is highly recommended to use
CudnnOpaqueParamsSaveable to save and restore weights and biases in a
canonical format.
This is a typical use case:
* The user creates a CudnnRNN model.
* The user query that parameter buffer size.
* The user creates a variable of that size that serves as the parameter
buffers.
* The user either initialize the parameter buffer, or load the canonical
weights into the parameter buffer.
* The user calls the model with the parameter buffer for inference, or
training.
* If training, the user creates a Saver object.
* If training, the user creates a CudnnOpaqueParamsSaveable object from the
parameter buffer for it to be later saved in the canonical format. When
creating a CudnnOpaqueParamsSaveable object, a name could be provided,
which is useful in distinguishing the names of multiple
CudnnOpaqueParamsSaveable objects (e.g. for an encoder-decoder model).
* Once a while, the user saves the parameter buffer into model checkpoints
with Saver.save().
* When restoring, the user creates a CudnnOpaqueParamsSaveable object and
uses Saver.restore() to restore the parameter buffer from the canonical
format to a user-defined format, as well as to restore other savable
objects in the checkpoint file.
"""
def _check_rnn_mode(rnn_mode):
if rnn_mode not in (CUDNN_LSTM, CUDNN_GRU, CUDNN_RNN_TANH, CUDNN_RNN_RELU):
raise ValueError(
"Invalid rnn_mode: %s, expect one of (%s, %s, %s, %s)" %
(rnn_mode, CUDNN_LSTM, CUDNN_GRU, CUDNN_RNN_TANH, CUDNN_RNN_RELU))
def _get_seed(seed):