In [1]:
import tensorflow as tf

rnn, 即循环神经网络（recurrent neural network）. 相比全连接神经网络，可以更好的处理序列数据。对于序列数据，其中的每个元素是前后相关联的。全连接神经可以处理相对独立的数据，

举个例子，对于词性标注问题，我/nn 吃/v 苹果/nn

对于这个任务来说，那么全连接神经网络会如下的对应。我 -> nn 吃 -> v 苹果 -> nn
很容易看到，对于词性标注问题，前面的词对于后面的词有影响，例如预测苹果的此行是，如果前面的吃是动词时，那么苹果作为名词的概率就会很大。因为动词后面接名词的概率很大。

为了更好的处理的这样的问题，RNN就诞生了。

RNN的网络结构如下
输入层、隐藏层和输出层组成
![title](img/rnn1.png)

对于RNN的网络结构，其形态比较抽象。其中一个比较突出的特点就是隐藏层再次连接到隐藏层。这样的原理在于将上一个隐藏层的输出状态当成下一次输入的数据。就可以有效的利用序列的前一个状态信息。

图中，W是RNN隐藏层的权重，V是隐藏层到输出层的权重，U是输入层到隐藏层的权重。

输入值为X, 输出值为O, 隐藏状态为S

那么
S_i = f(UX_i + WS_i-1)
O_i = g(VS_i)

这样我们可以看到隐藏状态S_i， 和X_i以及S_i-1都是相关的

对于rnn,其参数梯度的计算是使用TBP算法


In [84]:
(train_data, train_target), (test_data, test_target) = tf.keras.datasets.imdb.load_data()

In [85]:
max_len =64

In [99]:
train_target

array([1, 0, 0, ..., 0, 1, 0], dtype=int64)

In [100]:

train_target = train_target.astype("float32")

In [83]:
train_padding = tf.keras.preprocessing.sequence.pad_sequences(train_data, padding="post", maxlen=max_len)
test_padding = tf.keras.preprocessing.sequence.pad_sequences(test_data, padding="post", maxlen=max_len)

In [91]:
train_padding = train_padding.astype("float32")
test_padding = test_padding.astype("float32")

In [10]:
word_index = tf.keras.datasets.imdb.get_word_index()

In [107]:
VOCAB_SIZE = 100000
EMBED_SIZE = 10
RNN_SIZE = 10

In [103]:
dataset = tf.data.Dataset.from_tensor_slices((train_padding, train_target))
dataset = dataset.shuffle(100).batch(100)

In [108]:
class JRNN(tf.keras.Model):
    
    def __init__(self):
        super(JRNN, self).__init__()
        
        self.embed = tf.keras.layers.Embedding(VOCAB_SIZE, output_dim=EMBED_SIZE)
        self.rnn = tf.keras.layers.SimpleRNN(RNN_SIZE, return_sequences=True, return_state=True)
        self.out = tf.keras.layers.Dense(1, activation="sigmoid")
        
    def call(self, input_x):
        
        x = self.embed(input_x)
        out, state = self.rnn(x)
        out = out[:,-1,:]
        logits = self.out(out)
        
        return logits

In [109]:
model = JRNN()

In [79]:
model(tf.constant([[1, 2], [3, 4]]))

<tf.Tensor: id=2909, shape=(2, 1), dtype=float32, numpy=
array([[0.50424653],
       [0.49094072]], dtype=float32)>

In [31]:
optimizer = tf.keras.optimizers.Adam()
loss_func = tf.keras.losses.BinaryCrossentropy(from_logits=True)

In [33]:
acc = tf.keras.metrics.Accuracy()

In [111]:
@tf.function()
def train_step(input_x, input_y):
    
    with tf.GradientTape() as tape:
        
        logits = model(input_x)
        loss = loss_func(logits, input_y)
        
    variables = model.variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))
    

    
    return loss

In [112]:
EPOCH = 100

for e in range(EPOCH):
    
    for i, (train_x, train_y) in enumerate(dataset):
        loss = train_step(train_x, train_y)
        
        if i% 100 == 0:
            print("epoch {0} batch {1} loss {2}".format(e, i, loss))

epoch 0 batch 0 loss 0.7394543290138245
epoch 0 batch 100 loss 0.5515481233596802
epoch 0 batch 200 loss 0.5070056319236755
epoch 1 batch 0 loss 0.5624309778213501
epoch 1 batch 100 loss 0.5231655836105347
epoch 1 batch 200 loss 0.49936443567276
epoch 2 batch 0 loss 0.551193356513977
epoch 2 batch 100 loss 0.5130713582038879
epoch 2 batch 200 loss 0.5088028907775879
epoch 3 batch 0 loss 0.5274919271469116
epoch 3 batch 100 loss 0.5234302282333374
epoch 3 batch 200 loss 0.4816802740097046
epoch 4 batch 0 loss 0.5420442819595337
epoch 4 batch 100 loss 0.5116667151451111
epoch 4 batch 200 loss 0.5115427374839783
epoch 5 batch 0 loss 0.5417777895927429
epoch 5 batch 100 loss 0.5227643251419067
epoch 5 batch 200 loss 0.5037571787834167
epoch 6 batch 0 loss 0.5340408086776733
epoch 6 batch 100 loss 0.5339873433113098
epoch 6 batch 200 loss 0.5149919986724854
epoch 7 batch 0 loss 0.5301385521888733
epoch 7 batch 100 loss 0.5073539018630981
epoch 7 batch 200 loss 0.5073172450065613
epoch 8 bat

epoch 65 batch 0 loss 0.5335954427719116
epoch 65 batch 100 loss 0.5335953831672668
epoch 65 batch 200 loss 0.48041149973869324
epoch 66 batch 0 loss 0.5373942255973816
epoch 66 batch 100 loss 0.5108022689819336
epoch 66 batch 200 loss 0.5070034265518188
epoch 67 batch 0 loss 0.5525896549224854
epoch 67 batch 100 loss 0.49560683965682983
epoch 67 batch 200 loss 0.5070033669471741
epoch 68 batch 0 loss 0.5525895953178406
epoch 68 batch 100 loss 0.4918079376220703
epoch 68 batch 200 loss 0.5146011114120483
epoch 69 batch 0 loss 0.5032045245170593
epoch 69 batch 100 loss 0.4918080270290375
epoch 69 batch 200 loss 0.49560683965682983
epoch 70 batch 0 loss 0.5411930680274963
epoch 70 batch 100 loss 0.5411930680274963
epoch 70 batch 200 loss 0.48800909519195557
epoch 71 batch 0 loss 0.5259976387023926
epoch 71 batch 100 loss 0.5183999538421631
epoch 71 batch 200 loss 0.5146010518074036
epoch 72 batch 0 loss 0.5259976387023926
epoch 72 batch 100 loss 0.48800909519195557
epoch 72 batch 200 los

In [98]:
train_x

<tf.Tensor: id=3103, shape=(100, 64), dtype=float32, numpy=
array([[2.7800e+02, 3.6000e+01, 6.9000e+01, ..., 7.0000e+00, 1.2900e+02,
        1.1300e+02],
       [1.9400e+02, 1.3519e+04, 1.1697e+04, ..., 1.4000e+01, 1.0700e+02,
        1.0200e+02],
       [1.4000e+02, 1.4500e+02, 8.0000e+00, ..., 7.0000e+00, 2.5820e+03,
        1.0200e+02],
       ...,
       [3.2100e+02, 8.1100e+02, 5.7328e+04, ..., 3.3630e+03, 1.6860e+03,
        4.5100e+02],
       [2.5000e+02, 4.7500e+02, 1.1000e+01, ..., 4.0000e+00, 4.2368e+04,
        7.0660e+03],
       [1.4000e+01, 9.0000e+00, 6.0000e+00, ..., 1.1675e+04, 8.7000e+01,
        2.2000e+01]], dtype=float32)>