# Sprint22 リカレントニューラルネットワーク

スクラッチを通してリカレントニューラルネットワークの基礎を理解する

## Simple RNN フォワードプロパゲーション

SimpleRNNのクラスSimpleRNNを作成する。

$$
a_t = x_t \dot W_x + h_{t_1} \cdot W_h + B \\
h_t = tanh(a_t)
$$

at  : 時刻tの活性化関数を通す前の状態 (batch_size, n_nodes)
ht : 時刻tの状態・出力 (batch_size, n_nodes)
xt : 時刻tの入力 (batch_size, n_features)
Wx : 入力に対する重み (n_features, n_nodes)
ht−1 : 時刻t-1の状態（前の時刻から伝わる順伝播） (batch_size, n_nodes)
Wh : 状態に対する重み。 (n_nodes, n_nodes)
B : バイアス項 (n_nodes,)

In [4]:
#基本ライブラリ
import time
import pickle
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
%matplotlib inline

In [3]:
x = np.array([[[1, 2], [2, 3], [3, 4]]])/100 # (batch_size, n_sequences, n_features)
w_x = np.array([[1, 3, 5, 7], [3, 5, 7, 8]])/100 # (n_features, n_nodes)
w_h = np.array([[1, 3, 5, 7], [2, 4, 6, 8], [3, 5, 7, 8], [4, 6, 8, 10]])/100 # (n_nodes, n_nodes)
batch_size = x.shape[0] # 1
n_sequences = x.shape[1] # 3
n_features = x.shape[2] # 2
n_nodes = w_x.shape[1] # 4
h = np.zeros((batch_size, n_nodes)) # (batch_size, n_nodes)
b = np.array([1, 1, 1, 1]) # (n_nodes,)

print("x\n", x)
print("w_x\n", w_x)
print("w_h\n", w_h)

x
 [[[0.01 0.02]
  [0.02 0.03]
  [0.03 0.04]]]
w_x
 [[0.01 0.03 0.05 0.07]
 [0.03 0.05 0.07 0.08]]
w_h
 [[0.01 0.03 0.05 0.07]
 [0.02 0.04 0.06 0.08]
 [0.03 0.05 0.07 0.08]
 [0.04 0.06 0.08 0.1 ]]


In [185]:
class SimpleRNN():
    def __init__(self, recurrents=3, alpha=1, num_itr=10):
        self.recurrents = recurrents
        self.alpha = alpha
        self.num_itr = num_itr
        
    def fit(self, X, y):
        #初期化
        self.w_x = np.array([[1, 3, 5, 7], [3, 5, 7, 8]])/100 # (n_features, n_nodes)
        self.w_a = np.array([[1, 3, 5, 7], [2, 4, 6, 8], [3, 5, 7, 8], [4, 6, 8, 10]])/100 # (n_nodes, n_nodes)
        self.batch_size = X.shape[0] # 1
        n_sequences = X.shape[1] # 3
        n_features = X.shape[2] # 2
        n_nodes = self.w_x.shape[1] # 4
        self.h = np.zeros(( 4, n_nodes)) # (batch_size, n_nodes)
        self.bias = np.array([1.0, 1.0, 1.0, 1.0])
        self.y_hat = np.empty((4,4))
        
        for _ in range(self.num_itr):
            
            #forward
            for i in range(n_sequences):
                h = np.tanh(np.dot(X[0,i].T, self.w_x) + np.dot(self.h[i], self.w_a) + self.bias)
                self.y_hat[i] = self.softmax(h)
                self.h[i+1] = h
                     
            #backward
            count = 0
            for i in reversed(range(n_sequences)):
                if i == 2:
                    LH = self.y_hat[i] - y
                    HA = LH * (1 - np.tanh(self.h[i])*np.tanh(self.h[i]))
                    LB = np.copy(HA)
                    LWx = np.dot(X[0, i].reshape(-1,1), HA.reshape(1, -1))
                    LWh = np.dot(self.h[i].reshape(-1,1), HA.reshape(1, -1))
                    LH_next = np.dot(HA, self.w_a.T)
                    LX = np.dot(HA.reshape(-1, 1), X[0,-1].reshape(1,-1))
                else:
                    LH = (self.y_hat[i] - y) + LH_next
                    HA = LH * (1 - np.tanh(self.h[i])*np.tanh(self.h[i]))
                    LB *= HA
                    LWx *= np.dot(X[0, i].reshape(-1,1), HA.reshape(1, -1))
                    LWh *= np.dot(self.h[i].reshape(-1,1), HA.reshape(1, -1))
                    LH_next = np.dot(HA, self.w_a.T)
                    LX *= np.dot(HA.reshape(-1, 1), X[0,-1].reshape(1,-1))                
            self.LWx = LWx
            self.LWh = LWh
            self.LB = LB
            self.LX = LX


            #更新
            self.w_x -= self.alpha*self.LWx
            self.w_a -= self.alpha*self.LWh
            self.bias -= self.alpha*self.LB
            

        return self.y_hat
            
            
    # ソフトマックス関数
    def softmax(self, a):
        # 一番大きい値を取得
        c = np.max(a)
        # 各要素から一番大きな値を引く（オーバーフロー対策）
        exp_a = np.exp(a - c)
        sum_exp_a = np.sum(exp_a)
        # 要素の値/全体の要素の合計
        y = exp_a / sum_exp_a

        return y

        

In [186]:
#test
y = np.array([0.24, 0.24, 0.26, 0.26])

SRNN = SimpleRNN()
result = SRNN.fit(x, y)
print(" ")
print(result)
print(" ")
print(SRNN.h)

 
[[0.24991093 0.24997366 0.25003683 0.25007858]
 [0.24247109 0.24785809 0.25283146 0.25683935]
 [0.24201507 0.24775671 0.2530165  0.25721173]
 [0.04       0.06       0.08       0.1       ]]
 
[[0.         0.         0.         0.        ]
 [0.76188792 0.76213887 0.76239156 0.76255851]
 [0.79220896 0.81418283 0.83404957 0.84977726]
 [0.79494223 0.81838947 0.83939692 0.85584181]]


## バックプロパゲーションの実装

$$
W'_x = W_x - \alpha \frac{\partial L}{\partial W_x} \\
W'_h = W_h - \alpha \frac{\partial L}{\partial W_h} \\
B' = B - \alpha \frac{\partial L}{\partial B}
$$

勾配を求めるためのバックプロパゲーションの数式が以下

$$
\frac{\partial h_t}{\partial a_t} = \frac{\partial L}{\partial h_t} \times (1 - tanh^2(a_t))  \\
\frac{\partial L}{\partial B} = \frac{\partial h_t}{\partial a_t} \\
\frac{\partial L}{\partial W_x} = x_t^T \cdot \frac{\partial h_t}{\partial a_t} \\
\frac{\partial L}{\partial W_h} = h_{t-1}^T \cdot \frac{\partial h_t}{\partial a_t}
$$

各時間にて生み出された勾配は、１つに掛け合わされる。

L/h_t は前の時刻からの状態の誤差と出力の誤差の合計。hは順伝播時に出力と次の層に伝わる状態双方に使われているからである。

前の時刻や層に流す誤差の数式は以下。

$$
\frac{\partial L}{\partial h_{t-1}} = \frac{\partial h_t}{\partial a_t} \cdot W_h^T \\
\frac{\partial L}{\partial X_t} =  \frac{\partial h_t}{\partial a_t} \cdot W_x^T  \\
\frac{\partial L}{\partial h_{next}} = \frac{\partial L}{\partial h_{t-1}} + (yhat - y)
$$

In [172]:
#バックワード
y = np.array([0.24, 0.24, 0.26, 0.26]) #適当に決める
LH = result[-1] - y
print(LH)

[-0.2  -0.18 -0.18 -0.16]


In [173]:
np.tanh(SRNN.h)

array([[0.        , 0.        , 0.        , 0.        ],
       [0.64218764, 0.64233507, 0.64248348, 0.64258151],
       [0.65965859, 0.67189125, 0.68264445, 0.69095309],
       [0.66119969, 0.67419234, 0.6854895 , 0.69410906]])

In [174]:
HA = LH * (1 - np.tanh(SRNN.h[-1])*np.tanh(SRNN.h[-1]))
print(HA)
print(HA.shape)

[-0.11256299 -0.09818364 -0.09541875 -0.08291402]
(4,)


In [177]:
LWx = np.dot(x[0, 2].reshape(-1,1), HA.reshape(1, -1))
print(LWx)

[[-0.00337689 -0.00294551 -0.00286256 -0.00248742]
 [-0.00450252 -0.00392735 -0.00381675 -0.00331656]]


In [180]:
LWh =  np.dot(SRNN.h[2].reshape(-1,1), HA.reshape(1, -1))
print(LWh)

[[-0.08917341 -0.07778196 -0.07559159 -0.06568523]
 [-0.09164686 -0.07993944 -0.0776883  -0.06750717]
 [-0.09388312 -0.08189003 -0.07958396 -0.0691544 ]
 [-0.09565347 -0.08343423 -0.08108468 -0.07045845]]


In [181]:
LH_1 = np.dot(HA, SRNN.w_a.T)
print(LH_1)

[-0.01464606 -0.01853685 -0.02159851 -0.02631844]


In [184]:
LX = np.dot(HA.reshape(-1, 1), x[0,-1].reshape(1,-1))
print(LX)

[[-0.00337689 -0.00450252]
 [-0.00294551 -0.00392735]
 [-0.00286256 -0.00381675]
 [-0.00248742 -0.00331656]]


以上