In [1]:
import numpy as np

$$
h_{t}=tanh(h_{t-1}W_{h}+x_{t}W_{x}+b) \newline \newline
{∂tanh(x) \over ∂x}=(1+tanh(x))(1-tanh(x))
$$


In [2]:
class RNN:
    def __init__(self, Wx, Wh, b):
        # Wx : [D, H]
        # Wh : [H, H]
        # b : [1, H]
        self.params = [Wx, Wh, b]
        self.grads = [np.zeros_like(Wx), np.zeros_like(Wh), np.zeros_like(b)]
        self.cache = None
    
    def forward(self, x, h_prev):
        # x : [B, D]
        # h_prev : [B, H]
        Wx, Wh, b = self.params
        # t: [B, H]
        t = np.matmul(h_prev, Wh) + np.matmul(x, Wx) + b
        # h_next: [B, H]
        h_next = np.tanh(t)
        # backward 계산을 위해 cache로 저장
        self.cache = (x, h_prev, h_next)
        return h_next
    
    def backward(self, dh_next):
        Wx, Wh, b = self.params
        x, h_prev, h_next = self.cache
        # dh_next / dt  [B, H]
        dt = (1- h_next**2)
        # dt / db (b was broadcasted) [1, H]
        db = np.sum(dt, axis=0)
        # dt / dWh [H, H]
        dWh = np.matmul(h_prev.T, dt)
        # dt / dh_prev [H, H]
        dh_prev = np.matmul(dt, Wh.T)
        # dt / dWx [D, H]
        dWx = np.matmul(x.T, dt)
        # dt / dx [B, D]
        dx = np.matmul(dt, Wx.T)

        self.grads[0][...] = dWx
        self.grads[1][...] = dWh
        self.grads[2][...] = db

        return dx, dh_prev


In [3]:
class TimeRNN:
    def __init__(self, Wx, Wh, b, stateful=False):
        self.params = [Wx, Wh, b]
        self.grads = [np.zeros_like(Wx), np.zeros_like(Wh), np.zeros_like(b)]
        self.layer = None

        self.h, self.dh = None, None
        self.stateful = stateful
    
    def set_state(self, h):
        self.h = h
    
    def reset_state(self):
        self.h = None
    
    def forward(self, xs):
        Wx, Wh, b = self.params
        B, T, D = xs.shape # [batch, time, input_dim]
        D, H = Wx.shape # [input_dim, hidden_dim]

        self.layers = []
        hs = np.empty((B, T, H), dtype='f')

        if not self.stateful or self.h is None:
            # initialize hidden states
            self.h = np.zeros((B, H), dtype='f')

        for t in range(T):
            layer = RNN(*self.params)
            self.h = layer.forward(xs[:, t, :], self.h)
            hs[:, t, :] = self.h
            self.layers.append(layer)

        return hs 
    
    def backward(self, dhs):
        # dhs is gradient from upper layer
        # dhs : [B, T, H]
        # dh is gradient from next time step
        # dh : [B, H]    
        # dxs is gradient to lower layer
        # dxs : [B, T, D]
        Wx, Wh, b = self.params
        B, T, H = dhs.shape
        D, H = Wx.shape

        dxs = np.empty((B, T, D), dtype="f")
        dh = 0
        grads = [0, 0, 0]
        for t in reversed(range(T)):
            layer = self.layers[t]
            # in forward pass, output h is branched out to next layer and also to next time step
            # so we have to add up gradients from next time step and upper layer
            dx, dh = layer.backward(dhs[:, t, :] + dh)
            dxs[:, t, :] = dx

            # RNN shares the parameters so we have to add up all gradients from each time step
            for i, grad in enumerate(layer.grads):
                grads[i] += grad
        
        for i, grad in enumerate(grads):
            self.grads[i][...] = grad
        self.dh = dh

        return dxs