## Recurrent neural networks

In [None]:
class RNN:
    def __init__(self, Wx, Wh, b):
        self.params = [Wx, Wh, b]
        self.grads = [np.zeros_like(Wx), np.zeros_like(Wh), np.zeros_like(b)]
        self.cache = None

    def forward(self, x, h_prev):
        Wx, Wh, b = self.params
        t = np.dot(h_prev, Wh) + np.dot(x, Wx) + b
        h_next = np.tanh(t)

        self.cache = (x, h_prev, h_next)
        return h_next

    def backward(self, dh_next):
        # 'ppp' exercise
        Wx, Wh, b = self.params
        x, h_prev, h_next = self.cache

        # tanh
        dt = dh_next * (1- h_next**2)
        db = np.sum(dt, axis=0)
        dWh = np.dot(h_prev.T ,dt)
        dh_prev = np.dot(x.T ,dt)
        dWx = np.dot(x.T ,dt)
        dx = np.dot(dt, Wx.T)

        self.grad[0][...] = dWx
        self.grad[1][...] = dWh
        self.grad[2][...] = db
        return dx, dh_prev



In [None]:
class TimeRNN:
    def __init__(self, Wx, Wh, b, stateful=False):
        self.params = [Wx, Wh, b]
        self.grads = [np.zeros_like(Wx), np.zeros_like(Wh), np.zeros_like(b)]
        self.layers = None

        self.h, self.dh = None, None
        self.stateful = stateful

    def forward(self, xs):
        Wx, Wh, b = self.params
        N, T, D = xs.shape
        D, H = Wx.shape

        self.layers = []
        hs = np.empty((N, T, H), dtype='f')

        if not self.stateful or self.h is None:
            self.h = np.zeros((N, H), dtype='f')

        for t in range(T):
            # 'ppp' exercise
            layer = RNN(*self.params)
            self.h = layer.forward(xs(:, t, :), self.h)
            hs[:, t, :] = self.h
            self.layer.append(layer)

        return hs

    def backward(self, dhs):
        # 'ppp' exercise
        Wx, Wh, b = self.params
        N, T, H =dhs.shape
        D, H = Wx.shape

        dxs = np.empty((N, T, D), dtype='f')
        dh = 0
        grads = [0, 0, 0]
        for i in reversed(range(T)):
          layer = self.layers(t)
          dx, dh = layer.backward(dhs[:, t, :] + dh)
          dxs[:, t, :] = dx

          for i, grad in enumerate(layer, grads):
            grads[i] += grad

        for i, grad in enumerate(grads):
          self.grads[i][...] = grad
        self.dh = dh

        return dxs

    def set_state(self, h):
        self.h = h

    def reset_state(self):
        self.h = None

## Language models

In [None]:
# PPP exercise
class TimeEmbedding:
    def __init__(self, W):
      self.params = [W]
      self.grads = [np.zeros_like(W)]
      self.layers = None
      self.W = W

    def forward(self, xs):
      N, T = xs.shape
      V, D = self.W.shape

      out = np.empty((N, T, D), dtype='f')
      self.layers = []

    for i in range(T):
      layer = Embedding(self.W)
      out[:, t, :] = layer.forward(xs[:, t])
      self.layers.append(layer)
    return out

    def backward(self, dout):
        N, T, D = dout.shape
        grad = 0
        for t in range(T):
          layer = self.layers[t]
          layer.backward(dout[:, t, :])
          grad += layer.grads[0]
        self.grads[0][...] = grad
        return None


class TimeAffine:
    def __init__(self, W, b):
      self.params = [W, b]
      self.grads = [np.zeros_like(W), np.zeros_like(b)]
      self.xs = None

    def forward(self, xs):
      N ,T, D = xs.shape
      W, b = self.params
      rx = rs.reshape(N*T, -1)
      out = np.dot(rx, W) + b
      self.xs = xs
      return out.reshape(N, T, -1)

    def backward(self, dout):
      xs = self.xs
      N ,T, D = xs.shape
      W, b = self.params

      dout = dout.reshape(N * T, -1)
      rx = xs.reshape(*xs.shape)

      self.grads[0][...] = dW
      self.grads[1][...] = db


class TimeSoftmaxWithLoss:
    def __init__(self):
      self.params, self.grads = [], []
      self.cache = None
      self.ignore_label = -1

    def forward(self, xs, ts):
      N, T, V = xs.shape

      if ts.dim == 3:
        ts = ts.argmax(axis=2)

      mask = (ts != self.ignore_label)
      xs = xs.reshape(N*T, V)
      ts = ts.reshape(N*T)
      mask = mask.reshape(N*T)

      ys = softmax(xs)
      ls = np.log(ys[np.arrange(N*T), ts])
      ls *= mask
      loss = -np.sum(ls)
      loss /= mask.sum()

      self.cache = (ts, ys, mask, (N, T, V))
      return loss

    def backward(self, dout=1):
      ts, ys, mask, (N, T, V) = self.cache

      dx = ys
      dx[np.arrange(N*T), ts] -= 1
      dx *= dout
      dx /= mask.sum()
      dx *= mask[:, np.newaxis]

      dx = dx.reshape((N, T, V))
      return dx

