<a href="https://colab.research.google.com/github/kake01/Bot/blob/master/RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
'''テキストデータ(コーパス)の正規化'''
def preprocess(text):
  # 小文字にする
  text = text.lower()
  # '.',を ' .'に変更する
  text = text.replace('.', ' .')
  # ' 'で区切り配列に代入
  words = text.split(' ')
 
  word_to_id = {}
  id_to_word = {}
  for word in words:
    if word not in word_to_id:
      new_id = len(word_to_id)
      word_to_id[word] = new_id
      id_to_word[new_id] = word
  corpus = [word_to_id[w] for w in words]
  
  return corpus, word_to_id, id_to_word


'''共起行列の作成'''
def create_co_matrix(corpus, vocab_size, window_size = 1):
  corpus_size = len(corpus)
  co_matrix = np.zeros((vocab_size, vocab_size), dtype=np.int32)

  for idx, word_id in enumerate(corpus):
    for i in range(1, window_size + 1):
      left_idx = idx - 1
      right_idx = idx + 1

      if left_idx >= 0:
        left_word_id = corpus[left_idx]
        co_matrix[word_id, left_word_id] += 1

      if right_idx < corpus_size:
        right_word_idx = corpus[right_idx]
        co_matrix[word_id, right_word_idx] += 1

  return co_matrix


'''コサイン類似度'''
def cos_similarity(x, y, eps=1e-8):
  nx = x / (np.sqrt(np.sum(x**2)) + eps)
  ny = y / (np.sqrt(np.sum(y**2)) + eps)
  
  return np.dot(nx,ny)


'''類似単語の検索'''
def most_similar(query, word_to_id, id_to_word, word_matrix, top=5):
    if query not in word_to_id:
        print('%s is not found' % query)
        return

    print('\n[query] ' + query)
    query_id = word_to_id[query]
    query_vec = word_matrix[query_id]

    vocab_size = len(id_to_word)

    similarity = np.zeros(vocab_size)
    for i in range(vocab_size):
        similarity[i] = cos_similarity(word_matrix[i], query_vec)

    count = 0
    for i in (-1 * similarity).argsort():
        if id_to_word[i] == query:
            continue
        print(' %s: %s' % (id_to_word[i], similarity[i]))

        count += 1
        if count >= top:
            return


'''正の相互情報量'''
def ppmi(C, verbose=False, eps=1e-8):
  M = np.zeros_like(C, dtype=np.float32)
  N = np.sum(C)
  S = np.sum(C, axis=0)
  total = C.shape[0] * C.shape[1]
  cnt = 0
  for i in range(C.shape[0]):
    for j in range(C.shape[1]):
      pmi = np.log2(C[i, j] * N / (S[j]*S[i]) + eps)
      M[i, j] = max(0, pmi)
      if verbose:
        cnt += 1
        if cnt % (total//100) == 0:
          print('%.1f%% done' % (100*cnt/total))
  return M


'''コンテキストとターゲットの生成'''
def create_contexts_target(corpus, window_size=1):
  target = corpus[window_size:-window_size]
  contexts = []

  for idx in range(window_size, len(corpus) - window_size):
    cs = []
    for t in range(-window_size, window_size + 1):
      if t == 0:
        continue
      cs.append(corpus[idx + t])
    contexts.append(cs)
  
  return np.array(contexts), np.array(target)


'''コンテキストとターゲットをone-hot表現への変換'''
def convert_one_hot(corpus, vocab_size):
  N = corpus.shape[0]

  if corpus.ndim == 1:
      one_hot = np.zeros((N, vocab_size), dtype=np.int32)
      for idx, word_id in enumerate(corpus):
          one_hot[idx, word_id] = 1

  elif corpus.ndim == 2:
      C = corpus.shape[1]
      one_hot = np.zeros((N, C, vocab_size), dtype=np.int32)
      for idx_0, word_ids in enumerate(corpus):
          for idx_1, word_id in enumerate(word_ids):
              one_hot[idx_0, idx_1, word_id] = 1

  return one_hot


def cross_entropy_error(y, t):
  if y.ndim == 1:
      t = t.reshape(1, t.size)
      y = y.reshape(1, y.size)
      
  # 教師データがone-hot-vectorの場合、正解ラベルのインデックスに変換
  if t.size == y.size:
      t = t.argmax(axis=1)
  batch_size = y.shape[0]

  return -np.sum(np.log(y[np.arange(batch_size), t] + 1e-7)) / batch_size

def softmax(x):
  if x.ndim == 2:
      x = x - x.max(axis=1, keepdims=True)
      x = np.exp(x)
      x /= x.sum(axis=1, keepdims=True)
  elif x.ndim == 1:
      x = x - np.max(x)
      x = np.exp(x) / np.sum(np.exp(x))

  return x

'''パラメータ配列中の重複する重みをひとつに集約,その重みに対応する勾配を加算する'''
def remove_duplicate(params, grads):
  params, grads = params[:], grads[:]  # copy list
  while True:
    find_flg = False
    L = len(params)

    for i in range(0, L - 1):
        for j in range(i + 1, L):
            # 重みを共有する場合
            if params[i] is params[j]:
                grads[i] += grads[j]  # 勾配の加算
                find_flg = True
                params.pop(j)
                grads.pop(j)
            # 転置行列として重みを共有する場合（weight tying）
            elif params[i].ndim == 2 and params[j].ndim == 2 and \
                  params[i].T.shape == params[j].shape and np.all(params[i].T == params[j]):
                grads[i] += grads[j].T
                find_flg = True
                params.pop(j)
                grads.pop(j)

            if find_flg: break
        if find_flg: break

    if not find_flg: break
  return params, grads

def clip_grads(grads, max_norm):
  total_norm = 0
  for grad in grads:
    total_norm += np.sum(grad ** 2)
  total_norm = np.sqrt(total_norm)

  rate = max_norm / (total_norm + 1e-6)
  if rate < 1:
    for grad in grads:
      grad *= rate



In [0]:
class MatMul:
  def __init__(self, W):
      self.params = [W]
      self.grads = [np.zeros_like(W)]
      self.x = None

  def forward(self, x):
      W, = self.params
      out = np.dot(x, W)
      self.x = x
      return out

  def backward(self, dout):
      W, = self.params
      dx = np.dot(dout, W.T)
      dW = np.dot(self.x.T, dout)
      self.grads[0][...] = dW
      return dx

class SoftmaxWithLoss:
  def __init__(self):
      self.params, self.grads = [], []
      self.y = None  # softmaxの出力
      self.t = None  # 教師ラベル

  def forward(self, x, t):
      self.t = t
      self.y = softmax(x)

      # 教師ラベルがone-hotベクトルの場合、正解のインデックスに変換
      if self.t.size == self.y.size:
          self.t = self.t.argmax(axis=1)

      loss = cross_entropy_error(self.y, self.t)
      return loss

  def backward(self, dout=1):
      batch_size = self.t.shape[0]

      dx = self.y.copy()
      dx[np.arange(batch_size), self.t] -= 1
      dx *= dout
      dx = dx / batch_size

      return dx

# SimpleCBOWモデル
class SimpleCBOW:
  def __init__(self, vocab_size, hidden_size):
      V, H = vocab_size, hidden_size

      W_in = 0.01 * np.random.randn(V, H).astype('f')
      W_out = 0.01 * np.random.randn(H, V).astype('f')

      self.in_layer0 = MatMul(W_in)
      self.in_layer1 = MatMul(W_in)
      self.out_layer = MatMul(W_out)
      self.loss_layer = SoftmaxWithLoss()

      # すべての重みと勾配をリストにまとめる
      layers = [self.in_layer0, self.in_layer1, self.out_layer]
      self.params, self.grads = [], []
      for layer in layers:
          self.params += layer.params
          self.grads += layer.grads

      # メンバ変数に単語の分散表現を設定
      self.word_vecs = W_in

  def forward(self, contexts, target):
      h0 = self.in_layer0.forward(contexts[:, 0])
      h1 = self.in_layer1.forward(contexts[:, 1])
      h = (h0 + h1) * 0.5
      score = self.out_layer.forward(h)
      loss = self.loss_layer.forward(score, target)
      return loss

  def backward(self, dout=1):
      ds = self.loss_layer.backward(dout)
      da = self.out_layer.backward(ds)
      da *= 0.5
      self.in_layer1.backward(da)
      self.in_layer0.backward(da)
      return None

# CBOWモデル
class CBOW:
  def __init__(self, vocab_size, hidden_size, window_size, corpus):
      V, H = vocab_size, hidden_size

      # 重みの初期化
      W_in = 0.01 * np.random.randn(V, H).astype('f')
      W_out = 0.01 * np.random.randn(V, H).astype('f')

      # レイヤの生成
      self.in_layers = []
      for i in range(2 * window_size):
          layer = Embedding(W_in)  # Embeddingレイヤを使用
          self.in_layers.append(layer)
      self.ns_loss = NegativeSamplingLoss(W_out, corpus, power=0.75, sample_size=5)

      # すべての重みと勾配をリストにまとめる
      layers = self.in_layers + [self.ns_loss]
      self.params, self.grads = [], []
      for layer in layers:
          self.params += layer.params
          self.grads += layer.grads

      # メンバ変数に単語の分散表現を設定
      self.word_vecs = W_in

  def forward(self, contexts, target):
      h = 0
      for i, layer in enumerate(self.in_layers):
          h += layer.forward(contexts[:, i])
      h *= 1 / len(self.in_layers)
      loss = self.ns_loss.forward(h, target)
      return loss

  def backward(self, dout=1):
      dout = self.ns_loss.backward(dout)
      dout *= 1 / len(self.in_layers)
      for layer in self.in_layers:
          layer.backward(dout)
      return None


class Adam:
  def __init__(self, lr=0.001, beta1=0.9, beta2=0.999):
      self.lr = lr
      self.beta1 = beta1
      self.beta2 = beta2
      self.iter = 0
      self.m = None
      self.v = None
      
  def update(self, params, grads):
      if self.m is None:
          self.m, self.v = [], []
          for param in params:
              self.m.append(np.zeros_like(param))
              self.v.append(np.zeros_like(param))
      
      self.iter += 1
      lr_t = self.lr * np.sqrt(1.0 - self.beta2**self.iter) / (1.0 - self.beta1**self.iter)

      for i in range(len(params)):
          self.m[i] += (1 - self.beta1) * (grads[i] - self.m[i])
          self.v[i] += (1 - self.beta2) * (grads[i]**2 - self.v[i])
          
          params[i] -= lr_t * self.m[i] / (np.sqrt(self.v[i]) + 1e-7)


class Trainer:
  def __init__(self, model, optimizer):
      self.model = model
      self.optimizer = optimizer
      self.loss_list = []
      self.eval_interval = None
      self.current_epoch = 0
  
  def fit(self, x, t, max_epoch=10, batch_size=32, max_grad=None, eval_interval=20):
      data_size = len(x)
      max_iters = data_size // batch_size
      self.eval_interval = eval_interval
      model, optimizer = self.model, self.optimizer
      total_loss = 0
      loss_count = 0

      start_time = time.time()
      for epoch in range(max_epoch):
          # シャッフル
          idx = numpy.random.permutation(numpy.arange(data_size))
          x = x[idx]
          t = t[idx]

          for iters in range(max_iters):
              batch_x = x[iters*batch_size:(iters+1)*batch_size]
              batch_t = t[iters*batch_size:(iters+1)*batch_size]

              # 勾配を求め、パラメータを更新
              loss = model.forward(batch_x, batch_t)
              model.backward()
              params, grads = remove_duplicate(model.params, model.grads)  # 共有された重みを1つに集約
              if max_grad is not None:
                  clip_grads(grads, max_grad)
              optimizer.update(params, grads)
              total_loss += loss
              loss_count += 1

              # 評価
              if (eval_interval is not None) and (iters % eval_interval) == 0:
                  avg_loss = total_loss / loss_count
                  elapsed_time = time.time() - start_time
                  print('| epoch %d |  iter %d / %d | time %d[s] | loss %.2f'
                        % (self.current_epoch + 1, iters + 1, max_iters, elapsed_time, avg_loss))
                  self.loss_list.append(float(avg_loss))
                  total_loss, loss_count = 0, 0

          self.current_epoch += 1

  def plot(self, ylim=None):
      x = numpy.arange(len(self.loss_list))
      if ylim is not None:
          plt.ylim(*ylim)
      plt.plot(x, self.loss_list, label='train')
      plt.xlabel('iterations (x' + str(self.eval_interval) + ')')
      plt.ylabel('loss')
      plt.show()

class Embedding:
  def __init__(self, W):
      self.params = [W]
      self.grads = [np.zeros_like(W)]
      self.idx = None

  def forward(self, idx):
      W, = self.params
      self.idx = idx
      out = W[idx]
      return out

  def backward(self, dout):
      dW, = self.grads
      dW[...] = 0
      if GPU:
          np.scatter_add(dW, self.idx, dout)
      else:
          np.add.at(dW, self.idx, dout)
      return None


'''多く使われている単語を確率分布で取り出す'''
class UnigramSampler:
  def __init__(self, corpus, power, sample_size):
      self.sample_size = sample_size
      self.vocab_size = None
      self.word_p = None

      counts = collections.Counter()
      for word_id in corpus:
          counts[word_id] += 1

      vocab_size = len(counts)
      self.vocab_size = vocab_size

      self.word_p = np.zeros(vocab_size)
      for i in range(vocab_size):
          self.word_p[i] = counts[i]

      self.word_p = np.power(self.word_p, power)
      self.word_p /= np.sum(self.word_p)

  def get_negative_sample(self, target):
      batch_size = target.shape[0]

      if not GPU:
          negative_sample = np.zeros((batch_size, self.sample_size), dtype=np.int32)

          for i in range(batch_size):
              p = self.word_p.copy()
              target_idx = target[i]
              p[target_idx] = 0
              p /= p.sum()
              negative_sample[i, :] = np.random.choice(self.vocab_size, size=self.sample_size, replace=False, p=p)
      else:
          # GPU(cupy）で計算するときは、速度を優先
          # 負例にターゲットが含まれるケースがある
          negative_sample = np.random.choice(self.vocab_size, size=(batch_size, self.sample_size),
                                              replace=True, p=self.word_p)

      return negative_sample


'''多値分類から二値分類,多く使われている単語で正解かどうか判定する'''
class NegativeSamplingLoss:
  def __init__(self, W, corpus, power=0.75, sample_size=5):
      self.sample_size = sample_size
      self.sampler = UnigramSampler(corpus, power, sample_size)
      self.loss_layers = [SigmoidWithLoss() for _ in range(sample_size + 1)]
      self.embed_dot_layers = [EmbeddingDot(W) for _ in range(sample_size + 1)]

      self.params, self.grads = [], []
      for layer in self.embed_dot_layers:
          self.params += layer.params
          self.grads += layer.grads

  def forward(self, h, target):
      batch_size = target.shape[0]
      negative_sample = self.sampler.get_negative_sample(target)

      # 正例のフォワード
      score = self.embed_dot_layers[0].forward(h, target)
      correct_label = np.ones(batch_size, dtype=np.int32)
      loss = self.loss_layers[0].forward(score, correct_label)

      # 負例のフォワード
      negative_label = np.zeros(batch_size, dtype=np.int32)
      for i in range(self.sample_size):
          negative_target = negative_sample[:, i]
          score = self.embed_dot_layers[1 + i].forward(h, negative_target)
          loss += self.loss_layers[1 + i].forward(score, negative_label)

      return loss

  def backward(self, dout=1):
      dh = 0
      for l0, l1 in zip(self.loss_layers, self.embed_dot_layers):
          dscore = l0.backward(dout)
          dh += l1.backward(dscore)

      return dh

In [0]:
class RNN:
  def __init__(self, Wx, Wh, b):
    self.params = [Wx, Wh, b]
    self.grads = [np.zeros_like(Wx), np.zeros_like(Wh), np.zeros_like(b)]
    self.cache = None

  def forward(self, x, h_prev):
    Wx, Wh, b = self.params
    t = np.dot(h_prev, Wh) + np.dot(x, Wx) + b
    h_next = np.tanh(t)
    self.cache = (x, h_prev, h_next)
    
    return h_next

  def backward(self, dh_next):
    Wx, Wh, b = self.params
    x, h_prev, h_next = self.cache
    dt = dh_next * (1 - h_next ** 2)
    db = np.sum(dt, axis=0)
    dWh = np.dot(h_prev.T, dt)
    dh_prev = np.dot(dt, Wh.T)
    dWx = np.dot(x.T, dt)
    dx = np.dot(dt, Wx.T)
    
    self.grads[0][...] = dWx
    self.grads[1][...] = dWh
    self.grads[2][...] = db

    return dx, dh_prev

class TimeRNN:
  def __init__(self, Wx, Wh, b, stateful=False):
    self.params = [Wx, Wh, b]
    self.grads = [np.zeros_like(Wx), np.zeros_like(Wh), np.zeros_like(b)]
    self.layers = None

    self.h, self.dh = None, None
    self.stateful = stateful

  def forward(self, xs):
    Wx, Wh, b = self.params
    N, T, D = xs.shape
    D, H = Wx.shape

    self.layers = []
    hs = np.empty((N, T, H), dtype='f')

    if not self.stateful or self.h is None:
      self.h = np.zeros((N, H), dtype='f')

    for t in range(T):
      layer = RNN(*self.params)
      self.h = layer.forward(xs[:, t, :], self.h)
      hs[:, t, :] = self.h
      self.layers.append(layer)

    return hs

  def backward(self, dhs):
    Wx, Wh, b = self.params
    N, T, H = dhs.shape
    D, H = Wx.shape

    dxs = np.empty((N, T, D), dtype='f')
    dh = 0
    grads = [0, 0, 0]
    for t in reversed(range(T)):
      layer = self.layers[t]
      dx, dh = layer.backward(dhs[:, t, :] + dh)
      dxs[:, t, :] = dx

      for i, grad in enumerate(layer.grads):
        grads[i] += grad

    for i, grad in enumerate(grads):
      self.grads[i][...] = grad
      self.dh = dh

    return dxs

  def set_state(self, h):
    self.h = h

  def reset_state(self):
    self.h = None


class TimeEmbedding:
  def __init__(self, W):
    self.params = [W]
    self.grads = [np.zeros_like(W)]
    self.layers = None
    self.W = W

  def forward(self, xs):
    N, T = xs.shape
    V, D = self.W.shape

    out = np.empty((N, T, D), dtype='f')
    self.layers = []

    for t in range(T):
      layer = Embedding(self.W)
      out[:, t, :] = layer.forward(xs[:, t])
      self.layers.append(layer)

    return out

  def backward(self, dout):
    N, T, D = dout.shape

    grad = 0
    for t in range(T):
      layer = self.layers[t]
      layer.backward(dout[:, t, :])
      grad += layer.grads[0]

    self.grads[0][...] = grad
    return None


class TimeAffine:
  def __init__(self, W, b):
    self.params = [W, b]
    self.grads = [np.zeros_like(W), np.zeros_like(b)]
    self.x = None

  def forward(self, x):
    N, T, D = x.shape
    W, b = self.params

    rx = x.reshape(N*T, -1)
    out = np.dot(rx, W) + b
    self.x = x
    return out.reshape(N, T, -1)

  def backward(self, dout):
    x = self.x
    N, T, D = x.shape
    W, b = self.params

    dout = dout.reshape(N*T, -1)
    rx = x.reshape(N*T, -1)

    db = np.sum(dout, axis=0)
    dW = np.dot(rx.T, dout)
    dx = np.dot(dout, W.T)
    dx = dx.reshape(*x.shape)

    self.grads[0][...] = dW
    self.grads[1][...] = db

    return dx


class TimeSoftmaxWithLoss:
  def __init__(self):
    self.params, self.grads = [], []
    self.cache = None
    self.ignore_label = -1

  def forward(self, xs, ts):
    N, T, V = xs.shape

    if ts.ndim == 3:  # 教師ラベルがone-hotベクトルの場合
      ts = ts.argmax(axis=2)

    mask = (ts != self.ignore_label)

    # バッチ分と時系列分をまとめる（reshape）
    xs = xs.reshape(N * T, V)
    ts = ts.reshape(N * T)
    mask = mask.reshape(N * T)

    ys = softmax(xs)
    ls = np.log(ys[np.arange(N * T), ts])
    ls *= mask  # ignore_labelに該当するデータは損失を0にする
    loss = -np.sum(ls)
    loss /= mask.sum()

    self.cache = (ts, ys, mask, (N, T, V))
    return loss

  def backward(self, dout=1):
    ts, ys, mask, (N, T, V) = self.cache

    dx = ys
    dx[np.arange(N * T), ts] -= 1
    dx *= dout
    dx /= mask.sum()
    dx *= mask[:, np.newaxis]  # ignore_labelに該当するデータは勾配を0にする

    dx = dx.reshape((N, T, V))

    return dx


class SimpleRnnlm:
  def __init__(self, vocab_size, wordvec_size, hidden_size):
    V, D, H = vocab_size, wordvec_size, hidden_size
    rn = np.random.randn

    # 重みの初期化
    embed_W = (rn(V, D) / 100).astype('f')
    rnn_Wx = (rn(D, H) / np.sqrt(D)).astype('f')
    rnn_Wh = (rn(H, H) / np.sqrt(H)).astype('f')
    rnn_b = np.zeros(H).astype('f')
    affine_W = (rn(H, V) / np.sqrt(H)).astype('f')
    affine_b = np.zeros(V).astype('f')

    # レイヤの生成
    self.layers = [
      TimeEmbedding(embed_W),
      TimeRNN(rnn_Wx, rnn_Wh, rnn_b, stateful=True),
      TimeAffine(affine_W, affine_b)
    ]
    self.loss_layer = TimeSoftmaxWithLoss()
    self.rnn_layer = self.layers[1]

    # すべての重みと勾配をリストにまとめる
    self.params, self.grads = [], []
    for layer in self.layers:
      self.params += layer.params
      self.grads += layer.grads

  def forward(self, xs, ts):
    for layer in self.layers:
      xs = layer.forward(xs)
    loss = self.loss_layer.forward(xs, ts)
    return loss

  def backward(self, dout=1):
    dout = self.loss_layer.backward(dout)
    for layer in reversed(self.layers):
      dout = layer.backward(dout)
    return dout

  def reset_state(self):
    self.rnn_layer.reset_state()


class LSTM:
  def __init__(self, Wx, Wh, b):
    self.params = [Wx, Wh, b]
    self.grads = [np.zeros_like(Wx), np.zeros_like(Wh), np.zeros_like(b)]
    self.cache = None

  def forward(self, x, h_prev, c_prev):
    Wx, Wh, b = self.params
    N, H = h_prev.shape

    A = np.dot(x, Wx) + np.dot(h_prev, Wh) + b

    f = A[:, :H]
    g = A[:, H:2*H]
    i = A[:, 2*H:3*H]
    o = A[:, 3*H:]

    f = sigmoid(f)
    g = np.tanh(g)
    i = sigmoid(i)
    o = sigmoid(o)

    c_next = f * c_prev + g * i
    h_next = o * np.tanh(c_next)

    self.cache = (x, h_prev, c_prev, i, f, g, o, c_next)
    return h_next, c_next

  def backward(self, dh_next, dc_next):
    Wx, Wh, b = self.params
    x, h_prev, c_prev, i, f, g, o, c_next = self.cache

    tanh_c_next = np.tanh(c_next)

    ds = dc_next + (dh_next * o) * (1 - tanh_c_next ** 2)

    dc_prev = ds * f

    di = ds * g
    df = ds * c_prev
    do = dh_next * tanh_c_next
    dg = ds * i

    di *= i * (1 - i)
    df *= f * (1 - f)
    do *= o * (1 - o)
    dg *= (1 - g ** 2)

    dA = np.hstack((df, dg, di, do))

    dWh = np.dot(h_prev.T, dA)
    dWx = np.dot(x.T, dA)
    db = dA.sum(axis=0)

    self.grads[0][...] = dWx
    self.grads[1][...] = dWh
    self.grads[2][...] = db

    dx = np.dot(dA, Wx.T)
    dh_prev = np.dot(dA, Wh.T)

    return dx, dh_prev, dc_prev


class TimeLSTM:
  def __init__(self, Wx, Wh, b, stateful=False):
    self.params = [Wx, Wh, b]
    self.grads = [np.zeros_like(Wx), np.zeros_like(Wh), np.zeros_like(b)]
    self.layers = None

    self.h, self.c = None, None
    self.dh = None
    self.stateful = stateful

  def forward(self, xs):
    Wx, Wh, b = self.params
    N, T, D = xs.shape
    H = Wh.shape[0]

    self.layers = []
    hs = np.empty((N, T, H), dtype='f')

    if not self.stateful or self.h is None:
      self.h = np.zeros((N, H), dtype='f')
    if not self.stateful or self.c is None:
      self.c = np.zeros((N, H), dtype='f')

    for t in range(T):
      layer = LSTM(*self.params)
      self.h, self.c = layer.forward(xs[:, t, :], self.h, self.c)
      hs[:, t, :] = self.h

      self.layers.append(layer)

    return hs

  def backward(self, dhs):
    Wx, Wh, b = self.params
    N, T, H = dhs.shape
    D = Wx.shape[0]

    dxs = np.empty((N, T, D), dtype='f')
    dh, dc = 0, 0

    grads = [0, 0, 0]
    for t in reversed(range(T)):
      layer = self.layers[t]
      dx, dh, dc = layer.backward(dhs[:, t, :] + dh, dc)
      dxs[:, t, :] = dx
      for i, grad in enumerate(layer.grads):
        grads[i] += grad

    for i, grad in enumerate(grads):
      self.grads[i][...] = grad
    self.dh = dh
    return dxs

  def set_state(self, h, c=None):
    self.h, self.c = h, c

  def reset_state(self):
    self.h, self.c = None, None

class TimeDropout:
  def __init__(self, dropout_ratio=0.5):
    self.params, self.grads = [], []
    self.dropout_ratio = dropout_ratio
    self.mask = None
    self.train_flg = True

  def forward(self, xs):
    if self.train_flg:
      flg = np.random.rand(*xs.shape) > self.dropout_ratio
      scale = 1 / (1.0 - self.dropout_ratio)
      self.mask = flg.astype(np.float32) * scale

      return xs * self.mask
    else:
      return xs

  def backward(self, dout):
    return dout * self.mask


class BaseModel:
  def __init__(self):
    self.params, self.grads = None, None

  def forward(self, *args):
    raise NotImplementedError

  def backward(self, *args):
    raise NotImplementedError

  def save_params(self, file_name=None):
    if file_name is None:
      file_name = self.__class__.__name__ + '.pkl'

    params = [p.astype(np.float16) for p in self.params]
    if GPU:
      params = [to_cpu(p) for p in params]

    with open(file_name, 'wb') as f:
      pickle.dump(params, f)

  def load_params(self, file_name=None):
    if file_name is None:
      file_name = self.__class__.__name__ + '.pkl'

    if '/' in file_name:
      file_name = file_name.replace('/', os.sep)

    if not os.path.exists(file_name):
      raise IOError('No file: ' + file_name)

    with open(file_name, 'rb') as f:
      params = pickle.load(f)

    params = [p.astype('f') for p in params]
    if GPU:
      params = [to_gpu(p) for p in params]

    for i, param in enumerate(self.params):
      param[...] = params[i]


class Rnnlm(BaseModel):
  def __init__(self, vocab_size=10000, wordvec_size=100, hidden_size=100):
    V, D, H = vocab_size, wordvec_size, hidden_size
    rn = np.random.randn

    # 重みの初期化
    embed_W = (rn(V, D) / 100).astype('f')
    lstm_Wx = (rn(D, 4 * H) / np.sqrt(D)).astype('f')
    lstm_Wh = (rn(H, 4 * H) / np.sqrt(H)).astype('f')
    lstm_b = np.zeros(4 * H).astype('f')
    affine_W = (rn(H, V) / np.sqrt(H)).astype('f')
    affine_b = np.zeros(V).astype('f')

    # レイヤの生成
    self.layers = [
      TimeEmbedding(embed_W),
      TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=True),
      TimeAffine(affine_W, affine_b)
    ]
    self.loss_layer = TimeSoftmaxWithLoss()
    self.lstm_layer = self.layers[1]

    # すべての重みと勾配をリストにまとめる
    self.params, self.grads = [], []
    for layer in self.layers:
      self.params += layer.params
      self.grads += layer.grads

  def predict(self, xs):
    for layer in self.layers:
      xs = layer.forward(xs)
    return xs

  def forward(self, xs, ts):
    score = self.predict(xs)
    loss = self.loss_layer.forward(score, ts)
    return loss

  def backward(self, dout=1):
    dout = self.loss_layer.backward(dout)
    for layer in reversed(self.layers):
      dout = layer.backward(dout)
    return dout

  def reset_state(self):
    self.lstm_layer.reset_state()


# Rnnlmの改良,忘れる,重み共有,LSTMの多層化
class BetterRnnlm(BaseModel):
  def __init__(self, vocab_size=10000, wordvec_size=650, hidden_size=650, dropout_ratio=0.5):
    V, D, H = vocab_size, wordvec_size, hidden_size
    rn = np.random.randn

    embed_W = (rn(V, D) / 100).astype('f')
    lstm_Wx1 = (rn(D, 4 * H) / np.sqrt(D)).astype('f')
    lstm_Wh1 = (rn(H, 4 * H) / np.sqrt(H)).astype('f')
    lstm_b1 = np.zeros(4 * H).astype('f')
    lstm_Wx2 = (rn(H, 4 * H) / np.sqrt(H)).astype('f')
    lstm_Wh2 = (rn(H, 4 * H) / np.sqrt(H)).astype('f')
    lstm_b2 = np.zeros(4 * H).astype('f')
    affine_b = np.zeros(V).astype('f')

    self.layers = [
      TimeEmbedding(embed_W),
      TimeDropout(dropout_ratio),
      TimeLSTM(lstm_Wx1, lstm_Wh1, lstm_b1, stateful=True),
      TimeDropout(dropout_ratio),
      TimeLSTM(lstm_Wx2, lstm_Wh2, lstm_b2, stateful=True),
      TimeDropout(dropout_ratio),
      TimeAffine(embed_W.T, affine_b)  # weight tying!!
    ]
    self.loss_layer = TimeSoftmaxWithLoss()
    self.lstm_layers = [self.layers[2], self.layers[4]]
    self.drop_layers = [self.layers[1], self.layers[3], self.layers[5]]

    self.params, self.grads = [], []
    for layer in self.layers:
      self.params += layer.params
      self.grads += layer.grads

  def predict(self, xs, train_flg=False):
    for layer in self.drop_layers:
      layer.train_flg = train_flg

    for layer in self.layers:
      xs = layer.forward(xs)
    return xs

  def forward(self, xs, ts, train_flg=True):
    score = self.predict(xs, train_flg)
    loss = self.loss_layer.forward(score, ts)
    return loss

  def backward(self, dout=1):
    dout = self.loss_layer.backward(dout)
    for layer in reversed(self.layers):
      dout = layer.backward(dout)
    return dout

  def reset_state(self):
    for layer in self.lstm_layers:
      layer.reset_state()




In [0]:
# 7章で使用
# 文章作成
class RnnlmGen(Rnnlm):
  def generate(self, start_id, skip_ids=None, sample_size=100):
    word_ids = [start_id]

    x = start_id
    while len(word_ids) < sample_size:
      x = np.array(x).reshape(1, 1)
      score = self.predict(x)
      p = softmax(score.flatten())

      sampled = np.random.choice(len(p), size=1, p=p)
      if (skip_ids is None) or (sampled not in skip_ids):
        x = sampled
        word_ids.append(int(x))

    return word_ids

  def get_state(self):
    return self.lstm_layer.h, self.lstm_layer.c

  def set_state(self, state):
    self.lstm_layer.set_state(*state)


# より良い文章生成
class BetterRnnlmGen(BetterRnnlm):
  def generate(self, start_id, skip_ids=None, sample_size=100):
    word_ids = [start_id]

    x = start_id
    while len(word_ids) < sample_size:
      x = np.array(x).reshape(1, 1)
      score = self.predict(x).flatten()
      p = softmax(score).flatten()

      sampled = np.random.choice(len(p), size=1, p=p)
      if (skip_ids is None) or (sampled not in skip_ids):
        x = sampled
        word_ids.append(int(x))

    return word_ids

  def get_state(self):
    states = []
    for layer in self.lstm_layers:
      states.append((layer.h, layer.c))
    return states

  def set_state(self, states):
    for layer, state in zip(self.lstm_layers, states):
      layer.set_state(*state)


class Encoder:
  def __init__(self, vocab_size, wordvec_size, hidden_size):
    V, D, H = vocab_size, wordvec_size, hidden_size
    rn = np.random.randn

    embed_W = (rn(V, D) / 100).astype('f')
    lstm_Wx = (rn(D, 4 * H) / np.sqrt(D)).astype('f')
    lstm_Wh = (rn(H, 4 * H) / np.sqrt(H)).astype('f')
    lstm_b = np.zeros(4 * H).astype('f')

    self.embed = TimeEmbedding(embed_W)
    self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=False)

    self.params = self.embed.params + self.lstm.params
    self.grads = self.embed.grads + self.lstm.grads
    self.hs = None

  def forward(self, xs):
    xs = self.embed.forward(xs)
    hs = self.lstm.forward(xs)
    self.hs = hs
    return hs[:, -1, :]

  def backward(self, dh):
    dhs = np.zeros_like(self.hs)
    dhs[:, -1, :] = dh

    dout = self.lstm.backward(dhs)
    dout = self.embed.backward(dout)
    return dout


class Decoder:
  def __init__(self, vocab_size, wordvec_size, hidden_size):
    V, D, H = vocab_size, wordvec_size, hidden_size
    rn = np.random.randn

    embed_W = (rn(V, D) / 100).astype('f')
    lstm_Wx = (rn(D, 4 * H) / np.sqrt(D)).astype('f')
    lstm_Wh = (rn(H, 4 * H) / np.sqrt(H)).astype('f')
    lstm_b = np.zeros(4 * H).astype('f')
    affine_W = (rn(H, V) / np.sqrt(H)).astype('f')
    affine_b = np.zeros(V).astype('f')

    self.embed = TimeEmbedding(embed_W)
    self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=True)
    self.affine = TimeAffine(affine_W, affine_b)

    self.params, self.grads = [], []
    for layer in (self.embed, self.lstm, self.affine):
      self.params += layer.params
      self.grads += layer.grads

  def forward(self, xs, h):
    self.lstm.set_state(h)

    out = self.embed.forward(xs)
    out = self.lstm.forward(out)
    score = self.affine.forward(out)
    return score

  def backward(self, dscore):
    dout = self.affine.backward(dscore)
    dout = self.lstm.backward(dout)
    dout = self.embed.backward(dout)
    dh = self.lstm.dh
    return dh

  def generate(self, h, start_id, sample_size):
    sampled = []
    sample_id = start_id
    self.lstm.set_state(h)

    for _ in range(sample_size):
      x = np.array(sample_id).reshape((1, 1))
      out = self.embed.forward(x)
      out = self.lstm.forward(out)
      score = self.affine.forward(out)

      sample_id = np.argmax(score.flatten())
      sampled.append(int(sample_id))

    return sampled


# many to many
class Seq2seq(BaseModel):
  def __init__(self, vocab_size, wordvec_size, hidden_size):
    V, D, H = vocab_size, wordvec_size, hidden_size
    self.encoder = Encoder(V, D, H)
    self.decoder = Decoder(V, D, H)
    self.softmax = TimeSoftmaxWithLoss()

    self.params = self.encoder.params + self.decoder.params
    self.grads = self.encoder.grads + self.decoder.grads

  def forward(self, xs, ts):
    decoder_xs, decoder_ts = ts[:, :-1], ts[:, 1:]

    h = self.encoder.forward(xs)
    score = self.decoder.forward(decoder_xs, h)
    loss = self.softmax.forward(score, decoder_ts)
    return loss

  def backward(self, dout=1):
    dout = self.softmax.backward(dout)
    dh = self.decoder.backward(dout)
    dout = self.encoder.backward(dh)
    return dout

  def generate(self, xs, start_id, sample_size):
    h = self.encoder.forward(xs)
    sampled = self.decoder.generate(h, start_id, sample_size)
    return sampled

# より良いmany to many
class PeekyDecoder:
  def __init__(self, vocab_size, wordvec_size, hidden_size):
    V, D, H = vocab_size, wordvec_size, hidden_size
    rn = np.random.randn

    embed_W = (rn(V, D) / 100).astype('f')
    lstm_Wx = (rn(H + D, 4 * H) / np.sqrt(H + D)).astype('f')
    lstm_Wh = (rn(H, 4 * H) / np.sqrt(H)).astype('f')
    lstm_b = np.zeros(4 * H).astype('f')
    affine_W = (rn(H + H, V) / np.sqrt(H + H)).astype('f')
    affine_b = np.zeros(V).astype('f')

    self.embed = TimeEmbedding(embed_W)
    self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=True)
    self.affine = TimeAffine(affine_W, affine_b)

    self.params, self.grads = [], []
    for layer in (self.embed, self.lstm, self.affine):
      self.params += layer.params
      self.grads += layer.grads
    self.cache = None

  def forward(self, xs, h):
    N, T = xs.shape
    N, H = h.shape

    self.lstm.set_state(h)

    out = self.embed.forward(xs)
    hs = np.repeat(h, T, axis=0).reshape(N, T, H)
    out = np.concatenate((hs, out), axis=2)

    out = self.lstm.forward(out)
    out = np.concatenate((hs, out), axis=2)

    score = self.affine.forward(out)
    self.cache = H
    return score

  def backward(self, dscore):
    H = self.cache

    dout = self.affine.backward(dscore)
    dout, dhs0 = dout[:, :, H:], dout[:, :, :H]
    dout = self.lstm.backward(dout)
    dembed, dhs1 = dout[:, :, H:], dout[:, :, :H]
    self.embed.backward(dembed)

    dhs = dhs0 + dhs1
    dh = self.lstm.dh + np.sum(dhs, axis=1)
    return dh

  def generate(self, h, start_id, sample_size):
    sampled = []
    char_id = start_id
    self.lstm.set_state(h)

    H = h.shape[1]
    peeky_h = h.reshape(1, 1, H)
    for _ in range(sample_size):
      x = np.array([char_id]).reshape((1, 1))
      out = self.embed.forward(x)

      out = np.concatenate((peeky_h, out), axis=2)
      out = self.lstm.forward(out)
      out = np.concatenate((peeky_h, out), axis=2)
      score = self.affine.forward(out)

      char_id = np.argmax(score.flatten())
      sampled.append(char_id)

    return sampled


# Seq2seqのDecoderをPeekyDecoderに変更
class PeekySeq2seq(Seq2seq):
  def __init__(self, vocab_size, wordvec_size, hidden_size):
    V, D, H = vocab_size, wordvec_size, hidden_size
    self.encoder = Encoder(V, D, H)
    self.decoder = PeekyDecoder(V, D, H)
    self.softmax = TimeSoftmaxWithLoss()

    self.params = self.encoder.params + self.decoder.params
    self.grads = self.encoder.grads + self.decoder.grads

In [0]:
# 8章
class WeightSum:
  def __init__(self):
    self.params, self.grads = [], []
    self.cache = None

  def forward(self, hs, a):
    N, T, H = hs.shape

    ar = a.reshape(N, T, 1)#.repeat(T, axis=1)
    t = hs * ar
    c = np.sum(t, axis=1)

    self.cache = (hs, ar)
    return c

  def backward(self, dc):
    hs, ar = self.cache
    N, T, H = hs.shape
    dt = dc.reshape(N, 1, H).repeat(T, axis=1)
    dar = dt * hs
    dhs = dt * ar
    da = np.sum(dar, axis=2)

    return dhs, da


class AttentionWeight:
  def __init__(self):
    self.params, self.grads = [], []
    self.softmax = Softmax()
    self.cache = None

  def forward(self, hs, h):
    N, T, H = hs.shape

    hr = h.reshape(N, 1, H)#.repeat(T, axis=1)
    t = hs * hr
    s = np.sum(t, axis=2)
    a = self.softmax.forward(s)

    self.cache = (hs, hr)
    return a

  def backward(self, da):
    hs, hr = self.cache
    N, T, H = hs.shape

    ds = self.softmax.backward(da)
    dt = ds.reshape(N, T, 1).repeat(H, axis=2)
    dhs = dt * hr
    dhr = dt * hs
    dh = np.sum(dhr, axis=1)

    return dhs, dh


class Attention:
  def __init__(self):
    self.params, self.grads = [], []
    self.attention_weight_layer = AttentionWeight()
    self.weight_sum_layer = WeightSum()
    self.attention_weight = None

  def forward(self, hs, h):
    a = self.attention_weight_layer.forward(hs, h)
    out = self.weight_sum_layer.forward(hs, a)
    self.attention_weight = a
    return out

  def backward(self, dout):
    dhs0, da = self.weight_sum_layer.backward(dout)
    dhs1, dh = self.attention_weight_layer.backward(da)
    dhs = dhs0 + dhs1
    return dhs, dh


class TimeAttention:
  def __init__(self):
    self.params, self.grads = [], []
    self.layers = None
    self.attention_weights = None

  def forward(self, hs_enc, hs_dec):
    N, T, H = hs_dec.shape
    out = np.empty_like(hs_dec)
    self.layers = []
    self.attention_weights = []

    for t in range(T):
      layer = Attention()
      out[:, t, :] = layer.forward(hs_enc, hs_dec[:,t,:])
      self.layers.append(layer)
      self.attention_weights.append(layer.attention_weight)

    return out

  def backward(self, dout):
    N, T, H = dout.shape
    dhs_enc = 0
    dhs_dec = np.empty_like(dout)

    for t in range(T):
      layer = self.layers[t]
      dhs, dh = layer.backward(dout[:, t, :])
      dhs_enc += dhs
      dhs_dec[:,t,:] = dh

    return dhs_enc, dhs_dec
  

class AttentionEncoder(Encoder):
  def forward(self, xs):
    xs = self.embed.forward(xs)
    hs = self.lstm.forward(xs)
    return hs

  def backward(self, dhs):
    dout = self.lstm.backward(dhs)
    dout = self.embed.backward(dout)
    return dout


class AttentionDecoder:
  def __init__(self, vocab_size, wordvec_size, hidden_size):
    V, D, H = vocab_size, wordvec_size, hidden_size
    rn = np.random.randn

    embed_W = (rn(V, D) / 100).astype('f')
    lstm_Wx = (rn(D, 4 * H) / np.sqrt(D)).astype('f')
    lstm_Wh = (rn(H, 4 * H) / np.sqrt(H)).astype('f')
    lstm_b = np.zeros(4 * H).astype('f')
    affine_W = (rn(2*H, V) / np.sqrt(2*H)).astype('f')
    affine_b = np.zeros(V).astype('f')

    self.embed = TimeEmbedding(embed_W)
    self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=True)
    self.attention = TimeAttention()
    self.affine = TimeAffine(affine_W, affine_b)
    layers = [self.embed, self.lstm, self.attention, self.affine]

    self.params, self.grads = [], []
    for layer in layers:
      self.params += layer.params
      self.grads += layer.grads

  def forward(self, xs, enc_hs):
    h = enc_hs[:,-1]
    self.lstm.set_state(h)

    out = self.embed.forward(xs)
    dec_hs = self.lstm.forward(out)
    c = self.attention.forward(enc_hs, dec_hs)
    out = np.concatenate((c, dec_hs), axis=2)
    score = self.affine.forward(out)

    return score

  def backward(self, dscore):
    dout = self.affine.backward(dscore)
    N, T, H2 = dout.shape
    H = H2 // 2

    dc, ddec_hs0 = dout[:,:,:H], dout[:,:,H:]
    denc_hs, ddec_hs1 = self.attention.backward(dc)
    ddec_hs = ddec_hs0 + ddec_hs1
    dout = self.lstm.backward(ddec_hs)
    dh = self.lstm.dh
    denc_hs[:, -1] += dh
    self.embed.backward(dout)

    return denc_hs

  def generate(self, enc_hs, start_id, sample_size):
    sampled = []
    sample_id = start_id
    h = enc_hs[:, -1]
    self.lstm.set_state(h)

    for _ in range(sample_size):
      x = np.array([sample_id]).reshape((1, 1))

      out = self.embed.forward(x)
      dec_hs = self.lstm.forward(out)
      c = self.attention.forward(enc_hs, dec_hs)
      out = np.concatenate((c, dec_hs), axis=2)
      score = self.affine.forward(out)

      sample_id = np.argmax(score.flatten())
      sampled.append(sample_id)

    return sampled


class AttentionSeq2seq(Seq2seq):
  def __init__(self, vocab_size, wordvec_size, hidden_size):
    args = vocab_size, wordvec_size, hidden_size
    self.encoder = AttentionEncoder(*args)
    self.decoder = AttentionDecoder(*args)
    self.softmax = TimeSoftmaxWithLoss()

    self.params = self.encoder.params + self.decoder.params
    self.grads = self.encoder.grads + self.decoder.grads


In [0]:
# 5章で使用
import sys
import numpy
import time
import matplotlib.pyplot as plt

window_size = 1
hidden_size = 5
batch_size = 3
max_epoch = 1000

text = 'You say goodbye and I say hello.'
corpus, word_to_id, id_to_word = preprocess(text)

vocab_size = len(word_to_id)
contexts, target = create_contexts_target(corpus, window_size)
target = convert_one_hot(target, vocab_size)
contexts = convert_one_hot(contexts, vocab_size)

model = SimpleCBOW(vocab_size, hidden_size)
optimizer = Adam()
trainer = Trainer(model, optimizer)

trainer.fit(contexts, target, max_epoch, batch_size)
trainer.plot()

word_vecs = model.word_vecs
for word_id, word in id_to_word.items():
    print(word, word_vecs[word_id])
