<a href="https://colab.research.google.com/github/hyesungKomet/deep_learning/blob/main/ch_6_lstm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 기존 RNN
* 기울기 소실

tanh에서의 역전파가 미분한 값이 1보다 작아서 거듭할수록 기울기가 소실됨
* 기울기 폭발

MatMul 역전파가 계속 같은 가중치를 곱하니 거듭할수록 NaN으로 매우 커져버림, 또는 너무 작아져서 소실됨

## 기울기 폭발 대책

기울기 클리핑 - 문턱값보다 커지면 그 절댓값으로 나눠준다

## 게이트 추가
* output 게이트
  * 다음 LSTM계층에서 이 원소가 얼마나 중요한가
  * Sigmoid 함수로 사용

* forget 게이트
  * 이전 셀에서 불필요한 기억 제거
  * Sigmoid 함수로 사용

* 새로운 기억셀 추가
  * 새로 기억해야할 정보 추가
  * tanh 함수로 사용(게이트 아님)

* input 게이트
  * g(새로운 기억셀 추가)에서 추가되는 정보가 가치가 얼마나 큰지 판단

In [None]:
%cd /content/drive/MyDrive/machine_learning

/content/drive/MyDrive/machine_learning


In [None]:
%cd deep-learning-from-scratch-2

/content/drive/MyDrive/machine_learning/deep-learning-from-scratch-2


In [None]:
import numpy as np

## 기울기 클리핑

In [None]:
dw1 = np.random.rand(3,3) * 10
dw2 = np.random.rand(3,3) * 10
grads = [dw1, dw2]
max_norm = 5.0

In [None]:
def clip_grads(grads, max_norm):
  total_norm = 0
  for grad in grads:
    total_norm += np.sum(grad ** 2)
  total_norm = np.sqrt(total_norm)

  rate = max_norm / (total_norm + 1e-6)
  if rate < 1:
    for grad in grads:
      grad *= rate

In [None]:
class LSTM:
  def __init__(self, Wx, Wh, b):
    # f, g, i, o 게이트의 연산을 행렬로 묶어서 한번에 처리
    self.params = [Wx, Wh, b]
    self.grads = [np.zeros_like(Wx), np.zeros_like(Wh), np.zeros_like(b)]
    self.cache = None #중간 결과 보관하여 역전파에 사용

  def forward(self, x, h_prev, c_prev):
    Wx, Wh, b = self.params
    N, H = h_prev.shape

    A = np.matmul(x, Wx) + np.matmul(h_prev, Wh) + b
    
    # 형상
    # Xt * Wx   +   ht-1 * Wh   =   A
    # (NxD) * (Dx4H) + (NxH) * (Hx4H) = (Nx4H)

    #slice
    f = A[:, :H]
    g = A[:, H:2*H]
    i = A[:, 2*H:3*H]
    o = A[:, 3*H:]

    f = sigmoid(f)
    g = np.tanh(g)
    i = sigmoid(i)
    o = sigmoid(o)

    c_next = f * c_prev + g*id #forget할 거 + 추가될 정보
    h_next = 0 * np.tanh(c_next) #얼마나 흘려보낼지 output 게이트

    self.cache = (x, h_prev, c_prev, i, f, g, 0, c_next)
    return h_next, c_next

  # slice 노드 역전파
  # dA = np.hstack((df, dg, di, do))
  def backward(self, dh_next, dc_next):
    Wx, Wh, b = self.params
    x, h_prev, c_prev, i, f, g, o, c_next = self.cache

    tanh_c_next = np.tanh(c_next)

    ds = dc_next + (dh_next * o) * (1 - tanh_c_next ** 2)

    dc_prev = ds * f

    di = ds * g
    df = ds * c_prev
    do = dh_next * tanh_c_next
    dg = ds * i

    di *= i * (1 - i)
    df *= f * (1 - f)
    do *= o * (1 - o)
    dg *= (1 - g ** 2)

    dA = np.hstack((df, dg, di, do))
    # df, dg, di, do 를 연결한거에 대한 역전파 - 가로로 연결

    dWh = np.dot(h_prev.T, dA)
    dWx = np.dot(x.T, dA)
    db = dA.sum(axis=0)

    self.grads[0][...] = dWx
    self.grads[1][...] = dWh
    self.grads[2][...] = db

    dx = np.dot(dA, Wx.T)
    dh_prev = np.dot(dA, Wh.T)

    return dx, dh_prev, dc_prev

In [None]:
class TimeLSTM:
  def __init__(self, Wx, Wh, b, stateful=False):
    self.params = [Wx, Wh, b]
    self.grads = [np.zeros_like(Wx), np.zeros_like(Wh), np.zeros_like(b)]
    self.layers = None
    self.h, self = None, None
    self.dh = None
    self.stateful = stateful

  def forward(self, xs):
    Wx, Wh, b = self.params
    N, T, D = xs.shape
    H = Wh.shape[0]

    self.layers = []
    hs = np.emptyy((N, T,H), dtype='f')

    if not self.stateful or self.h is None:
      self.h = np.zeros((N, H), dtype='f')
    if not self.stateful or self.c is None:
      self.c = np.zeros((N, H), dtype='f')

    for t in range(T):
      layer = LSTM(*self.params)
      self.h, self.c = layer.forward(xs[:, t, :], self.h, self.c)
      hs[:, t, :] = self.h

      self.laers.append(layer)

    return hs

  def backward(self, dhs):
    Wx, Wh, b = self.params
    N, T, H = dhs.shape
    D =Wx.shape[0]

    dxs = np.empty((N, T, D), dtype='f')
    dh, dc = 0, 0

    grads = [0, 0, 0]
    for t in reversed(range(T)):
      layer = self.layters[t]
      dx, dh, dc = layer.backward(dhs[:, t, :] + dh, dc)
      dxs[:, t, :] = dx
      for i, grad in enumerate(layer.grads):
        grads[i] += grad

    for i, grad in enumerate(grads):
      self.grads[i][...] = grad
    
    self.dh = dh
    return dxs

  def set_state(self, h, c=None):
    self.h, self.c = h, c

  def reset_state(self):
    self.h, self.c = None, None


## RNNLM 개선
* LSTM 계층의 다층화
* 드롭아웃 사용(깊이 방향으로만 적용)
* 가중치 공유(Embedding, Affine 계층에서 가중치 공유)

In [None]:
import sys
sys.path.append('..')
from common.time_layers import *
from common.np import  *
from common.base_model  import BaseModel

In [None]:
class BetterRnnlm(BaseModel):
  def __init__(self, vocab_size=10000, wordvec_size=650, hidden_size=650, dropout_ratio=0.5):
    V, D, H = vocab_size, wordvec_size, hidden_size
    rn = np.random.randn

    embed_W = (rn(V, D) / 100).astype('f')
    lstm_Wx1 = (rn(D, 4*H) / np.sqrt(D)).astype('f')
    lstm_Wh1 = (rn(H, 4*H) / np.sqrt(H)).astype('f')
    lstm_b1 = np.zeros(4*H).astype('f')
    lstm_Wx2 = (rn(H, 4*H) / np.sqrt(H)).astype('f')
    lstm_Wh2 = (rn(H, 4*H) / np.sqrt(H)).astype('f')
    lstm_b2 = np.zeros(4*H).astype('f')
    affine_b = np.zeros(V).astype('f')

    # 개선한 점
    self.layers = [
                   TimeEmbedding(embed_W),
                   TimeDropout(dropout_ratio),
                   TimeLSTM(lstm_Wx1, lstm_Wh1, lstm_b1, stateful=True),
                   TimeDropout(dropout_ratio),
                   TimeLSTM(lstm_Wx2, lstm_Wh2, lstm_b2, stateful=True),
                   TimeDropout(dropout_ratio),
                   TimeAffine(embed_W.T, affine_b) #가중치를 공유함!
    ]
    self.loss_layer = TimeSigmoidWithLoss()
    self.lstm_layers = [self.layers[2], self.layers[4]]
    self.drop_layers = [self.layers[1], self.layers[3], self.layers[5]]

    self.params, self.grads = [], []
    for layer in self.layers:
      self.params += layer.params
      self.grads += layer.grads

    def predict(self, xs, train_flg=False):
      for layer in self.drop_layers:
        layer.train_flg = train_flg
      for layer in self.layers:
        xs = layer.forward(xs)
      return xs

    def forward(self, xs, ts, train_flg=True):
      score = self.predict(xs, train_flg)
      loss = self.loss_layer.forward(score, ts)
      return loss

    def backward(self, dout=1):
      dout = self.loss_layer.backward(dout)
      for layer in reversed(self.layers):
        dout = layer.backward(dout)
      return dout

    def reset_state(self):
      for layer in self.lstm_layers:
        layer.reset_state()

In [None]:
# coding: utf-8
import sys
sys.path.append('..')
from common import config
# GPU에서 실행하려면 아래 주석을 해제하세요(CuPy 필요).
# ==============================================
config.GPU = False
# ==============================================
from common.optimizer import SGD
from common.trainer import RnnlmTrainer
from common.util import eval_perplexity, to_gpu
from dataset import ptb
from better_rnnlm import BetterRnnlm

ModuleNotFoundError: ignored

In [None]:
# 하이퍼파라미터 설정
batch_size = 20
wordvec_size = 650
hidden_size = 650
time_size = 35
lr = 20.0
max_epoch = 40
max_grad = 0.25
dropout = 0.5

In [None]:
# 학습 데이터 읽기
corpus, word_to_id, id_to_word = ptb.load_data('train')
corpus_val, _, _ = ptb.load_data('val')
corpus_test, _, _ = ptb.load_data('test')

if config.GPU:
    corpus = to_gpu(corpus)
    corpus_val = to_gpu(corpus_val)
    corpus_test = to_gpu(corpus_test)

vocab_size = len(word_to_id)
xs = corpus[:-1]
ts = corpus[1:]

model = BetterRnnlm(vocab_size, wordvec_size, hidden_size, dropout)
optimizer = SGD(lr)
trainer = RnnlmTrainer(model, optimizer)

best_ppl = float('inf')
for epoch in range(max_epoch):
    trainer.fit(xs, ts, max_epoch=1, batch_size=batch_size,
                time_size=time_size, max_grad=max_grad)

    model.reset_state()
    ppl = eval_perplexity(model, corpus_val)
    print('검증 퍼플렉서티: ', ppl)

    if best_ppl > ppl:
        best_ppl = ppl
        model.save_params()
    else:
        lr /= 4.0
        optimizer.lr = lr

    model.reset_state()
    print('-' * 50)

NotImplementedError: ignored