<a href="https://colab.research.google.com/github/jeahoyang/DeepLearning_practcie/blob/main/colab/VanillaRNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [28]:
import numpy as np
import re

In [29]:
data = """
나라의 말이 중국과 달라 문자와 서로 통하지 아니하니, 이런 까닭으로 어리석은 백성이 이르고자 할 바가 있어도 마침내 제 뜻을 능히 펴지 못할 사람이 많다. 내가 이를 위하여 가엾이 여겨 새로 스물여덟 자를 만드노니 사람마다 하여금 쉬이 익혀 날마다 쓰는 데 편하게 하고자 할 따름이다.
"""

In [30]:
def data_preprocessing(data):
  data = re.sub('[^가-힣]', ' ', data)
  tokens = data.split()
  vocab = list(set(tokens))
  vocab_size = len(vocab)

  word2ix = {word: i for i, word in enumerate(vocab)}
  ix2word = {i: word for i, word in enumerate(vocab)}

  return tokens, vocab_size, word2ix, ix2word

In [31]:
def init_weights(h_size, vocab_size):
  U = np.random.randn(h_size, vocab_size) * 0.01
  W = np.random.randn(h_size, h_size) * 0.01
  V = np.random.randn(vocab_size, h_size) * 0.01
  return U,W,V

In [44]:
def feedforward(inputs, targets, hprev):
  loss = 0
  xs, hs, ps, ys = {}, {}, {}, {}
  hs[-1] = np.copy(hprev)
  for i in range(seq_len):
    xs[i] = np.zeros((vocab_size, 1))
    xs[i][inputs[i]] = 1 # one hot encoding for each word
    hs[i] = np.tanh(np.dot(U, xs[i]) + np.dot(W, hs[i-1]))
    ys[i] = np.dot(V, hs[i])
    ps[i] = np.exp(ys[i]) / np.sum(np.exp(ys[i])) # softmax calculate
    loss += -np.log(ps[i][targets[i], 0]) # cross entropy

  return loss, ps, hs, xs

In [65]:
def backward(ps, hs, xs):

  # Backwaard propagation through time (BPTT)
  # Initial bias set to 0
  dV = np.zeros(V.shape)
  dW = np.zeros(W.shape)
  dU = np.zeros(U.shape)

  for i in range(seq_len)[::-1]: # This is because to go in reverse order
    output = np.zeros((vocab_size, 1))
    output[targets[i]] = 1
    ps[i] = ps[i] - output.reshape(-1,1)
    dV_step_i = ps[i] @ (hs[i]).T # (y_hat - y) @ hs.T  for each step

    dV = dV + dV_step_i # sum all of dL/dVi

    # To find V and W for each i,
    # first calculate the commonly calculated part as delta,
    # then go back in time to find dL/dWij and dL/dUij,
    # add them together to get dL/dW and dL/dU,
    # and update the commonly calculated delta again.

    # delta to be commonly used in the ith step
    delta_recent = (V.T @ ps[i]) * (1 - hs[i] ** 2)

    # Go back in time and find dL/dW and dL/dU
    for j in range(i+1)[::-1]:
      dW_ij = delta_recent @ hs[j - 1].T
      dW = dW + dW_ij

      dU_ij = delta_recent @ xs[j].reshape(1, -1)
      dU = dU + dU_ij

      # Delta update to be commonly calculated in the next step j
      delta_recent = (W.T @ delta_recent) * (1 - hs[j - 1] ** 2)

      for d in [dV, dW, dU]:
        np.clip(d, -1, 1, out=d)

  return dV, dW, dU, hs[len(inputs) -1]

In [66]:
def predict(word, length):
  x = np.zeros((vocab_size, 1))
  x[word2ix[word]] = 1
  ixes = []
  h = np.zeros((h_size, 1))

  for t in range(length):
    h = np.tanh(np.dot(U, x) + np.dot(W, h))
    y = np.dot(V, h)
    p = np.exp(y) / np.sum(np.exp(y)) # softmax
    ix = np.argmax(p)                 # Return the index with the highest probability
    x = np.zeros((vocab_size, 1))     # prepare next input x
    x[ix] = 1
    ixes.append(ix)

  pred_words = ' '.join(ix2word[i] for i in ixes)

  return pred_words

In [67]:
# parameters

epochs = 10000
h_size = 100
seq_len = 3
learning_rate = 1e-2

In [68]:
tokens, vocab_size, word2ix, ix2word = data_preprocessing(data)

In [69]:
tokens

['나라의',
 '말이',
 '중국과',
 '달라',
 '문자와',
 '서로',
 '통하지',
 '아니하니',
 '이런',
 '까닭으로',
 '어리석은',
 '백성이',
 '이르고자',
 '할',
 '바가',
 '있어도',
 '마침내',
 '제',
 '뜻을',
 '능히',
 '펴지',
 '못할',
 '사람이',
 '많다',
 '내가',
 '이를',
 '위하여',
 '가엾이',
 '여겨',
 '새로',
 '스물여덟',
 '자를',
 '만드노니',
 '사람마다',
 '하여금',
 '쉬이',
 '익혀',
 '날마다',
 '쓰는',
 '데',
 '편하게',
 '하고자',
 '할',
 '따름이다']

In [70]:
vocab_size

43

In [71]:
word2ix

{'바가': 0,
 '하여금': 1,
 '까닭으로': 2,
 '하고자': 3,
 '어리석은': 4,
 '사람이': 5,
 '쓰는': 6,
 '스물여덟': 7,
 '백성이': 8,
 '만드노니': 9,
 '가엾이': 10,
 '익혀': 11,
 '편하게': 12,
 '많다': 13,
 '제': 14,
 '문자와': 15,
 '중국과': 16,
 '내가': 17,
 '서로': 18,
 '날마다': 19,
 '이를': 20,
 '여겨': 21,
 '아니하니': 22,
 '있어도': 23,
 '쉬이': 24,
 '데': 25,
 '통하지': 26,
 '못할': 27,
 '이런': 28,
 '자를': 29,
 '나라의': 30,
 '마침내': 31,
 '따름이다': 32,
 '펴지': 33,
 '능히': 34,
 '말이': 35,
 '달라': 36,
 '할': 37,
 '위하여': 38,
 '새로': 39,
 '사람마다': 40,
 '뜻을': 41,
 '이르고자': 42}

In [72]:
ix2word

{0: '바가',
 1: '하여금',
 2: '까닭으로',
 3: '하고자',
 4: '어리석은',
 5: '사람이',
 6: '쓰는',
 7: '스물여덟',
 8: '백성이',
 9: '만드노니',
 10: '가엾이',
 11: '익혀',
 12: '편하게',
 13: '많다',
 14: '제',
 15: '문자와',
 16: '중국과',
 17: '내가',
 18: '서로',
 19: '날마다',
 20: '이를',
 21: '여겨',
 22: '아니하니',
 23: '있어도',
 24: '쉬이',
 25: '데',
 26: '통하지',
 27: '못할',
 28: '이런',
 29: '자를',
 30: '나라의',
 31: '마침내',
 32: '따름이다',
 33: '펴지',
 34: '능히',
 35: '말이',
 36: '달라',
 37: '할',
 38: '위하여',
 39: '새로',
 40: '사람마다',
 41: '뜻을',
 42: '이르고자'}

In [73]:
U, W, V = init_weights(h_size, vocab_size)

In [75]:
p = 0
hprev = np.zeros((h_size, 1))
for epoch in range(epochs):

  for p in range(len(tokens)-seq_len):
    inputs = [word2ix[word] for word in tokens[p:p + seq_len]]
    targets = [word2ix[word] for word in tokens[p + 1:p + seq_len + 1]]

    loss, ps, hs, xs = feedforward(inputs, targets, hprev)
    dV, dW, dU, hprev = backward(ps, hs, xs)

    # Update weights and biases using gradient descent
    W -= learning_rate * dW
    U -= learning_rate * dU
    V -= learning_rate * dV

    # p += seq_len

  if epoch % 100 == 0:
    print(f'epoch: {epoch}, loss: {loss}')

epoch: 0, loss: 0.16826204103608933
epoch: 100, loss: 0.045075354562351205
epoch: 200, loss: 0.02719595139490123
epoch: 300, loss: 0.020454189133170063
epoch: 400, loss: 0.01680663150625597
epoch: 500, loss: 0.014438260748168024
epoch: 600, loss: 0.012737071948426326
epoch: 700, loss: 0.01143433470452837
epoch: 800, loss: 0.010391864227331666
epoch: 900, loss: 0.009531135746568876
epoch: 1000, loss: 0.008804103547274865
epoch: 1100, loss: 0.008179548716693729
epoch: 1200, loss: 0.00763613833886128
epoch: 1300, loss: 0.007158653728040396
epoch: 1400, loss: 0.0067358183168768435
epoch: 1500, loss: 0.006358987639106922
epoch: 1600, loss: 0.006021334450523641
epoch: 1700, loss: 0.005717332178962401
epoch: 1800, loss: 0.005442421900738794
epoch: 1900, loss: 0.005192791947861225
epoch: 2000, loss: 0.004965226018936524
epoch: 2100, loss: 0.00475699313049654
epoch: 2200, loss: 0.004565763935241503
epoch: 2300, loss: 0.004389544646858366
epoch: 2400, loss: 0.004226623546626689
epoch: 2500, loss

In [76]:
while 1:
  try:
    user_input = input("input: ")
    if user_input == 'break':
      break
    response = predict(user_input, 20)
    print(response)

  except:
    print('try again!')

말이 중국과 달라 문자와 달라 문자와 달라 문자와 서로 통하지 서로 통하지 서로 통하지 아니하니 통하지 아니하니 이런 아니하니 이런
input: break
