# RNN Training on 훈민정음

In [1]:
import numpy as np
import re

In [2]:
data = """
나라의 말이 중국과 달라 문자와 서로 통하지 아니하기에 이런 까닭으로 어리석은 백성이 이르고자 할 바가 있어도 마침내 제 뜻을 능히 펴지 못할 사람이 많으니라 내가 이를 위해 가엾이 여겨 새로 스물여덟 글자를 만드노니 사람마다 하여 쉬이 익혀 날로 씀에 편안케 하고자 할 따름이니라
"""

In [3]:
def preprocessing(text):
    data = re.sub('[^가-힣]', ' ', text)
    tokens = data.split()
    vocab = list(set(tokens))
    vocab_size = len(vocab)

    word_to_idx = {word: i for i, word in enumerate(vocab)}
    idx_to_word = {i: word for i, word in enumerate(vocab)}

    return tokens, vocab_size, word_to_idx, idx_to_word

In [6]:
tokens, vocab_size, word_to_idx, idx_to_word = preprocessing(data)
print(tokens)
print()
print(vocab_size)
print()
print(word_to_idx)
print()
print(idx_to_word)

['나라의', '말이', '중국과', '달라', '문자와', '서로', '통하지', '아니하기에', '이런', '까닭으로', '어리석은', '백성이', '이르고자', '할', '바가', '있어도', '마침내', '제', '뜻을', '능히', '펴지', '못할', '사람이', '많으니라', '내가', '이를', '위해', '가엾이', '여겨', '새로', '스물여덟', '글자를', '만드노니', '사람마다', '하여', '쉬이', '익혀', '날로', '씀에', '편안케', '하고자', '할', '따름이니라']

42

{'가엾이': 0, '글자를': 1, '씀에': 2, '어리석은': 3, '백성이': 4, '이르고자': 5, '내가': 6, '바가': 7, '많으니라': 8, '여겨': 9, '서로': 10, '하여': 11, '편안케': 12, '능히': 13, '만드노니': 14, '쉬이': 15, '할': 16, '스물여덟': 17, '하고자': 18, '말이': 19, '까닭으로': 20, '사람마다': 21, '익혀': 22, '이런': 23, '펴지': 24, '따름이니라': 25, '있어도': 26, '위해': 27, '문자와': 28, '아니하기에': 29, '마침내': 30, '뜻을': 31, '새로': 32, '제': 33, '나라의': 34, '날로': 35, '중국과': 36, '통하지': 37, '못할': 38, '달라': 39, '사람이': 40, '이를': 41}

{0: '가엾이', 1: '글자를', 2: '씀에', 3: '어리석은', 4: '백성이', 5: '이르고자', 6: '내가', 7: '바가', 8: '많으니라', 9: '여겨', 10: '서로', 11: '하여', 12: '편안케', 13: '능히', 14: '만드노니', 15: '쉬이', 16: '할', 17: '스물여덟', 18: '하고자', 19: '말이', 20: '까닭으로', 21: '사람마다', 22: '익혀', 23: '이런', 24: '펴지', 25: '따

In [21]:
def init_weights(h_size, vocab_size):
    U = np.random.randn(h_size, vocab_size) * 0.01
    W = np.random.randn(h_size, h_size) * 0.01
    V = np.random.randn(vocab_size, h_size) * 0.01
    return U, W, V

In [22]:
def feedforward(inputs, targets, hprev):
    loss = 0
    xs, hs, ps, ys = {}, {}, {}, {}
    hs[-1] = np.copy(hprev)
    for i in range(seq_len):
        xs[i] = np.zeros((vocab_size, 1))
        xs[i][inputs[i]] = 1 # one-hot encoding
        hs[i] = np.tanh(np.dot(U,xs[i]) + np.dot(W, hs[i-1])) # hidden_state
        ys[i] = np.dot(V, hs[i]) # o_1 = Vh_1
        ps[i] = np.exp(ys[i]) / np.sum(np.exp(ys[i])) # softmax
        loss += -np.log(ps[i][targets[i], 0]) # cross-entropy
    return loss, ps, hs, xs
        

In [28]:
def backward(ps, hs, xs):
    # backpropagation through time ## BPTT
    dV = np.zeros(V.shape)
    dW = np.zeros(W.shape)
    dU = np.zeros(U.shape)

    for i in range(seq_len)[::-1]:
        output = np.zeros((vocab_size, 1))
        output[targets[i]] = 1
        ps[i] = ps[i] - output.reshape(-1, 1)
        # calculating dL/dvi at each step i
        dV_step_i = ps[i] @ (hs[i]).T # (y_hat - y) @ hs.T

        dV = dV + dV_step_i # add all dv/dVi
        
        # in order to calculate V and W per each i
        # it is better to calculate the common part and save it as delta
        # Then get dL/dWij and dL/dUij
        # get dL/dW and dL/dU by adding up each of them
        # then update delta with it.

        # delta that will be used in the loop
        delta_recent = (V.T @ ps[i]) * (1 - hs[i] ** 2) # (y_hat - y) * V * (1 - h_{i-1}^2)

        for j in range(i + 1)[::-1]:
            dW_ij = delta_recent @ hs[j-1].T
            dW = dW + dW_ij
            dU_ij = delta_recent @ xs[j].reshape(1, -1)
            dU = dU + dU_ij
            
            # updating delta to be used in the next loop.
            delta_recent = (W.T @ delta_recent) * (1-hs[j-1]**2)
        for d in [dU, dW, dV]:
            np.clip(d, -1, 1, out=d)
    return dU, dW, dV, hs[len(inputs) - 1]


In [29]:
def predict(word, length):
    x = np.zeros((vocab_size, 1))
    x[word_to_idx[word]] = 1
    ixes = []
    h = np.zeros((h_size, 1))

    for t in range(length):
        h = np.tanh(np.dot(U, x) + np.dot(W, h))
        y = np.dot(V, h)
        p = np.exp(y) / np.sum(np.exp(y))    # softmax
        idx = np.argmax(p)                   # return the index that has the highest probability
        x = np.zeros((vocab_size, 1))        # prepare the next input
        x[idx] = 1
        ixes.append(idx)
    pred_words = ' '.join(idx_to_word[i] for i in ixes)
    return pred_words

In [30]:
epochs = 10000
h_size = 1000
seq_len = 3
learning_rate = 1e-2

In [31]:
U, W, V = init_weights(h_size, vocab_size)

In [33]:
from tqdm.auto import tqdm

p = 0
hprev = np.zeros((h_size, 1))
for epoch in tqdm(range(epochs)):
    for p in range(len(tokens) - seq_len):
        inputs = [word_to_idx[token] for token in tokens[p:p + seq_len]]
        targets = [word_to_idx[token] for token in tokens[p+1:p + seq_len+1]]

        loss, ps, hs, xs = feedforward(inputs, targets, hprev)

        dU, dW, dV, hprev = backward(ps, hs, xs)
        
        # update weights and biases using gradient descent
        W -= learning_rate * dW
        U -= learning_rate * dU
        V -= learning_rate * dV
        # p += seq_len

    if epoch %100 == 0:
        print(f"epoch {epoch}, loss: {loss}")
        

  0%|          | 0/10000 [00:00<?, ?it/s]

epoch 0, loss: 11.120202059885667
epoch 100, loss: 1.0539873788899932
epoch 200, loss: 0.08152424355107796
epoch 300, loss: 0.04898888026406524
epoch 400, loss: 0.02728792125964602
epoch 500, loss: 0.018152832688487996
epoch 600, loss: 0.012772458748214446
epoch 700, loss: 0.009398687458355058
epoch 800, loss: 0.007171430400222013
epoch 900, loss: 0.005614459393640043
epoch 1000, loss: 0.004496547313407035
epoch 1100, loss: 0.0037082819183412493
epoch 1200, loss: 0.003169889561582531
epoch 1300, loss: 0.0028272919007194474
epoch 1400, loss: 0.0026402149430549394
epoch 1500, loss: 0.002558961380749707
epoch 1600, loss: 0.0025224168161854703
epoch 1700, loss: 0.002476034191083159
epoch 1800, loss: 0.002381518376694911
epoch 1900, loss: 0.0022327983054101544
epoch 2000, loss: 0.0020584587873681962
epoch 2100, loss: 0.0018909908442846513
epoch 2200, loss: 0.0017451348416219412
epoch 2300, loss: 0.0016218391617572497
epoch 2400, loss: 0.0015172569677594679
epoch 2500, loss: 0.00142702463909

In [37]:
while 1:
    try:
        user_input = input('input: ')
        if user_input == 'break':
            break
        response = predict(user_input, 40)
        print(response)
    except:
        print('Try again')

input:  나라의


말이 중국과 달라 문자와 달라 문자와 서로 문자와 서로 통하지 서로 통하지 아니하기에 통하지 아니하기에 이런 아니하기에 이런 까닭으로 이런 까닭으로 어리석은 까닭으로 어리석은 백성이 어리석은 백성이 이르고자 백성이 이르고자 할 이르고자 할 바가 할 바가 있어도 바가 있어도 마침내


input:  break
