In [1]:
!pip install jamotools

Collecting jamotools
  Downloading jamotools-0.1.10-py2.py3-none-any.whl (13 kB)
Collecting future
  Downloading future-0.18.2.tar.gz (829 kB)
[K     |████████████████████████████████| 829 kB 318 kB/s eta 0:00:01
Building wheels for collected packages: future
  Building wheel for future (setup.py) ... [?25l- \ | / - done
[?25h  Created wheel for future: filename=future-0.18.2-py3-none-any.whl size=491059 sha256=3b254ed8a9a9f8abe505de22d8557a2261de2a245ae6b7eebd6bed2ac05644aa
  Stored in directory: /Users/jk/Library/Caches/pip/wheels/8e/70/28/3d6ccd6e315f65f245da085482a2e1c7d14b90b30f239e2cf4
Successfully built future
Installing collected packages: future, jamotools
Successfully installed future-0.18.2 jamotools-0.1.10


In [None]:
import tensorflow as tf
import numpy as np
import jamotools

path_to_file = tf.keras.utils.get_file('toji.txt', 'https://raw.githubusercontent.com/pykwon/etc/master/rnn_test_toji.txt')
# 데이터 로드 및 확인. encoding 형식 utf-8
train_text = open(path_to_file, 'rb').read().decode(encoding='utf-8')

# 텍스트 길이 확인
print('Length of text: {} characters'.format(len(train_text)))
print()

# 한글 텍스트를 자모 단위로 분리, 한자 등에는 영향 x
s = train_text[:100]
s_split = jamotools.split_syllables(s)
print(s_split)

In [None]:
# 자모 결합 테스트
s2 = jamotools.join_jamos(s_split)
print(s2)
print(s == s2)


In [None]:
# 자모 토큰화
# 텍스트를 자모 단위로 split. 시간이 좀 걸림.
train_text_X = jamotools.split_syllables(train_text)
vocab = sorted(set(train_text_X))
vocab.append('UNK')
print ('{} unique characters'.format(len(vocab)))

# vocab list를 숫자로 맵핑, 반대도
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

text_as_int = np.array([char2idx[c] for c in train_text_X])

In [None]:
# 토큰 데이터 확인
print(train_text_X[:20])
print(text_as_int[:20])

In [None]:
# 학습 데이터세트 생성
seq_length = 80
examples_per_epoch = len(text_as_int) // seq_length
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

char_dataset = char_dataset.batch(seq_length+1, drop_remainder=True)    # drop_remainder 쌓아주고 있긴한데, 남는 부분이 있으면 제거하겠다.
for item in char_dataset.take(1):
    print(idx2char[item.numpy()])
    print(item.numpy())

def split_input_target(chunk):
    return [chunk[:-1], chunk[-1]]

train_dataset = char_dataset.map(split_input_target)
for x,y in train_dataset.take(3):
    print(idx2char[x.numpy()])
    print(x.numpy())
    print(idx2char[y.numpy()])
    print(y.numpy())

BATCH_SIZE = 64
steps_per_epoch = examples_per_epoch // BATCH_SIZE
BUFFER_SIZE = 5000

train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

In [None]:
# 자소 단위 생성 모델 정의
total_chars = len(vocab)
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(total_chars, 100, input_length=seq_length),
    tf.keras.layers.LSTM(units=400, activation='tanh'),
    tf.keras.layers.Dense(total_chars, activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()


In [None]:
# 자소 단위 생성 모델 학습
from tensorflow.keras.preprocessing.sequence import pad_sequences

def testmodel(epoch, logs):
    if epoch % 5 != 0 and epoch != 99:
        return

    test_sentence = train_text[:48]
    test_sentence = jamotools.split_syllables(test_sentence)

    next_chars = 300
    for _ in range(next_chars):
        test_text_X = test_sentence[-seq_length:]
        test_text_X = np.array([char2idx[c] if c in char2idx else char2idx['UNK'] for c in test_text_X])
        test_text_X = pad_sequences([test_text_X], maxlen=seq_length, padding='pre', value=char2idx['UNK'])

        output_idx = model.predict_classes(test_text_X)
        test_sentence += idx2char[output_idx[0]]

    print()
    print(jamotools.join_jamos(test_sentence))
    print()

testmodelcb = tf.keras.callbacks.LambdaCallback(on_epoch_end=testmodel)

history = model.fit(train_dataset.repeat(), epochs=50, steps_per_epoch=steps_per_epoch, callbacks=[testmodelcb], verbose=2)


In [None]:
model.save("rnnmodel2.hdf5")

from tensorflow.keras.preprocessing.sequence import pad_sequences
test_sentence = '최참판댁 사랑은 무인지경처럼 적막하다'
test_sentence = jamotools.split_syllables(test_sentence)

next_chars = 5000
for _ in range(next_chars):
    test_text_X = test_sentence[-seq_length:]
    test_text_X = np.array([char2idx[c] if c in char2idx else char2idx['UNK'] for c in test_text_X])
    test_text_X = pad_sequences([test_text_X], maxlen=seq_length, padding='pre', value=char2idx['UNK'])

    output_idx = model.predict_classes(test_text_X)
    test_sentence += idx2char[output_idx[0]]


print(jamotools.join_jamos(test_sentence))
