## 텍스트 생성 - 단일 문장 생성

In [None]:
# 사용 모듈 import 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from konlpy.tag import Okt

from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN, RNN, LSTM, GRU, Embedding
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import optimizers
from pprint import pprint

In [None]:
# 데이터 읽기
d_path = './data/4_문어체_한국문화_190920.xlsx'
xlsx = pd.read_excel(d_path)
xlsx.head()

In [None]:
# 데이터 확인
print(xlsx['원문'])

In [None]:
# 사용 데이터 확인
sentences = xlsx['원문'].values
print(len(sentences))
print(sentences[:5])
sentences = sentences[:100]

In [None]:
# 전처리 및 words_set 생성
okt = Okt()
words_set = set()
max_len = 0
for ix, sentence in enumerate(sentences):
    words = [word for word, _ in okt.pos(sentence)]
    if len(words) > max_len:
        max_len = len(words)
    words_set.update(words)

In [None]:
# words_set 확인
print(len(words_set))
words_set = list(sorted(words_set))
words_set.insert(0, '')
print(max_len)

In [None]:
# 학습 데이터 만들기
train_sentences = []

for sentence in sentences:
    words = [words_set.index(word) for word, _ in okt.pos(sentence)]
    if len(words) > 3:
        for i in range(1, len(words)):
            train_sentence = words[:i+1]
            for _ in range(max_len-len(train_sentence)):
                train_sentence.insert(0, 0)
            train_sentences.append(train_sentence)
train_sentences = np.asarray(train_sentences).astype('float')

In [None]:
# 학습 데이터 확인
print(train_sentences[:5])

In [None]:
# 데이터 입력, 출력 나누기
x_train = train_sentences[:, :-1]
y_train = train_sentences[:, -1]

In [None]:
# 입력, 출력 데이터 확인
pprint(x_train[:5])
pprint(y_train[:5])

In [None]:
# 출력 데이터를 One hot 인코딩
y_train = to_categorical(y_train, num_classes=len(words_set))

In [None]:
# 출력 데이터 확인
print(y_train[:5])
print(y_train.shape)

In [None]:
# Loss확인을 위한 클래스 작성
class LossHistory(keras.callbacks.Callback):
    def init(self):
        self.losses = []
        
    def on_epoch_end(self, batch, logs={}):
        self.losses.append(logs.get('loss'))

In [None]:
# 모델 생성 및 컴파일
history = LossHistory()
history.init()

model = Sequential()
temp = model.add(Embedding(input_dim=len(words_set), output_dim=300, mask_zero=True))
model.add(LSTM(units=64, input_shape=(-1, 300)))
model.add(Dense(len(words_set), activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# 모델 출력 확인
temp1 = model.predict(x_train[0].reshape(1, -1))
print(temp1)
print(temp1.shape)

In [None]:
# 모델 학습
model.fit(x_train, y_train, epochs=90, batch_size=128, callbacks=[history])

In [None]:
# Loss 확인
loss = history.losses
epochs = range(len(loss))
plt.plot(epochs, loss)
plt.show()

In [None]:
# 모델 테스트
temp1 = model.predict(x_train[0].reshape(1, -1))
print(temp1)
print(temp1.shape)

In [None]:
# 모델로 문장 생성하는 함수 작성
def sentence_generation(model, cur_word, n):
    sentence = cur_word
    for _ in range(n):
        encoded = [words_set.index(word) for word, _ in okt.pos(sentence)]
        for _ in range(max_len-len(sentence)):
            encoded.insert(0, 0)
        encoded = np.asarray(encoded).astype('float').reshape(1, -1)
        result = list(np.squeeze(model.predict(encoded))) ## encoded가 tensor가 아니고 np라서 warning
        pred_word_idx = result.index(max(result))
        pred_word = words_set[pred_word_idx]
        sentence = sentence + ' ' + pred_word
    return sentence

In [None]:
# 한 단어로 한 문장 생성
sentence = sentence_generation(model, '많은', len(x_train[0]))

In [None]:
# 확인하기
print(words_set[np.int(x_train[0][-1])])
print(sentence)

In [None]:
# 다수의 단어로 한 문장 생성
sentence = sentence_generation(model, '힘을 잃어가고, 민속의', 11)

In [None]:
# 확인하기
print(sentence)