In [51]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [52]:
corpus = """경마장에 있는 말이 뛰고 있다\n
그의 말이 법이다\n
가는 말이 고와야 오는 말이 곱다\n"""

In [53]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([corpus])
vocab_size = len(tokenizer.word_index) + 1 # padding 고려

In [54]:
# 훈련 데이터 만들기
sequences = []

for line in corpus.split("\n"):
    encoded = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(encoded)):
        sequence = encoded[:i + 1]
        sequences.append(sequence)
        
sequences

[[2, 3],
 [2, 3, 1],
 [2, 3, 1, 4],
 [2, 3, 1, 4, 5],
 [6, 1],
 [6, 1, 7],
 [8, 1],
 [8, 1, 9],
 [8, 1, 9, 10],
 [8, 1, 9, 10, 1],
 [8, 1, 9, 10, 1, 11]]

In [55]:
# pad_sequences
max_len = max(len(sequence) for sequence in sequences)
sequences = pad_sequences(sequences, maxlen=max_len, padding="pre")

sequences

array([[ 0,  0,  0,  0,  2,  3],
       [ 0,  0,  0,  2,  3,  1],
       [ 0,  0,  2,  3,  1,  4],
       [ 0,  2,  3,  1,  4,  5],
       [ 0,  0,  0,  0,  6,  1],
       [ 0,  0,  0,  6,  1,  7],
       [ 0,  0,  0,  0,  8,  1],
       [ 0,  0,  0,  8,  1,  9],
       [ 0,  0,  8,  1,  9, 10],
       [ 0,  8,  1,  9, 10,  1],
       [ 8,  1,  9, 10,  1, 11]], dtype=int32)

In [56]:
# Labeling
sequences = np.array(sequences)
X_train = sequences[:, :-1]
y_train = sequences[:, -1]

X_train, y_train

(array([[ 0,  0,  0,  0,  2],
        [ 0,  0,  0,  2,  3],
        [ 0,  0,  2,  3,  1],
        [ 0,  2,  3,  1,  4],
        [ 0,  0,  0,  0,  6],
        [ 0,  0,  0,  6,  1],
        [ 0,  0,  0,  0,  8],
        [ 0,  0,  0,  8,  1],
        [ 0,  0,  8,  1,  9],
        [ 0,  8,  1,  9, 10],
        [ 8,  1,  9, 10,  1]], dtype=int32),
 array([ 3,  1,  4,  5,  1,  7,  1,  9, 10,  1, 11], dtype=int32))

In [57]:
# Label을 OHE
y_train = to_categorical(y_train, num_classes=vocab_size)

y_train

array([[0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]], dtype=float32)

In [58]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, SimpleRNN

In [59]:
# RNN Modeling
embedding_dim = 10
hidden_size = 32

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim))
model.add(SimpleRNN(hidden_size))
model.add(Dense(vocab_size, activation="softmax"))
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
model.fit(X_train, y_train, epochs=200, verbose=2)

Epoch 1/200


2023-07-17 18:54:48.101151: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


1/1 - 1s - loss: 2.4986 - accuracy: 0.0909 - 648ms/epoch - 648ms/step
Epoch 2/200
1/1 - 0s - loss: 2.4863 - accuracy: 0.0909 - 49ms/epoch - 49ms/step
Epoch 3/200
1/1 - 0s - loss: 2.4742 - accuracy: 0.2727 - 48ms/epoch - 48ms/step
Epoch 4/200
1/1 - 0s - loss: 2.4622 - accuracy: 0.3636 - 46ms/epoch - 46ms/step
Epoch 5/200
1/1 - 0s - loss: 2.4503 - accuracy: 0.4545 - 45ms/epoch - 45ms/step
Epoch 6/200
1/1 - 0s - loss: 2.4383 - accuracy: 0.4545 - 47ms/epoch - 47ms/step
Epoch 7/200
1/1 - 0s - loss: 2.4260 - accuracy: 0.3636 - 36ms/epoch - 36ms/step
Epoch 8/200
1/1 - 0s - loss: 2.4135 - accuracy: 0.3636 - 35ms/epoch - 35ms/step
Epoch 9/200
1/1 - 0s - loss: 2.4005 - accuracy: 0.3636 - 33ms/epoch - 33ms/step
Epoch 10/200
1/1 - 0s - loss: 2.3871 - accuracy: 0.3636 - 53ms/epoch - 53ms/step
Epoch 11/200
1/1 - 0s - loss: 2.3731 - accuracy: 0.3636 - 47ms/epoch - 47ms/step
Epoch 12/200
1/1 - 0s - loss: 2.3585 - accuracy: 0.3636 - 43ms/epoch - 43ms/step
Epoch 13/200
1/1 - 0s - loss: 2.3432 - accuracy

<keras.callbacks.History at 0x29d2adac0>

In [60]:
# Evaluation
def sent_generator(model, tokenizer, current_word, n):
    init_word = current_word
    sentence = ""

    for _ in range(n):
        encoded = tokenizer.texts_to_sequences([current_word])[0]
        encoded = pad_sequences([encoded], maxlen=5, padding="pre")
        res = model.predict(encoded, verbose=0)
        res = np.argmax(res, axis=1)

        for word, idx in tokenizer.word_index.items():
            if idx == res:
                break
        current_word = f"{current_word} {word}"
        sentence = f"{sentence} {word}"
    sentence = init_word + sentence
    return sentence

print(sent_generator(model, tokenizer, "경마장에", 4))
print(sent_generator(model, tokenizer, "그외", 2))
print(sent_generator(model, tokenizer, "가는", 5))

2023-07-17 18:54:56.586595: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


경마장에 있는 말이 뛰고 있다
그외 말이 있는
가는 말이 고와야 오는 말이 곱다
