In [1]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical




In [2]:
text = """경마장에 있는 말이 뛰고 있다
그의 말이 법이다
가는 말이 고와야 오늘 말이 곱다"""

In [3]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
vocab_size = len(tokenizer.word_index) + 1
print("vocab_size: {}".format(vocab_size))

vocab_size: 12


In [4]:
print(tokenizer.word_index)

{'말이': 1, '경마장에': 2, '있는': 3, '뛰고': 4, '있다': 5, '그의': 6, '법이다': 7, '가는': 8, '고와야': 9, '오늘': 10, '곱다': 11}


In [5]:
sequences = list()
for line in text.split('\n'):
    encoded = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(encoded)):
        sequence = encoded[:i+1] # 이코드 봐라 정말 대단하다....
        sequences.append(sequence)

print("sample couunt:{}".format(len(sequences)))

sample couunt:11


In [6]:
print(sequences)

[[2, 3], [2, 3, 1], [2, 3, 1, 4], [2, 3, 1, 4, 5], [6, 1], [6, 1, 7], [8, 1], [8, 1, 9], [8, 1, 9, 10], [8, 1, 9, 10, 1], [8, 1, 9, 10, 1, 11]]


In [7]:
# 패딩을 하자
max_len = max(len(x) for x in sequences)
sequences = pad_sequences(sequences)
print(sequences)

[[ 0  0  0  0  2  3]
 [ 0  0  0  2  3  1]
 [ 0  0  2  3  1  4]
 [ 0  2  3  1  4  5]
 [ 0  0  0  0  6  1]
 [ 0  0  0  6  1  7]
 [ 0  0  0  0  8  1]
 [ 0  0  0  8  1  9]
 [ 0  0  8  1  9 10]
 [ 0  8  1  9 10  1]
 [ 8  1  9 10  1 11]]


In [8]:
# 라벨정의
sequences = np.array(sequences)
x = sequences[:,:-1]
y = sequences[:,-1]
print(x)
print(y)

[[ 0  0  0  0  2]
 [ 0  0  0  2  3]
 [ 0  0  2  3  1]
 [ 0  2  3  1  4]
 [ 0  0  0  0  6]
 [ 0  0  0  6  1]
 [ 0  0  0  0  8]
 [ 0  0  0  8  1]
 [ 0  0  8  1  9]
 [ 0  8  1  9 10]
 [ 8  1  9 10  1]]
[ 3  1  4  5  1  7  1  9 10  1 11]


In [9]:
# 라벨에 대한 원핫코딩
# 항상 분류 문제에 있어 라벨은 대부분 원핫코딩을 해야 한다. 소프트맥스에(0~1)와 비교하기 위해서
y = to_categorical(y)
print(y)

[[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]]


In [10]:
# 모델 설계하기
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, SimpleRNN

In [11]:
embedding_dim = 10
hidden_units = 32

model = Sequential(
    [
        Embedding(vocab_size, embedding_dim),
        SimpleRNN(hidden_units),
        Dense(vocab_size, activation='softmax')
    ]
)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])
model.fit(x,y,epochs=200, verbose = 1)



Epoch 1/200


Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoc

<keras.src.callbacks.History at 0x200a57401d0>

In [12]:
# 예측함수 정의
def sentence_generation(model, tokenizer, current_word, n):
    init_word = current_word
    sentence = ''
    
    for _ in range(n):
        encoded = tokenizer.texts_to_sequences([current_word])[0]
        encoded = pad_sequences([encoded], maxlen=5, padding='pre')
        
        result = model.predict(encoded, verbose=0)
        print(result)
        result = np.argmax(result, axis=1) # axix = 1을 주니까 가장 큰 수의 인덱스 번호가 나오는 구나
        print(result)
        
        for word, index in tokenizer.word_index.items():
            if index == result:
                break
            
        current_word = current_word + ' ' + word
        
        sentence = sentence + ' ' + word
        
    sentence  = init_word + sentence
    
    return sentence

In [13]:
print(sentence_generation(model, tokenizer, "경마장에", 4))

[[0.00275484 0.27000433 0.00486427 0.69543386 0.00146947 0.00367961
  0.00443175 0.00296749 0.00146884 0.00848091 0.0036956  0.0007491 ]]
[3]
[[7.4127468e-04 9.5966834e-01 1.1301655e-03 9.8538660e-03 3.2205172e-04
  2.1075474e-03 1.2559313e-03 5.3454321e-03 2.4193404e-03 1.6854126e-02
  2.1789840e-04 8.4185369e-05]]
[1]
[[1.0261791e-03 3.0170963e-04 2.2898985e-03 1.4323788e-03 9.6437716e-01
  2.2767441e-04 9.8867167e-04 4.3889848e-03 1.4155237e-03 9.0711378e-03
  3.7006908e-03 1.0779920e-02]]
[4]
[[1.5028818e-03 9.9829929e-03 1.5186422e-03 1.3744995e-03 4.4109469e-04
  9.4986284e-01 8.2817336e-04 7.4918796e-03 5.2147650e-04 1.3872477e-03
  2.4853135e-02 2.3511930e-04]]
[5]
경마장에 있는 말이 뛰고 있다
