In [1]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [2]:
text="""
식탁에 앉아 피자를 먹는 사람\n
포크와 나이프를 사용하고 있네요\n
포크를 사용해 피자를 먹습니다
"""


In [3]:
tokenizer=Tokenizer()
tokenizer.fit_on_texts([text])

In [4]:
len(tokenizer.word_index)+1, "단어 집합의 크기"

(13, '단어 집합의 크기')

In [5]:
tokenizer.word_index

{'피자를': 1,
 '식탁에': 2,
 '앉아': 3,
 '먹는': 4,
 '사람': 5,
 '포크와': 6,
 '나이프를': 7,
 '사용하고': 8,
 '있네요': 9,
 '포크를': 10,
 '사용해': 11,
 '먹습니다': 12}

In [6]:
sequences=list()
for line in text.split('\n'):
    encoded=tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(encoded)):
        sequence=encoded[:i+1]
        sequences.append(sequence)

In [7]:
len(sequences), "학습에 사용할 샘플의 개수"

(10, '학습에 사용할 샘플의 개수')

In [8]:
sequences

[[2, 3],
 [2, 3, 1],
 [2, 3, 1, 4],
 [2, 3, 1, 4, 5],
 [6, 7],
 [6, 7, 8],
 [6, 7, 8, 9],
 [10, 11],
 [10, 11, 1],
 [10, 11, 1, 12]]

In [9]:
max(len(i) for i in sequences), "샘플의 최대 길이"

(5, '샘플의 최대 길이')

In [10]:
sequences=pad_sequences(sequences, maxlen=5, padding='pre')

In [11]:
sequences[0]

array([0, 0, 0, 2, 3])

In [12]:
sequences=np.array(sequences)
X=sequences[:,:-1]
y=sequences[:,-1]

In [13]:
y=to_categorical(y, num_classes=13)

In [14]:
y

array([[0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]],
      dtype=float32)

In [15]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, SimpleRNN, LSTM

In [16]:
embedding_dim=13
hidden_units=32

model=Sequential()
model.add(Embedding(13, embedding_dim))
model.add(SimpleRNN(hidden_units))
# model.add(SimpleRNN(64))
# model.add(SimpleRNN(32))
# model.add(SimpleRNN(16))
# model.add(SimpleRNN(8))

model.add(Dense(13, activation="softmax"))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [17]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 13)          169       
                                                                 
 simple_rnn (SimpleRNN)      (None, 32)                1472      
                                                                 
 dense (Dense)               (None, 13)                429       
                                                                 
Total params: 2,070
Trainable params: 2,070
Non-trainable params: 0
_________________________________________________________________


In [18]:
model.fit(X,y, epochs=200, verbose=2)

Epoch 1/200
1/1 - 1s - loss: 2.5678 - accuracy: 0.1000 - 720ms/epoch - 720ms/step
Epoch 2/200
1/1 - 0s - loss: 2.5563 - accuracy: 0.1000 - 2ms/epoch - 2ms/step
Epoch 3/200
1/1 - 0s - loss: 2.5448 - accuracy: 0.1000 - 2ms/epoch - 2ms/step
Epoch 4/200
1/1 - 0s - loss: 2.5334 - accuracy: 0.1000 - 2ms/epoch - 2ms/step
Epoch 5/200
1/1 - 0s - loss: 2.5221 - accuracy: 0.3000 - 2ms/epoch - 2ms/step
Epoch 6/200
1/1 - 0s - loss: 2.5107 - accuracy: 0.3000 - 2ms/epoch - 2ms/step
Epoch 7/200
1/1 - 0s - loss: 2.4992 - accuracy: 0.3000 - 3ms/epoch - 3ms/step
Epoch 8/200
1/1 - 0s - loss: 2.4876 - accuracy: 0.3000 - 2ms/epoch - 2ms/step
Epoch 9/200
1/1 - 0s - loss: 2.4758 - accuracy: 0.3000 - 998us/epoch - 998us/step
Epoch 10/200
1/1 - 0s - loss: 2.4639 - accuracy: 0.3000 - 2ms/epoch - 2ms/step
Epoch 11/200
1/1 - 0s - loss: 2.4517 - accuracy: 0.3000 - 2ms/epoch - 2ms/step
Epoch 12/200
1/1 - 0s - loss: 2.4392 - accuracy: 0.3000 - 2ms/epoch - 2ms/step
Epoch 13/200
1/1 - 0s - loss: 2.4265 - accuracy: 0.30

Epoch 105/200
1/1 - 0s - loss: 0.5360 - accuracy: 0.9000 - 3ms/epoch - 3ms/step
Epoch 106/200
1/1 - 0s - loss: 0.5245 - accuracy: 1.0000 - 2ms/epoch - 2ms/step
Epoch 107/200
1/1 - 0s - loss: 0.5133 - accuracy: 1.0000 - 2ms/epoch - 2ms/step
Epoch 108/200
1/1 - 0s - loss: 0.5024 - accuracy: 1.0000 - 2ms/epoch - 2ms/step
Epoch 109/200
1/1 - 0s - loss: 0.4918 - accuracy: 1.0000 - 2ms/epoch - 2ms/step
Epoch 110/200
1/1 - 0s - loss: 0.4815 - accuracy: 1.0000 - 3ms/epoch - 3ms/step
Epoch 111/200
1/1 - 0s - loss: 0.4714 - accuracy: 1.0000 - 3ms/epoch - 3ms/step
Epoch 112/200
1/1 - 0s - loss: 0.4617 - accuracy: 1.0000 - 2ms/epoch - 2ms/step
Epoch 113/200
1/1 - 0s - loss: 0.4521 - accuracy: 1.0000 - 2ms/epoch - 2ms/step
Epoch 114/200
1/1 - 0s - loss: 0.4429 - accuracy: 1.0000 - 2ms/epoch - 2ms/step
Epoch 115/200
1/1 - 0s - loss: 0.4339 - accuracy: 1.0000 - 2ms/epoch - 2ms/step
Epoch 116/200
1/1 - 0s - loss: 0.4251 - accuracy: 1.0000 - 2ms/epoch - 2ms/step
Epoch 117/200
1/1 - 0s - loss: 0.4166 - 

<keras.callbacks.History at 0x23d27ba0748>

In [19]:
def sentence_generation(model, tokenizer, current_word, n):
    init_word=current_word
    sentence=""
    
    #n번 반복
    for _ in range(n):
        # 현재 단어에 대한 정수 인코딩과 패딩
        encoded=tokenizer.texts_to_sequences([current_word])[0]
        encoded=pad_sequences([encoded], maxlen=5, padding='pre')
        # 입력한 X(현재 단어)에 대해서 Y를 예측하고 Y(예측한 단어)를 result에 저장
        result=model.predict(encoded, verbose=0)
        result=np.argmax(result, axis=1)
        
        for word, index in tokenizer.word_index.items():
            # 만약 예측한 단어와 인덱스와 동일한 단어가 있다면 break
            if index==result:
                break
        
        current_word=current_word+' '+word
        sentence=sentence+' '+word
    
    sentence=init_word+sentence
    return sentence

In [20]:
print(sentence_generation(model, tokenizer, "포크와", 5))

포크와 나이프를 사용하고 있네요 피자를 먹는


In [21]:
# !pip install word2word

In [22]:
from word2word import Word2word

In [23]:
translate=Word2word("en","ko")

Downloading data ...
100% [..........................................................................] 8030815 / 8030815

In [24]:
print(translate("fork")), print(translate("pizza")), print(translate("knife"))

['포크', '갈림길', '케익', '찌르', '볼로냐']
['피자', '맛있', '먹', '냉동', '조각']
['칼', '나이프', 'knife', '흔적', '찔렀']


(None, None, None)

In [25]:
translate("fork")[0]

'포크'

In [26]:
keyword_list=[translate("fork")[0], translate("pizza")[0], translate("knife")[1]]

In [27]:
import random

In [28]:
from tkinter import *
from tkinter import messagebox
 
root = Tk()
root.geometry("360x100")
 
# 버튼 클릭 이벤트 핸들러
def okClick():
    txt.delete(0,"end")
    txt.insert(0, sentence_generation(model, tokenizer, ' '.join(random.sample(keyword_list, 1))+"와", random.randint(2, 5)))
 
lbl = Label(root, text="키워드: "+", ".join(keyword_list))
lbl.grid(row=0, column=1)
txt = Entry(root, width=50)
txt.grid(row=3, column=1)
 
# 버튼 클릭 이벤트와 핸들러 정의
btn = Button(root, text="문장 만들기", command=okClick)
 
btn.grid(row=1, column=1)
root.mainloop()