In [1]:
import numpy as np
import pandas as pd


# 파일 읽기
with open('bot.txt', 'r') as content_file:
    botdata = content_file.read()

In [2]:
Questions = []
Answers = []

for line in botdata.split("</pattern>"):
    if "<pattern>" in line:
        Quesn = line[line.find("<pattern>")+len("<pattern>"):]
        Questions.append(Quesn.lower())
        
for line in botdata.split("</template>"):
    if "<template>" in line:
        Ans = line[line.find("<template>")+len("<template>"):]
        Ans = Ans.lower()
        Answers.append(Ans.lower())

QnAdata = pd.DataFrame(np.column_stack([Questions,Answers]),columns = ["Questions","Answers"])
QnAdata["QnAcomb"] = QnAdata["Questions"]+" "+QnAdata["Answers"]


print(QnAdata.head())

            Questions                                            Answers  \
0               yahoo  a lot of people hear about <bot name="name"/> ...   
1        you are lazy                    actually i work 24 hours a day.   
2         you are mad                no i am quite logical and rational.   
3    you are thinking  i am a thinking machine.<think><set name="it">...   
4  you are dividing *            actually i am not too good at division.   

                                             QnAcomb  
0  yahoo a lot of people hear about <bot name="na...  
1       you are lazy actually i work 24 hours a day.  
2    you are mad no i am quite logical and rational.  
3  you are thinking i am a thinking machine.<thin...  
4  you are dividing * actually i am not too good ...  


In [3]:
import nltk
import collections

counter = collections.Counter()

for i in range(len(QnAdata)):
    for word in nltk.word_tokenize(QnAdata.iloc[i][2]):
        counter[word]+=1

word2idx = {w:(i+1) for i,(w,_) in enumerate(counter.most_common())}        
idx2word = {v:k for k,v in word2idx.items()}


idx2word[0] = "PAD"

vocab_size = len(word2idx)+1

print("\n\nVocabulary size:",vocab_size)



Vocabulary size: 3451


In [4]:
def encode(sentence, maxlen,vocab_size):
    indices = np.zeros((maxlen, vocab_size))
    for i, w in enumerate(nltk.word_tokenize(sentence)):
        if i == maxlen: break
        indices[i, word2idx[w]] = 1
    return indices


def decode(indices, calc_argmax=True):
    if calc_argmax:
        indices = np.argmax(indices, axis=-1)
    return ' '.join(idx2word[x] for x in indices)


question_maxlen = 10
answer_maxlen = 20

def create_questions(question_maxlen,vocab_size):
    question_idx = np.zeros(shape=(len(Questions),question_maxlen,vocab_size))
    
    for q in range(len(Questions)):
        question = encode(Questions[q],question_maxlen,vocab_size)

        question_idx[i] = question 
    return question_idx

quesns_train = create_questions(question_maxlen=question_maxlen,vocab_size=vocab_size)


def create_answers(answer_maxlen,vocab_size):
    answer_idx = np.zeros(shape=(len(Answers),answer_maxlen,vocab_size))
    
    for q in range(len(Answers)):
        answer = encode(Answers[q],answer_maxlen,vocab_size)

        answer_idx[i] = answer 
    return answer_idx

answs_train = create_answers(answer_maxlen=answer_maxlen,vocab_size=vocab_size)

In [5]:
from keras.layers import Input,Dense,Dropout,Activation
from keras.models import Model
from keras.layers.recurrent import LSTM
from keras.layers.wrappers import Bidirectional
from keras.layers import RepeatVector,TimeDistributed,ActivityRegularization

Using TensorFlow backend.


In [6]:
n_hidden = 128

question_layer = Input(shape=(question_maxlen,vocab_size))
encoder_rnn = LSTM(n_hidden,dropout=0.2,recurrent_dropout=0.2)(question_layer)
repeat_encode = RepeatVector(answer_maxlen)(encoder_rnn)
dense_layer = TimeDistributed(Dense(vocab_size))(repeat_encode)
regularized_layer = ActivityRegularization(l2=1)(dense_layer)
softmax_layer = Activation('softmax')(regularized_layer)

model = Model([question_layer],[softmax_layer])
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

print (model.summary())

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 10, 3451)          0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               1832960   
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 20, 128)           0         
_________________________________________________________________
time_distributed_1 (TimeDist (None, 20, 3451)          445179    
_________________________________________________________________
activity_regularization_1 (A (None, 20, 3451)          0         
_________________________________________________________________
activation_1 (Activation)    (None, 20, 3451)          0         
Total params: 2,278,139
Trainable params: 2,278,139
Non-trainable params: 0
_________________________________________________

In [7]:
quesns_train_2 = quesns_train.astype('float32')
answs_train_2 = answs_train.astype('float32')

In [8]:
# 모델 학습
model.fit(quesns_train_2, answs_train_2,batch_size=32,epochs=30,validation_split=0.05) # 오래 걸린다.

Train on 2803 samples, validate on 148 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.callbacks.History at 0x155fc485b80>

In [9]:
# 모델 예측
ans_pred = model.predict(quesns_train_2[0:3])

print (decode(ans_pred[0]))
print (decode(ans_pred[1]))

PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD
PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD
