# <저장되어있는 것에서 대답해주는 ENG chatbot>

## 1.
### yaml 포멧 파일을 불러옴
- docs 에 dict type으로 저장(key값 두개)

### conversation 처리
- conv의 구조. 질문,답,답,답...
- 2일때는 질문,답

In [94]:
import os
import yaml

dir_path = './raw_data'
files_list = os.listdir(dir_path + os.sep)

questions = list()
answers = list()
for filepath in files_list:
    stream = open( dir_path + os.sep + filepath , 'rb')
    docs = yaml.safe_load(stream)
    conversations = docs['conversations']
    for con in conversations:
        if len( con ) > 2 :
            questions.append(con[0])
            replies = con[ 1 : ]
            ans = ''
            for rep in replies:
                ans += ' ' + rep
            answers.append( ans )
        elif len( con )> 1:
            questions.append(con[0])
            answers.append(con[1])

In [95]:
# 질문 / 답변 각각 텍스트 파일 저장

import numpy as np

np.savetxt("./data/questions.csv", questions, delimiter=",", fmt='%s')
np.savetxt("./data/answers.csv", answers, delimiter=",", fmt='%s')

In [96]:
# 질문 파일 로드

with open("./data/questions.csv", 'r') as f:
    questions_load = f.read().splitlines()

In [98]:
# 답변 파일 로드

with open("./data/answers.csv", 'r') as f:
    answers_load = f.read().splitlines()

In [99]:
# 파일 확인

print('첫번째 질문 : {} \n 첫번째 답변 : {}\n'.format(questions_load[0],answers_load[0]))
print('두번째 질문 : {} \n 두번째 답변 : {}'.format(questions_load[1],answers_load[1]))

첫번째 질문 : What is AI? 
 첫번째 답변 : Artificial Intelligence is the branch of engineering and science devoted to constructing machines that think.

두번째 질문 : What is AI? 
 두번째 답변 : AI is the field of science which concerns itself with building hardware and software that replicates the functions of the human mind.


## preparing input data for the encoder

The Encoder model will be fed input data which are preprocessed English sentences.
The preprocessing is done as follows:

- Tokenizing the English sentences
- Determining the maximum length of the English sentence that's max_input_length
- Padding the *tokenized_eng_lines* to the max_input_length
- Determining the vocabulary size (num_eng_tokens) for English words

![Encoder-Decoder](./image/RNN.PNG)

## Encoder preprocessing
- tokenizing
- max length of sentence
- padding
- 전체 단어 갯수

In [100]:
from keras import preprocessing
import numpy as np

Using TensorFlow backend.


In [104]:
## 문장 토크나이징 및 벡터화

tokenizer = preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(questions_load)
tokenized_questions = tokenizer.texts_to_sequences(questions_load)

In [114]:
## 최대길이 문장의 길이 확인 --> input 텐서 생성 maxlen 활용

max_input_length = max([len(token_seq) for token_seq in tokenized_questions])
print('questions max length is {}'.format(max_input_length))

questions max length is 22


In [29]:
## pad_sentences로 입력 텐서 생성

## padding : padding위치
padded_questions = preprocessing.sequence.pad_sequences(tokenized_questions, maxlen = max_input_length, padding='post')
encoder_input = np.array(padded_questions)
print('Encoder input data shape --> {}'.format(encoder_input.shape))
encoder_input[0]

Encoder input data shape --> (523, 22)


array([  4,   3, 109,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0])

In [138]:
## word_index로 전체 데이터의 딕셔너리 생성 및 전체 단어 수 확인

questions_word_dict = tokenizer.word_index

# embedding input_dim에는 전체 단어 수 + 1 <-- 0을 인덱스로 사용하지 않음 (470개 : 1~471)
num_questions_tokens = len(questions_word_dict)
print('Number of questions tokens = {}'.format(num_questions_tokens))

Number of questions tokens = 470


## Preparing input data for the Decoder

The Decoder model will be fed the preprocessed 'answers'.
The preprocessing steps are similar to the ones which are above.
This one step is carried out before the other steps.

- Append < Start > tag at the first position in each answer sentence.
- Append < End > tag at the last position in each answer sentence.

### a2 function이 더 좋음

In [179]:
def a1():
    ans = list()

    for i in range(len(answers_load)):
        ans.append('<START>' + answers_load[i] + '<END>')
def a2():
    ans = ['<START>'+answer+'<END>' for answer in answers_load]

In [180]:
%timeit a1()
%timeit a2()

148 µs ± 3.07 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
92.6 µs ± 2.37 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [183]:
ans = ['<START>'+answer+'<END>' for answer in answers_load]

In [271]:
tokenizer1 = preprocessing.text.Tokenizer()
tokenizer1.fit_on_texts(ans)
tokenized_ans = tokenizer1.texts_to_sequences(ans)

max_output_length = max([len(token_seq) for token_seq in tokenized_ans])
print('answers max length is {}'.format(max_output_length))

padded_ans = preprocessing.sequence.pad_sequences(tokenized_ans, maxlen=max_output_length,
                                                 padding='post')
decoder_input = np.array(padded_ans)
print('Decoder input data shape --> {}'.format(decoder_input.shape))

ans_word_dict = tokenizer1.word_index
num_ans_tokens = len(ans_word_dict)
print('Number of answers tokens = {}'.format(num_ans_tokens))

answers max length is 74
Decoder input data shape --> (523, 74)
Number of answers tokens = 1559


## Preparing target data for the Decoder

- Take a copy of tokenized_ans nad modify it like this

    1. Remove the < Start > tag which we appended earlier
    2. Convert the padded_ans to one-hot vectors

In [295]:
decoder_target.shape

(523, 74, 1560)

In [285]:
decoder_target = [token_seq[1:] for token_seq in tokenized_ans]
    
padded_ans1 = preprocessing.sequence.pad_sequences(decoder_target ,maxlen=max_output_length, padding='post')

# 1~1559까지 있었으나 padding을 넣으면서 0~1559, num_ans_tokens+1
onehot_ans = utils.to_categorical(padded_ans1, num_ans_tokens+1)
decoder_target= np.array(onehot_ans)
print( 'Decoder target data shape -> {}'.format( decoder_target.shape ))

Decoder target data shape -> (523, 74, 1560)


In [16]:
onehot_ans[0]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]], dtype=float32)

## Defining Model

In [298]:
from keras.models import Sequential, Model
from keras import layers

[Masking and Padding](https://www.tensorflow.org/beta/guide/keras/masking_and_padding)  
[return_sequences, return_state](https://keras.io/layers/recurrent/)

In [299]:
# mask_zero = True : 0으로 된값은 고려하지 않아도 됨
# function 형으로 짜서 encode와 decode를 분리할 수 있음

In [300]:
# return_sequences : hidden state
# return_state : cell state
encoder_inputs = layers.Input(shape=(None, ))
encoder_embedding = layers.Embedding(num_questions_tokens, 256, mask_zero=True)(encoder_inputs)
encoder_outputs, state_h, state_c = layers.LSTM(128, return_state=True)(encoder_embedding)
encoder_states = [state_h, state_c]

decoder_inputs = layers.Input(shape=(None,  ))
decoder_embedding = layers.Embedding(num_ans_tokens, 256, mask_zero=True) (decoder_inputs)
decoder_lstm = layers.LSTM(128, return_state=True, return_sequences=True)
decoder_outputs , d_state_h , d_state_c = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = layers.Dense(num_ans_tokens, activation= 'softmax') 
output = decoder_dense(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], output )
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 256)    120320      input_1[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, None, 256)    399104      input_2[0][0]                    
____________________________________________________________________________________________

In [20]:
model.fit([encoder_input, decoder_input], decoder_target, batch_size = 128, epochs = 300)
model.save('chat_model.h5')


Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300
Epoch 75/300
Epoch 76/300
Epoch 77/300
Epoch 7

  '. They will not be included '


## load model

In [30]:
from keras.models import load_model

model_loaded = load_model('chat_model.h5')

## encoder - decoder 모델 각각 분리 저장

[seq2seq model framework](https://keras.io/examples/lstm_seq2seq/)

In [21]:
encoder_model = Model(encoder_inputs, encoder_states)

In [27]:
decoder_state_input_h = layers.Input(shape=(128,))
decoder_state_input_c = layers.Input(shape=(128,))  
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_outputs, state_h, state_c = decoder_lstm(decoder_embedding,
                                                 initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model([decoder_inputs] + decoder_states_inputs,
                          [decoder_outputs] + decoder_states)

In [88]:
encoder_model.save('chatbot_encoder_model.h5')
decoder_model.save('chatbot_decoder_model.h5')

  '. They will not be included '


In [23]:
encoder_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, None)              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, None, 256)         120576    
_________________________________________________________________
lstm_1 (LSTM)                [(None, 128), (None, 128) 197120    
Total params: 317,696
Trainable params: 317,696
Non-trainable params: 0
_________________________________________________________________


In [24]:
decoder_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, None, 256)    399360      input_2[0][0]                    
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, 128)          0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            (None, 128)          0                                            
__________________________________________________________________________________________________
lstm_2 (LS

In [None]:
# 입력받음.

In [25]:
def str_to_tokens(sentence : str ):
    words = sentence.lower().split()
    tokens_list = list()
    for word in words:
        tokens_list.append(questions_word_dict[word] ) 
    return preprocessing.sequence.pad_sequences([tokens_list], 
                                                maxlen=max_input_length, padding='post')

### chatbot test

In [28]:
for epoch in range(encoder_input.shape[0] ):
    states_values = encoder_model.predict(str_to_tokens(input('Enter eng sentence : ' )))
    #states_values = enc_model.predict( encoder_input_data[ epoch ] )
    empty_target_seq = np.zeros((1, 1))
    empty_target_seq[0, 0] = ans_word_dict['start']
    stop_condition = False
    decoded_translation = ''
    while not stop_condition :
        dec_outputs, h , c = decoder_model.predict([empty_target_seq] + states_values )
        sampled_word_index = np.argmax(dec_outputs[0, -1, :] )
        sampled_word = None
        for word, index in ans_word_dict.items() :
            if sampled_word_index == index :
                decoded_translation += ' {}'.format(word)
                sampled_word = word
        
        if sampled_word == 'end' or len(decoded_translation.split()) > max_output_length:
            stop_condition = True
            
        empty_target_seq = np.zeros( ( 1 , 1 ) )  
        empty_target_seq[ 0 , 0 ] = sampled_word_index
        states_values = [ h , c ] 

    print(decoded_translation)

Enter eng sentence : hello
 hi we we end
Enter eng sentence : z


KeyError: 'z'