##############################################
#      Sequence 2 Sequence 챗봇     
##############################################

## 0. 라이브러리 임포트

In [24]:
import pandas as pd
import string
import numpy as np

import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Masking, Bidirectional, Add, BatchNormalization, Layer, Concatenate, Attention, LayerNormalization

from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l2
from keras.models import Model, load_model
from keras.callbacks import ModelCheckpoint

## 1. 하이퍼파라미터 및 파일 불러오기

In [25]:
tokenizer = tfds.deprecated.text.SubwordTextEncoder.load_from_file('tokenizer')
START_TOKEN, END_TOKEN = [tokenizer.vocab_size], [tokenizer.vocab_size+1]
VOCAB_SIZE = tokenizer.vocab_size + 2

QUE_MAX_LENGTH = 48
ANS_MAX_LENGTH = 138

embedding_dim = 256
hidden_units = 512
batch_size = 128
epochs = 100
lr=0.001
dropout_rate = 0.5
# initial_lr = 0.1
# final_lr = 0.0001

model_path = 'chatbot.h5'

src_vocab_size = VOCAB_SIZE
tar_vocab_size = src_vocab_size

encoder_input_train = np.load('encoder_input_train.npy')
decoder_input_train = np.load('decoder_input_train.npy')
decoder_target_train = np.load('decoder_target_train.npy')
encoder_input_test = np.load('encoder_input_test.npy')
decoder_input_test = np.load('decoder_input_test.npy')
decoder_target_test = np.load('decoder_target_test.npy')

## 2. 모델 정의

### 2-0. Basic Model

In [None]:
# src_vocab_size = tokenizer.vocab_size + 2
# tar_vocab_size = src_vocab_size

# encoder_inputs = Input(shape=(None,))
# enc_emb = Embedding(src_vocab_size, embedding_dim)(encoder_inputs)
# enc_masking = Masking(mask_value=0.0)(enc_emb)
# encoder_lstm = LSTM(hidden_units, return_state=True)
# encoder_outputs, state_h, state_c = encoder_lstm(enc_masking)
# encoder_states = [state_h, state_c]

# decoder_inputs = Input(shape=(None,))
# dec_emb_layer = Embedding(tar_vocab_size, hidden_units)
# dec_emb = dec_emb_layer(decoder_inputs)
# dec_masking = Masking(mask_value=0.0)(dec_emb)
# decoder_lstm = LSTM(hidden_units, return_sequences=True, return_state=True) 

# decoder_outputs, _, _ = decoder_lstm(dec_masking,initial_state=encoder_states)

# decoder_dense = Dense(tar_vocab_size, activation='softmax')
# decoder_outputs = decoder_dense(decoder_outputs)

# model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
# model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['acc'])
# model.summary()

### 2-1. No Attention Model(Encoder-Decoder Layer Model)

In [26]:
# encoder_inputs = Input(shape=(None,))
# enc_emb = Embedding(src_vocab_size, embedding_dim)(encoder_inputs)
# enc_masking = Masking(mask_value=0.0)(enc_emb)

# encoder_bi_lstm1 = Bidirectional(LSTM(hidden_units, return_sequences=True, dropout=dropout_rate))
# encoder_outputs1 = encoder_bi_lstm1(enc_masking)

# encoder_bi_lstm2 = Bidirectional(LSTM(hidden_units, return_state=True, dropout=dropout_rate))
# encoder_outputs2, forward_h, forward_c, backward_h, backward_c = encoder_bi_lstm2(encoder_outputs1)

# state_h = tf.keras.layers.Concatenate()([forward_h, backward_h])
# state_c = tf.keras.layers.Concatenate()([forward_c, backward_c])
# encoder_states = [state_h, state_c]

# decoder_inputs = Input(shape=(None,))
# dec_emb_layer = Embedding(tar_vocab_size, hidden_units)
# dec_emb = dec_emb_layer(decoder_inputs)
# dec_masking = Masking(mask_value=0.0)(dec_emb)

# decoder_lstm1 = LSTM(hidden_units * 2, return_sequences=True, return_state=True, dropout=dropout_rate, kernel_regularizer=l2(0.01))
# decoder_outputs1, _, _ = decoder_lstm1(dec_masking, initial_state=encoder_states)

# decoder_outputs1_bn = BatchNormalization()(decoder_outputs1)

# dec_masking_dense = Dense(hidden_units * 2)(dec_masking)
# decoder_outputs1_residual = Add()([decoder_outputs1_bn, dec_masking_dense])

# decoder_lstm2 = LSTM(hidden_units * 2, return_sequences=True, return_state=True, dropout=dropout_rate)
# decoder_outputs2, _, _ = decoder_lstm2(decoder_outputs1_residual)

# decoder_dense = Dense(tar_vocab_size, activation='softmax')
# decoder_outputs = decoder_dense(decoder_outputs2)

# model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
# optimizer = tf.keras.optimizers.RMSprop(learning_rate=lr, rho=0.9)
# model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['acc'])
# model.summary()

### 2-2. Encoder - Decoder Attention Model

In [27]:
encoder_inputs = Input(shape=(None,))
enc_emb = Embedding(src_vocab_size, embedding_dim)(encoder_inputs)
enc_masking = Masking(mask_value=0.0)(enc_emb)
encoder_lstm = Bidirectional(LSTM(hidden_units, return_sequences=True, return_state=True, dropout=dropout_rate))
encoder_outputs, forward_h, forward_c, backward_h, backward_c = encoder_lstm(enc_masking)

state_h = Concatenate()([forward_h, backward_h])
state_c = Concatenate()([forward_c, backward_c])
encoder_states = [state_h, state_c]

decoder_inputs = Input(shape=(None,))
dec_emb_layer = Embedding(tar_vocab_size, hidden_units)
dec_emb = dec_emb_layer(decoder_inputs)
dec_masking = Masking(mask_value=0.0)(dec_emb)
decoder_lstm = LSTM(hidden_units * 2, return_sequences=True, return_state=True, dropout=dropout_rate)
decoder_outputs, _, _ = decoder_lstm(dec_masking, initial_state=encoder_states)

attention = Attention()
attention_output = attention([decoder_outputs, encoder_outputs])

decoder_combined_context = Concatenate(axis=-1)([decoder_outputs, attention_output])

decoder_dense = Dense(tar_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_combined_context)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
optimizer = tf.keras.optimizers.RMSprop(learning_rate=lr, rho=0.9)
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['acc'])
model.summary()

### 2-3. Self Attention Model

In [28]:
# encoder_inputs = Input(shape=(None,))
# enc_emb = Embedding(src_vocab_size, embedding_dim)(encoder_inputs)
# enc_masking = Masking(mask_value=0.0)(enc_emb)
# encoder_lstm = Bidirectional(LSTM(hidden_units, return_sequences=True, return_state=True, dropout=dropout_rate))
# encoder_outputs, forward_h, forward_c, backward_h, backward_c = encoder_lstm(enc_masking)

# state_h = Concatenate()([forward_h, backward_h])
# state_c = Concatenate()([forward_c, backward_c])
# encoder_states = [state_h, state_c]

# decoder_inputs = Input(shape=(None,))
# dec_emb_layer = Embedding(tar_vocab_size, hidden_units)
# dec_emb = dec_emb_layer(decoder_inputs)
# dec_masking = Masking(mask_value=0.0)(dec_emb)
# decoder_lstm = LSTM(hidden_units * 2, return_sequences=True, return_state=True, dropout=dropout_rate)
# decoder_outputs, _, _ = decoder_lstm(dec_masking, initial_state=encoder_states)

# attention = Attention()
# attention_output = attention([decoder_outputs, encoder_outputs])

# self_attention = tf.keras.layers.MultiHeadAttention(num_heads=8, key_dim=hidden_units * 2)
# self_attention_output = self_attention(decoder_outputs, decoder_outputs)

# decoder_outputs_with_attention = decoder_outputs + attention_output
# decoder_outputs_norm_with_attention = LayerNormalization()(decoder_outputs_with_attention)

# decoder_outputs_with_self_attention = decoder_outputs_norm_with_attention + self_attention_output
# decoder_outputs_norm = LayerNormalization()(decoder_outputs_with_self_attention)

# decoder_dense = Dense(tar_vocab_size, activation='softmax')
# decoder_outputs = decoder_dense(decoder_outputs_norm)

# # 모델 정의
# model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
# optimizer = tf.keras.optimizers.RMSprop(learning_rate=lr, rho=0.9)
# model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['acc'])
# model.summary()

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_7 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 embedding_6 (Embedding)        (None, None, 256)    4172288     ['input_7[0][0]']                
                                                                                                  
 input_8 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 masking_6 (Masking)            (None, None, 256)    0           ['embedding_6[0][0]']            
                                                                                            

# 3. 콜백 함수 정의

In [29]:
# def scheduler(epoch, lr):
#     if epoch < epochs:
#         lr = initial_lr - (initial_lr - final_lr) * (epoch / epochs)
    
#     return lr

# lr_callback = tf.keras.callbacks.LearningRateScheduler(scheduler)
checkpoint = ModelCheckpoint(model_path, monitor='val_loss', save_best_only=True, mode='min', verbose=1)



# 4. 학습

In [30]:
model.fit(x=[encoder_input_train, decoder_input_train], y=decoder_target_train, \
          validation_data=([encoder_input_test, decoder_input_test], decoder_target_test),
          batch_size=batch_size, epochs=epochs, 
          callbacks=[checkpoint])   # ,lr_callback

Epoch 1/100
Epoch 1: val_loss improved from inf to 1.87285, saving model to chatbot.h5
Epoch 2/100
Epoch 2: val_loss improved from 1.87285 to 0.18699, saving model to chatbot.h5
Epoch 3/100

KeyboardInterrupt: 

#   5. 검증

4.1 모델정의

In [32]:
model = load_model('D:/chatbot_please/chatbot0.186.h5')
pre_token = -1

print('################################################')
print('## 채팅을 끝내시려면 "끝!" 이라고 입력해주세요! ##')
print('################################################')

while True:
    question = []
    result_list = []
    final_sentence = ''
    input_sentence = input('나 : ')
    print(f'나 : {input_sentence}')
    if input_sentence == '끝!':
        break

    question.append(tokenizer.encode(input_sentence))
    tokenized_input_sentence = tf.keras.preprocessing.sequence.pad_sequences(
        question, maxlen=QUE_MAX_LENGTH, padding='post'
    )

    temparary_decoder_input = [START_TOKEN[0],]

    for i in range(0,ANS_MAX_LENGTH):
        pred = model.predict([tokenized_input_sentence, np.array(temparary_decoder_input).reshape(1, -1)],verbose=0)
        last_pred = pred[:, -1, :]
        sorted_indices = np.argsort(last_pred, axis=-1)[:, ::-1]
        second_max_index = sorted_indices[:, 0]
        next_token = second_max_index[0]

        if next_token >= 16260:
            break

        temparary_decoder_input.append(next_token)
        
        if pre_token != next_token:
            result_list.append(tokenizer.decode([next_token]))

        pre_token = next_token
    final_sentence = ''.join(result_list)

    special_characters = string.punctuation
    final_sentence = final_sentence.lstrip(special_characters)

    if not final_sentence.endswith(('.', '!')):
        final_sentence += '.'

    print(f'봇 : {final_sentence}')

################################################
## 채팅을 끝내시려면 "끝!" 이라고 입력해주세요! ##
################################################
나 : IMF의 구제금융에는 요구 조건이 있나요?
봇 : 없습니다 애플 없습니다 애플 없습니다 애플 .
나 : 뉴스에서 지난해 광의통화량이 증가했다고 하는데 무슨뜻인가요?
봇 : 없습니다 애플 없습니다 애플 없습니다 애플 .
