In [1]:
import tensorflow as tf
import numpy as np
import os

from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import matplotlib.pyplot as plt

from preprocess import *

In [2]:
DATA_IN_PATH = './data_in/'
DATA_OUT_PATH = './data_out/'
TRAIN_INPUTS = 'train_inputs.npy'
TRAIN_OUTPUTS = 'train_outputs.npy'
TRAIN_TARGETS = 'train_targets.npy'
DATA_CONFIGS = 'data_configs.json'

SEED_NUM = 1234
tf.random.set_seed(SEED_NUM)

index_inputs = np.load(open(DATA_IN_PATH + TRAIN_INPUTS, 'rb'))
index_outputs = np.load(open(DATA_IN_PATH + TRAIN_OUTPUTS, 'rb'))
index_targets = np.load(open(DATA_IN_PATH + TRAIN_TARGETS, 'rb'))
prepro_configs = json.load(open(DATA_IN_PATH + DATA_CONFIGS, 'r'))

# 각 시퀀스 길이 확인
print(len(index_inputs), len(index_outputs), len(index_targets))

MODEL_NAME = 'seq2seq_kor'
BATCH_SIZE = 2
MAX_SEQUENCE = 25
EPOCH = 30
UNITS = 1024 # GRU rnn의 결과 차원
EMBEDDING_DIM = 256
VALIDATION_SPLIT = 0.1

char2idx = prepro_configs['char2idx']
idx2char = prepro_configs['idx2char']
std_index = prepro_configs['std_symbol'] # "<SOS>"
end_index = prepro_configs['end_symbol'] # "<END>"
vocab_size = prepro_configs['vocab_size'] # 111

20 20 20


# 모델 

## 인코더

In [3]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz # 2
        self.enc_units = enc_units # 1024
        self.vocab_size = vocab_size # 111
        self.embedding_dim = embedding_dim # 256
        
        self.embedding = tf.keras.layers.Embedding(self.vocab_size, self.embedding_dim)
        self.gru = tf.keras.layers.GRU(self.enc_units,
                                      return_sequences=True,
                                      return_state=True,
                                      recurrent_initializer='glorot_uniform')
        
    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state=hidden)
        
        return output, state
    
    def initialize_hidden_state(self, inp):
        return tf.zeros((tf.shape(inp)[0], self.enc_units))

## 어텐션

In [4]:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        
        super(BahdanauAttention, self).__init__()
        self.w1 = tf.keras.layers.Dense(units)
        self.w2 = tf.keras.layers.Dense(units)
        self.v = tf.keras.layers.Dense(1)
        """ w1, w2, v는 학습을 통해 최적화됨!! """

    def call(self, query, values):
        """
        query: 인코더 GRU rnn의 은닉상태값 
        values: 인코더 GRU rnn의 결과값
        """
        hidden_with_time_axis = tf.expand_dims(query, 1)
        # query를 w2에 행렬곱할 수 있는 형태를 만듦.
        
        score = self.v(tf.nn.tanh(
            self.w1(values) + self.w2(hidden_with_time_axis)))
        # 1차원의 벡터값 나옴
        
        # 어텐션 가중치
        attention_weights = tf.nn.softmax(score, axis=1)
        # 모델이 중요하다고 판단하는 값은 1에, 영향도 떨어질수록 0에 가까운 값
        
        context_vector = attention_weights * values
        # 인코더 결과값 중 1에 가까운 값은 커지고, 0에 가까운 값은 작아짐
        context_vector = tf.reduce_sum(context_vector, axis=1)
        
        """ 새로운 인코더 순환 신경망 결과값 만들어서 디코더에 전달!! """
        return context_vector, attention_weights

## 디코더

In [5]:
class Decoder(tf.keras.layers.Layer):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        
        super(Decoder, self).__init__()
        
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        
        self.embedding = tf.keras.layers.Embedding(self.vocab_size,
                                                  self.embedding_dim)
        self.gru = tf.keras.layers.GRU(self.dec_units,
                                      return_sequences=True,
                                      return_state=True,
                                      recurrent_initializer='glorot_uniform')
        self.fc = tf.keras.layers.Dense(self.vocab_size)
        
        self.attention = BahdanauAttention(self.dec_units)
        
    def call(self, x, hidden, enc_output):
        
        # 1. 인코더 결과값에 어텐션 가중치 적용해서 새로운 결과값 "문맥 벡터" 만듦
        context_vector, attention_weights = self.attention(hidden, enc_output)
        # context_vector : (20, 1024)
        
        # 2. 디코더 입력값 임베딩 
        x = self.embedding(x)
        
        # 3. 문맥벡터와 디코더 입력 임베딩값을 연결 
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
        # tf.concat([(20, 1, 1024), (20, 1, 256)], axis=-1)
        # concat 결과: (20, 1, 1280)
        
        # 4. 연결한 것을 GRU rnn 
        output, state = self.gru(x)
        # output: (20, 1, 1024)
        # state: (20, 1024) - 디코더 최종 은닉상태 행렬
        
        # 5. GRU 결과 reshape
        output = tf.reshape(output, (-1, output.shape[2]))
        # tf.reshape(output, (-1, 1024))
        # output: (20, 1024)
        
        # 6. 선형 층 통과 
        x = self.fc(output)
        
        # 디코더 출력값, 디코더 최종 은닉상태, 어텐션 가중치 
        return x, state, attention_weights
        # x: (20, 111)
        # state: (20, 1024)
        # attention_weights: (20, 25, 1)

In [6]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy')

def loss(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    
    return tf.reduce_mean(loss_)

def accuracy(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    mask = tf.expand_dims(tf.cast(mask, dtype=pred.dtype), axis=-1)
    pred *= mask
    acc = train_accuracy(real, pred)
    
    return tf.reduce_mean(acc)

## 시퀀스 투 시퀀스 모델

In [7]:
class seq2seq(tf.keras.Model):
    
    def __init__(self, vocab_size, embedding_dim, enc_units, dec_units, batch_sz, end_token_idx=2):
        
        super(seq2seq, self).__init__()
        
        self.end_token_idx = end_token_idx
        self.encoder = Encoder(vocab_size, embedding_dim, enc_units, batch_sz)
        self.decoder = Decoder(vocab_size, embedding_dim, dec_units, batch_sz)
        
    def call(self, x):
        
        inp, tar = x # index_inputs, index_outputs
        
        enc_hidden = self.encoder.initialize_hidden_state(inp)
        enc_output, enc_hidden = self.encoder(inp, enc_hidden)
        
        dec_hidden = enc_hidden
        
        predict_tokens = list()
        
        for t in range(0, tar.shape[1]): # range(0, 25)
            # 각 타임스텝마다 (t: 0~24)
            dec_input = tf.dtypes.cast(tf.expand_dims(tar[:,t],1), tf.float32)
            
            predictions, dec_hidden, _ = self.decoder(dec_input, dec_hidden, enc_output)
            # 마지막 값은 "어텐션 가중치"
            
            predict_tokens.append(tf.dtypes.cast(predictions, tf.float32))
            
        return tf.stack(predict_tokens, axis=1)
    
    # 임의의 입력에 대한 모델의 결과값 확인하기 위한 테스트 목적 함수
    def inference(self, x):
        
        inp = x  # test_index_inputs
#         array([[41, 59, 56, 61,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
#          0,  0,  0,  0,  0,  0,  0,  0,  0]])
#         (1, 25)

        # 질문을 받아서 은닉상태를 초기화하고 
        enc_hidden = self.encoder.initialize_hidden_state(inp)
        # <tf.Tensor: shape=(1, 1024), dtype=float32, numpy=array([[0., 0., 0., ..., 0., 0., 0.]], dtype=float32)>
        
        # 해당 질문과 은닉상태로 GRU rnn 인코딩을 진행한다. 
        enc_output, enc_hidden = self.encoder(inp, enc_hidden)
        # enc_output = (1, 25, 1024) : 인코더 출력
        # enc_hidden = (1, 1024) : 최종 은닉상태벡터
        
        dec_hidden = enc_hidden
        
        dec_input = tf.expand_dims([char2idx[std_index]], 1) # ([1],1)
        # <tf.Tensor: shape=(1, 1), dtype=int32, numpy=array([[1]])>
        
        predict_tokens = list()
        
        for t in range(0, MAX_SEQUENCE): # range(0, 25)
            predictions, dec_hidden, _ = self.decoder(dec_input, dec_hidden, enc_output)
            # decoder([[1]], 인코더 최종 은닉상태, 인코더 출력) 시작
            # "<SOS>" 의 정수인덱스로 예측 문장 시작
            
            predict_token = tf.argmax(predictions[0]) # 최대값의 위치 (111개 단어 중)
            # <tf.Tensor: shape=(), dtype=int64, numpy=101>
            
            # "<END>" 만나면 반복 멈춤
            if predict_token == self.end_token_idx: # 2
                break
                
            predict_tokens.append(predict_token)
            dec_input = tf.dtypes.cast(tf.expand_dims([predict_token], 0), tf.float32)
            # ex. <tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[101.]], dtype=float32)>
            # 디코더를 거쳐나온 predictions 값이 가장 큰 위치를 실수화한 것
            
        return tf.stack(predict_tokens, axis=0).numpy()
        # ex. array([49, 45, 34,  3], dtype=int64)
        # idx2char 사전으로 해독하고, ' '.join시키면 답변 문장 나옴!! 

In [8]:
model = seq2seq(vocab_size, EMBEDDING_DIM, UNITS, UNITS, BATCH_SIZE, char2idx[end_index])
model.compile(loss=loss, optimizer=tf.keras.optimizers.Adam(1e-3), metrics=[accuracy])
# model.run_eagerly = True

# 학습 진행

In [9]:
PATH = DATA_OUT_PATH + MODEL_NAME

if not(os.path.isdir(PATH)):
    os.makedirs(os.path.join(PATH))
    
checkpoint_path = DATA_OUT_PATH + MODEL_NAME + '/weights.h5'

cp_callback = ModelCheckpoint(
    checkpoint_path, monitor='val_accuracy', verbose=1,
    save_best_only=True, save_weights_only=True)

earlystop_callback = EarlyStopping(monitor='val_accuracy',
                                  min_delta = 0.0001,
                                  patience=10)

history = model.fit([index_inputs, index_outputs], index_targets,
                   batch_size = BATCH_SIZE, epochs=EPOCH,
                   validation_split=VALIDATION_SPLIT,
                   callbacks=[earlystop_callback, cp_callback])

Epoch 1/30
Epoch 1: val_accuracy improved from -inf to 0.87400, saving model to ./data_out/seq2seq_kor\weights.h5
Epoch 2/30
Epoch 2: val_accuracy improved from 0.87400 to 0.87800, saving model to ./data_out/seq2seq_kor\weights.h5
Epoch 3/30
Epoch 3: val_accuracy improved from 0.87800 to 0.88400, saving model to ./data_out/seq2seq_kor\weights.h5
Epoch 4/30
Epoch 4: val_accuracy improved from 0.88400 to 0.88750, saving model to ./data_out/seq2seq_kor\weights.h5
Epoch 5/30
Epoch 5: val_accuracy improved from 0.88750 to 0.88960, saving model to ./data_out/seq2seq_kor\weights.h5
Epoch 6/30
Epoch 6: val_accuracy improved from 0.88960 to 0.89133, saving model to ./data_out/seq2seq_kor\weights.h5
Epoch 7/30
Epoch 7: val_accuracy improved from 0.89133 to 0.89314, saving model to ./data_out/seq2seq_kor\weights.h5
Epoch 8/30
Epoch 8: val_accuracy improved from 0.89314 to 0.89625, saving model to ./data_out/seq2seq_kor\weights.h5
Epoch 9/30
Epoch 9: val_accuracy improved from 0.89625 to 0.89978, 

Epoch 26/30
Epoch 26: val_accuracy improved from 0.93912 to 0.94092, saving model to ./data_out/seq2seq_kor\weights.h5
Epoch 27/30
Epoch 27: val_accuracy improved from 0.94092 to 0.94281, saving model to ./data_out/seq2seq_kor\weights.h5
Epoch 28/30
Epoch 28: val_accuracy improved from 0.94281 to 0.94457, saving model to ./data_out/seq2seq_kor\weights.h5
Epoch 29/30
Epoch 29: val_accuracy improved from 0.94457 to 0.94607, saving model to ./data_out/seq2seq_kor\weights.h5
Epoch 30/30
Epoch 30: val_accuracy improved from 0.94607 to 0.94747, saving model to ./data_out/seq2seq_kor\weights.h5


# 결과 확인

In [10]:
SAVE_FILE_NM = 'weights.h5'
model.load_weights(os.path.join(DATA_OUT_PATH, MODEL_NAME, SAVE_FILE_NM))

query = "남자친구 승진 선물로 뭐가 좋을까?"

# preprocess.py 의 함수 이용! 
test_index_inputs, _ = enc_processing([query], char2idx)
predict_tokens = model.inference(test_index_inputs)
print(predict_tokens)

print(' '.join([idx2char[str(t)] for t in predict_tokens]))

[49 45 34  3]
평소에 필요한 것 <UNK>


In [58]:
model.inference(test_index_inputs)

array([49, 45, 34,  3], dtype=int64)

In [57]:
predict_tokens

[<tf.Tensor: shape=(20, 111), dtype=float32, numpy=
 array([[ 8.4215831e-03, -1.1196337e-03,  1.4138895e-03, ...,
          1.6215544e-02,  1.0359879e-02, -4.1585290e-03],
        [ 7.9158768e-03, -1.1241965e-03,  1.3373462e-03, ...,
          1.6100917e-02,  1.0241423e-02, -4.1347970e-03],
        [ 8.4461328e-03, -1.2108093e-03,  1.3304802e-03, ...,
          1.6038792e-02,  9.7769313e-03, -3.8001370e-03],
        ...,
        [ 7.8215245e-03, -2.5606330e-04,  2.3081205e-03, ...,
          1.6792014e-02,  1.0876923e-02, -4.1219615e-03],
        [ 7.6563433e-03,  2.8635876e-04,  1.9904436e-03, ...,
          1.7501252e-02,  1.0738020e-02, -4.1031847e-03],
        [ 8.2794465e-03, -9.4826508e-05,  2.0592671e-03, ...,
          1.6905840e-02,  1.0454825e-02, -4.5564095e-03]], dtype=float32)>,
 <tf.Tensor: shape=(20, 111), dtype=float32, numpy=
 array([[ 0.00661545, -0.00986471, -0.01042728, ..., -0.00091787,
          0.01066955,  0.00064627],
        [ 0.00610968, -0.0098733 , -0.01050

In [48]:
test_index_inputs

array([[41, 59, 56, 61,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0]])