In [1]:
import tensorflow as tf
import numpy as np

In [2]:
#Dataset define
SOS_token = 0
EOS_token = 11
n_samples=6000
seq_len = 5

X = np.array([[np.random.randint(1, 6) for _ in range(seq_len)] for _ in range(n_samples)])
Y = X+5

X= np.insert(X, 0, SOS_token, axis=1)
Y = np.insert(Y, seq_len, EOS_token, axis=1)

x_data = X[:int(n_samples * 0.5),:]
y_data = Y[:int(n_samples * 0.5),:]

x_eval = X[int(n_samples * 0.5): ,:]
y_eval = Y[int(n_samples * 0.5): ,:]

In [3]:
#----vocab----#
vocab_size = 12
embedding_dim =16

#----training----#
batch_size = 3000
epochs = 1000

#----encoder,decoder----#
hidden_dim = 128

In [4]:
def batch_generator(x_data, y_data, batch_size):
    n_samples = len(x_data)
    while True:
        batches = range(0, n_samples, batch_size)
        for start in batches:
            end = start + batch_size
            X_batch = x_data[start:end]
            Y_batch = y_data[start:end]
            
            all_data = {
                'Encoder_input' : X_batch,
                'Decoder_input' : Y_batch
            }
            yield (all_data)

In [5]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, batch_size):
        super(Encoder, self).__init__()
        self.batch_size = batch_size
        self.hidden_dim = hidden_dim
        # make embedding matrix
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        '''
        LSTM args.  
            
            return_sequences
                True : [batch_size, input_seq_len, hidden_dim]  => return all sequence
                False: [batch_size, hidden_dim] => return last output
        '''
        self.lstm = tf.keras.layers.LSTM(hidden_dim,
                                        return_sequences=True,
                                        return_state=True) 
   
    def call(self, inputs):
        # embeded_input : [batch_size, input_seq_len, embedding_dim]
        embeded_inputs = self.embedding(inputs)
        output, memory_state, carry_state = self.lstm(embeded_inputs, initial_state = self.init_hidden_state())
        return output, memory_state, carry_state
    
    def init_hidden_state(self):
        return (tf.zeros((self.batch_size, self.hidden_dim)),tf.zeros((self.batch_size, self.hidden_dim))) #init state : tuple
    

In [6]:
class Attention(tf.keras.layers.Layer):
    '''
        Attention Mechanism
        1. encoder hidden state 생성
        2. previous decoder hidden state와 모든 encoder hidden state와의 Alignment score 계산 => attention score
        3. attention score softmax함 => attention weight
        4. attention weight X 모든 encoder hidden state => context vector
        5. previous decoder output과 context vector concat하여 Decoder의 input으로 사용함
        6. 1~5반복
    '''
    def __init__(self, hidden_dim):
        super(Attention, self).__init__()
        self.W1 = tf.keras.layers.Dense(hidden_dim)
        self.W2 = tf.keras.layers.Dense(hidden_dim)
        self.V = tf.keras.layers.Dense(1)
    def call(self, encoder_output, decoder_state, BahdanauAttn=True):
        '''
            encoder_output : [batch_size, time_step, hidden_dim]
            decoder_state : [batch_size, hidden_dim]
        '''
        # encoder_output과 shape 맞춤
        decoder_state = tf.expand_dims(decoder_state, 1)
        if BahdanauAttn:
            # self.W2(encoder_output)의 각 time step에 self.W1(decoder_state)더함
            # [batch_size, time_step, 1]
            attn_score = self.V(tf.nn.tanh(self.W1(decoder_state) + self.W2(encoder_output)))
        else:
            # dot_product
            decoder_state = tf.transpose(decoder_state, perm=[0,2,1])
            attn_score = tf.matmul(decoder_state,encoder_output)
            
        # [batch_size, input_seq_len, 1]
        attn_weights = tf.nn.softmax(attn_score, axis =1)
        
        # [batch_size, input_seq_len, 1], 각 encoder_hidden에 attn_weigts 곱함
        context_vector = attn_weights * encoder_output
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector, attn_weights

In [7]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, batch_size):
        super(Decoder, self).__init__()
        self.batch_size = batch_size
        self.hidden_dim = hidden_dim
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)

        self.lstm = tf.keras.layers.LSTM(self.hidden_dim,
                                        return_sequences=True,
                                        return_state=True)
        self.attn = Attention(hidden_dim)
        #vocab_logit
        self.projection_layer = tf.keras.layers.Dense(vocab_size)
        
    def call(self, inputs, prev_decoder_state, encoder_output):
        context_vector,_ = self.attn(encoder_output, prev_decoder_state)

        #embeded_inputs: [batch_size, 1 , embedding_dim]
        embeded_inputs = self.embedding(inputs)
        
        #new_input: [batch_size, 1 , embedding_dim + hidden_dim]
        new_input = tf.concat([tf.expand_dims(context_vector, 1), embeded_inputs], axis=-1)
        decoder_output, state, carry_state = self.lstm(new_input)

        #[batch, hidden_dim]
        decoder_output = tf.reshape(decoder_output, (-1, decoder_output.shape[2]))
        v =  self.projection_layer(decoder_output)
        return v, state

In [8]:
@tf.function
def train_step(encoder, decoder, optimizer, loss_object, encoder_input, target):
    loss = 0
    with tf.GradientTape() as tape:
        encoder_output, memory_state, carry_state= encoder(encoder_input)
        # decoder init state : feature vector
        hidden = memory_state 
        
        # [[0]* batch_size]
        decoder_input = tf.expand_dims([SOS_token] * batch_size, 1)
        
        # Teacher forcing 
        for t in range(0,target.shape[1]):
            pred, hidden = decoder(decoder_input, hidden, encoder_output)
            # pred와 target[:,t]가 매칭되도록 학습 
            loss += loss_function(loss_object, target[:,t], pred)
            decoder_input = tf.expand_dims(target[:,t], 1)
            
        batch_loss = (loss/int(decoder_input.shape[1]))
        variables = encoder.trainable_variables + decoder.trainable_variables
        gradients = tape.gradient(loss, variables)
        optimizer.apply_gradients(zip(gradients, variables))
    
        return batch_loss


In [9]:
def train_run(epoch, encoder, decoder):  
    generator= batch_generator(x_data,y_data,batch_size)
    
    optimizer = tf.keras.optimizers.Adam()
    loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')
    
    for e in range(epoch):
        total_loss = 0
        for step in range(len(x_data)//batch_size):
            data = next(generator)
            batch_loss = train_step(encoder, decoder, optimizer, loss_object ,data['Encoder_input'] ,data['Decoder_input'])
        if e % 50 ==0:
            print(f'Epochs :{e}/{epoch}, \t Batch_loss : {batch_loss:.5f}')

In [10]:
def loss_function(loss_object, real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)
    mask = tf.cast(mask, dtype=loss_.dtype)
    # padding된 부분 masking하여 loss에 영향을 주지 않도록함
    loss_ *= mask
    return tf.reduce_mean(loss_)

In [11]:
def evaluate(encoder, decoder, n_samples= 10):
    generator= batch_generator(x_data,y_data,1)
    encoder.batch_size = 1
    for i in range(n_samples):
        data = next(generator)
        inputs = data['Encoder_input']
        target = data['Decoder_input'][:,:-1].squeeze()
        encoder_output, memory_state, carry_state= encoder(inputs)
        hidden = memory_state
        
        decoder_input = tf.expand_dims([0], 1)
        result=''
        for t in range(inputs.shape[1]):
            pred, hidden = decoder(decoder_input, hidden, encoder_output)
            pred_id = tf.argmax(pred,axis=1).numpy()
            if pred_id ==EOS_token:
                break
            else :
                result += str(pred_id[0]) +' '
            decoder_input = tf.expand_dims(pred_id,1)
        print(f'real: {target} \t pred : {result}')
        result=''

In [12]:
def main(): 
    encoder = Encoder(vocab_size, embedding_dim, hidden_dim, batch_size)
    decoder = Decoder(vocab_size, embedding_dim, hidden_dim, batch_size)
    train_run(epochs, encoder, decoder)
    evaluate(encoder, decoder)

if __name__ == '__main__':
    main()

Epochs :0/1000, 	 Batch_loss : 14.90906
Epochs :50/1000, 	 Batch_loss : 10.73083
Epochs :100/1000, 	 Batch_loss : 10.53481
Epochs :150/1000, 	 Batch_loss : 8.99809
Epochs :200/1000, 	 Batch_loss : 8.46741
Epochs :250/1000, 	 Batch_loss : 8.19937
Epochs :300/1000, 	 Batch_loss : 8.05801
Epochs :350/1000, 	 Batch_loss : 7.82328
Epochs :400/1000, 	 Batch_loss : 7.51935
Epochs :450/1000, 	 Batch_loss : 11.33467
Epochs :500/1000, 	 Batch_loss : 10.04626
Epochs :550/1000, 	 Batch_loss : 9.52231
Epochs :600/1000, 	 Batch_loss : 9.21502
Epochs :650/1000, 	 Batch_loss : 8.49149
Epochs :700/1000, 	 Batch_loss : 3.08244
Epochs :750/1000, 	 Batch_loss : 1.26239
Epochs :800/1000, 	 Batch_loss : 0.69617
Epochs :850/1000, 	 Batch_loss : 1.23023
Epochs :900/1000, 	 Batch_loss : 0.41134
Epochs :950/1000, 	 Batch_loss : 0.24137
real: [8 8 8 7 7] 	 pred : 8 8 8 7 7 
real: [ 9  6 10  6  9] 	 pred : 9 6 10 6 9 
real: [7 7 6 9 6] 	 pred : 7 7 6 9 6 
real: [10  8  6  9  7] 	 pred : 10 8 6 9 7 
real: [6 7 9 8