# TF2 로 seq2seq 구현  (with attention)

In [1]:
import tensorflow as tf
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    # 텐서플로가 첫 번째 GPU만 사용하도록 제한
    # 프로그램 시작시에 메모리 증가가 설정되어야만 합니다
    try:
        tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
        tf.config.experimental.set_memory_growth(gpus[0], True)
        print('GPU[0] is ready')
    except RuntimeError as e:
        # 프로그램 시작시에 접근 가능한 장치가 설정되어야만 합니다
        print(e)
else:
    print('Please check GPU available')
    
import os
import sys
import tensorflow as tf
from tensorflow.keras import datasets, layers, models, optimizers
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import clear_output
from six.moves import urllib
from tensorflow import feature_column as fc
import tensorflow_datasets as tfds
plt.rcParams["font.family"] = 'NanumBarunGothic'
TENSORBOARD_BINARY = '/home/hoondori/anaconda3/envs/ai/bin/tensorboard'
os.environ['TENSORBOARD_BINARY'] =  TENSORBOARD_BINARY   

GPU[0] is ready


In [2]:
import nltk
import numpy as np
import re
import shutil
import tensorflow as tf
import os
import unicodedata
import zipfile

from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# 데이터 처리

* teacher forcing을 사용해야 하므로 (X, Y, Y') 형태

In [4]:
def preprocess_sentence(sent):
    sent = "".join([c for c in unicodedata.normalize("NFD", sent) 
        if unicodedata.category(c) != "Mn"])
    sent = re.sub(r"([!.?])", r" \1", sent)
    sent = re.sub(r"[^a-zA-Z!.?]+", r" ", sent)
    sent = re.sub(r"\s+", " ", sent)
    sent = sent.lower()
    return sent


def download_and_read():
    en_sents, fr_sents_in, fr_sents_out = [], [], []
    local_file = '/home/hoondori/data/fra-eng/fra.txt'
    with open(local_file, "r") as fin:
        for i, line in enumerate(fin):
            en_sent, fr_sent, _ = line.strip().split('\t')
            en_sent = [w for w in preprocess_sentence(en_sent).split()]
            fr_sent = preprocess_sentence(fr_sent)
            fr_sent_in = [w for w in ("BOS " + fr_sent).split()]
            fr_sent_out = [w for w in (fr_sent + " EOS").split()]
            en_sents.append(en_sent)
            fr_sents_in.append(fr_sent_in)
            fr_sents_out.append(fr_sent_out)
            if i >= NUM_SENT_PAIRS - 1:
                break
    return en_sents, fr_sents_in, fr_sents_out

NUM_SENT_PAIRS = 30000
sents_en, sents_fr_in, sents_fr_out = download_and_read()
print(len(sents_en), len(sents_fr_in), len(sents_fr_out))

30000 30000 30000


In [5]:
tokenizer_en = tf.keras.preprocessing.text.Tokenizer(
    filters="", lower=False)
tokenizer_en.fit_on_texts(sents_en)
data_en = tokenizer_en.texts_to_sequences(sents_en)
data_en = tf.keras.preprocessing.sequence.pad_sequences(data_en, padding="post")

tokenizer_fr = tf.keras.preprocessing.text.Tokenizer(
    filters="", lower=False)
tokenizer_fr.fit_on_texts(sents_fr_in)
tokenizer_fr.fit_on_texts(sents_fr_out)
data_fr_in = tokenizer_fr.texts_to_sequences(sents_fr_in)
data_fr_in = tf.keras.preprocessing.sequence.pad_sequences(data_fr_in, padding="post")
data_fr_out = tokenizer_fr.texts_to_sequences(sents_fr_out)
data_fr_out = tf.keras.preprocessing.sequence.pad_sequences(data_fr_out, padding="post")

vocab_size_en = len(tokenizer_en.word_index)
vocab_size_fr = len(tokenizer_fr.word_index)
word2idx_en = tokenizer_en.word_index
idx2word_en = {v:k for k, v in word2idx_en.items()}
word2idx_fr = tokenizer_fr.word_index
idx2word_fr = {v:k for k, v in word2idx_fr.items()}
print("vocab size (en): {:d}, vocab size (fr): {:d}".format(
    vocab_size_en, vocab_size_fr))

maxlen_en = data_en.shape[1]
maxlen_fr = data_fr_out.shape[1]
print("seqlen (en): {:d}, (fr): {:d}".format(maxlen_en, maxlen_fr))

vocab size (en): 4339, vocab size (fr): 7649
seqlen (en): 8, (fr): 16


In [6]:
# dataset
BATCH_SIZE = 128
dataset = tf.data.Dataset.from_tensor_slices((data_en, data_fr_in, data_fr_out))
dataset = dataset.shuffle(10000)
test_size = NUM_SENT_PAIRS // 4
test_dataset = dataset.take(test_size).batch(BATCH_SIZE, drop_remainder=True)
train_dataset = dataset.skip(test_size).batch(BATCH_SIZE, drop_remainder=True)
train_dataset

<BatchDataset shapes: ((128, 8), (128, 16), (128, 16)), types: (tf.int32, tf.int32, tf.int32)>

# 모델

In [7]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, num_timesteps, encoder_dim, **kwargs):
        super(Encoder, self).__init__(**kwargs)
        self.encoder_dim = encoder_dim
        self.embedding = tf.keras.layers.Embedding(
            vocab_size, embedding_dim, input_length=num_timesteps)
        self.rnn = tf.keras.layers.GRU(
            encoder_dim, return_sequences=True, return_state=True)   # attention 사용을 위해 return_sequences=True

    def call(self, x, state):
        x = self.embedding(x)
        x, state = self.rnn(x, initial_state=state)
        return x, state

    def init_state(self, batch_size):
        return tf.zeros((batch_size, self.encoder_dim))

## Bahdanau Attention : e = v_t tanh(W[query; value])  

* query는 디코더 은닉 상태, value는 인코더 은닉 상태

In [8]:
class BahdanauAttention(tf.keras.layers.Layer):
    
    def __init__(self, num_units, **kwargs):
        super(BahdanauAttention, self).__init__(**kwargs)
        
        self.Wq = tf.keras.layers.Dense(num_units) 
        self.Wv = tf.keras.layers.Dense(num_units)
        self.V = tf.keras.layers.Dense(1)
        
    def call(self, query, values):
        
        # query shape : (batch_size, num_units)
        # value shape : (batch_size, timesteps, num_units)
        
        # (batch_size, 1, num_units)
        query_with_time_axis = tf.expand_dims(query, axis=1)  
        
        # (batch_size, timesteps, 1)
        scores = self.V( tf.keras.activations.tanh(             
            self.Wv(values) + self.Wq(query_with_time_axis)
        ))
        
        # (batch_size, timesteps, 1)
        alignments = tf.nn.softmax(scores, axis=1)  
        
        # (batch_size, num_units)
        context = tf.reduce_sum(
            tf.linalg.matmul(                           # (batch_size, 1, num_units)
                tf.linalg.matrix_transpose(alignments),   # (batch_size, 1, timesteps)
                values                                # (batch_size, timesteps, num_units)
            ), axis=1
        )
        
        #context = tf.expand_dims(context, axis=1)   ??
        return context, alignments

In [9]:
class LuongAttention(tf.keras.layers.Layer):
    
    def __init__(self, num_units):
        super(LuongAttention, self).__init__()
        self.W = tf.keras.layers.Dense(num_units)
        
    def call(self, query, values):
        
        # query shape : (batch_size, num_units)
        # value shape : (batch_size, timesteps, num_units)
        
        # (batch_size, 1, num_units)
        query_with_time_axis = tf.expand_dims(query, axis=1)  
        
        # (batch_size, 1, timesteps) = (batch_size, 1, num_units) * (batch_size, num_units, timesteps) 
        scores = tf.linalg.matmul(query_with_time_axis, self.W(values), transpose_b=True)
        
        # (batch_size, 1, timesteps)
        alignments = tf.nn.softmax(scores, axis=2)
        
        # (batch_size, num_units)
        context = tf.reduce_sum(
            tf.linalg.matmul(
                alignments, 
                values
            ), axis=1
        )
        
        return context, alignments
    
        

In [10]:
batch_size = 64
num_timesteps = maxlen_en
num_units = 256

query = np.random.random(size=(batch_size, num_units)).astype(np.float32)
values = np.random.random(size=(batch_size, num_timesteps, num_units)).astype(np.float32)

b_attn = BahdanauAttention(num_units)
context, alignments = b_attn(query, values)
print("Bahdanau: context.shape:", context.shape, "alignments.shape:", alignments.shape)
# # Bahdanau: context.shape: (64, 1024) alignments.shape: (64, 8, 1)

l_attn = LuongAttention(num_units)
context, alignments = l_attn(query, values)
print("Luong: context.shape:", context.shape, "alignments.shape:", alignments.shape)

Bahdanau: context.shape: (64, 256) alignments.shape: (64, 8, 1)
Luong: context.shape: (64, 256) alignments.shape: (64, 1, 8)


In [11]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, num_timesteps,decoder_dim, **kwargs):
        super(Decoder, self).__init__(**kwargs)
        
        self.decoder_dim = decoder_dim
        
        self.attention = LuongAttention(decoder_dim)
        # self.attention = BahdanauAttention(decoder_dim)
        
        self.embedding = tf.keras.layers.Embedding(
            vocab_size, embedding_dim, input_length=num_timesteps)
        
        self.rnn = tf.keras.layers.GRU(
            decoder_dim, return_sequences=True, return_state=True)
        
        self.Wc = tf.keras.layers.Dense(decoder_dim, activation='tanh')
        self.Ws = tf.keras.layers.Dense(vocab_size)

    def call(self, x, state, encoder_out):
        
        x = self.embedding(x)
        
        context, alignments = self.attention(state, encoder_out)
        
        concat = tf.concat([x, context], axis=1)    # (batch_size, decoder_dim + embed_dim)
        
        x, state = self.rnn(
            tf.expand_dims(concat, axis=1)
        )
        x = self.Wc(x)
        x = self.Ws(x)
        return x, state, alignments
    
EMBEDDING_DIM = 256
ENCODER_DIM, DECODER_DIM = 1024, 1024

embedding_dim = EMBEDDING_DIM
encoder_dim, decoder_dim = ENCODER_DIM, DECODER_DIM

tf.random.set_seed(42)

encoder = Encoder(vocab_size_en+1, embedding_dim, maxlen_en, encoder_dim)
decoder = Decoder(vocab_size_fr+1, embedding_dim, maxlen_fr, decoder_dim)     

In [12]:
# 디버깅 (출력 점검)

for enc_in, dec_in, dec_out in train_dataset.take(1):
    enc_init_state = encoder.init_state(BATCH_SIZE)
    encoder_out,enc_state = encoder(enc_in, enc_init_state)
    decoder_state = enc_state

    print("encoder input :", enc_in.shape)   # (batch_size, maxlen_en)
    print("encoder state :", enc_state.shape) # (batch_size, encdoer_dim)
    print("decoder input :", dec_in.shape) # (batch_size, maxlen_fr)
    
    for t in range(dec_out.shape[1]):
        decoder_in_t = dec_in[:, t]
        decoder_pred_t, decoder_state, _ = decoder(decoder_in_t, decoder_state, encoder_out)
        if t == 0:
            print("decoder input at t:", decoder_in_t.shape) # (batch_size, )
            print("decoder state at t:", decoder_state.shape) # (batch_size, decoder_dim)
            print("decoder predict at t:", decoder_pred_t.shape) # (batch_size, 1, vocab_size_fr+1)

encoder input : (128, 8)
encoder state : (128, 1024)
decoder input : (128, 16)
decoder input at t: (128,)
decoder state at t: (128, 1024)
decoder predict at t: (128, 1, 7650)


# 평가 정의

In [13]:
def predict(encoder, decoder, batch_size, sents_en, data_en, sents_fr_out, word2idx_fr, idx2word_fr):
    random_id = np.random.choice(len(sents_en))
    print("input    : ",  " ".join(sents_en[random_id]))
    print("label    : ", " ".join(sents_fr_out[random_id]))

    encoder_in = tf.expand_dims(data_en[random_id], axis=0)
    decoder_out = tf.expand_dims(sents_fr_out[random_id], axis=0)

    encoder_state = encoder.init_state(1)
    encoder_out, encoder_state = encoder(encoder_in, encoder_state)
    decoder_state = encoder_state

    pred_sent_fr = []
    decoder_in = tf.expand_dims(tf.constant(word2idx_fr["BOS"]), axis=0)

    while True:
        decoder_pred, decoder_state, _ = decoder(decoder_in, decoder_state, encoder_out)
        decoder_pred = tf.argmax(decoder_pred, axis=-1)
        pred_word = idx2word_fr[decoder_pred.numpy()[0][0]]
        pred_sent_fr.append(pred_word)
        if pred_word == "EOS" or len(pred_sent_fr) >= maxlen_fr:
            break
        decoder_in = tf.squeeze(decoder_pred, axis=1)

    print("predicted: ", " ".join(pred_sent_fr))


def evaluate_bleu_score(encoder, decoder, test_dataset, 
        word2idx_fr, idx2word_fr):

    bleu_scores = []
    smooth_fn = SmoothingFunction()

    for encoder_in, decoder_in, decoder_out in test_dataset:
        encoder_state = encoder.init_state(batch_size)
        encoder_out, encoder_state = encoder(encoder_in, encoder_state)
        decoder_state = encoder_state

        ref_sent_ids = np.zeros_like(decoder_out)
        hyp_sent_ids = np.zeros_like(decoder_out)
        for t in range(decoder_out.shape[1]):
            decoder_out_t = decoder_out[:, t]
            decoder_in_t = decoder_in[:, t]
            decoder_pred_t, decoder_state, _ = decoder(
                decoder_in_t, decoder_state, encoder_out)
            decoder_pred_t = tf.argmax(decoder_pred_t, axis=-1)
            for b in range(decoder_pred_t.shape[0]):
                ref_sent_ids[b, t] = decoder_out_t.numpy()[0]
                hyp_sent_ids[b, t] = decoder_pred_t.numpy()[0][0]
        
        for b in range(ref_sent_ids.shape[0]):
            ref_sent = [idx2word_fr[i] for i in ref_sent_ids[b] if i > 0]
            hyp_sent = [idx2word_fr[i] for i in hyp_sent_ids[b] if i > 0]
            # remove trailing EOS
            ref_sent = ref_sent[0:-1]
            hyp_sent = hyp_sent[0:-1]
            bleu_score = sentence_bleu([ref_sent], hyp_sent,
                smoothing_function=smooth_fn.method1)
            bleu_scores.append(bleu_score)

    return np.mean(np.array(bleu_scores))
  

# 훈련

In [14]:
# loss 정의 : 둘 다 zero(PAD)인 것은 제외하고 평가

def loss_fn(ytrue, ypred):
    scce = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    mask = tf.math.logical_not(tf.math.equal(ytrue, 0))
    mask = tf.cast(mask, dtype=tf.int64)
    loss = scce(ytrue, ypred, sample_weight=mask)
    return loss

In [15]:
@tf.function
def train_step(encoder_in, decoder_in, decoder_out, encoder_state):
    with tf.GradientTape() as tape:
        encoder_out, encoder_state = encoder(encoder_in, encoder_state)
        decoder_state = encoder_state

        loss = 0
        for t in range(decoder_out.shape[1]):
            decoder_in_t = decoder_in[:, t]
            decoder_pred_t, decoder_state, _ = decoder(decoder_in_t,
                decoder_state, encoder_out)
            loss += loss_fn(decoder_out[:, t], decoder_pred_t)

    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))
    return loss / decoder_out.shape[1]

In [17]:
checkpoint_dir = '/tmp/logs/seq2seq_with_attn/'
optimizer = tf.keras.optimizers.Adam()
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

num_epochs = 50


In [18]:
eval_scores = []
batch_size = BATCH_SIZE

for e in range(num_epochs):
    encoder_state = encoder.init_state(batch_size)

    for i, data in enumerate(train_dataset):
        encoder_in, decoder_in, decoder_out = data
        # print(encoder_in.shape, decoder_in.shape, decoder_out.shape)
        loss = train_step(
            encoder_in, decoder_in, decoder_out, encoder_state)
        
        #print('{:d}/{:d} loss={:.4f}'.format(i, e, loss))
    
    print("Epoch: {}, Loss: {:.4f}".format(e + 1, loss.numpy()))

    if e % 10 == 0:
        checkpoint.save(file_prefix=checkpoint_prefix)
    
    predict(encoder, decoder, batch_size, sents_en, data_en,
        sents_fr_out, word2idx_fr, idx2word_fr)

    eval_score = evaluate_bleu_score(encoder, decoder, test_dataset, word2idx_fr, idx2word_fr)
    print("Eval Score (BLEU): {:.3e}".format(eval_score))
    # eval_scores.append(eval_score)

checkpoint.save(file_prefix=checkpoint_prefix)

Epoch: 1, Loss: 1.4314
input    :  he gave up hope .
label    :  il abandonna tout espoir . EOS
predicted:  il est un chemin . EOS
Eval Score (BLEU): 1.773e-02
Epoch: 2, Loss: 1.1245
input    :  i wanted more .
label    :  j en voulais plus . EOS
predicted:  je suis de l air . EOS
Eval Score (BLEU): 2.523e-02
Epoch: 3, Loss: 0.8434
input    :  we re worried .
label    :  nous nous faisons du souci . EOS
predicted:  nous sommes allees . EOS
Eval Score (BLEU): 3.435e-02
Epoch: 4, Loss: 0.7225
input    :  take command .
label    :  prenez le controle . EOS
predicted:  prenez la tete . EOS
Eval Score (BLEU): 4.168e-02
Epoch: 5, Loss: 0.6033
input    :  tom grinned .
label    :  tom souriait . EOS
predicted:  tom chatouillait . EOS
Eval Score (BLEU): 6.306e-02
Epoch: 6, Loss: 0.5205
input    :  i m desperate .
label    :  je suis desespere . EOS
predicted:  je suis desarme . EOS
Eval Score (BLEU): 8.144e-02
Epoch: 7, Loss: 0.4065
input    :  you re early .
label    :  vous etes en avance . 

'/tmp/logs/seq2seq_with_attn/ckpt-6'