<a href="https://colab.research.google.com/github/kimhwijin/TensorflowWithKeras/blob/master/RNN/seq2seq_translate.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import nltk
import numpy as np
import re
import shutil
import tensorflow as tf
import os
import unicodedata
import zipfile
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction


In [2]:
def preprocessing_sentence(sent):
    sent = "".join([c for c in unicodedata.normalize("NFD", sent) if unicodedata.category(c) != "Mn"])
    #문자열 앞에 r 이붙으면 그대로 반환 r'abcd\n' = abcd\n
    sent = re.sub(r"([!.?])", r" \1", sent)
    #알파벳 또는 ! ? 제외하고 공백으로 치환
    sent = re.sub(r"[^a-zA-Z!.?]+", r" ", sent)
    #공백문자를 띄어쓰기 한칸으로 변경
    sent = re.sub(r"\s+", " ", sent)
    sent = sent.lower()
    return sent

def download_and_read(url, num_sent_pairs=30000):

    local_file = url.split('/')[-1]
    drive_path = "drive/MyDrive/Datasets/anki-eng-frg"
    data_path = os.path.join(drive_path, local_file)
    if not os.path.isfile(data_path):
        os.system('wget -O {:s} -P {:s} {:s}'.format(local_file, drive_path, url))
        with zipfile.ZipFile(data_path, 'r') as zip_ref:
            zip_ref.extractall(data_path)
    file_path = os.path.join(drive_path, 'fra.txt')
    en_sents, fr_sents_in, fr_sents_out = [], [], []

    with open(file_path, 'r') as fin:
        for i , line in enumerate(fin):
            en_sent, fr_sent, _ = line.strip().split('\t')
            en_sent = [w for w in preprocessing_sentence(en_sent).split()]
            fr_sent = preprocessing_sentence(fr_sent)
            fr_sent_in = [w for w in ("BOS" + fr_sent).split()]
            fr_sent_out = [w for w in (fr_sent + "EOS").split()]
            en_sents.append(en_sent)
            fr_sents_in.append(fr_sent_in)
            fr_sents_out.append(fr_sent_out)
            if i >= num_sent_pairs - 1:
                break
    return en_sents, fr_sents_in, fr_sents_out


In [3]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, num_timestemps, encoder_dim, **kwargs):
        super(Encoder, self).__init__(**kwargs)
        self.encoder_dim = encoder_dim
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=num_timestemps)
        self.rnn = tf.keras.layers.GRU(self.encoder_dim, return_sequences=True, return_state=True)

    def call(self, x, state):
        x = self.embedding(x)
        x, state = self.rnn(x, initial_state=state)
        return x, state
    def init_state(self, batch_size):
        return tf.zeros((batch_size, self.encoder_dim))

class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, num_timestemps, decoder_dim, **kwargs):
        super(Decoder, self).__init__(**kwargs)
        self.decoder_dim = decoder_dim
        self.embedding = tf.keras.layers.Embedding(vocab_size, decoder_dim, input_length=num_timestemps)
        self.rnn = tf.keras.layers.GRU(decoder_dim, return_sequences=True, return_state=True)
        self.dense = tf.keras.layers.Dense(vocab_size)
    
    def call(self, x, state):
        x = self.embedding(x)
        x, state = self.rnn(x, state)
        x = self.dense(x)
        return x, state

In [4]:
#패딩된 부분 마스킹후, 로스 계산
def loss_fn(ytrue, ypred):
    scce = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    mask = tf.math.logical_not(tf.math.equal(ytrue, 0 ))
    mask = tf.cast(mask, dtype=tf.int64)
    loss = scce(ytrue, ypred, sample_weight=mask)
    return loss

@tf.function
def train_step(encoder_in, decoder_in, decoder_out, encoder_state):
    with tf.GradientTape() as tape:
        decoder_state = encoder_state
        decoder_pred, decoder_state = decoder(decoder_in, decoder_state)
        loss = loss_fn(decoder_out, decoder_pred)
    variables = (encoder.trainable_variables + decoder.trainable_variables)
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))
    return loss

In [5]:
#하이퍼 파라미터
NUM_SENT_PAIRS = 30000
EMBEDDING_DIM = 256
ENCODER_DIM, DECODER_DIM = 1024, 1024
BATCH_SIZE = 64
NUM_EPOCHS = 30

#
checkpoint_dir = 'drive/MyDrive/Colab Notebooks/models/RNN_GRU_seq2seq'
#문장 데이터
sents_en, sents_fr_in, sents_fr_out = download_and_read('https://www.manythings.org/anki/fra-eng.zip', NUM_SENT_PAIRS)

In [6]:
#토크나이저
tokenizer_en = tf.keras.preprocessing.text.Tokenizer(filters="", lower=False)
tokenizer_en.fit_on_texts(sents_en)
data_en = tokenizer_en.texts_to_sequences(sents_en)
#뒤쪽 빈 부분을 채워줌
data_en = tf.keras.preprocessing.sequence.pad_sequences(data_en, padding='post')

#데이터 및 토크나이저 설정
tokenizer_fr = tf.keras.preprocessing.text.Tokenizer(filters="", lower=False)
tokenizer_fr.fit_on_texts(sents_fr_in)
tokenizer_fr.fit_on_texts(sents_fr_out)
data_fr_in = tokenizer_fr.texts_to_sequences(sents_fr_in)
data_fr_out = tokenizer_fr.texts_to_sequences(sents_fr_out)
data_fr_in = tf.keras.preprocessing.sequence.pad_sequences(data_fr_in, padding='post')
data_fr_out = tf.keras.preprocessing.sequence.pad_sequences(data_fr_out, padding='post')

#단어 개수
vocab_size_en = len(tokenizer_en.word_index)
vocab_size_fr = len(tokenizer_fr.word_index)
word2idx_en = tokenizer_en.word_index
idx2word_en = {v: k for k , v in word2idx_en.items()}
word2idx_fr = tokenizer_fr.word_index
idx2word_fr = {v: k for k , v in word2idx_fr.items()}
print("단어 사이즈 (en) : {:d}, (fr) : {:d}".format(vocab_size_en, vocab_size_fr))
maxlen_en = data_en.shape[1]
maxlen_fr = data_fr_out.shape[1]
print("기준 시퀀셜 길이 (en) : {:d}, (fr) : {:d}".format(maxlen_en, maxlen_fr))

단어 사이즈 (en) : 4354, (fr) : 8740
기준 시퀀셜 길이 (en) : 8, (fr) : 15


In [7]:
#test, train dataset // 1 : 3 비율
dataset = tf.data.Dataset.from_tensor_slices((data_en, data_fr_in, data_fr_out))
dataset = dataset.shuffle(10000)
test_size = NUM_SENT_PAIRS // 4
test_dataset = dataset.take(test_size).batch(BATCH_SIZE, drop_remainder=True)
train_dataset = dataset.skip(test_size).batch(BATCH_SIZE, drop_remainder=True)

In [8]:
encoder = Encoder(vocab_size_en + 1, EMBEDDING_DIM, maxlen_en, ENCODER_DIM)
decoder = Decoder(vocab_size_fr + 1, EMBEDDING_DIM, maxlen_fr, ENCODER_DIM)

In [9]:
#shape test
for encoder_in, decoder_in, decoder_out in train_dataset:
    encoder_state = encoder.init_state(BATCH_SIZE)
    encoder_out, encoder_state = encoder(encoder_in, encoder_state)
    decoder_state = encoder_state
    decoder_pred, decoder_state = decoder(decoder_in, decoder_state)
    break

print("Encoder 입력 : ", encoder_in.shape)
print("ENcoder 출력 : ", encoder_out.shape, "state : ", encoder_state.shape)
print("Decoder 입력 : ", decoder_in.shape)
print("Decoder 출력 : ", decoder_out.shape, "state : ", decoder_state.shape)

Encoder 입력 :  (64, 8)
ENcoder 출력 :  (64, 8, 1024) state :  (64, 1024)
Decoder 입력 :  (64, 15)
Decoder 출력 :  (64, 15) state :  (64, 1024)


In [10]:
def predict(encoder, decoder, batch_size, sents_en, data_en, sents_fr_out, word2idx_fr, idx2word_fr):
    random_id = np.random.choice(len(sents_en))
    #입력 문장
    print("입력 : ", "".join(sents_en[random_id]))
    #출력 문장
    print("출력 : ", "".join(sents_fr_out[random_id]))
    #배치차원 생성
    encoder_in = tf.expand_dims(data_en[random_id], axis=0)
    decoder_out = tf.expand_dims(sents_fr_out[random_id], axis=0)
    
    #1 배치 사이스 state
    encoder_state = encoder.init_state(batch_size=1)
    #인코더
    encoder_out, encoder_state = encoder(encoder_in, encoder_state)
    decoder_state = encoder_state
    #처음 BOS 에 해당하는 index로 시작
    decoder_in = tf.expand_dims(tf.constant([word2idx_fr["BOS"]]), axis=0)
    pred_sent_fr = []
    #sequence 진행
    while True:
        #예상 단어
        decoder_pred, decoder_state = decoder(decoder_in, decoder_state)
        decoder_pred = tf.argmax(decoder_pred, axis=-1)
        pred_word = idx2word_fr[decoder_pred.numpy()[0][0]]
        #문장에 추가
        pred_sent_fr.append(pred_word)
        #마지막
        if pred_word == "EOS":
            break
        decoder_in = decoder_pred
    print("predicted: ", "".join(pred_sent_fr))

#BiLingual Evaluation Understudy 점수 / 테스트 dataset 전반에 걸쳐 수행
def evaluate_bleu_score(encoder, decoder, test_dataset, word2idx_fr, idx2word_fr):
    bleu_scores = []
    smooth_fn = SmoothingFunction()
    for encoder_in, decoder_in, decoder_out in test_dataset:
        encoder_state = encoder.init_state(BATCH_SIZE)
        encoder_out, encoder_state = encoder(encoder_in, encoder_state)
        decoder_state = encoder_state

    decoder_pred, decoder_state = decoder(decoder_in, decoder_state)

    decoder_out = decoder_out.numpy()
    decoder_pred = tf.argmax(decoder_pred, axis=-1).numpy()

    for i in range(decoder_out.shape[0]):
        ref_sent = [idx2word_fr[j] for j in decoder_out[j].tolist() if j > 0]
        hyp_sent = [idx2word_fr[j] for j in decoder_pred_[j].tolist() if j > 0]
        #EOS 제거
        ref_sent = ref_sent[0:-1]
        hyp_sent = hyp_sent[0:-1]
        bleu_score = sentence_bleu([ref_sent], hyp_sent, smoothing_function=smooth_fn.method1)
        bleu_scores.append(bleu_score)
    
    return np.mean(np.array(bleu_scores))

In [11]:
#Train
optimizer = tf.keras.optimizers.Adam()
checkpoint_prefix = os.path.join(checkpoint_dir, 'ckpt')
checkpoint = tf.train.Checkpoint(optimizer=optimizer, encoder=encoder, decoder=decoder)

num_epochs = 250
eval_scores = []
for e in range(num_epochs):
    encoder_state = encoder.init_state(BATCH_SIZE)
    print(len(train_dataset))
    for batch, data in enumerate(train_dataset):
        encoder_in, decoder_in, decoder_out = data
        loss = train_step(encoder_in, decoder_in, decoder_out, encoder_state)
    print("\nEpochs : {}, loss : {:.4f}".format(e + 1, loss.numpy()))

    if e % 10 == 0:
        checkpoint.save(file_prefix=checkpoint_prefix)
    
    predict(encoder, decoder, BATCH_SIZE, sents_en, data_en, sents_fr_out, word2idx_fr, idx2word_fr)
    eval_score = evaluate_bleu_score(encoder, decoder, test_dataset, word2idx_fr, idx2word_fr)
    print(type(eval_score))
    print("eval score : {:.3e}".format(eval_score))

checkpoint.save(file_prefix=checkpoint_prefix)

351

Epochs : 1, loss : 0.3081
입력 :  allofusstoodup.
출력 :  nousnoussommestoutesmisesdebout.EOS


KeyboardInterrupt: ignored