### Machine Translation (seq2seq) with RNN
Data preprocessing

In [36]:
import re # regular expression for input data preprocessing
import numpy as np
import tensorflow as tf
import unicodedata # for Français data preprocessing

from tensorflow.keras.layers import Embedding, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

In [37]:
num_samples = 33000

In [38]:
def to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
                   if unicodedata.category(c) != 'Mn')

In [39]:
def preprocessing(sentence):
  sent = to_ascii(sentence.lower())

  # Insert whitespace between words and puncutation
  # e.g. "I am a student." => "I am a student ."
  sent = re.sub(r"([?.!,¿])", r" \1", sent)

  # Replace with whitespace except (a-z, A-Z, ".", "?", "!", ",")
  sent = re.sub(r"[^a-zA-Z!.?]+", r" ", sent)

  # Replace multiple whitespaces with single one
  sent = re.sub(r"\s+", " ", sent)
  return sent

In [40]:
sentence_eng = u"Have you had a dinner       ?" # u: unicode
sentence_frn = u"déjà diné       ?"

print(sentence_eng)
print(preprocessing(sentence_eng))
print(sentence_frn)
print(preprocessing(sentence_frn))

Have you had a dinner       ?
have you had a dinner ?
déjà diné       ?
deja dine ?


In [41]:
def load_preprocessed_data():
    encoder_input, decoder_input, decoder_target = [], [], []

    with open("fra.txt", "r") as lines:
        for i, line in enumerate(lines):
            src_line, tar_line, _ = line.strip().split('\t')
            src_lines = [w for w in preprocessing(src_line).split()] # delimiter

            tar_line = preprocessing(tar_line)
            tar_line_in = [w for w in ("<sos> " + tar_line).split()]
            tar_line_out = [w for w in (tar_line + " <eos>").split()]

            encoder_input.append(src_line)
            decoder_input.append(tar_line_in)
            decoder_target.append(tar_line_out)

            if i == num_samples - 1:
                break
        
        return encoder_input, decoder_input, decoder_target

In [42]:
sents_en_in, sents_fra_in, sents_fra_out = load_preprocessed_data()
print('Encoder input : ', sents_en_in[:5])
print('Decoder input : ', sents_fra_in[:5])
print('Decoder label : ', sents_fra_out[:5])

Encoder input :  ['Go.', 'Go.', 'Go.', 'Hi.', 'Hi.']
Decoder input :  [['<sos>', 'va', '!'], ['<sos>', 'marche', '.'], ['<sos>', 'bouge', '!'], ['<sos>', 'salut', '!'], ['<sos>', 'salut', '.']]
Decoder label :  [['va', '!', '<eos>'], ['marche', '.', '<eos>'], ['bouge', '!', '<eos>'], ['salut', '!', '<eos>'], ['salut', '.', '<eos>']]


In [43]:
# words to vector
tokenizer_en = Tokenizer(filters="", lower=False)
tokenizer_en.fit_on_texts(sents_en_in)
encoder_input = tokenizer_en.texts_to_sequences(sents_en_in)
encoder_input = pad_sequences(encoder_input, padding="post")
# print(encoder_input[:5])

tokenizer_fra = Tokenizer(filters="", lower=False)
tokenizer_fra.fit_on_texts(sents_fra_in)
tokenizer_fra.fit_on_texts(sents_fra_out)

decoder_input = tokenizer_fra.texts_to_sequences(sents_fra_in)
decoder_input = pad_sequences(decoder_input, padding="post")

decoder_target = tokenizer_fra.texts_to_sequences(sents_fra_out)
decoder_target = pad_sequences(decoder_target, padding="post")

In [44]:
# For word embedding
src_vocab_size = len(tokenizer_en.word_index) + 1
tar_vocab_size = len(tokenizer_fra.word_index) + 1
print("English vocabulary size : {:d}, French vocabulary size : {:d}".format(src_vocab_size, tar_vocab_size))

English vocabulary size : 7383, French vocabulary size : 8153


In [45]:
src_to_index = tokenizer_en.word_index # word to index number
index_to_src = tokenizer_en.index_word # index number to word
tar_to_index = tokenizer_fra.word_index
index_to_tar = tokenizer_fra.index_word

In [48]:
# shuffle indices for better training
indices = np.arange(encoder_input.shape[0])
np.random.shuffle(indices)
print('Random sequence :',indices)

Random sequence : [11872 18806 20048 ...  1818 21895 28841]


In [50]:
encoder_input = encoder_input[indices]
decoder_input = decoder_input[indices]
decoder_target = decoder_target[indices]

In [52]:
n_of_val = int(33000*0.1)
print('Validation data size :',n_of_val)

Validation data size : 3300


In [53]:
encoder_input_train = encoder_input[:-n_of_val]
decoder_input_train = decoder_input[:-n_of_val]
decoder_target_train = decoder_target[:-n_of_val]

encoder_input_test = encoder_input[-n_of_val:]
decoder_input_test = decoder_input[-n_of_val:]
decoder_target_test = decoder_target[-n_of_val:]

Model Training

In [54]:
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Masking
from tensorflow.keras.models import Model

In [56]:
embedding_dim = 64
 # 위에서 단어를 tokenize 한 결과 (go -> 82) 는 단어 사이 유사성을 고려하지 않은 데이터. 
 # 비슷한 데이터는 비슷한 값을 갖도록 embedding 을 구해주면 성능이 좋아진다.
 # 82 를 64 dimension vector 로 변환시킬 예정

hidden_units = 64 # z_i 가 64개 있음

In [57]:
# Encoder
# 학습을 통해서 단어 사이 유사성을 고려하여 embedding table 을 만든다.
# ex) embedding table size: 10000 * 64 (10000: 단어 개수, 64: dimension)
encoder_inputs = Input(shape=(None,))
enc_emb = Embedding(src_vocab_size, embedding_dim)(encoder_inputs)
enc_masking = Masking(mask_value=0.0)(enc_emb) # zero padding 을 masking 함

encoder_lstm = LSTM(hidden_units, return_state=True)
 # return state: state_c 가 필요해서
encoder_outputs, state_h, state_c = encoder_lstm(enc_masking)
encoder_states = [state_h, state_c]

2023-07-21 11:09:52.708957: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-07-21 11:09:52.803882: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1956] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [58]:
# Decoder
decoder_inputs = Input(shape=(None,))
dec_emb_layer = Embedding(tar_vocab_size, hidden_units) 
dec_emb = dec_emb_layer(decoder_inputs)
dec_masking = Masking(mask_value=0.0)(dec_emb)

decoder_lstm = LSTM(hidden_units, return_sequences=True, return_state=True) 
 # return sequence: 번역 후 나오는 단어들 (<sos>->person, person->wearing, ... 인 경우 return sequence: person, wewaring, ...)
decoder_outputs, _, _ = decoder_lstm(dec_masking, initial_state=encoder_states)
 # initial state 로 encoder_states 를 사용함 (h_0 로 사용됨)

decoder_dense = Dense(tar_vocab_size, activation='softmax')
 # tar_vocab_size: 영어로 번역하는 경우, 사용되는 영어 단어 개수가 10000개라고 하면
 # output 값이 어떤 단어가 될 지는 각 단어에 속할 확률을 계산 후 가장 높은 확률값을 가진 데이터로 변환하는데
 # 이를 softmax 를 사용한 dense layer 를 통해서 구한다

decoder_outputs = decoder_dense(decoder_outputs)

In [59]:
# Model inputs and outputs
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['acc'])

In [61]:
model.fit(x=[encoder_input_train, decoder_input_train], y=decoder_target_train, \
          validation_data=([encoder_input_test, decoder_input_test], decoder_target_test),
          batch_size=128, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f415843e140>

Translation

In [62]:
# Encoder
encoder_model = Model(encoder_inputs, encoder_states)

# Design decoder for translation
decoder_state_input_h = Input(shape=(hidden_units,))
decoder_state_input_c = Input(shape=(hidden_units,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

# Reusing embedding layer
dec_emb2 = dec_emb_layer(decoder_inputs)

decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]

# Next word prediction
decoder_outputs2 = decoder_dense(decoder_outputs2)

# Modified decoder
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs2] + decoder_states2)

2023-07-21 11:32:18.908237: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-07-21 11:32:18.909630: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-07-21 11:32:18.910698: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

In [63]:
def decode_sequence(input_seq):
  states_value = encoder_model.predict(input_seq) # context vector

  # Create an integer for <sos>
  target_seq = np.zeros((1,1))
  target_seq[0, 0] = tar_to_index['<sos>']

  stop_condition = False
  decoded_sentence = ''

  while not stop_condition:
    output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
    # output token: 10000개 단어를 대상으로 각 단어에 속할 확률

    sampled_token_index = np.argmax(output_tokens[0, -1, :])
    # 10000 개의 확률 중 가장 큰 확률 값을 가진 index
    sampled_char = index_to_tar[sampled_token_index]
    # 가장 확률 값이 컸던 데이터를 캐릭터로 변환

    decoded_sentence += ' '+sampled_char
    # decoded sentence 끝에 단어를 추가

    if (sampled_char == '<eos>' or len(decoded_sentence) > 50):
        stop_condition = True

    target_seq = np.zeros((1,1))
    target_seq[0, 0] = sampled_token_index 
    # 현재 타임스텝의 output(sampled token index) 을 다음 스텝의 input(target seq) 으로 사용

    states_value = [h, c]

  return decoded_sentence

In [64]:
def seq_to_src(input_seq):
  sentence = ''
  for encoded_word in input_seq:
    if(encoded_word != 0):
      sentence = sentence + index_to_src[encoded_word] + ' '
  return sentence

def seq_to_tar(input_seq):
  sentence = ''
  for encoded_word in input_seq:
    if(encoded_word != 0 and encoded_word != tar_to_index['<sos>'] and encoded_word != tar_to_index['<eos>']):
      sentence = sentence + index_to_tar[encoded_word] + ' '
  return sentence

In [65]:
for seq_index in [3, 50, 100, 300, 1001]:
  input_seq = encoder_input_train[seq_index: seq_index + 1]
  decoded_sentence = decode_sequence(input_seq)

  print("Input :",seq_to_src(encoder_input_train[seq_index]))
  print("Label :",seq_to_tar(decoder_input_train[seq_index]))
  print("Output :",decoded_sentence[1:-5])
  print("-"*50)



2023-07-21 11:37:46.597122: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-07-21 11:37:46.598525: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-07-21 11:37:46.599635: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Input : You're nuts! 
Label : vous etes dingues ! 
Output : tu es ? 
--------------------------------------------------
Input : Go make popcorn. 
Label : va faire du pop corn . 
Output : soyez ! 
--------------------------------------------------
Input : Keep it short. 
Label : soyez bref . 
Output : soyez calme . 
--------------------------------------------------
Input : Take care! 
Label : prends bien soin de toi . 
Output : soyez prudente ! 
--------------------------------------------------
Input : I will try again. 
Label : j essaierai a nouveau . 
Output : je suis . 
--------------------------------------------------


In [66]:
for seq_index in [3, 50, 100, 300, 1001]:
  input_seq = encoder_input_test[seq_index: seq_index + 1]
  decoded_sentence = decode_sequence(input_seq)

  print("Input :",seq_to_src(encoder_input_test[seq_index]))
  print("Label :",seq_to_tar(decoder_input_test[seq_index]))
  print("Output :",decoded_sentence[1:-5])
  print("-"*50)

Input : Tom's repulsive. 
Label : tom est repoussant . 
Output : tom est la . 
--------------------------------------------------
Input : No way! 
Label : c est pas possible ! 
Output : le chien ! 
--------------------------------------------------
Input : I plan on winning. 
Label : je prevois de gagner . 
Output : j ai pas . 
--------------------------------------------------
Input : Help! I can't swim. 
Label : a l aide ! je ne sais pas nager . 
Output : ce n est pas pas . 
--------------------------------------------------
Input : I can't risk that. 
Label : je ne peux pas risquer cela . 
Output : je ne suis pas . 
--------------------------------------------------
