# Neural Translation with Sequence-to-sequence  

어떤 시퀀스를 다른 시퀀스로 변환하는 seq2seq이 사용되는 가장 대표적인 task가 Neural Translation입니다. 이번 실습을 통해 seq2seq을 구현하는 방법 및 모델의 특성을 익혀 보도록 하겠습니다.  

이번 실습에서 시도해 볼 task는 일반적인 언어간 번역 task 대신, 자연어로 표현된 날짜 표현을 숫자로 표현된 정형화된 날짜 표현으로 '번역'하는 task입니다. 예를 들어 'June 11th, 2019'라는 문자열을 '2019-06-11' 로 번역하는 방식입니다. 언뜻 간단해 보이지만, 세계 각국의 언어로 표현된 날짜 표현을 특정 포맷으로 정확하게 변환해 내야 하므로 아주 간단한 문제는 아닐 것입니다.  

(참고)  
https://blog.keras.io/a-ten-minute-introduction-to-sequence-to-sequence-learning-in-keras.html  
https://medium.com/datalogue/attention-in-keras-1892773a4f22  
https://medium.com/@jbetker/implementing-seq2seq-with-attention-in-keras-63565c8e498c  
https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html  
https://github.com/neonbjb/ml-notebooks/blob/master/keras-seq2seq-with-attention/keras_translate_notebook.ipynb

In [1]:
import os
import json
import pandas as pd
import numpy as np
import random
import unicodedata
import re
import time
import shutil
from collections import Counter

# Start by importing all the things we'll need.
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, CuDNNLSTM, Flatten, TimeDistributed, Dropout, LSTMCell, RNN
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.python.keras.utils import tf_utils
from tensorflow.keras import backend as K

# This enables the Jupyter backend on some matplotlib installations.
%matplotlib notebook
import matplotlib.pyplot as plt
# Turn off interactive plots. iplt doesn't work well with Jupyter.
# plt.ioff()

In [2]:
random.seed(1984)

## Load Dataset 

이번에 다루게 될 데이터셋은 keras에서 제공하는 사용하기 편한 데이터셋이 아닙니다. 우선 데이터셋을 열어서 분석해 보겠습니다.

In [3]:
# csv 포맷의 데이터 경로를 지정합니다.
train_dataset_filepath = 'datasets/nmt_date/nmt_date_train.csv'
test_dataset_filepath = 'datasets/nmt_date/nmt_date_test.csv'

In [4]:
df = pd.read_csv(train_dataset_filepath, header=None, names=['X', 'Y'])
df.head(10)

Unnamed: 0,X,Y
0,"12, sept. 2010",2010-09-12
1,ДЕКАБРЬ 14 1992,1992-12-14
2,8 JUIN 1999,1999-06-08
3,11.03.00,2000-03-11
4,16 Mwere wa gatandatũ 1979,1979-06-16
5,"ਫ਼ਰਵਰੀ 1, 1988",1988-02-01
6,19.06.91,1991-06-19
7,12 january 1981,1981-01-12
8,"30 Nzeli, 2015",2015-09-30
9,8 Ado 1996,1996-07-08


In [5]:
df.describe()

Unnamed: 0,X,Y
count,500000,500000
unique,442175,18058
top,29 12 98,2010-01-02
freq,11,50


## Build Vocabulary Dictionary

In [6]:
for i in range(10):
    x = df.iat[i,0]
    print(tuple(x))

('1', '2', ',', ' ', 's', 'e', 'p', 't', '.', ' ', '2', '0', '1', '0')
('Д', 'Е', 'К', 'А', 'Б', 'Р', 'Ь', ' ', '1', '4', ' ', '1', '9', '9', '2')
('8', ' ', 'J', 'U', 'I', 'N', ' ', '1', '9', '9', '9')
('1', '1', '.', '0', '3', '.', '0', '0')
('1', '6', ' ', 'M', 'w', 'e', 'r', 'e', ' ', 'w', 'a', ' ', 'g', 'a', 't', 'a', 'n', 'd', 'a', 't', 'ũ', ' ', '1', '9', '7', '9')
('ਫ', '਼', 'ਰ', 'ਵ', 'ਰ', 'ੀ', ' ', '1', ',', ' ', '1', '9', '8', '8')
('1', '9', '.', '0', '6', '.', '9', '1')
('1', '2', ' ', 'j', 'a', 'n', 'u', 'a', 'r', 'y', ' ', '1', '9', '8', '1')
('3', '0', ' ', 'N', 'z', 'e', 'l', 'i', ',', ' ', '2', '0', '1', '5')
('8', ' ', 'A', 'd', 'o', ' ', '1', '9', '9', '6')


In [7]:
x_corpus = df.iloc[:,0] 
y_corpus = df.iloc[:,1] 
x_corpus.head(10)

0                12, sept. 2010
1               ДЕКАБРЬ 14 1992
2                   8 JUIN 1999
3                      11.03.00
4    16 Mwere wa gatandatũ 1979
5                ਫ਼ਰਵਰੀ 1, 1988
6                      19.06.91
7               12 january 1981
8                30 Nzeli, 2015
9                    8 Ado 1996
Name: X, dtype: object

In [8]:
x_corpus_list = x_corpus.values.tolist()
y_corpus_list = y_corpus.values.tolist()
len(x_corpus_list)

500000

In [9]:
x_char_list = np.concatenate([list(tuple(x)) for x in x_corpus_list], axis=0)
y_char_list = np.concatenate([list(tuple(y)) for y in y_corpus_list], axis=0)
print(len(x_char_list))
print(len(y_char_list))

6664404
5000000


In [10]:
counter_x = Counter(x_char_list)
len(counter_x)

1397

In [11]:
counter_y = Counter(y_char_list)
len(counter_y)

11

In [12]:
counter_x.most_common(100)

[(' ', 945905),
 ('1', 673748),
 ('0', 482926),
 ('9', 459492),
 ('2', 447864),
 ('7', 210627),
 ('8', 210598),
 (',', 168348),
 ('a', 141133),
 ('3', 134137),
 ('.', 133501),
 ('e', 132063),
 ('5', 110015),
 ('4', 109633),
 ('6', 109468),
 ('r', 94754),
 ('u', 78046),
 ('i', 74986),
 ('n', 70189),
 ('o', 67375),
 ('m', 63269),
 ('A', 61107),
 ('t', 54158),
 ('b', 53513),
 ('/', 51559),
 ('s', 47457),
 ('E', 45737),
 ('M', 42798),
 ('d', 40798),
 ('l', 39755),
 ('N', 32898),
 ('R', 32118),
 ('O', 31984),
 ('y', 28159),
 ('j', 27377),
 ('U', 26873),
 ('J', 26653),
 ('I', 26202),
 ('g', 25923),
 ('p', 25635),
 ('S', 25287),
 ('k', 23773),
 ('c', 22859),
 ('D', 22006),
 ('v', 21642),
 ('T', 20996),
 ('w', 20081),
 ('B', 19927),
 ('ر', 19120),
 ('h', 17626),
 ('ا', 16066),
 ('و', 15757),
 ('L', 15437),
 ('-', 14604),
 ('а', 13169),
 ('ي', 12620),
 ('K', 12250),
 ('ب', 11551),
 ('م', 11414),
 ('ن', 11039),
 ('F', 10836),
 ('f', 10807),
 ('P', 10623),
 ('Y', 10500),
 ('р', 10361),
 ('G', 986

In [13]:
counter_y.most_common()

[('-', 1000000),
 ('0', 962000),
 ('1', 880922),
 ('2', 541166),
 ('9', 538203),
 ('7', 242869),
 ('8', 242439),
 ('3', 167482),
 ('5', 142626),
 ('4', 141321),
 ('6', 140972)]

In [14]:
x_vocab = ['PAD', 'BOS', 'EOS', 'UNK']
x_vocab = x_vocab + list(Counter(dict(counter_x.most_common())))
x_vocab[:10]
# x_vocab[-10:]

['PAD', 'BOS', 'EOS', 'UNK', ' ', '1', '0', '9', '2', '7']

In [15]:
y_vocab = ['PAD', 'BOS', 'EOS', 'UNK']
y_vocab = y_vocab + list(Counter(dict(counter_y.most_common())))
y_vocab

['PAD',
 'BOS',
 'EOS',
 'UNK',
 '-',
 '0',
 '1',
 '2',
 '9',
 '7',
 '8',
 '3',
 '5',
 '4',
 '6']

In [16]:
idx2char_x = dict(enumerate(x_vocab))
idx2char_x

{0: 'PAD',
 1: 'BOS',
 2: 'EOS',
 3: 'UNK',
 4: ' ',
 5: '1',
 6: '0',
 7: '9',
 8: '2',
 9: '7',
 10: '8',
 11: ',',
 12: 'a',
 13: '3',
 14: '.',
 15: 'e',
 16: '5',
 17: '4',
 18: '6',
 19: 'r',
 20: 'u',
 21: 'i',
 22: 'n',
 23: 'o',
 24: 'm',
 25: 'A',
 26: 't',
 27: 'b',
 28: '/',
 29: 's',
 30: 'E',
 31: 'M',
 32: 'd',
 33: 'l',
 34: 'N',
 35: 'R',
 36: 'O',
 37: 'y',
 38: 'j',
 39: 'U',
 40: 'J',
 41: 'I',
 42: 'g',
 43: 'p',
 44: 'S',
 45: 'k',
 46: 'c',
 47: 'D',
 48: 'v',
 49: 'T',
 50: 'w',
 51: 'B',
 52: 'ر',
 53: 'h',
 54: 'ا',
 55: 'و',
 56: 'L',
 57: '-',
 58: 'а',
 59: 'ي',
 60: 'K',
 61: 'ب',
 62: 'م',
 63: 'ن',
 64: 'F',
 65: 'f',
 66: 'P',
 67: 'Y',
 68: 'р',
 69: 'G',
 70: 'W',
 71: '月',
 72: 'C',
 73: 'á',
 74: 'z',
 75: '་',
 76: 'س',
 77: 'ل',
 78: 'H',
 79: 'V',
 80: 'е',
 81: 'н',
 82: 'ت',
 83: 'ی',
 84: 'ɔ',
 85: 'ɛ',
 86: 'ŋ',
 87: 'т',
 88: '\u200f',
 89: 'А',
 90: 'र',
 91: 'я',
 92: 'в',
 93: 'с',
 94: 'é',
 95: 'أ',
 96: '́',
 97: 'м',
 98: 'í',
 99: 'л

In [17]:
char2idx_x = {char:index for index, char in enumerate(x_vocab)}
char2idx_x

{'PAD': 0,
 'BOS': 1,
 'EOS': 2,
 'UNK': 3,
 ' ': 4,
 '1': 5,
 '0': 6,
 '9': 7,
 '2': 8,
 '7': 9,
 '8': 10,
 ',': 11,
 'a': 12,
 '3': 13,
 '.': 14,
 'e': 15,
 '5': 16,
 '4': 17,
 '6': 18,
 'r': 19,
 'u': 20,
 'i': 21,
 'n': 22,
 'o': 23,
 'm': 24,
 'A': 25,
 't': 26,
 'b': 27,
 '/': 28,
 's': 29,
 'E': 30,
 'M': 31,
 'd': 32,
 'l': 33,
 'N': 34,
 'R': 35,
 'O': 36,
 'y': 37,
 'j': 38,
 'U': 39,
 'J': 40,
 'I': 41,
 'g': 42,
 'p': 43,
 'S': 44,
 'k': 45,
 'c': 46,
 'D': 47,
 'v': 48,
 'T': 49,
 'w': 50,
 'B': 51,
 'ر': 52,
 'h': 53,
 'ا': 54,
 'و': 55,
 'L': 56,
 '-': 57,
 'а': 58,
 'ي': 59,
 'K': 60,
 'ب': 61,
 'م': 62,
 'ن': 63,
 'F': 64,
 'f': 65,
 'P': 66,
 'Y': 67,
 'р': 68,
 'G': 69,
 'W': 70,
 '月': 71,
 'C': 72,
 'á': 73,
 'z': 74,
 '་': 75,
 'س': 76,
 'ل': 77,
 'H': 78,
 'V': 79,
 'е': 80,
 'н': 81,
 'ت': 82,
 'ی': 83,
 'ɔ': 84,
 'ɛ': 85,
 'ŋ': 86,
 'т': 87,
 '\u200f': 88,
 'А': 89,
 'र': 90,
 'я': 91,
 'в': 92,
 'с': 93,
 'é': 94,
 'أ': 95,
 '́': 96,
 'м': 97,
 'í': 98,
 'л': 9

In [18]:
idx2char_y = dict(enumerate(y_vocab))
idx2char_y

{0: 'PAD',
 1: 'BOS',
 2: 'EOS',
 3: 'UNK',
 4: '-',
 5: '0',
 6: '1',
 7: '2',
 8: '9',
 9: '7',
 10: '8',
 11: '3',
 12: '5',
 13: '4',
 14: '6'}

In [19]:
char2idx_y = {char:index for index, char in enumerate(y_vocab)}
char2idx_y

{'PAD': 0,
 'BOS': 1,
 'EOS': 2,
 'UNK': 3,
 '-': 4,
 '0': 5,
 '1': 6,
 '2': 7,
 '9': 8,
 '7': 9,
 '8': 10,
 '3': 11,
 '5': 12,
 '4': 13,
 '6': 14}

In [20]:
with open(os.path.join('datasets/nmt_date', 'x_vocab.json'), 'w') as f_x:
    json.dump(char2idx_x, f_x)

In [21]:
with open(os.path.join('datasets/nmt_date', 'y_vocab.json'), 'w') as f_y:
    json.dump(char2idx_y, f_y)

### word corpus -> indexed corpus

In [22]:
def convert_sentence_to_indexed_corpus(corpus, char2idx):
    indexed_corpus = [char2idx['BOS']]
    indexed_corpus = indexed_corpus + [char2idx[char] if char in char2idx else char2idx['UNK'] for char in tuple(corpus)]
    indexed_corpus = indexed_corpus + [char2idx_x['EOS']]
    return indexed_corpus

In [23]:
indexed_x_corpus_list = []
for doc in x_corpus_list:
    indexed_x_corpus_list.append(convert_sentence_to_indexed_corpus(doc, char2idx_x))
#     indexed_x_corpus = [char2idx_x['BOS']]
#     indexed_x_corpus = indexed_x_corpus + [char2idx_x[char] if char in char2idx_x else char2idx_x['UNK'] for char in tuple(doc)]
#     indexed_x_corpus = indexed_x_corpus + [char2idx_x['EOS']]
#     indexed_x_corpus_list.append(indexed_x_corpus)

In [24]:
x_corpus_list[0]

'12, sept. 2010'

In [25]:
indexed_x_corpus_list[0]

[1, 5, 8, 11, 4, 29, 15, 43, 26, 14, 4, 8, 6, 5, 6, 2]

In [26]:
char2idx_x['1']

5

In [27]:
char2idx_x['s']

29

In [28]:
indexed_y_corpus_list = []
for doc in y_corpus_list:
    indexed_y_corpus_list.append(convert_sentence_to_indexed_corpus(doc, char2idx_y))
#     indexed_y_corpus = [char2idx_y['BOS']]
#     indexed_y_corpus = indexed_y_corpus + [char2idx_y[char] if char in char2idx_y else char2idx_y['UNK'] for char in tuple(doc)]
#     indexed_y_corpus = indexed_y_corpus + [char2idx_y['EOS']]
#     indexed_y_corpus_list.append(indexed_y_corpus)
#     indexed_y_corpus_list.append([char2idx_y[char] if char in char2idx_y else char2idx_y['UNK'] for char in tuple(doc)])

In [29]:
y_corpus_list[0]

'2010-09-12'

In [30]:
indexed_y_corpus_list[0]

[1, 7, 5, 6, 5, 4, 5, 8, 4, 6, 7, 2]

In [31]:
max_x_corpus_length = max([len(doc) for doc in indexed_x_corpus_list])
max_x_corpus_length

68

In [32]:
max_y_corpus_length = max([len(doc) for doc in indexed_y_corpus_list])
max_y_corpus_length

12

## make (x_train, y_train), (x_test, y_test)  

In [33]:
input_data = tf.keras.preprocessing.sequence.pad_sequences(indexed_x_corpus_list, maxlen=max_x_corpus_length, padding="post")
output_data = tf.keras.preprocessing.sequence.pad_sequences(indexed_y_corpus_list, maxlen=max_y_corpus_length, padding="post")
teacher_data = output_data

target_data = [[teacher_data[n][i+1] for i in range(len(teacher_data[n])-1)] for n in range(len(teacher_data))]
target_data = tf.keras.preprocessing.sequence.pad_sequences(target_data, maxlen=max_y_corpus_length, padding="post")
target_data = target_data.reshape((target_data.shape[0], target_data.shape[1], 1))

print(input_data.shape)
print(teacher_data.shape)
print(target_data.shape)

(500000, 68)
(500000, 12)
(500000, 12, 1)


In [34]:
teacher_data[0]

array([1, 7, 5, 6, 5, 4, 5, 8, 4, 6, 7, 2])

In [35]:
target_data[0]

array([[7],
       [5],
       [6],
       [5],
       [4],
       [5],
       [8],
       [4],
       [6],
       [7],
       [2],
       [0]])

In [36]:
BUFFER_SIZE = len(x_corpus_list)
BATCH_SIZE = 32
embedding_dim = 16
units = 32
x_vocab_size = len(idx2char_y)
y_vocab_size = len(idx2char_y)
len_input = max_x_corpus_length
len_target = max_y_corpus_length

## build seq2seq Model

In [37]:
# Create the Encoder layers first.
encoder_inputs = Input(shape=(len_input,))
encoder_emb = Embedding(input_dim=x_vocab_size, output_dim=embedding_dim)
encoder_lstm = CuDNNLSTM(units=units, return_sequences=True, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_emb(encoder_inputs))
encoder_states = [state_h, state_c]

Instructions for updating:
Colocations handled automatically by placer.


In [38]:
# Now create the Decoder layers.
decoder_inputs = Input(shape=(None,))
decoder_emb = Embedding(input_dim=y_vocab_size, output_dim=embedding_dim)
decoder_lstm = CuDNNLSTM(units=units, return_sequences=True, return_state=True)
decoder_lstm_out, _, _ = decoder_lstm(decoder_emb(decoder_inputs), initial_state=encoder_states)
# Two dense layers added to this model to improve inference capabilities.
decoder_d1 = Dense(units, activation="relu")
decoder_d2 = Dense(y_vocab_size, activation="softmax")
# Drop-out is added in the dense layers to help mitigate overfitting in this part of the model. Astute developers
# may want to add the same mechanism inside the LSTMs.
decoder_out = decoder_d2(Dropout(rate=.4)(decoder_d1(Dropout(rate=.4)(decoder_lstm_out))))

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [39]:
model = Model([encoder_inputs, decoder_inputs], decoder_out)
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 68)           0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 68, 16)       240         input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 16)     240         input_2[0][0]                    
__________________________________________________________________________________________________
cu_dnnlstm

In [40]:
model.compile(optimizer=tf.train.AdamOptimizer(), loss="sparse_categorical_crossentropy", metrics=['sparse_categorical_accuracy'])

In [41]:

# Note, we use 20% of our data for validation.
# epochs = 10
epochs = 1
history = model.fit([input_data, teacher_data], target_data,
                 batch_size=BATCH_SIZE,
                 epochs=epochs,
                 validation_split=0.2)

Train on 400000 samples, validate on 100000 samples
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
 37280/400000 [=>............................] - ETA: 2:08 - loss: 1.2017 - sparse_categorical_accuracy: 0.5476

KeyboardInterrupt: 

## Inference 

In [69]:
# Create the encoder model from the tensors we previously declared.
encoder_model = Model(encoder_inputs, [encoder_outputs, state_h, state_c])
encoder_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 68)                0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 68, 16)            240       
_________________________________________________________________
cu_dnnlstm_2 (CuDNNLSTM)     [(None, 68, 32), (None, 3 6400      
Total params: 6,640
Trainable params: 6,640
Non-trainable params: 0
_________________________________________________________________


In [71]:
# Generate a new set of tensors for our new inference decoder. Note that we are using new tensors, 
# this does not preclude using the same underlying layers that we trained on. (e.g. weights/biases).
inf_decoder_inputs = Input(shape=(None,), name="inf_decoder_inputs")
# We'll need to force feed the two state variables into the decoder each step.
state_input_h = Input(shape=(units,), name="state_input_h")
state_input_c = Input(shape=(units,), name="state_input_c")
decoder_res, decoder_h, decoder_c = decoder_lstm(
    decoder_emb(inf_decoder_inputs), 
    initial_state=[state_input_h, state_input_c])
inf_decoder_out = decoder_d2(decoder_d1(decoder_res))
inf_model = Model(inputs=[inf_decoder_inputs, state_input_h, state_input_c], 
                  outputs=[inf_decoder_out, decoder_h, decoder_c])
inf_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
inf_decoder_inputs (InputLayer) (None, None)         0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, None, 16)     240         inf_decoder_inputs[0][0]         
__________________________________________________________________________________________________
state_input_h (InputLayer)      (None, 32)           0                                            
__________________________________________________________________________________________________
state_input_c (InputLayer)      (None, 32)           0                                            
__________________________________________________________________________________________________
cu_dnnlstm

In [89]:
# Converts the given sentence (just a string) into a vector of char IDs
# Output is 1-D: [timesteps/chars]
def sentence_to_vector(sentence):
    pre = convert_sentence_to_indexed_corpus(sentence, char2idx_x)
    vec = np.zeros(len_input)
#     sentence_list = [char2idx_x[s] for s in pre.split(' ')]
    for i,w in enumerate(pre):
        vec[i] = w
    return vec

In [98]:
# Given an input string, an encoder model (infenc_model) and a decoder model (infmodel),
# return a translated string.
def translate(input_sentence, infenc_model, infmodel, attention=False):
    sv = sentence_to_vector(input_sentence)
    # Reshape so we can use the encoder model. New shape=[samples,sequence length]
    sv = sv.reshape(1,len(sv))
    [emb_out, sh, sc] = infenc_model.predict(x=sv)
    
    i = 0
    start_vec = char2idx_y['BOS']
    stop_vec = char2idx_y['EOS']
    # We will continuously feed cur_vec as an input into the decoder to produce the next word,
    # which will be assigned to cur_vec. Start it with "<start>".
    cur_vec = np.zeros((1,1))
    cur_vec[0,0] = start_vec
    cur_word = 'BOS'
    output_sentence = ""
    # Start doing the feeding. Terminate when the model predicts an "<end>" or we reach the end
    # of the max target language sentence length.
    while cur_word != 'EOS' and i < (len_target-1):
        i += 1
        if cur_word != 'BOS':
            output_sentence = output_sentence + cur_word
        x_in = [cur_vec, sh, sc]
        # This will allow us to accomodate attention models, which we will talk about later.
        if attention:
            x_in += [emb_out]
        [nvec, sh, sc] = infmodel.predict(x=x_in)
        # The output of the model is a massive softmax vector with one spot for every possible word. Convert
        # it to a word ID using argmax().
        cur_vec[0,0] = np.argmax(nvec[0,0])
        cur_word = idx2char_y[np.argmax(nvec[0,0])]
    return output_sentence

In [99]:
translate("June 11th, 2019", encoder_model, inf_model)

'1994-06-24'