# Bytepairencoding seq2seq model in keras that translates english <-> german

As a next step I take the model used for the [toy problem of adding/subtracting numbers](SimpleModelForAddingAndSubstraction.ipynb) and train it with english/german data for machine translation.

As trainings set I use the [European Parliament Proceedings Parallel Corpus 1996-2011](http://statmt.org/europarl/) German-English corpus.

In [1]:
# technical detail so that an instance (maybe running in a different window)
# doesn't take all the GPU memory resulting in some strange error messages
import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.5
set_session(tf.Session(config=config))

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
import math
import matplotlib.pyplot as plt
import os
import re
import tarfile

from gensim.models import KeyedVectors
import keras
import keras.layers as L
from keras.models import Model
from keras.preprocessing.sequence import pad_sequences
import numpy as np
import pandas as pd
import requests
import sentencepiece as spm
from sklearn.model_selection import train_test_split
from tqdm import tqdm_notebook as tqdm

# Fixing random state ensure reproducible results
RANDOM_STATE=42
np.random.seed(RANDOM_STATE)
tf.set_random_seed(RANDOM_STATE)

In [3]:
# START = '^'
# END = '\n'

MAX_INPUT_LENGTH = 25 #50
MAX_TARGET_LENGTH = 35 #65
LATENT_DIM = 512
EMBEDDING_DIM = 100
BPE_MERGE_OPERATIONS = 1000
EPOCHS = 20
BATCH_SIZE = 128
DROPOUT = 0.5
TEST_SIZE = 500
EMBEDDING_TRAINABLE = True
LEARNING_RATE = 5e-4

## Download and explore data

In [4]:
def download_file(fname, url):
    print(f"Downloading {fname} from {url} ...")
    response = requests.get(url, stream=True)

    total_size = int(response.headers.get('content-length', 0)); 
    block_size = 1024

    download = tqdm(
        response.iter_content(block_size),
        total=math.ceil(total_size // block_size),
        unit='KB',
        unit_scale=True
    )
    with open(f"{fname}", "wb") as handle:
        for data in download:
            handle.write(data)

PATH = 'data'
INPUT_LANG = 'en'
TARGET_LANG = 'de'
LANGUAGES = [INPUT_LANG, TARGET_LANG]
BPE_URL = {lang: f'http://cosyne.h-its.org/bpemb/data/{lang}/' for lang in LANGUAGES}
BPE_MODEL_NAME = {lang: f'{lang}.wiki.bpe.op{BPE_MERGE_OPERATIONS}.model' for lang in LANGUAGES}
BPE_WORD2VEC_NAME = {lang: f'{lang}.wiki.bpe.op{BPE_MERGE_OPERATIONS}.d{EMBEDDING_DIM}.w2v.bin' for lang in LANGUAGES}
DOWNLOAD_FILES = {
    'de-en.tgz': 'http://statmt.org/europarl/v7/de-en.tgz',
    BPE_MODEL_NAME[INPUT_LANG]: f'{BPE_URL[INPUT_LANG]}/{BPE_MODEL_NAME[INPUT_LANG]}',
    BPE_WORD2VEC_NAME[INPUT_LANG] + '.tar.gz': f'{BPE_URL[INPUT_LANG]}/{BPE_WORD2VEC_NAME[INPUT_LANG]}' + '.tar.gz',
    BPE_MODEL_NAME[TARGET_LANG]: f'{BPE_URL[TARGET_LANG]}/{BPE_MODEL_NAME[TARGET_LANG]}',
    BPE_WORD2VEC_NAME[TARGET_LANG] + '.tar.gz': f'{BPE_URL[TARGET_LANG]}/{BPE_WORD2VEC_NAME[TARGET_LANG]}' + '.tar.gz',
}
os.makedirs(PATH, exist_ok=True)

for name, url in DOWNLOAD_FILES.items():
    fname = os.path.join(PATH, name)
    exists = os.path.exists(fname)
    size = os.path.getsize(fname) if exists else -1
    if exists and size > 0:
        print(f'{name} already downloaded ({size / 2**20:3.1f} MB)')
        continue
    download_file(fname, url)
    if (re.search(r'\.(tgz|tar\.gz)$', fname)):
        tar = tarfile.open(fname, "r:gz")
        tar.extractall(path=PATH)
        tar.close()
        print(f'Extracted {fname} ...')


de-en.tgz already downloaded (188.6 MB)
en.wiki.bpe.op1000.model already downloaded (0.2 MB)
en.wiki.bpe.op1000.d100.w2v.bin.tar.gz already downloaded (0.7 MB)
de.wiki.bpe.op1000.model already downloaded (0.2 MB)
de.wiki.bpe.op1000.d100.w2v.bin.tar.gz already downloaded (0.5 MB)


In [5]:
# Following https://github.com/bheinzerling/bpemb/blob/master/preprocess_text.sh
# (ignoring urls as there shouldn't be any in parliament discussions)
def preprocess(line):
    line = re.sub(r'\d+', '0', line)
    line = re.sub(r'\s+', ' ', line)  # keep newlines, but strip together all other whitespaces
    return line.lower().strip()

def read_corpus_lines(language):
    return [preprocess(line) for line in open(f'{PATH}/europarl-v7.de-en.{language}', 'r').readlines()]
    
pd.set_option('max_colwidth', 60)
df = pd.DataFrame(data={
    'input_texts': read_corpus_lines('en'),
    'target_texts': read_corpus_lines('de'), 
})
# df.target_texts = START + df.target_texts + END

In [6]:
len(df)
df.target_texts = df.target_texts  # encode a start symbol (doesn't occur in texts)
df['input_length'] = df.input_texts.apply(len)
df['target_length'] = df.target_texts.apply(len)
df.head()

1920209

Unnamed: 0,input_texts,target_texts,input_length,target_length
0,resumption of the session,wiederaufnahme der sitzungsperiode,25,34
1,i declare resumed the session of the european parliament...,"ich erkläre die am freitag, dem 0. dezember unterbrochen...",203,217
2,"although, as you will have seen, the dreaded 'millennium...","wie sie feststellen konnten, ist der gefürchtete ""millen...",191,185
3,you have requested a debate on this subject in the cours...,im parlament besteht der wunsch nach einer aussprache im...,105,110
4,"in the meantime, i should like to observe a minute' s si...",heute möchte ich sie bitten - das ist auch der wunsch ei...,232,217


In [7]:
# x = df.input_length
# logbins = np.logspace(1,5,20)
# plt.hist(x, bins=logbins)
# plt.xscale('log')
# plt.show();

In [8]:
non_empty = (df.input_length > 1) & (df.target_length > 1)  # there are empty phrases like '\n' --> 'Frau Präsidentin\n'
short_inputs = (df.input_length < MAX_INPUT_LENGTH) & (df.target_length < MAX_TARGET_LENGTH)
sum(non_empty & short_inputs)
df = df[non_empty & short_inputs]

34392

In [9]:
input_pretrained_bpe = KeyedVectors.load_word2vec_format(os.path.join(PATH, BPE_WORD2VEC_NAME[INPUT_LANG]), binary=True)
target_pretrained_bpe = KeyedVectors.load_word2vec_format(os.path.join(PATH, BPE_WORD2VEC_NAME[TARGET_LANG]), binary=True)
sp_input = spm.SentencePieceProcessor()
sp_input.Load(os.path.join(PATH, BPE_MODEL_NAME[INPUT_LANG]))
subwords = sp_input.EncodeAsPieces("this is a test")
print(subwords)
sp_target = spm.SentencePieceProcessor()
sp_target.Load(os.path.join(PATH, BPE_MODEL_NAME[TARGET_LANG]))
subwords = sp_target.EncodeAsPieces("das ist ein test")
print(subwords)

True

['▁this', '▁is', '▁a', '▁t', 'est']


True

['▁das', '▁ist', '▁ein', '▁te', 'st']


In [10]:
input_wordvec_index = dict({
    word: index 
    for index, word 
    in enumerate(['<pad>', '<s>', '</s>'] + input_pretrained_bpe.wv.index2word)  # haven't found start/stop tokens, so add them manually
})
input_unk_index = input_wordvec_index['<unk>']

target_wordvec_index = dict({
    word: index 
    for index, word 
    in enumerate(['<pad>', '<s>', '</s>'] + target_pretrained_bpe.wv.index2word)  # haven't found start/stop tokens, so add them manually
})
target_unk_index = target_wordvec_index['<unk>']

def subword_indices(text, unk_index, sp, wordvec_index):
    subwords = ['<s>'] + sp.EncodeAsPieces(text) + ['</s>']  # automatic add start/stop index
    return [wordvec_index.get(subword, unk_index) for subword in subwords]

def input_subword_indices(text):
    return subword_indices(text, input_unk_index, sp_input, input_wordvec_index)

def target_subword_indices(text):
    return subword_indices(text, target_unk_index, sp_target, target_wordvec_index)

FULL_EMBEDDING_DIM = EMBEDDING_DIM + 2
input_embedding_matrix = np.zeros((len(input_wordvec_index), FULL_EMBEDDING_DIM))
input_embedding_matrix[0, :] = 1e-6 * np.random.standard_normal(FULL_EMBEDDING_DIM)  # pad symbol as close to zero
input_embedding_matrix[1, -1] = 1  # one hot encode start symbol
input_embedding_matrix[2, -2] = 1  # one hot encode stop symbol
input_embedding_matrix[3:, :-2] = input_pretrained_bpe.wv.vectors

target_embedding_matrix = np.zeros((len(target_wordvec_index), FULL_EMBEDDING_DIM))
target_embedding_matrix[0, :] = 1e-6 * np.random.standard_normal(FULL_EMBEDDING_DIM)  # pad symbol as close to zero
target_embedding_matrix[1, -1] = 1  # one hot encode start symbol
target_embedding_matrix[2, -2] = 1  # one hot encode stop symbol
target_embedding_matrix[3:, :-2] = target_pretrained_bpe.wv.vectors

df['input_sequences'] = df.input_texts.apply(input_subword_indices)
df['target_sequences'] = df.target_texts.apply(target_subword_indices)

  after removing the cwd from sys.path.
  # This is added back by InteractiveShellApp.init_path()


In [11]:
input_embedding_matrix[:4, -8:]
target_embedding_matrix[:4, -8:]

array([[-3.92108153e-07, -1.46351495e-06,  2.96120277e-07,
         2.61055272e-07,  5.11345664e-09, -2.34587133e-07,
        -1.41537074e-06, -4.20645323e-07],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  1.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         1.00000000e+00,  0.00000000e+00],
       [-1.54319003e-01,  1.55878007e-01,  5.60858011e-01,
        -1.23772003e-01,  1.91783994e-01,  3.10420003e-02,
         0.00000000e+00,  0.00000000e+00]])

array([[-8.83857436e-07,  1.53725106e-07,  5.82087184e-08,
        -1.14297030e-06,  3.57787360e-07,  5.60784526e-07,
         1.08305124e-06,  1.05380205e-06],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  1.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         1.00000000e+00,  0.00000000e+00],
       [-1.57903999e-01, -3.65200013e-01, -2.83556014e-01,
        -8.73659998e-02,  5.86996019e-01, -2.23167002e-01,
         0.00000000e+00,  0.00000000e+00]])

In [12]:
# corpus = pd.concat([df.input_texts, df.target_texts])
# corpus = df.target_texts

In [13]:
# tokenizer = keras.preprocessing.text.Tokenizer(num_words=100, filters=None, char_level=True, oov_token='~')
# tokenizer.fit_on_texts(corpus)
# # df['input_sequences'] = tokenizer.texts_to_sequences(df.input_texts)
# df['target_sequences'] = tokenizer.texts_to_sequences(df.target_texts)

In [14]:
# list(reversed(sorted(tokenizer.word_counts.items(), key=lambda d: d[1])))
# sum(1 for w, count in tokenizer.word_counts.items() if count > 1000)

In [15]:
max_len_input = df.input_sequences.apply(len).max()
max_len_target = df.target_sequences.apply(len).max()
nr_input_tokens = len(input_wordvec_index)  
nr_target_tokens = len(target_wordvec_index)  #len(tokenizer.word_index) + 1  # add 0 padding not in word_index contained

# one hot encoded y_t_output wouldn't fit into memory any longer
# so need to train/validate on batches generated on the fly
def create_batch_generator(samples_ids):
    
    def batch_generator():
        nr_batches = np.ceil(len(samples_ids) / BATCH_SIZE)
        while True:
            shuffled_ids = np.random.permutation(samples_ids)
            batch_splits = np.array_split(shuffled_ids, nr_batches)
            for batch_ids in batch_splits:
                batch_X = pad_sequences(df.iloc[batch_ids].input_sequences, padding='post', maxlen=max_len_input)
                batch_y = pad_sequences(df.iloc[batch_ids].target_sequences, padding='post', maxlen=max_len_target)
                batch_y_t_output = keras.utils.to_categorical(batch_y[:,1:], num_classes=nr_target_tokens)
                batch_x_t_input = batch_y[:,:-1]
                yield ([batch_X, batch_x_t_input], batch_y_t_output)
    
    return batch_generator()

In [16]:
train_ids, val_ids = train_test_split(np.arange(df.shape[0]), test_size=0.1)

In [17]:
nr_input_tokens, nr_target_tokens
# len(tokenizer.word_index)
len(train_ids), len(val_ids)

(1832, 1369)

(30952, 3440)

In [18]:
encoder_gru = L.Bidirectional(
    L.GRU(LATENT_DIM // 2, dropout=DROPOUT, return_state=True, name='encoder_gru'),
    name='encoder_bidirectional'
)
decoder_gru = L.GRU(LATENT_DIM, dropout=DROPOUT, return_sequences=True, return_state=True, name='decoder_gru')
decoder_dense = L.Dense(nr_target_tokens, activation='softmax', name='decoder_outputs')

input_embedding = L.Embedding(
    nr_input_tokens,
    FULL_EMBEDDING_DIM,
    mask_zero=True,
    weights=[input_embedding_matrix],
    name='input_embedding',
    trainable=EMBEDDING_TRAINABLE,
)
target_embedding = L.Embedding(
    nr_target_tokens,
    FULL_EMBEDDING_DIM,
    mask_zero=True,
    weights=[target_embedding_matrix],
    name='target_embedding',
    trainable=EMBEDDING_TRAINABLE,
)

encoder_inputs = L.Input(shape=(max_len_input, ), dtype='int32', name='encoder_inputs')
encoder_embeddings = input_embedding(encoder_inputs)
_, encoder_state_1, encoder_state_2 = encoder_gru(encoder_embeddings)
encoder_states = L.concatenate([encoder_state_1, encoder_state_2])

decoder_inputs = L.Input(shape=(max_len_target-1, ), dtype='int32', name='decoder_inputs')
decoder_mask = L.Masking(mask_value=0)(decoder_inputs)
decoder_embeddings_inputs = target_embedding(decoder_mask)
decoder_embeddings_outputs, _ = decoder_gru(decoder_embeddings_inputs, initial_state=encoder_states) 
decoder_outputs = decoder_dense(decoder_embeddings_outputs)

model = Model(inputs=[encoder_inputs, decoder_inputs], outputs=decoder_outputs)

inference_encoder_model = Model(encoder_inputs, encoder_states)
    
inference_decoder_state_inputs = L.Input(shape=(LATENT_DIM, ), dtype='float32', name='inference_decoder_state_inputs')
inference_decoder_embeddings_outputs, inference_decoder_states = decoder_gru(
    decoder_embeddings_inputs, initial_state=inference_decoder_state_inputs
)
inference_decoder_outputs = decoder_dense(inference_decoder_embeddings_outputs)

inference_decoder_model = Model(
    [decoder_inputs, inference_decoder_state_inputs], 
    [inference_decoder_outputs, inference_decoder_states]
)

In [19]:
model.summary()
inference_decoder_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_inputs (InputLayer)     (None, 20)           0                                            
__________________________________________________________________________________________________
decoder_inputs (InputLayer)     (None, 21)           0                                            
__________________________________________________________________________________________________
input_embedding (Embedding)     (None, 20, 102)      186864      encoder_inputs[0][0]             
__________________________________________________________________________________________________
masking_1 (Masking)             (None, 21)           0           decoder_inputs[0][0]             
__________________________________________________________________________________________________
encoder_bi

In [20]:
model.compile(optimizer=keras.optimizers.Adam(clipnorm=1.), loss='categorical_crossentropy')

In [21]:
train_generator = create_batch_generator(train_ids)
val_generator = create_batch_generator(val_ids)
model.fit_generator(
    train_generator,
    steps_per_epoch=np.ceil(len(train_ids) / BATCH_SIZE),
    epochs=20,
    validation_data=val_generator,
    validation_steps=np.ceil(len(val_ids) / BATCH_SIZE),
)
#model.fit([X, x_t_input], y_t_output, validation_split=0.1, epochs=EPOCHS, batch_size=BATCH_SIZE)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f3e619a20f0>

In [22]:
def decode_sequence(input_seq):
    states_value = inference_encoder_model.predict(input_seq)
    
    # tokens = {idx: token for (token, idx) in tokenizer.word_index.items()}
    tokens = {idx: token for (token, idx) in target_wordvec_index.items()}
    start_token_idx = target_wordvec_index['<s>']
    end_token_idx = target_wordvec_index['</s>']
    
    target_seq = np.zeros((1, max_len_target-1))
    target_seq[0, 0] = start_token_idx
    
    decoded_sequence = [] 
    for i in range(max_len_target):
        output_tokens, output_states = inference_decoder_model.predict(
            [target_seq, states_value]
        )
        
        # greedy search
        sampled_token_idx = np.argmax(output_tokens[0, 0, :])
        if sampled_token_idx == end_token_idx:
            break
        sampled_token = tokens.get(sampled_token_idx, '~')
        decoded_sequence.append(sampled_token)
            
        target_seq[0, 0] = sampled_token_idx
        states_value = output_states
    
    return sp_target.DecodePieces(decoded_sequence)

In [23]:
def predict(sentence):
    # print(sentence),
    # print(preprocess(sentence))
    # print(subword_indices(preprocess(sentence)))
    return decode_sequence(keras.preprocessing.sequence.pad_sequences(
        [input_subword_indices(preprocess(sentence))],
        padding='post',
        maxlen=max_len_input,
    ))

In [24]:
# Performance on some examples:
EXAMPLES = [
    'Hello.',
    'You are welcome.',
    'How do you do?',
    'I hate mondays.',
    'I am a programmer.',
    'Data is the new oil.',
    'It could be worse.',
    "I am on top of it.",
    "N° Uno",
    "Awesome!",
    "Put your feet up!",
    "From the start till the end!",
    "From dusk till dawn.",
]
for en in [sentence + '\n' for sentence in EXAMPLES]:
    print(f"{en!r} --> {predict(en)!r}")

'Hello.\n' --> 'helfen.'
'You are welcome.\n' --> 'sie sind willkommen.'
'How do you do?\n' --> 'wie können sie mir das?'
'I hate mondays.\n' --> 'ich habe meine antwort.'
'I am a programmer.\n' --> 'ich bin ein einprogramm.'
'Data is the new oil.\n' --> 'das ist eine richtige ortlage.'
'It could be worse.\n' --> 'das wäre gut.'
'I am on top of it.\n' --> 'ich bin damit im gegenteil.'
'N° Uno\n' --> 'nigero'
'Awesome!\n' --> 'eindeugend!'
'Put your feet up!\n' --> 'schangen sie sie!'
'From the start till the end!\n' --> 'im gegenteil!'
'From dusk till dawn.\n' --> 'der kaudität bleiben.'


In [25]:
# Performance on training set:
for en, de in df[['input_texts', 'target_texts']][1:20].values.tolist():
    print(f"Original {en!r}, got {predict(en)!r}, exp: {de!r}")

Original 'relating to wednesday:', got 'zum mittwoch:', exp: 'zum mittwoch:'
Original 'that was the decision.', got 'das war die einzige.', exp: 'das war der beschluß.'
Original 'we have agreed to this.', got 'darauf haben wir uns recht.', exp: 'wir haben dem zugestimmt.'
Original 'it is not a lot to ask.', got 'es ist nicht einfach.', exp: 'das ist nicht zuviel verlangt.'
Original 'thank you very much.', got 'vielen dank.', exp: 'vielen dank.'
Original 'that did not happen.', got 'das ist nicht geschehen.', exp: 'dazu kam es nicht.'
Original 'the debate is closed.', got 'die aussprache ist geschlossen.', exp: 'die aussprache ist geschlossen.'
Original 'the debate is closed.', got 'die aussprache ist geschlossen.', exp: 'die aussprache ist geschlossen.'
Original 'the debate is closed.', got 'die aussprache ist geschlossen.', exp: 'die aussprache ist geschlossen.'
Original 'what is the result?', got 'was ist das ergebnis?', exp: 'was sind die folgen?'
Original 'the debate is closed.', g

In [26]:
# Performance on validation set
val_df = df.iloc[val_ids]
for en, de in val_df[['input_texts', 'target_texts']][1:20].values.tolist():
    print(f"Original {en!r}, got {predict(en)!r}, exp: {de!r}")

Original 'voting time', got 'abstimmungsstunde', exp: 'abstimmungsstunde'
Original 'thank you.', got 'vielen dank.', exp: 'danke.'
Original 'it lives!', got 'es ist vorbei!', exp: 'es lebt!'
Original 'thank you, mr caudron.', got 'vielen dank, herr carha.', exp: 'ich danke herrn caudron.'
Original 'i think they can.', got 'ich glaube nicht.', exp: 'ich meine, ja.'
Original 'marek siwiec: 0 votes', got 'frau jamil wurtzide: 0 stimmen', exp: 'marek siwiec: 0 stimmen'
Original '(applause)', got '(beifall)', exp: '(beifall)'
Original 'why do we have to do so?', got 'warum tun wir das?', exp: 'warum müssen wir dies tun?'
Original 'riddle me that.', got 'das sagte ich.', exp: 'das ist mir ein rätsel.'
Original '- before the vote:', got '- vor der abstimmung:', exp: '- vor der abstimmung:'
Original 'very few.', got 'sehr gut.', exp: 'über sehr wenige.'
Original 'zimbabwe', got 'simbabwe', exp: 'simbabwe'
Original 'applause', got 'beifall', exp: 'beifall'
Original 'welcome', got 'begrüßung', e

In [None]:
import spacy
try:
    from spacy.lang.de import German
except ModuleNotFoundError:
    spacy.cli.download('de')
    from spacy.lang.de import German
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

parser = German()
chencherry = SmoothingFunction()  # to handle short sequences, see also http://www.nltk.org/_modules/nltk/translate/bleu_score.html#SmoothingFunction.method3

def remove_spaces_and_puncts(tokens):
     return [token.orth_ for token in tokens if not (token.is_space or token.is_punct)]  

bleu_scores = np.zeros(TEST_SIZE)
nist_scores = np.zeros(TEST_SIZE)

for i in tqdm(range(TEST_SIZE)):
    pred_tokens = remove_spaces_and_puncts(parser(predict(df.iloc[i].input_texts)))
    ref_tokens = remove_spaces_and_puncts(parser(df.iloc[i].target_texts))
    bleu_scores[i] = sentence_bleu([ref_tokens], pred_tokens, smoothing_function=chencherry.method3)
    
print("Average bleu score:", bleu_scores.mean())

HBox(children=(IntProgress(value=0, max=500), HTML(value='')))

In [None]:
## Conclusion

# It doesn't work perfect, but fine enough to show that seq2seq works in some way. I wouldn't be surprised if the mean average error is better than average human bias for calculating without any tools.
# For improvements and further discussions I'll move to a real problem (translating) and main steps will be:
# * Bytepairencoding/Word embeddings
# * Beam Search
# * Attention models