# Char level transformer for Machine translation
To do translation, we'll use sequence-to-sequence model: an encoder encode the input and decoder using both the encode and raw output to predict. 

This is similar to char-transformer for language modeling, but we will add an encoder, and in place of top-k output when we generate text we'll use top-1 beam search output.

<img src='images/transformer.png' width=400>

<a href="https://www.kaggle.com/hungnm/englishvietnamese-translation">English-Vietnamese HungMN Kaggle</a>

<a href="https://github.com/vietai/SAT">English-Vietnamese dataset VietAI SAT</a>

<a href="https://nlp.stanford.edu/projects/nmt/">English-Vietnamese dataset IWSLT 15</a>

In [1]:
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader

import logging
logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
)

%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
class CharDataset(Dataset):
    '''
    Dataset is a iterable that returns input and target sentence. It adds <sos> at the begining, and <eos> at the end, 
    and filling in <pad> if sentence length is less than pre-defined value.
    '''
    
    def __init__(self, x, y, sequence_len, encoder=None):
        # data in the type of pairs of sentence
        data = ''.join(x+y)
        # from collections import Counter
        # vocab_size = 250

        # ct = Counter(data)
        # include = sorted(ct, key=ct.get, reverse=True)
        # if len(include)>vocab_size: include = include[:vocab_size]
        # rule = ''.join(include)
        chars = ['<pad>'] +['<sos>'] + ['<eos>'] + sorted(list(set(data)))
        data_size, vocab_size = len(data), len(chars)

        print('data has %d characters, %d unique chars, %d sentences.' % (data_size, len(chars), len(x)))
        print('sentence length nine_nine_percentile: %d' % (sequence_len))
        
        self.x, self.y = x, y
        self.ch2i = {ch:i for i,ch in enumerate(chars)}
        self.i2ch = {i:ch for i,ch in enumerate(chars)}
        self.vocab_size = vocab_size
        self.sequence_len = sequence_len
        self.encoder=encoder
    
    def __len__(self):
        return len(self.x) # len x = y
    
    def __getitem__(self, idx):
        
        indx = self.padding([self.ch2i[ch] for ch in self.x[idx]] + [self.ch2i['<eos>']])
        indy = [self.ch2i['<sos>']] + self.padding([self.ch2i[ch] for ch in self.y[idx]] + [self.ch2i['<eos>']])

        x = torch.tensor(indx, dtype=torch.long)
        y = torch.tensor(indy, dtype=torch.long)

        return x,y
                                                                                                                               
    def padding(self, string):
        if len(string)<self.sequence_len:
            string =  string + [0]*(self.sequence_len - len(string))
        else:
            string = string[:self.sequence_len -1] + [self.ch2i['<eos>']]
                   
        return string

In [3]:
sequence_len = 128
min_len = 5

In [None]:
# # process and save data
# from utils.pre_processing import *
# from utils.utils import *
# en, vi = list(), list()
# paths = ["data/vietaisat/","data/hungnm/","data/iwslt15/"]
# for path in paths:
#     x = open(path+"en.txt", encoding='utf-8').read().split("\n")
#     y = open(path+"vi.txt", encoding='utf-8').read().split("\n")
#     x,y = pre_processing(x, y, min_length=min_len, max_length=sequence_len) # remove sentence less than 4 characters
#     en += x
#     vi += y
    
# path = "data/cleaned/"
# pickle(path+"en", en)
# pickle(path+"vi", vi)
# # nine_nine_percentile = int(np.percentile([len(sen) for sen in vi],99))

In [20]:
# Load saved data
from utils.utils import *
from utils.pre_processing import *

path = "data/cleaned/"
en = pickle(path+"en")
vi = pickle(path+"vi")
en,vi = pre_processing(en, vi, min_length=min_len, max_length=sequence_len) # clip sentences
for i in range(-1,-5,-1):
    print(en[i],'|',vi[i])

Some last sentences
 | 
Didier Sornette How we can predict the next financial crisis | Paul Pholeros Làm sao để bớt nghèo khổ ? Hãy sửa nhà
Thank you very much for your time . | Rất cảm ơn đã lắng nghe .
It s manmade and can be overcome and eradicated by the actions of human beings .  | Nó là do con người và có thể ngăn chặn và diệt trừ bởi hành động của con người . 


In [21]:

dataset = CharDataset(vi, en, sequence_len=sequence_len)

print('sample tensors ', next(iter(dataset)))
print("vocab: ", dataset.ch2i)

data has 292892120 characters, 206 unique chars, 2448155 sentences.
sentence length nine_nine_percentile: 128
sample tensors  (tensor([ 69,  54,  59,   3,  67,  66,  54,   3,  57,  97,  59,  52,   3, 107,
        139,  65,   3,  59,  52, 115, 177,  54,   3,  62,  66,  93,  65,   3,
         63,  89,  48,   3,  65,  63,  60,  59,  52,   3,  65, 187,   3,  48,
         53, 169,  54,   2,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0]), tensor([ 1, 34, 57, 50, 46, 64, 50,  3, 61, 66, 65,  3, 65, 53, 50,  3, 49, 66,
        64, 65, 61, 46, 59,  3, 54, 59,  3, 65, 53, 50,  3, 47, 63, 60,

# Building model

In [22]:
from model.encode_decode_transformer import Transformer, TransformerConfig
from utils.trainer import Trainer, TrainerConfig
tconfig = TrainerConfig(max_epochs=1, batch_size=16, learning_rate=6e-4, grad_norm_clip=1.0, device='cuda',
                       lr_decay=True, warmup_tokens=5000, ckpt_n_print_iter=4000, ckpt_path='checkpoint/transformer_vn_en_char')

mconfig = TransformerConfig(vocab_size=dataset.vocab_size, sequence_len=dataset.sequence_len, embed_dim=256,
                           n_block=8, n_head=8, device=tconfig.device)

In [23]:
model = Transformer(mconfig)

10/04/2021 11:58:26 - INFO - model.encode_decode_transformer -   number of parameters: 1.487002e+07


In [24]:
sentences = ["hôm nay trông bạn thật đẹp!",
           "đưa tôi một chai nước, tôi khát khô cổ rồi.",
           "thời tiết hôm nay thật đẹp!",
           "bạn đã ăn sáng chưa?",
           "chúng ta sẽ khởi hành vào rạng sáng mai."
            ]
trainer = Trainer(model, dataset, tconfig, test_dataset=sentences, collate=None)

In [25]:
# load pre-trained weights
# from utils.utils import pickle
# model.load_state_dict(pickle(tconfig.ckpt_path)) # load

In [None]:
trainer.train()

In [None]:
samples = ["Hôm nay cái áo bạn mặc trông thật đẹp, nó bao tiền vậy?",
           "Nhà tôi có 1 con zombie trà sữa",
           "nó đáng yêu nhưng rất sợ sấm và bóng tối",
           "Hôm nay nhìn Amy không khác gì tranh vẽ",
           "dù Amy chưa ăn sáng",
           "Chúng ta sẽ khởi hành vào rạng sáng mai, hãy chuẩn bị kỹ."
          ]
result = model.generate_output(samples, dataset, top_k=5, print_process=True)
print(result)

In [None]:
# benchmarking using bleu score
path = "data/iwslt15/"
en = open(path+"tst2013.en.txt", encoding='utf-8').read().split("\n")
vi = open(path+"tst2013.vi.txt", encoding='utf-8').read().split("\n")
en,vi = pre_processing(en, vi, min_length=min_len, max_length=sequence_len) # remove sentence less than 4 characters

result = model.generate_output(vi, dataset, top_k=5, print_process=True)


In [None]:
from utils.utils import *
score, references, candidates = bleu_score(en, result)
print(score)

In [None]:
for i in range(len(vi)):
    print(vi[i], " | ", en[i], " | ", result[i])