# Wordpice encoding for machine translation 

In [2]:
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader

import logging
logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
)

%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
sequence_len = 128
min_len = 10

In [3]:
from utils.utils import *

path = "data/cleaned/"
en = pickle(path+"en")
vi = pickle(path+"vi")
for i in range(-1,-5,-1):
    print(en[i],'|',vi[i])

 | 
Didier Sornette How we can predict the next financial crisis | Paul Pholeros L√†m sao ƒë·ªÉ b·ªõt ngh√®o kh·ªï ? H√£y s·ª≠a nh√†
Thank you very much for your time . | R·∫•t c·∫£m ∆°n ƒë√£ l·∫Øng nghe .
It s manmade and can be overcome and eradicated by the actions of human beings .  | N√≥ l√† do con ng∆∞·ªùi v√† c√≥ th·ªÉ ngƒÉn ch·∫∑n v√† di·ªát tr·ª´ b·ªüi h√†nh ƒë·ªông c·ªßa con ng∆∞·ªùi . 


In [4]:
from tokenizers import Tokenizer
from tokenizers.models import WordPiece
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import WordPieceTrainer

tokenizer = Tokenizer(WordPiece(unk_token="<unk>"))
tokenizer.pre_tokenizer = Whitespace()
trainer = WordPieceTrainer(special_tokens=["<pad>", "<sos>", "<eos>", "<unk>"],
                     vocab_size=1000, show_progress=True)

In [7]:
files = [f"data/cleaned/{split}" for split in ["en", "vi"]]
tokenizer.train(files, trainer)

In [8]:
from tokenizers import decoders
output = tokenizer.encode("Hello, y'all! How are you üòÅ ?")
print(output.tokens, output.ids)
tokenizer.decoder = decoders.WordPiece()
output = tokenizer.decode(output.ids)
print(output)

['He', '##ll', '##o', ',', 'y', "'", 'all', '!', 'H', '##ow', 'are', 'you', '<unk>', '?'] [723, 459, 213, 6, 70, 5, 739, 4, 26, 467, 606, 457, 3, 18]
Hello, y'all! How are you?


In [35]:
tokenizer.save("data/tokenizer-cleaned.json")

In [9]:
class CharDataset(Dataset):
    '''
    Dataset is a iterable that returns input and target sentence. It adds <sos> at the begining, and <eos> at the end, 
    and filling in <pad> if sentence length is less than pre-defined value.
    '''
    
    def __init__(self, x, y, sequence_len, encoder=None):
        print('%d sentences.' % (len(x)))
        
        self.x, self.y = x, y
        self.ch2i = encoder.get_vocab(with_added_tokens=True)
        self.ch2i = dict(sorted(self.ch2i.items(), key=lambda item: item[1]))
        self.i2ch = { i:token for token,i in self.ch2i.items()}
        self.vocab_size = encoder.get_vocab_size(with_added_tokens=True)
        self.sequence_len = sequence_len
        self.encoder = encoder
    
    def __len__(self):
        return len(self.x) # len x = y
    
    def __getitem__(self, idx):

        indx = self.padding(self.encoder.encode(self.x[idx]).ids + [self.ch2i['<eos>']])
        indy = [self.ch2i['<sos>']] + self.padding(self.encoder.encode(self.y[idx]).ids + [self.ch2i['<eos>']])
#         indx = self.x[idx].ids
#         indy = self.y[idx].ids
        x = torch.tensor(indx, dtype=torch.long)
        y = torch.tensor(indy, dtype=torch.long)

        return x,y
                                                                                                                               
    def padding(self, string):
        if len(string)<self.sequence_len:
            string =  string + [0]*(self.sequence_len - len(string))
        else:
            string = string[:self.sequence_len -1] + [self.ch2i['<eos>']]
                   
        return string

In [11]:
# nine_nine_percentile = int(np.percentile([len(sen) for sen in data],99))
dataset = CharDataset(vi, en, sequence_len=sequence_len, encoder=tokenizer)

print('sample tensors ', next(iter(dataset)))
print("vocab: ", dataset.ch2i)

2448155 sentences.
sample tensors  (tensor([ 69, 432,  67, 214, 221,  57, 827, 107, 861, 523, 452, 287, 208,  63,
        463, 521,  65, 330, 410, 749,   2,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0]), tensor([  1,  34, 443, 442, 209,  61, 481, 411,  49, 625, 219, 415, 438, 411,
         47, 495, 437,  48, 934, 496, 208,   2,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
        

# Building model

In [12]:
from model.encode_decode_transformer import Transformer, TransformerConfig
from utils.trainer import Trainer, TrainerConfig
tconfig = TrainerConfig(max_epochs=1, batch_size=16, learning_rate=6e-4, grad_norm_clip=1.0, device='cuda',
                       lr_decay=True, warmup_tokens=5000, ckpt_n_print_iter=4000, ckpt_path='checkpoint/transformer_vn_en_wordpiece')

mconfig = TransformerConfig(vocab_size=dataset.vocab_size, sequence_len=dataset.sequence_len, embed_dim=256,
                           n_block=8, n_head=8, device=tconfig.device)

In [13]:
model = Transformer(mconfig)

10/03/2021 19:09:59 - INFO - model.encode_decode_transformer -   number of parameters: 1.527654e+07


In [14]:
sentences = ["h√¥m nay tr√¥ng b·∫°n th·∫≠t ƒë·∫πp!",
           "ƒë∆∞a t√¥i m·ªôt chai n∆∞·ªõc, t√¥i kh√°t kh√¥ c·ªï r·ªìi.",
           "th·ªùi ti·∫øt h√¥m nay th·∫≠t ƒë·∫πp!",
           "b·∫°n ƒë√£ ƒÉn s√°ng ch∆∞a?",
           "ch√∫ng ta s·∫Ω kh·ªüi h√†nh v√†o r·∫°ng s√°ng mai."
            ]
trainer = Trainer(model, dataset, tconfig, test_dataset=sentences, collate=None)

In [1]:
# # load pre-trained weights
# from utils.utils import pickle
# model.load_state_dict(pickle(tconfig.ckpt_path)) # load

In [None]:
trainer.train()

epoch: 1 | train loss: 900.69436  | lr: 1.920000e-06:   0%|                                 | 0/153009 [00:00<?, ?it/s]

[]


epoch: 1 | train loss: 3.22835  | lr: 5.991366e-04:   3%|‚ñå                    | 4000/153009 [15:56<10:21:23,  4.00it/s]

["She's going to me!", "There's going to help me....................................................................................................................", "That's a lot of your friends..................................................................................................................", 'Have you?', "We've got a lot of the books................................................................................................................."]


epoch: 1 | train loss: 2.84345  | lr: 5.962545e-04:   5%|‚ñà‚ñè                    | 8000/153009 [31:15<8:47:03,  4.59it/s]

["You're going to feel you, you're going to feel.", "When I'm a few moment, I'm going to feel.", "What's a few morning, you're going to go.", "What's you going to do?", "We'll be a few morning."]


epoch: 1 | train loss: 2.65123  | lr: 5.913671e-04:   8%|‚ñà‚ñã                   | 12000/153009 [46:31<9:06:39,  4.30it/s]

["Don't find you!", "My mother, I'm a little book.", "It's always because you're gone.", 'Are you going to go?', "We'll see the first time."]


epoch: 1 | train loss: 2.53527  | lr: 5.864802e-04:  10%|‚ñà‚ñà                   | 14960/153009 [58:06<9:03:08,  4.24it/s]

In [14]:
samples = ["H√¥m nay c√°i √°o b·∫°n m·∫∑c tr√¥ng th·∫≠t ƒë·∫πp, n√≥ bao ti·ªÅn v·∫≠y?",
           "Nh√† t√¥i c√≥ 1 con zombie tr√† s·ªØa",
           "n√≥ ƒë√°ng y√™u nh∆∞ng r·∫•t s·ª£ s·∫•m v√† b√≥ng t·ªëi",
           "H√¥m nay nh√¨n Amy kh√¥ng kh√°c g√¨ tranh v·∫Ω",
           "d√π Amy ch∆∞a ƒÉn s√°ng",
           "Ch√∫ng ta s·∫Ω kh·ªüi h√†nh v√†o r·∫°ng s√°ng mai, h√£y chu·∫©n b·ªã k·ªπ."
          ]
result = model.generate_output(samples, dataset, top_k=5, print_process=True)
print(result)

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [00:07<00:00,  1.28s/it]

['What time do you look interested, how much?', 'My house has one of these zombies .', 'It seems to be very frightened and', "Today 's looking like Amy 's nothing 's going to bring up with you .", "Amy hasn't eaten breakfast", "We're going to sit in tomorrow morning, please."]





In [None]:
# benchmarking using bleu score
path = "data/iwslt15/"
en = open(path+"tst2013.en.txt", encoding='utf-8').read().split("\n")
vi = open(path+"tst2013.vi.txt", encoding='utf-8').read().split("\n")
en,vi = pre_processing(en, vi, min_length=5, max_length=70) # remove sentence less than 4 characters

result = model.generate_output(vi, dataset, top_k=5, print_process=True)


In [16]:
from utils.utils import *
score, references, candidates = bleu_score(en, result)
print(score)

15.91292962059849


In [20]:
for i in range(len(vi)):
    print(vi[i], " | ", en[i], " | ", result[i])

T√¥i ƒë√£ r·∫•t t·ª± h√†o v·ªÅ ƒë·∫•t n∆∞·ªõc t√¥i .  |  And I was very proud .  |  I've been very proud of my country.
Gia ƒë√¨nh c·ªßa t√¥i kh√¥ng ngh√®o , v√† b·∫£n th√¢n t√¥i th√¨ ch∆∞a t·ª´ng ph·∫£i ch·ªãu ƒë√≥i .  |  My family was not poor , and myself , I had never experienced hunger .  |  My family doesn't leave, and myself hasn't ever been there.
Nh∆∞ng v√†o m·ªôt ng√†y c·ªßa nƒÉm 1995 , m·∫π t√¥i mang v·ªÅ nh√† m·ªôt l√° th∆∞ t·ª´ m·ªôt ng∆∞·ªùi ch·ªã em c√πng ch·ªó l√†m v·ªõi m·∫π .  |  But one day , in 1995 , my mom brought home a letter from a coworker s sister .  |  But on the day of 1995, my mother brought home from a sister from a sister with his mother.
T·∫•t c·∫£ c√πng n·∫±m tr√™n s√†n , v√† c∆° th·ªÉ ch√∫ng t√¥i y·∫øu ƒë·∫øn c√≥ th·ªÉ c·∫£m th·∫•y nh∆∞ c√°i ch·∫øt ƒëang ƒë·∫øn r·∫•t g·∫ßn .  |  We are lying on the floor together , and our bodies are so weak we are ready to die .   |  All right over the floor, and our bodies can feel like the death is very close.
T√¥i

T√¥i th·∫•y c·∫£nh gi·ªëng nh∆∞ 1 gia ƒë√¨nh ƒëang ƒë√°nh c√° tr√™n thuy·ªÅn , 2 anh trai , v√†i ƒë·ª©a nh·ªè h∆°n , c√≥ v·∫ª h·ª£p l√≠ nh·ªâ ?  |  I saw what seemed to be a family fishing on a boat , two older brothers , some younger kids , makes sense right ?  |  I feel like a family's fishing on the boat, two boys, and some smaller than smaller, doesn't it matter?
Sai . H·ªç ƒë·ªÅu b·ªã n√¥ l·ªá .  |  Wrong . They were all enslaved .  |  Sai. They're both slaved.
ƒê·ª©a tr·∫ª n√†y 8 tu·ªïi .  |  This young child is eight years old .  |  This child is eight years old.
Em run r·∫©y khi thuy·ªÅn c·ªßa ch√∫ng t√¥i ƒë·∫øn g·∫ßn , c·ª© s·ª£ thuy·ªÅn ƒë√® qua c√°i canoe b√© x√≠u c·ªßa em .  |  He was trembling when our boat approached , frightened it would run over his tiny canoe .  |  You're running when our boat came near, kidnapping through your boyfriend's little girl.
C·∫≠u b√© khi·∫øp ƒë·∫£m v√¨ s·ª£ r∆°i xu·ªëng n∆∞·ªõc .  |  He was petrified he would be knocked in the water .  |  T