# **1. Load Data**

In [1]:
import torch
MODELNAME = "iwslt15-en-vi-rnn.model"
EPOCH = 10
BATCHSIZE = 128
LR = 0.0001
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [2]:
import re
import html
import contractions
import requests
from torchtext.data.utils import get_tokenizer
from pyvi.ViTokenizer import tokenize

class DataLoader:
    def __init__(self, url_en, url_vi):
        # function to preprocessing
        self.__tokenizer_en = get_tokenizer('spacy', language='en_core_web_sm')
        self.__tokenizer_vi = lambda text: list(
            map(lambda word: re.sub('_', ' ', word), tokenize(text).split()))

        self.__check_dict = { # bổ xung
            ' \'s': '\'s',
            '& lt ;': '<',
            '& gt ;': '>',
            "<[^<]+>":'',
            ' +': ' ',
        }

        #last run
        self.__en_data = self.__load_data(url_en, 'en')
        self.__vi_data = self.__load_data(url_vi, 'vi')

    def __text_preprocessing(self, text: str, language: str = 'en'):
        text = html.unescape(text)
        for pattern, repl in self.__check_dict.items():
            text = re.sub(pattern, repl, text)

        if language == 'en':
            text = contractions.fix(text)
            return self.__tokenizer_en(text)

        return self.__tokenizer_vi(text)

    def __load_data(self, url, language: str):
        return [self.__text_preprocessing(line, language) for line in requests.get(url).text.splitlines()]

    @property
    def vi(self):
        return self.__vi_data

    @property
    def en(self):
        return self.__en_data


In [3]:
url = "https://nlp.stanford.edu/projects/nmt/data/iwslt15.en-vi/"

train = DataLoader(url +'train.en',url +'train.vi')
test = DataLoader(url + 'tst2013.en',url + 'tst2013.vi')


In [4]:
for i in range(10,20):
    print(i,train.en[i])
    print(i,train.vi[i])

print(len(train.en))

10 ['Over', '15,000', 'scientists', 'go', 'to', 'San', 'Francisco', 'every', 'year', 'for', 'that', '.']
10 ['Mỗi', 'năm', ',', 'hơn', '15,000', 'nhà', 'khoa học', 'đến', 'San Francisco', 'để', 'tham dự', 'hội nghị', 'này', '.']
11 ['And', 'every', 'one', 'of', 'those', 'scientists', 'is', 'in', 'a', 'research', 'group', ',', 'and', 'every', 'research', 'group', 'studies', 'a', 'wide', 'variety', 'of', 'topics', '.']
11 ['Mỗi một', 'khoa học', 'gia', 'đều', 'thuộc', 'một', 'nhóm', 'nghiên cứu', ',', 'và', 'mỗi', 'nhóm', 'đều', 'nghiên cứu', 'rất', 'nhiều', 'đề tài', 'đa dạng', '.']
12 ['For', 'us', 'at', 'Cambridge', ',', 'it', 'is', 'as', 'varied', 'as', 'the', 'El', 'Niño', 'oscillation', ',', 'which', 'affects', 'weather', 'and', 'climate', ',', 'to', 'the', 'assimilation', 'of', 'satellite', 'data', ',', 'to', 'emissions', 'from', 'crops', 'that', 'produce', 'biofuels', ',', 'which', 'is', 'what', 'I', 'happen', 'to', 'study', '.']
12 ['Với', 'chúng tôi', ',', 'tại', 'Cambridge', '

# **2. Vocab**

In [5]:
from typing import Iterator, List, Optional
from torchtext.vocab import build_vocab_from_iterator

class Language:
    def __init__(self, train_iter: Iterator, min_freq:int = 1,specials: Optional[List[str]] = None, default_idx:int = 0):
        self.__make_vocab(train_iter,min_freq,specials,default_idx)

    def __yield_tokens(self, data):
        for line in data:
            yield line  

    def __make_vocab(self, train_iter: Iterator, min_freq:int = 1,specials: Optional[List[str]] = None, default_idx:int = 0):
        self.__vocab = build_vocab_from_iterator(self.__yield_tokens(train_iter), min_freq, specials)
        self.__vocab.set_default_index(default_idx)

    @property
    def name(self):
        return self.__name

    @property
    def vocab(self):
        return self.__vocab

In [6]:
UNK_IDX, PAD_IDX, SOS_IDX, EOS_IDX = 0, 1, 2, 3
specials = ["<unk>", "<pad>", "<sos>", "<eos>"]

Vi = Language(train.vi,3,specials,UNK_IDX)
En = Language(train.en,3,specials,UNK_IDX)


In [7]:
print("vocab size vi:", len(Vi.vocab.get_itos()))
print("vocab size en:", len(En.vocab.get_itos()))
for word in Vi.vocab.get_itos()[:20]:
    print(word,Vi.vocab[word])

vocab size vi: 17843
vocab size en: 23972
<unk> 0
<pad> 1
<sos> 2
<eos> 3
, 4
. 5
là 6
và 7
một 8
tôi 9
những 10
của 11
có 12
đó 13
không 14
bạn 15
trong 16
này 17
đã 18
người 19


# **3. Data Preprocessing**

In [8]:
from torchtext.vocab.vocab import Vocab
def data_preprocessing(data:List[List[str]],vocab:Vocab):
    rr = []
    idx2word = vocab.get_itos()
    for line in data:
        tkl = ['<sos>']
        for word in line:
            tkl.append(idx2word[vocab[word]])
        tkl.append('<eos>')
        rr.append(tkl)
    return rr

In [9]:
train_en_prep = data_preprocessing(train.en, En.vocab)
train_vi_prep = data_preprocessing(train.vi, Vi.vocab)
test_en_prep = data_preprocessing(test.en, En.vocab)

In [10]:
for i in range(5,10):
    print(train_en_prep[i])
    print(train_vi_prep[i])
    print(test_en_prep[i])

['<sos>', 'Recently', 'the', 'headlines', 'looked', 'like', 'this', 'when', 'the', '<unk>', 'Panel', 'on', 'Climate', 'Change', ',', 'or', 'IPCC', ',', 'put', 'out', 'their', 'report', 'on', 'the', 'state', 'of', 'understanding', 'of', 'the', 'atmospheric', 'system', '.', '<eos>']
['<sos>', 'Các', 'tiêu đề', 'gần', 'đây', 'trông', 'như', 'thế', 'này', 'khi', 'Ban', '<unk>', 'Biến đổi', 'khí hậu', '<unk>', 'chính phủ', ',', 'gọi', 'tắt', 'là', 'IPCC', 'đưa', 'ra', 'bài', 'nghiên cứu', 'của', 'họ', 'về', 'hệ thống', 'khí quyển', '.', '<eos>']
['<sos>', 'My', 'family', 'was', 'not', 'poor', ',', 'and', 'myself', ',', 'I', 'had', 'never', 'experienced', 'hunger', '.', '<eos>']
['<sos>', 'That', 'report', 'was', 'written', 'by', '620', 'scientists', 'from', '40', 'countries', '.', '<eos>']
['<sos>', 'Nghiên cứu', 'được', 'viết', 'bởi', '620', 'nhà', 'khoa học', 'từ', '40', 'quốc gia', 'khác', 'nhau', '.', '<eos>']
['<sos>', 'But', 'one', 'day', ',', 'in', '1995', ',', 'my', 'mom', 'brought'

In [11]:
train_data = list(zip(train_en_prep, train_vi_prep))
train_data.sort(key = lambda x: (len(x[0]), len(x[1])))
test_data = list(zip(test_en_prep, test.en, test.vi))

for i in range(5):
  print(train_data[i])

for i in range(5):   
  print(test_data[i])

(['<sos>', '<eos>'], ['<sos>', '<eos>'])
(['<sos>', '<eos>'], ['<sos>', '<eos>'])
(['<sos>', '<eos>'], ['<sos>', '<eos>'])
(['<sos>', '<eos>'], ['<sos>', '<eos>'])
(['<sos>', '<eos>'], ['<sos>', '<eos>'])
(['<sos>', 'When', 'I', 'was', 'little', ',', 'I', 'thought', 'my', 'country', 'was', 'the', 'best', 'on', 'the', 'planet', ',', 'and', 'I', 'grew', 'up', 'singing', 'a', 'song', 'called', '"', 'Nothing', 'To', '<unk>', '.', '"', '<eos>'], ['When', 'I', 'was', 'little', ',', 'I', 'thought', 'my', 'country', 'was', 'the', 'best', 'on', 'the', 'planet', ',', 'and', 'I', 'grew', 'up', 'singing', 'a', 'song', 'called', '"', 'Nothing', 'To', 'Envy', '.', '"'], ['Khi', 'tôi', 'còn', 'nhỏ', ',', 'Tôi', 'nghĩ', 'rằng', 'BắcTriều', 'Tiên', 'là', 'đất nước', 'tốt', 'nhất', 'trên', 'thế giới', 'và', 'tôi', 'thường', 'hát', 'bài', '"', 'Chúng ta', 'chẳng', 'có', 'gì', 'phải', 'ghen tị', '.', '"'])
(['<sos>', 'And', 'I', 'was', 'very', 'proud', '.', '<eos>'], ['And', 'I', 'was', 'very', 'proud', '

In [12]:
from typing import Tuple

def make_batch(data:List[Tuple], batchsize:int):
    bb = []
    ben = []
    bvi = []
    for en, vi in data: 
        ben.append(en)
        bvi.append(vi)
        if len(ben) >= batchsize:
            bb.append((ben, bvi))
            ben = []
            bvi = []
    if len(ben) > 0:
        bb.append((ben, bvi))
    return bb


In [13]:
train_data = make_batch(train_data, BATCHSIZE)

In [14]:
for i in range(5):
    print(train_data[i])

([['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>']

In [15]:
def padding_batch(b):
    maxlen = max([len(x) for x in b])
    for tkl in b:
        for i in range(maxlen - len(tkl)):
            tkl.append('<pad>')

def padding(bb):
    for ben, bvi in bb:
        padding_batch(ben)
        padding_batch(bvi)

In [16]:
padding(train_data)

In [17]:
for i in range(3):
  print(train_data[i])

([['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>']

In [18]:
def text_pipeline(data:List[List[str]], vocab: Vocab):
    return [[vocab[word] for word in word_lst] for word_lst in data]

train_data = [(text_pipeline(ben,En.vocab),text_pipeline(bvi,En.vocab)) for ben, bvi in train_data]

In [19]:
for i in range (3): 
    print(train_data[i]) 

([[2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3]

# **4. Model**

In [20]:
import torch.nn.functional as F 

class RNNEncDec(torch.nn.Module):
    def __init__(self, vocab_x:Vocab, vocab_y:Vocab):
        super(RNNEncDec, self).__init__()
        num_emb_x = len(vocab_x.get_itos())
        num_emb_y = len(vocab_y.get_itos())
        
        self.encemb = torch.nn.Embedding(num_emb_x, 300, padding_idx=vocab_x['<pad>'])
        self.encrnn = torch.nn.Linear(300, 300) 
        self.decemb = torch.nn.Embedding(num_emb_y, 300, padding_idx=vocab_y['<pad>'])
        self.decrnn = torch.nn.Linear(300, 300)
        self.decout = torch.nn.Linear(300, num_emb_y)
  
    def forward(self,x):
        x, y = x[0], x[1]
        print(x.size())
        print(y.size())
        #enc
        e_x = self.encemb(x)
        n_x = e_x.size()[0]
        h = torch.zeros(300, dtype=torch.float32).to(DEVICE)
        for i in range(n_x):
            h = F.relu(e_x[i] + self.encrnn(h))
        #dec
        e_y = self.decemb(y)
        n_y = e_y.size()[0]
        loss = torch.tensor(0., dtype=torch.float32).to(DEVICE)
        for i in range (n_y-1):
            h = F.relu(e_y[i] + self.decrnn(h))
            loss += F.cross_entropy(self.decout(h), y[i+1])
        return loss

    def evaluate(self, x,  vocab_x:Vocab, vocab_y:Vocab):
        #enc
        e_x = self.encemb(x)
        n_x=e_x.size()[0]
        h = torch.zeros(300, dtype=torch.float32).to(DEVICE)
        for i in range (n_x):
            h = F.relu(e_x[i] + self.encrnn(h)) 
        #dec
        y = torch.tensor([vocab_x['<cls>']]).to(DEVICE)
        e_y= self.decemb(y)
        pred = []
        for i in range (30):
            h = F.relu(e_y + self.decrnn(h))
            pred_id = self.decout(h).squeeze().argmax() 
            if pred_id == vocab_y['<eos>']:
                break
            pred_y = vocab_y[pred_id][0]
            pred.append(pred_y) 
            y[0] = pred_id
            e_y= self.decemb(y)
        return pred

In [21]:
def train():
    model = RNNEncDec(En.vocab, Vi.vocab).to(DEVICE)
    optimizer = torch.optim.Adam(model.parameters(), lr=LR) 
    for epoch in range(EPOCH):
        loss = 0
        step = 0
        for ben, bvi in train_data:
            ben = torch.tensor(ben, dtype=torch.int64).transpose(0,1).to(DEVICE) 
            bvi = torch.tensor(bvi, dtype=torch.int64).transpose(0,1).to(DEVICE)
            optimizer.zero_grad()
            batchloss = model((ben, bvi))
            batchloss.backward()
            optimizer.step() 
            loss = loss + batchloss.item()
            if step % 100 == 0:
                print("step:", step, "batch loss:", batchloss.item())
            step += 1
        print("epoch", epoch, ": loss", loss)
    torch.save(model.state_dict(), MODELNAME)

In [22]:
train()


torch.Size([2, 128])
torch.Size([3, 128])
step: 0 batch loss: 20.36084747314453
torch.Size([4, 128])
torch.Size([8, 128])
torch.Size([4, 128])
torch.Size([5, 128])


IndexError: index out of range in self