# **1. Load Data**

In [1]:
import torch
MODEL_NAME = "nlp.model"
EPOCH = 10
BATCHSIZE = 128
LR = 0.0001
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [65]:
import re
import html
import contractions
import requests
from torchtext.data.utils import get_tokenizer
from pyvi.ViTokenizer import tokenize

class DataLoader:
    def __init__(self, url_en:str, url_vi:str):
        # function to preprocessing
        DataLoader.__tokenizer_en = get_tokenizer('spacy', language='en_core_web_sm')
        DataLoader.__tokenizer_vi = lambda text: list(map(lambda word: re.sub('_', ' ', word), tokenize(text).split()))

        DataLoader.__check_dict = { # bổ xung
            ' \'s': '\'s',
            '& lt ;': '<',
            '& gt ;': '>',
            "<[^<]+>":'',
            ' +': ' ',
        }

        #last run
        self.__load_data(url_en, url_vi)

    def __text_preprocessing(self, text: str, language: str = 'en'):
        text = html.unescape(text)
        for pattern, repl in DataLoader.__check_dict.items():
            text = text.lower()
            text = re.sub(pattern, repl, text)

        if language == 'en':
            text = contractions.fix(text)
            return DataLoader.__tokenizer_en(text)

        return DataLoader.__tokenizer_vi(text)

    def __load_data(self, url_en:str, url_vi:str):
        data_en = requests.get(url_en).text.strip().splitlines()
        data_vi = requests.get(url_vi).text.strip().splitlines()
        self.__en_data = []
        self.__vi_data = []
        for en,vi in zip(data_en,data_vi):
            en = ["<sos>",*self.__text_preprocessing(en, 'en'), "<eos>"] 
            vi = ["<sos>",*self.__text_preprocessing(vi, 'vi'), "<eos>"] 
            if len(en) < 33 and len(vi) < 33:
                self.__en_data.append(en)
                self.__vi_data.append(vi)

    @property
    def vi(self):
        return self.__vi_data

    @property
    def en(self):
        return self.__en_data
    
    @property
    def data(self):
        '''return en_data, vi_data'''
        return list(zip(self.__en_data,self.__vi_data))


In [66]:
url = "https://nlp.stanford.edu/projects/nmt/data/iwslt15.en-vi/"

train = DataLoader(url +'train.en',url +'train.vi')
val = DataLoader(url + 'tst2012.en',url + 'tst2012.vi')
test = DataLoader(url + 'tst2013.en',url + 'tst2013.vi')


In [67]:
for en,vi in train.data[:5]:
    print(f"en: {en}")
    print(f"vi: {vi}")
    
len(train.data),len(val.data),len(test.data)


en: ['<sos>', 'rachel', 'pike', ':', 'the', 'science', 'behind', 'a', 'climate', 'headline', '<eos>']
vi: ['<sos>', 'khoa học', 'đằng', 'sau', 'một', 'tiêu đề', 'về', 'khí hậu', '<eos>']
en: ['<sos>', 'i', ' ', 'would', 'like', 'to', 'talk', 'to', 'you', 'today', 'about', 'the', 'scale', 'of', 'the', 'scientific', 'effort', 'that', 'goes', 'into', 'making', 'the', 'headlines', 'you', 'see', 'in', 'the', 'paper', '.', '<eos>']
vi: ['<sos>', 'tôi', 'muốn', 'cho', 'các', 'bạn', 'biết', 'về', 'sự', 'to lớn', 'của', 'những', 'nỗ lực', 'khoa học', 'đã', 'góp phần', 'làm nên', 'các', 'dòng', 'tít', 'bạn', 'thường', 'thấy', 'trên', 'báo', '.', '<eos>']
en: ['<sos>', 'they', 'are', 'both', 'two', 'branches', 'of', 'the', 'same', 'field', 'of', 'atmospheric', 'science', '.', '<eos>']
vi: ['<sos>', 'cả', 'hai', 'đều', 'là', 'một', 'nhánh', 'của', 'cùng', 'một', 'lĩnh vực', 'trong', 'ngành', 'khoa học', 'khí quyển', '.', '<eos>']
en: ['<sos>', 'that', 'report', 'was', 'written', 'by', '620', 'scie

(105184, 1319, 957)

# **2. Vocab**

In [68]:
from typing import Iterator
from torchtext.vocab import build_vocab_from_iterator
from typing import List
class Language:
    def __init__(self, train_iter: Iterator, min_freq:int = 1):
        Language.specials = ["<unk>", "<pad>", "<sos>", "<eos>"]
        self.__make_vocab(train_iter,min_freq)
    def __yield_tokens(self, data):
        for line in data:
            yield line  

    def __make_vocab(self, train_iter: Iterator, min_freq:int = 5):
        self.__vocab = build_vocab_from_iterator(self.__yield_tokens(train_iter), min_freq, self.specials)
        self.__vocab.set_default_index(0)
    
    @property
    def vocab(self):
        return self.__vocab
    
    def sentence_to_vector(self, sent:List[str]):
        return torch.tensor(self.__vocab.lookup_indices(sent),dtype = torch.int64)
    
    def vector_to_sentence(self, vector:List[int]):
        return self.__vocab.lookup_tokens(vector)

In [69]:
UNK_IDX, PAD_IDX, SOS_IDX, EOS_IDX = 0, 1, 2, 3

Vi = Language(train.vi,3)
En = Language(train.en,3)


In [70]:
len(Vi.vocab.get_itos())


12131

In [71]:
len(En.vocab.get_itos())

15931

# **3. Data Preprocessing**

In [72]:
train_en = [En.sentence_to_vector(line) for line in train.en]
train_vi = [Vi.sentence_to_vector(line) for line in train.vi]


In [74]:
for en,vi in zip(train_en[:5],train_vi[:5]):
    print(f"en: {En.vector_to_sentence(en.tolist())}")
    print(f"vi: {Vi.vector_to_sentence(vi.tolist())}")

en: ['<sos>', 'rachel', 'pike', ':', 'the', 'science', 'behind', 'a', 'climate', 'headline', '<eos>']
vi: ['<sos>', 'khoa học', 'đằng', 'sau', 'một', 'tiêu đề', 'về', 'khí hậu', '<eos>']
en: ['<sos>', 'i', ' ', 'would', 'like', 'to', 'talk', 'to', 'you', 'today', 'about', 'the', 'scale', 'of', 'the', 'scientific', 'effort', 'that', 'goes', 'into', 'making', 'the', 'headlines', 'you', 'see', 'in', 'the', 'paper', '.', '<eos>']
vi: ['<sos>', 'tôi', 'muốn', 'cho', 'các', 'bạn', 'biết', 'về', 'sự', 'to lớn', 'của', 'những', 'nỗ lực', 'khoa học', 'đã', 'góp phần', 'làm nên', 'các', 'dòng', 'tít', 'bạn', 'thường', 'thấy', 'trên', 'báo', '.', '<eos>']
en: ['<sos>', 'they', 'are', 'both', 'two', 'branches', 'of', 'the', 'same', 'field', 'of', 'atmospheric', 'science', '.', '<eos>']
vi: ['<sos>', 'cả', 'hai', 'đều', 'là', 'một', 'nhánh', 'của', 'cùng', 'một', 'lĩnh vực', 'trong', 'ngành', 'khoa học', 'khí quyển', '.', '<eos>']
en: ['<sos>', 'that', 'report', 'was', 'written', 'by', '620', 'scie

In [82]:
from torch.nn.utils.rnn import pad_sequence
train_en = pad_sequence(train_en,batch_first= True,padding_value=UNK_IDX)
train_vi = pad_sequence(train_vi,batch_first= True,padding_value=UNK_IDX)


In [99]:
train_data = list(zip(train_en, train_vi))

In [100]:
from torch.utils.data import DataLoader as dl
train_data = list(dl(train_data,batch_size=BATCHSIZE,shuffle=True))

# **4. Model**

In [None]:
class Encoder(torch.nn.Module):
    def __init__(self, input_size, hidden_size):
        super(Encoder,self).__init__()
        self.hidden_size = hidden_size

        self.embedding = torch.nn.Embedding(input_size, hidden_size)
        self.gru = torch.nn.GRU(hidden_size, hidden_size)
    
    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=DEVICE)

In [None]:
import torch.nn.functional as F

class Decoder(torch.nn.Module):
    def __init__(self, hidden_size, output_size):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = torch.nn.Embedding(output_size, hidden_size)
        self.gru = torch.nn.GRU(hidden_size, hidden_size)
        self.out = torch.nn.Linear(hidden_size, output_size)
        self.softmax = torch.nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=DEVICE)

In [None]:
class AttnDecoderRNN(torch.nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=32):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = torch.nn.Embedding(self.output_size, self.hidden_size)
        self.attn = torch.nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = torch.nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = torch.nn.Dropout(self.dropout_p)
        self.gru = torch.nn.GRU(self.hidden_size, self.hidden_size)
        self.out = torch.nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=DEVICE)