# **1. Load Data**

In [8]:
import torch.nn.functional as F

import torch
MODEL_NAME = "nlp.model"
EPOCH = 10
BATCHSIZE = 128
LR = 0.0001
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

['I', 'have', 'to', 'go', 'out']

In [10]:
import re
import html
import contractions
import requests
from torchtext.data.utils import get_tokenizer
from pyvi.ViTokenizer import tokenize

class DataLoader:
    def __init__(self, url_en, url_vi):
        # function to preprocessing
        self.__tokenizer_en = get_tokenizer('spacy', language='en_core_web_sm')
        self.__tokenizer_vi = lambda text: list(map(lambda word: re.sub('_', ' ', word), tokenize(text).split()))

        self.__check_dict = { # bổ xung
            ' \'s': '\'s',
            '& lt ;': '<',
            '& gt ;': '>',
            "<[^<]+>":'',
            ' +': ' ',
        }

        #last run
        self.__en_data = self.__load_data(url_en, 'en')
        self.__vi_data = self.__load_data(url_vi, 'vi')

    def __text_preprocessing(self, text: str, language: str = 'en'):
        text = html.unescape(text)
        for pattern, repl in self.__check_dict.items():
            text = re.sub(pattern, repl, text)

        if language == 'en':
            text = contractions.fix(text)
            return self.__tokenizer_en(text)

        return self.__tokenizer_vi(text)

    def __load_data(self, url, language: str):
        return [self.__text_preprocessing(line, language) for line in requests.get(url).text.splitlines()]

    @property
    def vi(self):
        return self.__vi_data

    @property
    def en(self):
        return self.__en_data


In [11]:
url = "https://nlp.stanford.edu/projects/nmt/data/iwslt15.en-vi/"

train = DataLoader(url +'train.en',url +'train.vi')
test = DataLoader(url + 'tst2013.en',url + 'tst2013.vi')


In [12]:
for i in range(10,15):
    print(i,train.en[i])
    print(i,train.vi[i])

print(len(train.en))

10 ['Over', '15,000', 'scientists', 'go', 'to', 'San', 'Francisco', 'every', 'year', 'for', 'that', '.']
10 ['Mỗi', 'năm', ',', 'hơn', '15,000', 'nhà', 'khoa học', 'đến', 'San Francisco', 'để', 'tham dự', 'hội nghị', 'này', '.']
11 ['And', 'every', 'one', 'of', 'those', 'scientists', 'is', 'in', 'a', 'research', 'group', ',', 'and', 'every', 'research', 'group', 'studies', 'a', 'wide', 'variety', 'of', 'topics', '.']
11 ['Mỗi một', 'khoa học', 'gia', 'đều', 'thuộc', 'một', 'nhóm', 'nghiên cứu', ',', 'và', 'mỗi', 'nhóm', 'đều', 'nghiên cứu', 'rất', 'nhiều', 'đề tài', 'đa dạng', '.']
12 ['For', 'us', 'at', 'Cambridge', ',', 'it', 'is', 'as', 'varied', 'as', 'the', 'El', 'Niño', 'oscillation', ',', 'which', 'affects', 'weather', 'and', 'climate', ',', 'to', 'the', 'assimilation', 'of', 'satellite', 'data', ',', 'to', 'emissions', 'from', 'crops', 'that', 'produce', 'biofuels', ',', 'which', 'is', 'what', 'I', 'happen', 'to', 'study', '.']
12 ['Với', 'chúng tôi', ',', 'tại', 'Cambridge', '

# **2. Vocab**

In [13]:
from typing import Iterator, List, Optional
from torchtext.vocab import build_vocab_from_iterator

class Language:
    def __init__(self, train_iter: Iterator, min_freq:int = 1,specials: Optional[List[str]] = None, default_idx:int = 0):
        self.__make_vocab(train_iter,min_freq,specials,default_idx)

    def __yield_tokens(self, data):
        for line in data:
            yield line  

    def __make_vocab(self, train_iter: Iterator, min_freq:int = 1,specials: Optional[List[str]] = None, default_idx:int = 0):
        self.__vocab = build_vocab_from_iterator(self.__yield_tokens(train_iter), min_freq, specials)
        self.__vocab.set_default_index(default_idx)

    @property
    def name(self):
        return self.__name

    @property
    def word2index(self):
        return self.__vocab.get_stoi()

    @property
    def index2word(self):
        return self.__vocab.get_itos()

    @property
    def vocab(self):
        return self.__vocab

In [14]:
UNK_IDX, PAD_IDX, SOS_IDX, EOS_IDX = 0, 1, 2, 3
specials = ["<unk>", "<pad>", "<sos>", "<eos>"]

Vi = Language(train.vi,3,specials,UNK_IDX)
En = Language(train.en,3,specials,UNK_IDX)


# **3. Data Preprocessing**

In [15]:
from torchtext.vocab.vocab import Vocab
def data_preprocessing(data:List[List[str]],vocab:Vocab):
    rr = []
    idx2word = vocab.get_itos()
    for line in data:
        tkl = ['<sos>']
        for word in line:
            tkl.append(idx2word[vocab[word]])
        tkl.append('<eos>')
        rr.append(tkl)
    return rr

In [41]:
train_en_prep = data_preprocessing(train.en, En.vocab)
train_vi_prep = data_preprocessing(train.vi, Vi.vocab)
test_en_prep = data_preprocessing(test.en, En.vocab)

In [18]:
train_en_prep[0]

['<sos>',
 'Rachel',
 'Pike',
 ':',
 'The',
 'science',
 'behind',
 'a',
 'climate',
 'headline',
 '<eos>']

In [14]:
print(len(train_en_prep))
print(len(train_vi_prep))
print(len(test_en_prep))

133317
133317
1268


In [42]:
train_data = list(zip(train_en_prep, train_vi_prep))
train_data.sort(key = lambda x: (len(x[0]), len(x[1])))
test_data = list(zip(test_en_prep, test.en, test.vi))

len(train_data)

133317

In [24]:
from typing import Tuple

def make_batch(data:List[Tuple], batchsize:int):
    bb = []
    ben = []
    bvi = []
    for en, vi in data: 
        ben.append(en)
        bvi.append(vi)
        if len(ben) >= batchsize:
            bb.append((ben, bvi))
            ben = []
            bvi = []
    if len(ben) > 0:
        bb.append((ben, bvi))
    return bb


In [46]:
train_data = make_batch(train_data, BATCHSIZE)

In [61]:
def padding_batch(b):
    maxlen = max([len(x) for x in b])
    for tkl in b:
        for i in range(maxlen - len(tkl)):
            tkl.append('<pad>')

def padding(bb):
    for ben, bvi in bb:
        padding_batch(ben)
        padding_batch(bvi)

In [62]:
padding(train_data)


In [63]:
def text_pipeline(data:List[List[str]], vocab: Vocab):
    return [[vocab[word] for word in word_lst] for word_lst in data]

train_data = [(text_pipeline(ben,En.vocab),text_pipeline(bvi,En.vocab)) for ben, bvi in train_data]

# **4. Model**

In [67]:
class EncoderRNN(torch.nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = torch.nn.Embedding(input_size, hidden_size)
        self.gru = torch.nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=DEVICE)

In [71]:
class DecoderRNN(torch.nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = torch.nn.Embedding(output_size, hidden_size)
        self.gru = torch.nn.GRU(hidden_size, hidden_size)
        self.out = torch.nn.Linear(hidden_size, output_size)
        self.softmax = torch.nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=DEVICE)

In [72]:
class AttnDecoderRNN(torch.nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length:int = 7):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = torch.nn.Embedding(self.output_size, self.hidden_size)
        self.attn = torch.nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = torch.nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = torch.nn.Dropout(self.dropout_p)
        self.gru = torch.nn.GRU(self.hidden_size, self.hidden_size)
        self.out = torch.nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=DEVICE)

NameError: name 'MAX_LENGTH' is not defined