# **1. Load Data**

In [18]:
import torch.nn.functional as F

import torch
MODEL_NAME = "nlp.model"
EPOCH = 10
BATCHSIZE = 128
LR = 0.0001
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [1]:
import re
import html
import contractions
import requests
from torchtext.data.utils import get_tokenizer
from pyvi.ViTokenizer import tokenize

class DataLoader:
    def __init__(self, url_en, url_vi):
        # function to preprocessing
        self.__tokenizer_en = get_tokenizer('spacy', language='en_core_web_sm')
        self.__tokenizer_vi = lambda text: list(map(lambda word: re.sub('_', ' ', word), tokenize(text).split()))

        self.__check_dict = { # bổ xung
            ' \'s': '\'s',
            '& lt ;': '<',
            '& gt ;': '>',
            "<[^<]+>":'',
            ' +': ' ',
        }

        #last run
        self.__en_data = self.__load_data(url_en, 'en')
        self.__vi_data = self.__load_data(url_vi, 'vi')

    def __text_preprocessing(self, text: str, language: str = 'en'):
        text = html.unescape(text)
        for pattern, repl in self.__check_dict.items():
            text = re.sub(pattern, repl, text)

        if language == 'en':
            text = contractions.fix(text)
            return self.__tokenizer_en(text)

        return self.__tokenizer_vi(text)

    def __load_data(self, url, language: str):
        return [self.__text_preprocessing(line, language) for line in requests.get(url).text.splitlines()]

    @property
    def vi(self):
        return self.__vi_data

    @property
    def en(self):
        return self.__en_data


In [2]:
url = "https://nlp.stanford.edu/projects/nmt/data/iwslt15.en-vi/"

train = DataLoader(url +'train.en',url +'train.vi')
test = DataLoader(url + 'tst2013.en',url + 'tst2013.vi')


In [3]:
for i in range(10,20):
    print(i,train.en[i])
    print(i,train.vi[i])

print(len(train.en))

10 ['Over', '15,000', 'scientists', 'go', 'to', 'San', 'Francisco', 'every', 'year', 'for', 'that', '.']
10 ['Mỗi', 'năm', ',', 'hơn', '15,000', 'nhà', 'khoa học', 'đến', 'San Francisco', 'để', 'tham dự', 'hội nghị', 'này', '.']
11 ['And', 'every', 'one', 'of', 'those', 'scientists', 'is', 'in', 'a', 'research', 'group', ',', 'and', 'every', 'research', 'group', 'studies', 'a', 'wide', 'variety', 'of', 'topics', '.']
11 ['Mỗi một', 'khoa học', 'gia', 'đều', 'thuộc', 'một', 'nhóm', 'nghiên cứu', ',', 'và', 'mỗi', 'nhóm', 'đều', 'nghiên cứu', 'rất', 'nhiều', 'đề tài', 'đa dạng', '.']
12 ['For', 'us', 'at', 'Cambridge', ',', 'it', 'is', 'as', 'varied', 'as', 'the', 'El', 'Niño', 'oscillation', ',', 'which', 'affects', 'weather', 'and', 'climate', ',', 'to', 'the', 'assimilation', 'of', 'satellite', 'data', ',', 'to', 'emissions', 'from', 'crops', 'that', 'produce', 'biofuels', ',', 'which', 'is', 'what', 'I', 'happen', 'to', 'study', '.']
12 ['Với', 'chúng tôi', ',', 'tại', 'Cambridge', '

# **2. Vocab**

In [9]:
from typing import Iterator, List, Optional
from torchtext.vocab import build_vocab_from_iterator

class Language:
    def __init__(self, train_iter: Iterator, min_freq:int = 1,specials: Optional[List[str]] = None, default_idx:int = 0):
        self.__make_vocab(train_iter,min_freq,specials,default_idx)

    def __yield_tokens(self, data):
        for line in data:
            yield line  

    def __make_vocab(self, train_iter: Iterator, min_freq:int = 1,specials: Optional[List[str]] = None, default_idx:int = 0):
        self.__vocab = build_vocab_from_iterator(self.__yield_tokens(train_iter), min_freq, specials)
        self.__vocab.set_default_index(default_idx)

    @property
    def name(self):
        return self.__name

    @property
    def word2index(self):
        return self.__vocab.get_stoi()

    @property
    def index2word(self):
        return self.__vocab.get_itos()

    @property
    def vocab(self):
        return self.__vocab

In [10]:
UNK_IDX, PAD_IDX, SOS_IDX, EOS_IDX = 0, 1, 2, 3
specials = ["<unk>", "<pad>", "<sos>", "<eos>"]

Vi = Language(train.vi,3,specials,UNK_IDX)
En = Language(train.en,3,specials,UNK_IDX)


In [112]:
len(En.index2word)


23972

In [111]:
len(Vi.index2word)

17843

# **3. Data Preprocessing**

In [12]:
from torchtext.vocab.vocab import Vocab
def data_preprocessing(data:List[List[str]],vocab:Vocab):
    rr = []
    idx2word = vocab.get_itos()
    for line in data:
        tkl = ['<sos>']
        for word in line:
            tkl.append(idx2word[vocab[word]])
        tkl.append('<eos>')
        rr.append(tkl)
    return rr

In [13]:
train_en_prep = data_preprocessing(train.en, En.vocab)
train_vi_prep = data_preprocessing(train.vi, Vi.vocab)
test_en_prep = data_preprocessing(test.en, En.vocab)

In [14]:
print(len(train_en_prep))
print(len(train_vi_prep))
print(len(test_en_prep))

133317
133317
1268


In [40]:
train_data = list(zip(train_en_prep, train_vi_prep))
train_data.sort(key = lambda x: (len(x[0]), len(x[1])))
test_data = list(zip(test_en_prep, test.en, test.vi))

len(train_data)

133317

In [41]:
from typing import Tuple

def make_batch(data:List[Tuple], batchsize:int):
    bb = []
    ben = []
    bvi = []
    for en, vi in data: 
        ben.append(en)
        bvi.append(vi)
        if len(ben) >= batchsize:
            bb.append((ben, bvi))
            ben = []
            bvi = []
    if len(ben) > 0:
        bb.append((ben, bvi))
    return bb


In [42]:
train_data = make_batch(train_data, BATCHSIZE)

In [43]:
def padding_batch(b):
    maxlen = max([len(x) for x in b])
    for tkl in b:
        for i in range(maxlen - len(tkl)):
            tkl.append('<pad>')

def padding(bb):
    for ben, bvi in bb:
        padding_batch(ben)
        padding_batch(bvi)

In [44]:
padding(train_data)


In [45]:
def text_pipeline(data:List[List[str]], vocab: Vocab):
    return [[vocab[word] for word in word_lst] for word_lst in data]

train_data = [(text_pipeline(ben,En.vocab),text_pipeline(bvi,En.vocab)) for ben, bvi in train_data]

# **4. Model**

In [124]:
class LSTM(torch.nn.Module):
    def __init__(self, vocablist_x, vocabidx_x, vocablist_y, vocabidx_y):
        super(LSTM, self).__init__()
        self.num_embed_x = len(vocablist_x)
        self.num_embed_y = len(vocabidx_y)

        self.encemb = torch.nn.Embedding(self.num_embed_x, 256, padding_idx = vocabidx_x['<pad>'])
        self.dropout = torch.nn.Dropout(0.5)
        self.enclstm = torch.nn.LSTM(256,516,2,dropout=0.5)
        
        self.decemb = torch.nn.Embedding(self.num_embed_x, 256, padding_idx = vocabidx_y['<pad>'])
        self.declstm = torch.nn.LSTM(256,516,2,dropout=0.5)
        self.decout = torch.nn.Linear(516, self.num_embed_y)
  
    def forward(self,x):
        x, y = x[0], x[1]
        # print(x.shape)
        # print(y.shape)

        e_x = self.dropout(self.encemb(x))
        
        outenc,(hidden,cell) = self.enclstm(e_x)

        n_y=y.shape[0]
        print('n_y: ',y.shape)
        # outputs = torch.zeros(n_y,BATCHSIZE,self.num_embed_x).to(DEVICE)
        loss = torch.tensor(0.,dtype=torch.float32).to(DEVICE)
        for i in range(n_y-1):
            input = y[i]
            input = input.unsqueeze(0)
            input = self.dropout(self.decemb(input))
            outdec, (hidden,cell) = self.declstm(input,(hidden,cell))
            output = self.decout(outdec.squeeze(0))
            input = y[i+1]
            print(input.shape,output.shape)
            loss += F.cross_entropy(output, y[i+1])
        return loss

    def evaluate(self,x,vocablist_y,vocabidx_y):
        e_x = self.dropout(self.encemb(x))
        outenc,(hidden,cell)=self.enclstm(e_x)
        
        y = torch.tensor([vocabidx_y['<cls>']]).to(DEVICE)
        pred=[]
        for i in range(30):
            input = y
            input = input.unsqueeze(0)
            input = self.dropout(self.decemb(input))
            outdec,(hidden,cell)= self.declstm(input,(hidden,cell))
            output = self.decout(outdec.squeeze(0))  
            pred_id = output.squeeze().argmax().item()
            if pred_id == vocabidx_y['<eos>']:
                break
            pred_y = vocablist_y[pred_id]
            pred.append(pred_y)
            y[0]=pred_id
            input=y
        return pred  

In [125]:
def train_LMST():
    model = LSTM(En.index2word, En.word2index, Vi.index2word, Vi.word2index).to(DEVICE)
    optimizer = torch.optim.Adam(model.parameters(), lr=LR) 
    for epoch in range(EPOCH):
        loss = 0
        step = 0
        for ben, bvi in train_data:
            ben = torch.tensor(ben, dtype=torch.int64).transpose(0,1).to(DEVICE) 
            bvi = torch.tensor(bvi, dtype=torch.int64).transpose(0,1).to(DEVICE)
            optimizer.zero_grad()
            batchloss = model((ben, bvi))
            batchloss.backward()
            optimizer.step() 
            loss = loss + batchloss.item()
            if step % 100 == 0:
                print("step:", step, "batch loss:", batchloss.item())
            step += 1
        print("epoch", epoch, ": loss", loss)
    torch.save(model.state_dict(), MODEL_NAME)

In [126]:
train_LMST()

n_y:  torch.Size([3, 128])
torch.Size([128]) torch.Size([128, 17843])
torch.Size([128]) torch.Size([128, 17843])
step: 0 batch loss: 19.568538665771484
n_y:  torch.Size([5, 128])
torch.Size([128]) torch.Size([128, 17843])


IndexError: Target 19889 is out of bounds.