# **1. Data**

## 1.1 read data

In [1]:
import re
import html
import contractions
import requests

class DataLoader:
    def __init__(self, url_en, url_vi):
        self.__en_data = self.__load_data(url_en,'en')
        self.__vi_data = self.__load_data(url_vi,'vi')

    def __text_preprocessing(self, text:str, language:str = 'en'):
        text = html.unescape(text)
        text = re.sub(' +', ' ', text)
        if language == 'en':
            from underthesea import sent_tokenize
            text = contractions.fix(text)
        else:
            from nltk import sent_tokenize

        return [sentence for sentence in sent_tokenize(text)]

    def __load_data(self, url, language:str):
        return [self.__text_preprocessing(line,language) for line in requests.get(url).text.splitlines()]

    @property
    def vi(self):
        return self.__vi_data

    @property
    def en(self):
        return self.__en_data

In [2]:
url = "https://nlp.stanford.edu/projects/nmt/data/iwslt15.en-vi/"

train = DataLoader(url +'train.en',url +'train.vi')
test = DataLoader(url + 'tst2013.en',url + 'tst2013.vi')

In [3]:
print(train.vi[0])
print(len(train.en))

['Khoa học đằng sau một tiêu đề về khí hậu']
133317


## 1.2 data preprocessing

### a) vocab

In [6]:
from typing import Iterator, List, Optional
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from underthesea import word_tokenize

class Language:
    def __init__(self, name: str = 'en'):
        if name == 'en':
            self.__name = 'en'
            self.__tokenizer = get_tokenizer('spacy', language ='en_core_web_sm') 
        else:
            self.__name = 'vi'
            self.__tokenizer = word_tokenize

        self.__vocab = None

    def __yield_tokens(self, data):
        for line in data:
            for sentence in line:
                yield self.__tokenizer(sentence) 

    def make_vocab(self, train_iter: Iterator, min_freq:int = 1,specials: Optional[List[str]] = None, default_idx:int = 0):
        self.__vocab = build_vocab_from_iterator(self.__yield_tokens(train_iter), min_freq, specials)
        self.__vocab.set_default_index(default_idx)

    @property
    def name(self):
        return self.__name

    @property
    def vocab(self):
        return self.__vocab
    
    @property
    def tokenizer(self):
        return self.__tokenizer

In [7]:
UNK_IDX, PAD_IDX, SOS_IDX, EOS_IDX = 0, 1, 2, 3
specials = ["<unk>", "<pad>", "<sos>", "<eos>"]

Vi = Language('vi')
En = Language('en')
Vi.make_vocab(train.vi,3,specials,UNK_IDX)
En.make_vocab(train.en,3,specials,UNK_IDX)

In [13]:
print("vocab size vi:", len(Vi.vocab.get_itos()))
print("vocab size en:", len(En.vocab.get_itos()))
for word in Vi.vocab.get_itos()[:5]:
    print(word,Vi.vocab[word])

vocab size vi: 18141
vocab size en: 23974
<unk> 0
<pad> 1
<sos> 2
<eos> 3
, 4


### b) data preprocessing

In [None]:
from spacy import Vocab

def data_preprocessing(train_data : List[str], vocab: Vocab):
    prep_data = []
    