# **requirement**

In [1]:
# !python -m spacy download en
# !python -m spacy download vi
# %pip install underthesea
# %pip install contractions
%pip install pyvi

Note: you may need to restart the kernel to use updated packages.


# **1. Data**

## 1.1 read data

In [2]:
import requests
url = "https://nlp.stanford.edu/projects/nmt/data/iwslt15.en-vi/"

languages = ['en', 'vi']

train_data = {
    'en': requests.get(url+'train.en').text.splitlines(),
    'vi': requests.get(url+'train.vi').text.splitlines()
}
test = {
    'en': requests.get(url+'tst2013.en').text.splitlines(),
    'vi': requests.get(url+'tst2013.vi').text.splitlines()
}


## 1.2 data preprocessing

### a) vocab

- build vocab

In [3]:
from torchtext.data.utils import get_tokenizer
import html
import contractions
import re
from pyvi.ViTokenizer import tokenize
tokenizer = {
    'en': get_tokenizer('spacy', language='en_core_web_sm'),
    'vi': lambda text : list(map(lambda word: re.sub('_', ' ',word),tokenize(text).split()))
}

def text_preprocessing(text,language): 
    if language == 'en':
        return re.sub(' +', ' ', contractions.fix(html.unescape(text)))
    else:
        return re.sub(' +', ' ',html.unescape(text))

def yield_tokens(train_data, language='en'):
    for line in train_data[language]:
        yield tokenizer[language](text_preprocessing(line,language))


In [4]:
from torchtext.vocab import build_vocab_from_iterator

UNK_IDX, PAD_IDX, SOS_IDX, EOS_IDX = 0, 1, 2, 3
specials = ["<unk>", "<pad>", "<sos>", "<eos>"]

vocabs = {
    'en': build_vocab_from_iterator(yield_tokens(train_data, 'en'), 3, specials),
    'vi': build_vocab_from_iterator(yield_tokens(train_data, 'vi'), 3, specials)
}

for language in languages:
    vocabs[language].set_default_index(UNK_IDX)


- check vocab

In [5]:
print("vocab size en:", len(vocabs['en'].get_itos()))
print("vocab size vi:", len(vocabs['vi'].get_itos()))

vocab size en: 23973
vocab size vi: 17843
