# Necessary Imports 

In [1]:
!pip install contractions
!pip install pyvi
import torch
from torch import nn, optim
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.metrics import bleu_score
from torchtext.data.utils import get_tokenizer
from pyvi import ViTokenizer
import re
import html
import contractions
import requests
from typing import Iterator,List, Optional

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting contractions
  Downloading contractions-0.1.72-py2.py3-none-any.whl (8.3 kB)
Collecting textsearch>=0.0.21
  Downloading textsearch-0.0.21-py2.py3-none-any.whl (7.5 kB)
Collecting pyahocorasick
  Downloading pyahocorasick-1.4.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (106 kB)
[K     |████████████████████████████████| 106 kB 7.2 MB/s 
[?25hCollecting anyascii
  Downloading anyascii-0.3.1-py3-none-any.whl (287 kB)
[K     |████████████████████████████████| 287 kB 52.4 MB/s 
[?25hInstalling collected packages: pyahocorasick, anyascii, textsearch, contractions
Successfully installed anyascii-0.3.1 contractions-0.1.72 pyahocorasick-1.4.4 textsearch-0.0.21
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyvi
  Downloading pyvi-0.1.1-py2.py3-none-any.whl (8.5 MB)
[K     |███████████████████████████████

# 2. Data Preparation & Pre-processing

In [2]:
!python -m spacy download en_core_web_sm --quiet

[K     |████████████████████████████████| 12.8 MB 4.8 MB/s 
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [3]:
class Data:
  def __init__(self, url_en:str, url_vi:str):
    Data.check_dict = { # bổ xung
      ' \'s': '\'s',
      '& lt ;': '<',
      '& gt ;': '>',
      "<[^<]+>":'',
      ' +': ' ',
    }
    Data.tokenizer = {
      'vi': lambda text: list(map(lambda word: re.sub('_', ' ', word), ViTokenizer.tokenize(text).split())),
      'en': get_tokenizer('spacy', language='en_core_web_sm')
    }
    data_en = requests.get(url_en).text.strip().splitlines()
    data_vi = requests.get(url_vi).text.strip().splitlines()
    self.__data_en = [self.__text_preprocessing(en, 'en') for en in data_en]
    self.__data_vi = [self.__text_preprocessing(vi, 'vi') for vi in data_vi]

  def __text_preprocessing(self, text: str, language: str = 'en'):
    text = html.unescape(text)
    for pattern, repl in Data.check_dict.items():
      text = text.lower()
      text = re.sub(pattern, repl, text)

    if language == 'en':
      text = re.sub(' +', ' ', contractions.fix(text))
      return self.tokenizer['en'](text)
    return self.tokenizer['vi'](text)

  @property
  def en(self):
    return self.__data_en

  @property
  def vi(self):
    return self.__data_vi
  


In [4]:
url = "https://nlp.stanford.edu/projects/nmt/data/iwslt15.en-vi/"

train_data = Data(url +'train.en',url +'train.vi')
# val_data = Data(url + 'tst2012.en',url + 'tst2012.vi')
test_data = Data(url + 'tst2013.en',url + 'tst2013.vi')

In [5]:
for text in train_data.vi[:5]:
  print(text)

['khoa học', 'đằng', 'sau', 'một', 'tiêu đề', 'về', 'khí hậu']
['trong', '4', 'phút', ',', 'chuyên gia', 'hoá học', 'khí quyển', 'rachel', 'pike', 'giới thiệu', 'sơ lược', 'về', 'những', 'nỗ lực', 'khoa học', 'miệt mài', 'đằng', 'sau', 'những', 'tiêu đề', 'táo bạo', 'về', 'biến đổi', 'khí hậu', ',', 'cùng', 'với', 'đoàn', 'nghiên cứu', 'của', 'mình', '-', '-', 'hàng', 'ngàn', 'người', 'đã', 'cống hiến', 'cho', 'dự án', 'này', '-', '-', 'một', 'chuyến', 'bay', 'mạo hiểm', 'qua', 'rừng già', 'để', 'tìm kiếm', 'thông tin', 'về', 'một', 'phân tử', 'then chốt', '.']
['tôi', 'muốn', 'cho', 'các', 'bạn', 'biết', 'về', 'sự', 'to lớn', 'của', 'những', 'nỗ lực', 'khoa học', 'đã', 'góp phần', 'làm nên', 'các', 'dòng', 'tít', 'bạn', 'thường', 'thấy', 'trên', 'báo', '.']
['có', 'những', 'dòng', 'trông', 'như', 'thế', 'này', 'khi', 'bàn', 'về', 'biến đổi', 'khí hậu', ',', 'và', 'như', 'thế', 'này', 'khi', 'nói', 'về', 'chất lượng', 'không khí', 'hay', 'khói', 'bụi', '.']
['cả', 'hai', 'đều', 'là', '

In [6]:
class Language:
  def __init__(self, data: Data, min_freq:int = 1):
    specials = ["<unk>", "<pad>", "<sos>", "<eos>"]

    self.__vocab_en = build_vocab_from_iterator(self.__yield_tokens(data.en,'en'), min_freq, specials)
    self.__vocab_en.set_default_index(0)

    self.__vocab_vi = build_vocab_from_iterator(self.__yield_tokens(data.vi,'vi'), min_freq, specials)
    self.__vocab_vi.set_default_index(0)
  
  def __yield_tokens(self, data:List[str] , language:str = 'en'):
    for line in data:
      yield line

  @property
  def en(self):
    return self.__vocab_en

  @property
  def vi(self):
    return self.__vocab_vi

  def text_pipeline(self, data:List[str], language:str = 'en') -> List[str]:
    if language == 'en':
      return [vocab.en.lookup_tokens([2,*vocab.en.lookup_indices(line),3]) for line in data]
    if language == 'vi':
      return [vocab.vi.lookup_tokens([2,*vocab.vi.lookup_indices(line),3]) for line in data]

In [7]:
vocab = Language(train_data,3)

In [8]:
print(f"Unique tokens in source (en) vocabulary: {len(vocab.en)}")
print(f"Unique tokens in target (vi) vocabulary: {len(vocab.vi)}")

Unique tokens in source (en) vocabulary: 21928
Unique tokens in target (vi) vocabulary: 16085


In [9]:
#vector
train_en_prep = vocab.text_pipeline(train_data.en,'en')
train_vi_prep = vocab.text_pipeline(train_data.vi,'vi')
test_en_prep = vocab.text_pipeline(test_data.en,'en')
test_vi_prep = vocab.text_pipeline(test_data.vi,'vi')

In [10]:
for i in range(5):
  print(f'train en: {train_en_prep[i]}')
  print(f'train vi: {train_vi_prep[i]}')
  print(f'test en: {test_en_prep[i]}')

train en: ['<sos>', 'rachel', 'pike', ':', 'the', 'science', 'behind', 'a', 'climate', 'headline', '<eos>']
train vi: ['<sos>', 'khoa học', 'đằng', 'sau', 'một', 'tiêu đề', 'về', 'khí hậu', '<eos>']
test en: ['<sos>', 'when', 'i', 'was', 'little', ',', 'i', 'thought', 'my', 'country', 'was', 'the', 'best', 'on', 'the', 'planet', ',', 'and', 'i', 'grew', 'up', 'singing', 'a', 'song', 'called', '"', 'nothing', 'to', 'envy', '.', '"', '<eos>']
train en: ['<sos>', 'in', '4', 'minutes', ',', 'atmospheric', 'chemist', 'rachel', 'pike', 'provides', 'a', 'glimpse', 'of', 'the', 'massive', 'scientific', 'effort', 'behind', 'the', 'bold', 'headlines', 'on', 'climate', 'change', ',', 'with', 'her', 'team', '--', 'one', 'of', 'thousands', 'who', 'contributed', '--', 'taking', 'a', 'risky', 'flight', 'over', 'the', 'rainforest', 'in', 'pursuit', 'of', 'data', 'on', 'a', 'key', 'molecule', '.', '<eos>']
train vi: ['<sos>', 'trong', '4', 'phút', ',', 'chuyên gia', 'hoá học', 'khí quyển', 'rachel', 'p

In [11]:
train_data = list(zip(train_en_prep,train_vi_prep))
train_data.sort(key = lambda x: (len(x[0]), len(x[1])))
test_data = list(zip(test_en_prep, test_data.en, test_data.vi))

In [12]:
for i in range(5):
  print(train_data[i])

for i in range(5):   
  print(test_data[i])

(['<sos>', '<eos>'], ['<sos>', '<eos>'])
(['<sos>', '<eos>'], ['<sos>', '<eos>'])
(['<sos>', '<eos>'], ['<sos>', '<eos>'])
(['<sos>', '<eos>'], ['<sos>', '<eos>'])
(['<sos>', '<eos>'], ['<sos>', '<eos>'])
(['<sos>', 'when', 'i', 'was', 'little', ',', 'i', 'thought', 'my', 'country', 'was', 'the', 'best', 'on', 'the', 'planet', ',', 'and', 'i', 'grew', 'up', 'singing', 'a', 'song', 'called', '"', 'nothing', 'to', 'envy', '.', '"', '<eos>'], ['when', 'i', 'was', 'little', ',', 'i', 'thought', 'my', 'country', 'was', 'the', 'best', 'on', 'the', 'planet', ',', 'and', 'i', 'grew', 'up', 'singing', 'a', 'song', 'called', '"', 'nothing', 'to', 'envy', '.', '"'], ['khi', 'tôi', 'còn', 'nhỏ', ',', 'tôi', 'nghĩ', 'rằng', 'bắctriều', 'tiên', 'là', 'đất nước', 'tốt', 'nhất', 'trên', 'thế giới', 'và', 'tôi', 'thường', 'hát', 'bài', '"', 'chúng ta', 'chẳng', 'có', 'gì', 'phải', 'ghen tị', '.', '"'])
(['<sos>', 'and', 'i', 'was', 'very', 'proud', '.', '<eos>'], ['and', 'i', 'was', 'very', 'proud', '.

In [13]:
def make_batch(data, batchsize = 32):
  bb = []
  ben = []
  bvi = []
  for en, vi in data: 
    ben.append(en)
    bvi.append(vi)
    if len(ben) >= batchsize:
      bb.append((ben, bvi))
      ben = []
      bvi = []
  if len(ben) > 0:
    bb.append((ben, bvi))
  return bb


In [14]:
BATCH_SIZE = 128
train_data = make_batch(train_data, BATCH_SIZE)


In [15]:
for i in range(5):
  print(train_data[i])


([['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>']], [['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>'], ['<sos>', '<eos>

In [16]:
def padding_batch(b):
  maxlen = max([len(x) for x in b])
  for tkl in b:
    for i in range(maxlen - len(tkl)):
      tkl.append('<pad>')

def padding(bb):
  for ben, bvi in bb:
    padding_batch(ben)
    padding_batch(bvi)

padding(train_data)

In [17]:
train_data = [([vocab.en.lookup_indices(en) for en in ben],[vocab.vi.lookup_indices(vi)for vi in bvi]) for ben, bvi in train_data]
test_data = [(vocab.en.lookup_indices(enprep), en, vi) for enprep, en, vi in test_data]

In [18]:
for i in range (3): 
  print(train_data[i]) 
for i in range (3): 
  print(test_data[i]) 

([[2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3]], [[2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3]])
([[2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3]], [[2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3],

In [19]:
MODEL_NAME = "LSTM.model"
EPOCH = 30
LR = 1e-3
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [27]:
from torch.nn.functional import cross_entropy
class LSTM(torch.nn.Module):
  def __init__(self, vocablist_x, vocabidx_x, vocablist_y, vocabidx_y):
    super(LSTM, self).__init__()

    self.encemb = torch.nn.Embedding(len(vocablist_x), 256, padding_idx = vocabidx_x['<pad>'])
    self.dropout = torch.nn.Dropout(0.5)
    self.enclstm = torch.nn.LSTM(256,516,2,dropout=0.5)
    
    self.decemb = torch.nn.Embedding(len(vocablist_x), 256, padding_idx = vocabidx_y['<pad>'])
    self.declstm = torch.nn.LSTM(256,516,2,dropout=0.5)
    self.decout = torch.nn.Linear(516, len(vocabidx_y))
  
  def forward(self,x):
    x, y = x[0], x[1]
    # print(x.size())
    # print(y.size())

    e_x = self.dropout(self.encemb(x))
    
    outenc,(hidden,cell) = self.enclstm(e_x)

    n_y=y.shape[0]
    loss = torch.tensor(0.,dtype=torch.float32).to(DEVICE)
    for i in range(n_y-1):
      input = y[i]
      input = input.unsqueeze(0)
      input = self.dropout(self.decemb(input))
      outdec, (hidden,cell) = self.declstm(input,(hidden,cell))
      output = self.decout(outdec.squeeze(0))
      input = y[i+1]
      loss += cross_entropy(output, y[i+1], ignore_index= 1)
    return loss

  def evaluate(self,x,vocablist_y,vocabidx_y):
    e_x = self.dropout(self.encemb(x))
    outenc,(hidden,cell)=self.enclstm(e_x)
    
    y = torch.tensor([vocabidx_y['<sos>']]).to(DEVICE)
    pred=[]
    for i in range(50):
      input = y
      input = input.unsqueeze(0)
      input = self.dropout(self.decemb(input))
      outdec,(hidden,cell)= self.declstm(input,(hidden,cell))
      output = self.decout(outdec.squeeze(0))  
      pred_id = output.squeeze().argmax().item()
      if pred_id == vocabidx_y['<eos>']:
        break
      pred_y = vocablist_y[pred_id]
      pred.append(pred_y)
      y[0]=pred_id
      input=y
    return pred  

In [34]:
from numpy import False_
def train_LMST(epochs = 10, pre_train = False):
  model = LSTM(vocab.en.get_itos(), vocab.en, vocab.vi.get_itos(), vocab.vi).to(DEVICE)
  optimizer = torch.optim.Adam(model.parameters(), lr=LR)
  if pre_train:
    model.load_state_dict(torch.load(MODEL_NAME))
  for epoch in range(epochs):
    loss = 0
    step = 0
    for ben, bvi in train_data:
      ben = torch.tensor(ben, dtype=torch.int64).transpose(0,1).to(DEVICE) 
      bvi = torch.tensor(bvi, dtype=torch.int64).transpose(0,1).to(DEVICE)
      optimizer.zero_grad()
      batchloss = model((ben, bvi))
      batchloss.backward()
      optimizer.step() 
      loss = loss + batchloss.item()
      # if step % 100 == 0:
      #   print("step:", step, "batch loss:", batchloss.item())
      # step += 1
    print("epoch", epoch, ": loss", loss)
  torch.save(model.state_dict(), MODEL_NAME)

In [22]:
train_LMST()

epoch 0 : loss 467680.1049180031
epoch 1 : loss 420743.4804139137
epoch 2 : loss 395885.1951649189
epoch 3 : loss 377262.96259868145
epoch 4 : loss 362846.37199339643
epoch 5 : loss 350933.68852835894
epoch 6 : loss 340627.39025726914
epoch 7 : loss 331864.4622730613
epoch 8 : loss 324464.27799577266
epoch 9 : loss 317471.6290681809
epoch 10 : loss 311385.270949617
epoch 11 : loss 305820.2343057394
epoch 12 : loss 300873.89078941196
epoch 13 : loss 296320.78486964107
epoch 14 : loss 292208.3124485314
epoch 15 : loss 288366.8290441334
epoch 16 : loss 284903.79029307514
epoch 17 : loss 281661.36960694194
epoch 18 : loss 278765.9623745084
epoch 19 : loss 275991.3950408101
epoch 20 : loss 273413.89816597104
epoch 21 : loss 271077.42956490815
epoch 22 : loss 268820.6395789981
epoch 23 : loss 266909.9334579855
epoch 24 : loss 264674.5834614709
epoch 25 : loss 262828.5807994604
epoch 26 : loss 261100.90756383166
epoch 27 : loss 259368.92650842667
epoch 28 : loss 257715.67145892978
epoch 29 : 

In [None]:
train_LMST(True)

epoch 0 : loss 274399.25209573656
epoch 1 : loss 266435.32926535606


In [28]:
from torchtext.data.metrics import bleu_score
def test_LMST():
  model = LSTM(vocab.en.get_itos(), vocab.en, vocab.vi.get_itos(), vocab.vi).to(DEVICE)
  model.load_state_dict(torch.load(MODEL_NAME))
  model.eval()
  ref = []
  pred = []
  for enprep, en, vi in test_data:
    input = torch.tensor([enprep], dtype=torch.int64).transpose(0, 1).to(DEVICE)
    p=model.evaluate(input, vocab.vi.get_itos(), vocab.vi)
    print("INPUT", en)
    print("REF", vi)
    print("MT", p)
    ref.append([' '.join(vi).split()])
    pred.append(' '.join(p).split())
  print("total:", len(test_data)) 
  bleu = bleu_score(pred, ref)
  print("bleu:", bleu)

In [29]:
test_LMST()


INPUT ['when', 'i', 'was', 'little', ',', 'i', 'thought', 'my', 'country', 'was', 'the', 'best', 'on', 'the', 'planet', ',', 'and', 'i', 'grew', 'up', 'singing', 'a', 'song', 'called', '"', 'nothing', 'to', 'envy', '.', '"']
REF ['khi', 'tôi', 'còn', 'nhỏ', ',', 'tôi', 'nghĩ', 'rằng', 'bắctriều', 'tiên', 'là', 'đất nước', 'tốt', 'nhất', 'trên', 'thế giới', 'và', 'tôi', 'thường', 'hát', 'bài', '"', 'chúng ta', 'chẳng', 'có', 'gì', 'phải', 'ghen tị', '.', '"']
MT ['khi', 'tôi', 'nghĩ', 'về', 'cuộc sống', ',', 'tôi', 'nghĩ', 'mình', 'là', 'một', 'nhà', 'kinh tế học', ',', 'và', 'tôi', 'đã', 'nói', 'rằng', ',', '"', 'đừng', 'lo', 'là', '"', 'tôi', 'đang', 'ở', 'trên', 'đỉnh', 'núi', '"', '.']
INPUT ['and', 'i', 'was', 'very', 'proud', '.']
REF ['tôi', 'đã', 'rất', 'tự hào', 'về', 'đất nước', 'tôi', '.']
MT ['và', 'tôi', 'rất', 'tự hào', '.']
INPUT ['in', 'school', ',', 'we', 'spent', 'a', 'lot', 'of', 'time', 'studying', 'the', 'history', 'of', 'kim', 'il', '-', 'sung', ',', 'but', 'we', '