In [3]:
import requests
import torch
import torch.nn.functional as F
import torchtext

In [4]:
url = 'https://nlp.stanford.edu/projects/nmt/data/iwslt15.en-vi/'
train_en =[line.split() for line in requests.get(url+'train.en').text.splitlines()]
train_vi =[line.split() for line in requests.get(url+'train.vi').text.splitlines()]
test_en =[line.split() for line in requests.get(url+'tst2013.en').text.splitlines()]
test_vi =[line.split() for line in requests.get(url+'tst2013.vi').text.splitlines()]

In [5]:
for i in range(10):
    print(train_en[i])
    print(train_vi[i])
print('# of line',len(train_en),len(train_vi),len(test_en),len(test_vi))

['Rachel', 'Pike', ':', 'The', 'science', 'behind', 'a', 'climate', 'headline']
['Khoa', 'học', 'đằng', 'sau', 'một', 'tiêu', 'đề', 'về', 'khí', 'hậu']
['In', '4', 'minutes', ',', 'atmospheric', 'chemist', 'Rachel', 'Pike', 'provides', 'a', 'glimpse', 'of', 'the', 'massive', 'scientific', 'effort', 'behind', 'the', 'bold', 'headlines', 'on', 'climate', 'change', ',', 'with', 'her', 'team', '--', 'one', 'of', 'thousands', 'who', 'contributed', '--', 'taking', 'a', 'risky', 'flight', 'over', 'the', 'rainforest', 'in', 'pursuit', 'of', 'data', 'on', 'a', 'key', 'molecule', '.']
['Trong', '4', 'phút', ',', 'chuyên', 'gia', 'hoá', 'học', 'khí', 'quyển', 'Rachel', 'Pike', 'giới', 'thiệu', 'sơ', 'lược', 'về', 'những', 'nỗ', 'lực', 'khoa', 'học', 'miệt', 'mài', 'đằng', 'sau', 'những', 'tiêu', 'đề', 'táo', 'bạo', 'về', 'biến', 'đổi', 'khí', 'hậu', ',', 'cùng', 'với', 'đoàn', 'nghiên', 'cứu', 'của', 'mình', '--', 'hàng', 'ngàn', 'người', 'đã', 'cống', 'hiến', 'cho', 'dự', 'án', 'này', '--', 'một

In [6]:
def make_vocab(train_data, min_freg):
    vocab = {}
    for tokenlist in train_data:
        for token in tokenlist:
            if token not in vocab:
                vocab[token] = 0
            vocab[token] += 1
    vocablist = [('<unk>',0), ('<pad>,0'),('<cls>',0),('<eos>',0)]
    vocabidx = {}
    for token, freg in vocab.items():
        if freg >= min_freg:
            idx = len(vocablist)
            vocablist.append((token,freg))
            vocabidx[token]=idx
    vocabidx['<unk>'] =0
    vocabidx['<pad>'] =0
    vocabidx['<cls>'] =0
    vocabidx['<eos>'] =0
    return vocablist, vocabidx

In [7]:
vocablist_en,vocabidx_en = make_vocab(train_en,3)
vocablist_vi,vocabidx_vi = make_vocab(train_vi,3)
print('vocab size en:',len(vocablist_en))
print('vocab size vi:',len(vocablist_vi))

vocab size en: 24420
vocab size vi: 10666


In [8]:
def preprocess(data, vocabidx):
    rr = []
    for tokenlist in data:
        tkl = ['<cls>']
        for token in tokenlist:
            tkl.append(token if  token in vocabidx else '<unk>')
        tkl.append('<eos>')
        rr.append(tkl)
    return rr
train_en_prep = preprocess(train_en, vocabidx_en)
train_vi_prep = preprocess(train_vi, vocabidx_vi)
test_en_prep = preprocess(test_en, vocabidx_en)
for i in range(5):
    print(train_en_prep[i])
    print(train_vi_prep[i])
    print(test_en_prep[i])

['<cls>', 'Rachel', 'Pike', ':', 'The', 'science', 'behind', 'a', 'climate', 'headline', '<eos>']
['<cls>', 'Khoa', 'học', 'đằng', 'sau', 'một', 'tiêu', 'đề', 'về', 'khí', 'hậu', '<eos>']
['<cls>', 'When', 'I', 'was', 'little', ',', 'I', 'thought', 'my', 'country', 'was', 'the', 'best', 'on', 'the', 'planet', ',', 'and', 'I', 'grew', 'up', 'singing', 'a', 'song', 'called', '&quot;', 'Nothing', 'To', '<unk>', '.', '&quot;', '<eos>']
['<cls>', 'In', '4', 'minutes', ',', 'atmospheric', 'chemist', 'Rachel', 'Pike', 'provides', 'a', 'glimpse', 'of', 'the', 'massive', 'scientific', 'effort', 'behind', 'the', 'bold', 'headlines', 'on', 'climate', 'change', ',', 'with', 'her', 'team', '--', 'one', 'of', 'thousands', 'who', 'contributed', '--', 'taking', 'a', 'risky', 'flight', 'over', 'the', 'rainforest', 'in', 'pursuit', 'of', 'data', 'on', 'a', 'key', 'molecule', '.', '<eos>']
['<cls>', 'Trong', '4', 'phút', ',', 'chuyên', 'gia', 'hoá', 'học', 'khí', 'quyển', 'Rachel', 'Pike', 'giới', 'thiệu

In [9]:
train_data = list(zip(train_en_prep,train_vi_prep))
train_data.sort(key = lambda x : (len(x[0]),len(x[1])))
test_data = list(zip(test_en_prep,test_en,test_vi))
for i in range(5):
    print(train_data[i])
    print(test_data[i])

(['<cls>', '<eos>'], ['<cls>', '<eos>'])
(['<cls>', 'When', 'I', 'was', 'little', ',', 'I', 'thought', 'my', 'country', 'was', 'the', 'best', 'on', 'the', 'planet', ',', 'and', 'I', 'grew', 'up', 'singing', 'a', 'song', 'called', '&quot;', 'Nothing', 'To', '<unk>', '.', '&quot;', '<eos>'], ['When', 'I', 'was', 'little', ',', 'I', 'thought', 'my', 'country', 'was', 'the', 'best', 'on', 'the', 'planet', ',', 'and', 'I', 'grew', 'up', 'singing', 'a', 'song', 'called', '&quot;', 'Nothing', 'To', 'Envy', '.', '&quot;'], ['Khi', 'tôi', 'còn', 'nhỏ', ',', 'Tôi', 'nghĩ', 'rằng', 'BắcTriều', 'Tiên', 'là', 'đất', 'nước', 'tốt', 'nhất', 'trên', 'thế', 'giới', 'và', 'tôi', 'thường', 'hát', 'bài', '&quot;', 'Chúng', 'ta', 'chẳng', 'có', 'gì', 'phải', 'ghen', 'tị', '.', '&quot;'])
(['<cls>', '<eos>'], ['<cls>', '<eos>'])
(['<cls>', 'And', 'I', 'was', 'very', 'proud', '.', '<eos>'], ['And', 'I', 'was', 'very', 'proud', '.'], ['Tôi', 'đã', 'rất', 'tự', 'hào', 'về', 'đất', 'nước', 'tôi', '.'])
(['<cls>

In [10]:
def make_batch(data, batchsize=64):
    bb = []
    ben = []
    bvi = []
    for en,vi in data:
        ben.append(en)
        bvi.append(vi)
        if len(ben) >= batchsize:
            bb.append((ben, bvi))
            ben = []
            bvi = []
    if len(ben) > 0:
        bb.append((ben,bvi))
    return bb
train_data = make_batch(train_data, 64)
for i in range(5):
    print(train_data[i])

([['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>']

In [11]:
def padding_batch(b):
    maxlen=max([len(x) for x in b])
    for tkl in b:
        for i in range(maxlen-len(tkl)):
            tkl.append('<pad>')

def padding(bb):
    for ben,bvi in bb:
        padding_batch(ben)
        padding_batch(bvi)
padding(train_data)
for i in range(3):
    print(train_data[i])

([['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>'], ['<cls>', '<eos>']

In [12]:
train_data = [([[vocabidx_en[token] for token in tokenlist] for tokenlist in ben],
               [[vocabidx_vi[token] for token in tokenlist] for tokenlist in bvi])for ben,bvi in train_data]
test_data = [([vocabidx_en[token] for token in enprep],en, vi) for enprep, en, vi in test_data]
for i in range(3):
    print(train_data[i])
    print(test_data[i])

([[0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0]], [[0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 

In [13]:
EPOCH = 20
LR = 0.001
MODELNAME ='iwslt15-en-vi-rnn-lr-0.001.model'
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

In [14]:
class RNNEncDec(torch.nn.Module):
    def __init__(self,vocablist_x,vocabidx_x,vocablist_y,vocabidx_y):
        super(RNNEncDec,self).__init__()
        self.dropout = torch.nn.Dropout(0.5)
        self.encemb = torch.nn.Embedding(len(vocablist_x),512, padding_idx=vocabidx_x['<pad>'])
        self.encrnn = torch.nn.LSTM(512,512,3,dropout = 0.5,  bidirectional=True)
        self.decemb = torch.nn.Embedding(len(vocablist_y),512, padding_idx=vocabidx_y['<pad>'])
        self.decrnn = torch.nn.LSTM(512,512,3,dropout = 0.5, bidirectional=True)
        self.decout = torch.nn.Linear(1024,len(vocablist_y))

    def forward(self,x):
        #encoder
        x,y = x[0],x[1]
        e_x = self.encemb(x)
        n_x = e_x.size()[0]
        h = torch.zeros((6,x.size(1),512), dtype=torch.float32).to(DEVICE)
        c = torch.zeros((6,x.size(1),512), dtype=torch.float32).to(DEVICE)
        for i in range(n_x):
            output,(h,c)=self.encrnn(torch.unsqueeze(e_x[i],0),(h,c)) 
        output = torch.squeeze(output)
        #decoder
        e_y = self.decemb(y)
        n_y = e_y.size()[0]
        loss = torch.tensor(0.,dtype=torch.float32).to(DEVICE)
        for i in range(n_y-1):
            out,(h,c) = self.decrnn(torch.unsqueeze(e_y[i],0),(h,c))
            out = torch.squeeze(out)
            loss += F.cross_entropy(self.decout(out), y[i+1])
        return loss

    def evaluate(self,x,vocablist_y,vocabidx_y):
        e_x = self.encemb(x)
        n_x = e_x.size()[0]
        h = torch.zeros((6,x.size(1),512), dtype=torch.float32).to(DEVICE)
        c = torch.zeros((6,x.size(1),512), dtype=torch.float32).to(DEVICE)
        for i in range(n_x):
            #h = F.relu(e_x[i] + self.encrnn(h))
            output,(h,c)=self.encrnn(torch.unsqueeze(e_x[i],0),(h,c))
        y = torch.tensor([vocabidx_y['<cls>']]).to(DEVICE)
        e_y = self.decemb(y)
        pred = []
        for i in range(50):
            #h = F.relu(e_y+self.decrnn(h))
            out,(h,c) = self.decrnn(torch.unsqueeze(e_y,0),(h,c))
            out = out.view(1,1024)
            pred_id = self.decout(out).squeeze().argmax()
            if pred_id == vocabidx_y['<eos>']:
                break
            pred_y = vocablist_y[pred_id][0]
            pred.append(pred_y)
            y[0] = pred_id
            e_y = self.decemb(y)
        return pred

In [15]:
def train():
    model = RNNEncDec(vocablist_en,vocabidx_en,vocablist_vi,vocabidx_vi).to(DEVICE)
    optimizer = torch.optim.Adam(model.parameters(), lr = LR)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.8)
    for epoch in range(EPOCH):
        loss = 0
        step = 0
        for ben, bvi in train_data:
            ben = torch.tensor(ben,dtype=torch.int64).transpose(0,1).to(DEVICE)
            bvi = torch.tensor(bvi,dtype=torch.int64).transpose(0,1).to(DEVICE)
            optimizer.zero_grad()
            batchloss =model((ben,bvi))
            batchloss.backward()
            optimizer.step()
            loss = loss + batchloss.item()
            if step %100 == 0:
                print('step:',step,'batchloss:',batchloss.item())
            step+=1
        print('epoch',epoch,': loss',loss)
        scheduler.step()
    torch.save(model.state_dict(),MODELNAME)

In [16]:
def test():
    total = 0
    correct = 0
    model = RNNEncDec(vocablist_en,vocabidx_en,vocablist_vi,vocabidx_vi).to(DEVICE)
    model.load_state_dict(torch.load(MODELNAME))
    model.eval()
    ref = []
    pred =[]
    for enprep, en, vi in test_data:
        input =  torch.tensor([enprep], dtype=torch.int64).transpose(0,1).to(DEVICE)
        p = model.evaluate(input, vocablist_vi,vocabidx_vi)
        print('INPUT',en)
        print('REF',vi)
        print('MT',p)
        ref.append([vi])
        pred.append(p)
    bleu = torchtext.data.metrics.bleu_score(pred,ref)
    print('total:',len(test_data))
    print('bleu:',bleu)


In [17]:
import time
start = time.time()
train()
print("Completed:", time.time()-start)

step: 0 batchloss: 9.266225814819336
step: 100 batchloss: 34.314476013183594
step: 200 batchloss: 49.59475326538086
step: 300 batchloss: 57.393310546875
step: 400 batchloss: 61.893211364746094
step: 500 batchloss: 66.69588470458984
step: 600 batchloss: 72.14045715332031
step: 700 batchloss: 75.55307006835938
step: 800 batchloss: 95.92140197753906
step: 900 batchloss: 65.07380676269531
step: 1000 batchloss: 81.87042236328125
step: 1100 batchloss: 94.94229125976562
step: 1200 batchloss: 89.69611358642578
step: 1300 batchloss: 154.37596130371094
step: 1400 batchloss: 125.00069427490234
step: 1500 batchloss: 101.5946044921875
step: 1600 batchloss: 135.1498565673828
step: 1700 batchloss: 153.57229614257812
step: 1800 batchloss: 170.25401306152344
step: 1900 batchloss: 192.5818328857422
step: 2000 batchloss: 214.02255249023438
epoch 0 : loss 226939.85603809357
step: 0 batchloss: 2.603612184524536
step: 100 batchloss: 21.731826782226562
step: 200 batchloss: 31.183595657348633
step: 300 batchl

In [19]:
test()

INPUT ['When', 'I', 'was', 'little', ',', 'I', 'thought', 'my', 'country', 'was', 'the', 'best', 'on', 'the', 'planet', ',', 'and', 'I', 'grew', 'up', 'singing', 'a', 'song', 'called', '&quot;', 'Nothing', 'To', 'Envy', '.', '&quot;']
REF ['Khi', 'tôi', 'còn', 'nhỏ', ',', 'Tôi', 'nghĩ', 'rằng', 'BắcTriều', 'Tiên', 'là', 'đất', 'nước', 'tốt', 'nhất', 'trên', 'thế', 'giới', 'và', 'tôi', 'thường', 'hát', 'bài', '&quot;', 'Chúng', 'ta', 'chẳng', 'có', 'gì', 'phải', 'ghen', 'tị', '.', '&quot;']
MT ['Khi', 'tôi', 'còn', 'nhỏ', ',', 'tôi', 'là', 'một', 'đứa', 'trẻ', 'nghèo', 'nhất', 'trên', 'thế', 'giới', ',', 'và', 'tôi', 'gọi', 'bà', 'ấy', 'là', 'một', 'người', 'đàn', 'ông', 'hát', 'opera', '.']
INPUT ['And', 'I', 'was', 'very', 'proud', '.']
REF ['Tôi', 'đã', 'rất', 'tự', 'hào', 'về', 'đất', 'nước', 'tôi', '.']
MT ['Và', 'tôi', 'đã', 'rất', 'tự', 'hào', 'về', 'bản', 'thân', '.']
INPUT ['In', 'school', ',', 'we', 'spent', 'a', 'lot', 'of', 'time', 'studying', 'the', 'history', 'of', 'Kim', 