# Machine Translation Task (Portuguese to English)


### Software Requirements
- Python (>=3.6)
- PyTorch (>=1.2.0) 
- Jupyter (latest)
- torchtext
- NLTK

## Import libraries, dataset and prepare necessary functions

In [0]:
import unicodedata
import string
import re
import random
import time
import datetime
import math

import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence
import torchtext
from torchtext.datasets import TranslationDataset

import spacy
import numpy as np

import math, copy, time
import matplotlib.pyplot as plt
import seaborn
seaborn.set_context(context="talk")
%matplotlib inline

In [0]:
def inference(model, file_name, src_vocab, trg_vocab, attention= True, batch_size = 128, max_trg_len = 64):
    '''
    Function for translation inference

    Input: 
    model: translation model;
    file_name: the directoy of test file that the first column is target reference, and the second column is source language;
    trg_vocab: Target torchtext Field
    attention: the model returns attention weights or not.
    max_trg_len: the maximal length of translation text (optinal), default = 64

    Output:
    Corpus BLEU score.
    '''
    from nltk.translate.bleu_score import corpus_bleu
    from nltk.translate.bleu_score import sentence_bleu
    from torchtext.data import TabularDataset
    from torchtext.data import Iterator

    # convert index to text string
    def convert_itos(convert_vocab, token_ids):
        list_string = []
        for i in token_ids:
            if i == convert_vocab.vocab.stoi['<eos>']:
                break
            else:
                token = convert_vocab.vocab.itos[i]
                list_string.append(token)
        return list_string

    test = TabularDataset(
      path=file_name, # the root directory where the data lies
      format='tsv',
      skip_header=True, # if your tsv file has a header, make sure to pass this to ensure it doesn't get proceesed as data!
      fields=[('SRC', src_vocab), ('TRG', trg_vocab)])

    test_iter = Iterator(
    dataset = test, # we pass in the datasets we want the iterator to draw data from
    sort = False,batch_size=batch_size,
    sort_key=None,
    shuffle=False,
    sort_within_batch=False,
    device = device,
    train=False
    )
  
    model.eval()
    all_trg = []
    all_translated_trg = []

    TRG_PAD_IDX = trg_vocab.vocab.stoi[trg_vocab.pad_token]

    with torch.no_grad():
    
        for i, batch in enumerate(test_iter):

            src = batch.SRC
            #src = [src len, batch size]

            trg = batch.TRG
            #trg = [trg len, batch size]

            batch_size = trg.shape[1]

            # create a placeholder for traget language with shape of [max_trg_len, batch_size] where all the elements are the index of <pad>. Then send to device
            trg_placeholder = torch.Tensor(max_trg_len, batch_size)
            trg_placeholder.fill_(TRG_PAD_IDX)
            trg_placeholder = trg_placeholder.long().to(device)
            if attention == True:
              output,_ = model(src, trg_placeholder, 0) #turn off teacher forcing
            else:
              output = model(src, trg_placeholder) #turn off teacher forcing
            # get translation results, we ignor first token <sos> in both translation and target sentences. 
            # output_translate = [(trg len - 1), batch, output dim] output dim is size of target vocabulary.
            output_translate = output[1:]
            # store gold target sentences to a list 
            all_trg.append(trg[1:].cpu())

            # Choose top 1 word from decoder's output, we get the probability and index of the word
            prob, token_id = output_translate.data.topk(1)
            translation_token_id = token_id.squeeze(2).cpu()

            # store gold target sentences to a list 
            all_translated_trg.append(translation_token_id)
      
    all_gold_text = []
    all_translated_text = []
    for i in range(len(all_trg)): 
        cur_gold = all_trg[i]
        cur_translation = all_translated_trg[i]
        for j in range(cur_gold.shape[1]):
            gold_convered_strings = convert_itos(trg_vocab,cur_gold[:,j])
            gold_convered_strings = [c.lower() for c in gold_convered_strings]

            trans_convered_strings = convert_itos(trg_vocab,cur_translation[:,j])
            trans_convered_strings = [c.lower() for c in trans_convered_strings]

            all_gold_text.append(gold_convered_strings)
            all_translated_text.append(trans_convered_strings)

    corpus_all_gold_text = [[item] for item in all_gold_text]
    corpus_bleu_score = corpus_bleu(corpus_all_gold_text, all_translated_text)  
    return corpus_bleu_score

In [5]:
manual_seed = 77
torch.manual_seed(manual_seed)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
n_gpu = torch.cuda.device_count()
if n_gpu > 0:
    torch.cuda.manual_seed(manual_seed)

cuda


In [6]:
!python -m spacy download en_core_web_sm
!python -m spacy download pt_core_news_sm

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
Collecting pt_core_news_sm==2.2.5
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/pt_core_news_sm-2.2.5/pt_core_news_sm-2.2.5.tar.gz (21.2MB)
[K     |████████████████████████████████| 21.2MB 5.6MB/s 
Building wheels for collected packages: pt-core-news-sm
  Building wheel for pt-core-news-sm (setup.py) ... [?25l[?25hdone
  Created wheel for pt-core-news-sm: filename=pt_core_news_sm-2.2.5-cp36-none-any.whl size=21186282 sha256=1044924428db1344af734311a64b606590712673b0dfd8a57a1a22f88b9e5e25
  Stored in directory: /tmp/pip-ephem-wheel-cache-sgqkdzno/wheels/ea/94/74/ec9be8418e9231b471be5dc7e1b45dd670019a376a6b5bc1c0
Successfully built pt-core-news-sm
Installing collected packages: pt-core-news-sm
Successfully installed pt-core-news-sm-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('pt_core_ne

In [0]:
import pt_core_news_sm
import en_core_web_sm

spacy_pt = pt_core_news_sm.load()
spacy_en = en_core_web_sm.load()

In [0]:
def tokenize_pt(text):
    """
    Tokenizes PT text from a string into a list of strings (tokens)
    """
    return [tok.text for tok in spacy_pt.tokenizer(text)]

def tokenize_en(text):
    """
    Tokenizes English text from a string into a list of strings (tokens)
    """
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [0]:
SRC = torchtext.data.Field(tokenize = tokenize_pt, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)
TRG = torchtext.data.Field(tokenize = tokenize_en, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)

In [0]:
train, val= torchtext.data.TabularDataset.splits(
    path='./drive/My Drive/Colab Notebooks/lab4/data/machine_translation', train='mt-train-pt2en.tsv',validation='mt-valid-pt2en.tsv', 
    format='tsv', skip_header=True, fields=[('SRC', SRC), ('TRG', TRG)])
test = torchtext.data.TabularDataset(
    path='./drive/My Drive/Colab Notebooks/lab4/data/machine_translation/mt-test-pt2en_onlySRCpt.tsv',
    format='tsv', skip_header=True, fields=[('SRC', SRC)])


In [12]:
print(f"Number of training examples: {len(train.examples)}")
print(f"Number of validation examples: {len(val.examples)}")
print(f"Number of testing examples: {len(test.examples)}")

Number of training examples: 85918
Number of validation examples: 4772
Number of testing examples: 4738


In [0]:
import pickle

with open("./drive/My Drive/Colab Notebooks/lab4/SRC.field","rb")as f:
     pt_vec = pickle.load(f).vocab.vectors


In [0]:
# # try pre-trained embeddings for initialization

fasttext_pt_path = "/content/drive/My Drive/Colab Notebooks/lab4/pretrained/fasttext_pt_word2vec.pt"
glove_en_path = "/content/drive/My Drive/Colab Notebooks/lab4/pretrained/glove_word2vec.pt"
google_en_path = "/content/drive/My Drive/Colab Notebooks/lab4/pretrained/google_word2vec.pt"

#src_emb = torch.load(fasttext_pt_path)
trg_emb = torch.load(glove_en_path)

src_emb = torch.load(fasttext_pt_path)
#src_emb = pt_vec

In [0]:
TRG.build_vocab(train, min_freq = 2)
SRC.build_vocab(train, min_freq = 2)

In [15]:
print(f"Unique tokens in source (pt) vocabulary: {len(SRC.vocab)}")
print(f"Unique tokens in target (en) vocabulary: {len(TRG.vocab)}")

Unique tokens in source (pt) vocabulary: 7891
Unique tokens in target (en) vocabulary: 2320


In [16]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [0]:
train_batch_size = 16
val_batch_size = 256

train_iter, val_iter = torchtext.data.BucketIterator.splits(
    (train, val), # we pass in the datasets we want the iterator to draw data from
    batch_sizes=(train_batch_size, val_batch_size),device = device,
    sort_key=lambda x: len(x.SRC), # the BucketIterator needs to be told what function it should use to group the data.
    sort_within_batch=False)


test_iter = torchtext.data.Iterator(test,
    sort = False, batch_size=val_batch_size,
    sort_key=None,
    shuffle=False,
    sort_within_batch=False,
    device = device,
    train=False)


In [18]:
# batch example of training data
for batch in train_iter:
    src = batch.SRC
    trg = batch.TRG
    print('tensor size of source language:', src.shape)
    print('tensor size of target language:', trg.shape)
    print('the tensor of first example in target language:', trg[:,0])
    break

tensor size of source language: torch.Size([14, 16])
tensor size of target language: torch.Size([15, 16])
the tensor of first example in target language: tensor([  2,  27,  10,   8, 116, 308,   4,   3,   1,   1,   1,   1,   1,   1,
          1], device='cuda:0')


In [0]:
import pickle
with open("/content/drive/My Drive/Colab Notebooks/lab4/data/machine_translation/TRG.Field","wb")as f:
     pickle.dump(TRG,f)

with open("/content/drive/My Drive/Colab Notebooks/lab4/data/machine_translation/SRC.Field", "wb")as f:
     pickle.dump(SRC,f)

# Try with seq2seq Attention Model

In [0]:
def train_attn(model, iterator, optimizer, criterion, clip):
    manual_seed = 77
    torch.manual_seed(manual_seed)
    if n_gpu > 0:
        torch.cuda.manual_seed(manual_seed)
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        src = batch.SRC
        trg = batch.TRG
        
        optimizer.zero_grad()
        
        output,_ = model(src, trg, teacher_forcing_ratio = TEACH_FORCING_RATE)
        
        #trg = [trg len, batch size]
        #output = [trg len, batch size, output dim]
        
        output_dim = output.shape[-1]
        
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
        
        #trg = [(trg len - 1) * batch size]
        #output = [(trg len - 1) * batch size, output dim]
        
        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    bleu = inference(model, "/content/drive/My Drive/Colab Notebooks/lab4/data/machine_translation/mt-train-pt2en.tsv", SRC, TRG, True, train_batch_size, 64)
    return epoch_loss / len(iterator), bleu

def evaluate_attn(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src = batch.SRC
            trg = batch.TRG

            output,_ = model(src, trg, 0) #turn off teacher forcing
            #trg = [trg len, batch size]
            #output = [trg len, batch size, output dim]

            output_dim = output.shape[-1]
            
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)

            #trg = [(trg len - 1) * batch size]
            #output = [(trg len - 1) * batch size, output dim]

            loss = criterion(output, trg)

            epoch_loss += loss.item()
        
    bleu = inference(model, "/content/drive/My Drive/Colab Notebooks/lab4/data/machine_translation/mt-valid-pt2en.tsv", SRC, TRG, True, val_batch_size, 64)
    return epoch_loss / len(iterator), bleu

In [0]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, enc_hid_dim, n_layers, dropout, bidirection):
        super().__init__()

        self.emb_dim = emb_dim
        self.enc_hid_dim = enc_hid_dim
        self.dropout = dropout
        self.n_layers = n_layers

        self.embedding = nn.Embedding(input_dim, emb_dim)#.from_pretrained(src_emb, freeze = False)
        self.lstm = nn.LSTM(emb_dim, enc_hid_dim, n_layers, dropout=dropout, bidirectional = bidirection)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
        
        #src = [src len, batch size]
        
        embedded = self.dropout(self.embedding(src))
        
        #embedded = [src len, batch size, emb dim]
        
        outputs, (hidden, cell) = self.lstm(embedded)
       
        # outputs are always from the top hidden layer, if bidirectional outputs are concatenated.
        # outputs shape [sequence_length, batch_size, hidden_dim * num_directions]
        # hidden is of shape [num_layers * num_directions, batch_size, hidden_size]
        # cell is of shape [num_layers * num_directions, batch_size, hidden_size]
        
        return outputs, (hidden, cell)

In [0]:
class Attention(nn.Module):
    def __init__(self):
        super().__init__()
        
    def forward(self, dec_hidden, encoder_outputs):
        #hidden = [batch size, dec hid dim]
        #encoder_outputs = [src len, batch size, enc hid dim * num_directions]
        
        batch_size = encoder_outputs.shape[1]
        src_len = encoder_outputs.shape[0]
        
        #repeat encoder hidden state src_len-1 times
        hidden = dec_hidden.unsqueeze(1)
        
        encoder_outputs = encoder_outputs.permute(1, 2, 0)
        
        #hidden = [batch size, 1, dec hid dim]
        #encoder_outputs = [batch size, enc hid dim * num_directions, src_len]
        
        # attention scoring function : S dot H_encoder
        attention = torch.bmm(hidden, encoder_outputs).squeeze(1)
   
        # attention = [batch size, src len]
        return F.softmax(attention, dim=1)

In [0]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, n_layers, dropout, attention):
        super().__init__()
        
        self.output_dim = output_dim
        self.attention = attention
        
        self.embedding = nn.Embedding(output_dim, emb_dim)#.from_pretrained(trg_emb, freeze = False)
        
        self.rnn = nn.LSTM(emb_dim, dec_hid_dim, n_layers, dropout=dropout)

        self.fc_mid = nn.Linear(enc_hid_dim * 2 + dec_hid_dim, dec_hid_dim)
        
        self.fc_out = nn.Linear(dec_hid_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, cell, encoder_outputs):
             
        #input = [batch size], current token
        #hidden = [batch size, dec hid dim], previous decoder hidden state
        #cell = [batch size, dec hid dim], previous decoder cell state
        #encoder_outputs = [src len, batch size, enc hid dim * num_directions], all encoder hidden states (H)
        
        input = input.unsqueeze(0)
        
        #input = [1, batch size]
        
        embedded = self.dropout(self.embedding(input))
        
        #embedded = [1, batch size, emb dim]

        # Now, get the current hidden state of the decoder

        dec_output, (dec_hidden, dec_cell) = self.rnn(embedded, (hidden.unsqueeze(0), cell.unsqueeze(0)))

        dec_hidden = dec_hidden.squeeze(0)
        # dec_hidden = [batch size, dec hid dim]

        dec_cell = dec_cell.squeeze(0)
        # dec_cell = [batch size, dec hid dim]

        # Use the current hidden state of the decoder, get the attention probabilities

        attention_weights = self.attention(dec_hidden, encoder_outputs)
        # attention weights = [batch size, src_len]

        ## We need to create weighted context vector
        
        attention_weights = attention_weights.unsqueeze(1)
        # attention weights = [batch size, 1, src_len]
        
        encoder_outputs = encoder_outputs.permute(1, 0, 2)

        # encoder_outputs = [batch size, src len, enc hid dim * num_directions]
        # perform weighted sum of encoder hidden states to get attention output
        weighted = torch.bmm(attention_weights, encoder_outputs)
        
        #weighted = [batch size, 1, enc hid dim * num_directions]
        
        weighted = weighted.permute(1, 0, 2)
        
        #weighted = [1, batch size, enc hid dim * num_directions]

        weighted = weighted.squeeze(0)

        # weighted = [batch size, enc hid dim * num_directions]

        ## concatenate the weighted context vector with the current decoder hidden state

        conc = torch.cat((weighted, dec_hidden), dim = 1)

        # conc = [batch size, enc hid dim * num_directions + dec hid dim]

        ## Pass through the f_mid Linear layer to get S'

        S_new = self.fc_mid(conc)

        # S_new = [batch size, dec hid dim]

        ## Then pass the f_out Linear layer to get predictions

        prediction = self.fc_out(S_new)

        # prediction = [batch size, output dim]

        return prediction, dec_hidden, dec_cell, attention_weights

In [0]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        
        #src = [src len, batch size]
        #trg = [trg len, batch size]
        #teacher_forcing_ratio is probability to use teacher forcing
        #e.g. if teacher_forcing_ratio is 0.75 we use teacher forcing 75% of the time
        batch_size = src.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        #tensor to store decoder outputs
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        # save the encoder-decoder attention weights
        # all_attention_weights = [batch_size, trg len-1, src len ]
        all_attention_weights = torch.zeros(trg.shape[1], trg.shape[0]-1, src.shape[0])
        #encoder_outputs is all hidden states of the input sequence, back and forwards
        #hidden is the final forward and backward hidden states, passed through a linear layer
        encoder_outputs, (hidden, cell) = self.encoder(src)
          
        hidden = torch.mean(encoder_outputs, dim=0)
        cell = torch.cat((cell[0, :, :].unsqueeze(0), cell[1, :, :].unsqueeze(0)), dim = 2).squeeze(0)

        #first input to the decoder is the <sos> tokens
        input = trg[0,:]
        
        for t in range(1, trg_len):
            
            #insert input token embedding, previous hidden state and all encoder hidden states
            #receive output tensor (predictions) and new hidden state
            output, hidden, cell, attention_weights = self.decoder(input, hidden, cell, encoder_outputs)
            
            # all_attention_weights[t-1] = [src len, batch size]
            all_attention_weights[:,t-1,:] = attention_weights.squeeze(1)
            
            #place predictions in a tensor holding predictions for each token
            outputs[t] = output
            
            #decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            
            #get the highest predicted token from our predictions
            top1 = output.argmax(1) 
            
            #if teacher forcing, use actual next token as next input
            #if not, use predicted token
            input = trg[t] if teacher_force else top1

        return outputs,all_attention_weights

In [27]:
# hyperparameter define

INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
N_LAYERS = 1
BI_DIRECTION = True
ENC_HID_DIM = 512
DEC_HID_DIM = 2 * ENC_HID_DIM #(you should figure out this hyper-parameter). 
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.8
TEACH_FORCING_RATE = 0.7
LEARNING_RT = 0.0012
WEIGHT_DECAY = 0
MAX_EPOCH = 10

attn = Attention()
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, N_LAYERS, ENC_DROPOUT, BI_DIRECTION)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, N_LAYERS, DEC_DROPOUT, attn)

model = Seq2Seq(enc, dec, device).to(device)
optimizer = optim.Adam(model.parameters(),lr=LEARNING_RT, weight_decay=WEIGHT_DECAY)

TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]
print('<pad> token index: ',TRG_PAD_IDX)
## we will ignore the pad token in true target set
criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)


<pad> token index:  1


  "num_layers={}".format(dropout, num_layers))
  "num_layers={}".format(dropout, num_layers))


In [28]:
def init_weights(m):
    manual_seed = 77
    torch.manual_seed(manual_seed)
    if n_gpu > 0:
        torch.cuda.manual_seed(manual_seed)
    for name, param in m.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.05)
        else:
            nn.init.constant_(param.data, 0)

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs


model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(7891, 256)
    (lstm): LSTM(256, 512, dropout=0.5, bidirectional=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (attention): Attention()
    (embedding): Embedding(2320, 256)
    (rnn): LSTM(256, 1024, dropout=0.8)
    (fc_mid): Linear(in_features=2048, out_features=1024, bias=True)
    (fc_out): Linear(in_features=1024, out_features=2320, bias=True)
    (dropout): Dropout(p=0.8, inplace=False)
  )
)

In [29]:
manual_seed = 77
torch.manual_seed(manual_seed)
if n_gpu > 0:
    torch.cuda.manual_seed(manual_seed)


best_valid_loss = float('inf')
CLIP = 1

for epoch in range(MAX_EPOCH):
    
    start_time = time.time()
    
    train_loss,bleu_train = train_attn(model, train_iter, optimizer, criterion, CLIP)
    valid_loss, bleu = evaluate_attn(model, val_iter, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    # Create checkpoint at end of each epoch
    state_dict_model = model.state_dict() 
    state = {
        'epoch': epoch,
        'state_dict': state_dict_model,
        'optimizer': optimizer.state_dict()
        }

    torch.save(state, "/content/drive/My Drive/Colab Notebooks/lab4/ckpt/mt_"+str(epoch+1)+".pt")

    print(f'Epoch: {epoch+1:02} | Tim0e: {epoch_mins}m {epoch_secs}s')
    print(f'\t Train Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')
    print(f'\t Train BLEU:{bleu_train:.3f} |  Val. BLEU: {bleu:7.3f}')

Epoch: 01 | Tim0e: 6m 50s
	 Train Loss: 1.238 | Train PPL:   3.447
	 Val. Loss: 5.201 |  Val. PPL: 181.509
	 Train BLEU:0.900 |  Val. BLEU:   0.259
Epoch: 02 | Tim0e: 6m 54s
	 Train Loss: 0.214 | Train PPL:   1.239
	 Val. Loss: 7.091 |  Val. PPL: 1201.357
	 Train BLEU:0.962 |  Val. BLEU:   0.273
Epoch: 03 | Tim0e: 6m 45s
	 Train Loss: 0.153 | Train PPL:   1.165
	 Val. Loss: 7.705 |  Val. PPL: 2218.617
	 Train BLEU:0.975 |  Val. BLEU:   0.277
Epoch: 04 | Tim0e: 6m 45s
	 Train Loss: 0.133 | Train PPL:   1.142
	 Val. Loss: 9.431 |  Val. PPL: 12463.416
	 Train BLEU:0.984 |  Val. BLEU:   0.271
Epoch: 05 | Tim0e: 6m 46s
	 Train Loss: 0.128 | Train PPL:   1.136
	 Val. Loss: 11.020 |  Val. PPL: 61110.444
	 Train BLEU:0.986 |  Val. BLEU:   0.266
Epoch: 06 | Tim0e: 7m 0s
	 Train Loss: 0.128 | Train PPL:   1.137
	 Val. Loss: 12.231 |  Val. PPL: 204988.977
	 Train BLEU:0.989 |  Val. BLEU:   0.263
Epoch: 07 | Tim0e: 6m 58s
	 Train Loss: 0.132 | Train PPL:   1.141
	 Val. Loss: 13.226 |  Val. PPL: 55

KeyboardInterrupt: ignored

In [31]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
N_LAYERS = 1
BI_DIRECTION = True
ENC_HID_DIM = 512
DEC_HID_DIM = 2 * ENC_HID_DIM #(you should figure out this hyper-parameter). 
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.8
TEACH_FORCING_RATE = 0.6
LEARNING_RT = 0.0011
WEIGHT_DECAY = 0
MAX_EPOCH = 10

attn = Attention()
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, N_LAYERS, ENC_DROPOUT, BI_DIRECTION)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, N_LAYERS, DEC_DROPOUT, attn)

model_best = Seq2Seq(enc, dec, device).to(device)

model_best.load_state_dict(torch.load('/content/drive/My Drive/Colab Notebooks/lab4/ckpt/mt_3.pt')['state_dict'])

def translation_inference(token_id):
  sent = []
  for i in token_id:
    if i == TRG.vocab.stoi["<eos>"]:
      break
    else:
      token = TRG.vocab.itos[i]
      sent.append(token)
  
  return sent


def generate_translations(model, eval_iter, trg_vocab, attention = True, max_trg_len = 64):
  '''
    Function for generating translation by model inference

    Input: 
    model: translation model;
    eval_iter: iterator over the evaluation data
    trg_vocab: Target torchtext Field
    attention: the model returns attention weights or not.
    max_trg_len: the maximal length of translation text (optional), default = 64

    Output:
    List of translated sentences
  '''
  model.eval()
  all_translation_word_ids = []
  all_gold_sents = []
  for batch in eval_iter:
    src = batch.SRC
    #src = [src len, batch size]
    batch_size = src.shape[1]

    # gold = batch.TRG

    # for i in range(src.shape[1]):
    #   gold_id = gold[1:, i].cpu().numpy()
    #   gold_sent = translation_inference(gold_id)
    #   all_gold_sents.append(gold_sent)



    # create a placeholder for target language with shape of [max_trg_len, batch_size] where all the elements are the index of <pad>. Then send to device
    trg_placeholder = torch.Tensor(max_trg_len, batch_size)
    trg_placeholder.fill_(TRG_PAD_IDX)
    trg_placeholder = trg_placeholder.long().to(device)
    if attention == True:
      output,_ = model(src, trg_placeholder) #turn off teacher forcing
    else:
      output = model(src, trg_placeholder) #turn off teacher forcing
    # get translation results, we ignore first token <sos> in both translation and target sentences. 
    # output_translate = [(trg len - 1), batch, output dim] output dim is size of target vocabulary.
    output_translate = output[1:]

    # Choose top 1 word from decoder's output, we get the probability and index of the word
    prob, token_id = output_translate.data.topk(1)
    translation_token_id = token_id.squeeze(2).cpu()

    # store gold target sentences to a list 
    all_translation_word_ids.append(translation_token_id)
  
  all_translation_text = []
  for i in range(len(all_translation_word_ids)):
    cur_translation_batch = all_translation_word_ids[i]
    for j in range(cur_translation_batch.shape[1]):
      trans_convered_strings = convert_itos(trg_vocab, cur_translation_batch[:,j])
      all_translation_text.append(' '.join(trans_convered_strings)) # convert list of words to text
  
  return all_translation_text #, all_gold_sents

def convert_itos(convert_vocab, token_ids):
    list_string = []
    for i in token_ids:
        if i == convert_vocab.vocab.stoi['<eos>']:
            break
        else:
            token = convert_vocab.vocab.itos[i]
            list_string.append(token)
    return list_string

translation = generate_translations(model, test_iter, TRG, attention = True, max_trg_len = 64)
print(translation[:5])


  "num_layers={}".format(dropout, num_layers))
  "num_layers={}".format(dropout, num_layers))


['do you have a ?', 'the bear did not laugh .', 'she has been here since an an hour .', 'try try the on on these jeans .', 'i hope to meet rest of .']
