# Paraphrase generation of Portugese


### Software Requirements
- Python (>=3.6)
- PyTorch (>=1.2.0) 
- Jupyter (latest)
- torchtext
- NLTK

In [2]:
# required libraries

import unicodedata
import string
import re
import random
import time
import datetime
import math

import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence
import torchtext
from torchtext.datasets import TranslationDataset

import spacy
import numpy as np
from nltk.translate.bleu_score import corpus_bleu

# set the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


## Data preprocessing

In [3]:
'''
tokenization code
'''

!python -m spacy download pt_core_news_sm

import pt_core_news_sm

spacy_pt = pt_core_news_sm.load() # just need tokenizer from one language as both source and target are from the same language

def tokenize_pt(text):
    """
    Tokenizes Portuguese text from a string into a list of strings (tokens)
    """
    return [tok.text for tok in spacy_pt.tokenizer(text)]

'''
define field
'''
SRC = torchtext.data.Field(tokenize = tokenize_pt, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)
TRG = torchtext.data.Field(tokenize = tokenize_pt, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)

'''
load the data
'''
train_data = torchtext.data.TabularDataset(
    path='/content/drive/My Drive/Colab Notebooks/lab4/data/paraphrase_generation/pp-train.tsv', 
    format='tsv', skip_header=True, fields=[('SRC', SRC), ('TRG', TRG)])
valid_data = torchtext.data.TabularDataset(
    path='/content/drive/My Drive/Colab Notebooks/lab4/data/paraphrase_generation/pp-valid.tsv', 
    format='tsv', skip_header=True, fields=[('SRC', SRC), ('TRG', TRG)])
test_data = torchtext.data.TabularDataset(
    path='/content/drive/My Drive/Colab Notebooks/lab4/data/paraphrase_generation/pp-test-onlySRC.tsv', 
    format='tsv', skip_header=True, fields=[('SRC', SRC)]) # blind test data (that is, no targets)

print(f"Number of training examples: {len(train_data.examples)}")
print(f"Number of validation examples: {len(valid_data.examples)}")
print(f"Number of testing examples: {len(test_data.examples)}")

'''
build the vocabulary
'''
TRG.build_vocab(train_data, min_freq=2)
SRC.build_vocab(train_data, min_freq=2)
print(f"Unique tokens in source vocabulary: {len(SRC.vocab)}")
print(f"Unique tokens in target vocabulary: {len(TRG.vocab)}")

'''
create the iterator
'''
train_iter = torchtext.data.BucketIterator(train_data, batch_size=16, device=device, sort_key=lambda x: len(x.SRC), sort_within_batch=True)
valid_iter = torchtext.data.BucketIterator(valid_data, batch_size=256, device=device, sort_key=lambda x: len(x.SRC), sort_within_batch=True)
test_iter = torchtext.data.Iterator(test_data, batch_size=256, device=device, sort=False, sort_key=None, shuffle=False, sort_within_batch=False)

'''
print sample batch
'''
# print first batch of training data
print('training batch')
for batch in train_iter:
    src = batch.SRC
    trg = batch.TRG
    print('tensor size of source language:', src.shape)
    print('tensor size of target language:', trg.shape)
    break

# print first batch of validation data
print('validation batch')
for batch in valid_iter:
    src = batch.SRC
    trg = batch.TRG
    print('tensor size of source language:', src.shape)
    print('tensor size of target language:', trg.shape)
    break

# print first batch of test data
print('(blind) test batch')
for batch in test_iter:
    src = batch.SRC
    print('tensor size of source language:', src.shape)
    break

# save the field
import pickle
with open("./drive/My Drive/Colab Notebooks/lab4/data/paraphrase_generation/TRG.Field","wb")as f:
     pickle.dump(TRG,f)

with open("./drive/My Drive/Colab Notebooks/lab4/data/paraphrase_generation/SRC.Field","wb")as f:
     pickle.dump(SRC,f)



Collecting pt_core_news_sm==2.2.5
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/pt_core_news_sm-2.2.5/pt_core_news_sm-2.2.5.tar.gz (21.2MB)
[K     |████████████████████████████████| 21.2MB 42.6MB/s 
Building wheels for collected packages: pt-core-news-sm
  Building wheel for pt-core-news-sm (setup.py) ... [?25l[?25hdone
  Created wheel for pt-core-news-sm: filename=pt_core_news_sm-2.2.5-cp36-none-any.whl size=21186282 sha256=f097afc6be44626dfef27a88104cb8ed371969bfa59710798fc56976348bab9c
  Stored in directory: /tmp/pip-ephem-wheel-cache-8moqg9t0/wheels/ea/94/74/ec9be8418e9231b471be5dc7e1b45dd670019a376a6b5bc1c0
Successfully built pt-core-news-sm
Installing collected packages: pt-core-news-sm
Successfully installed pt-core-news-sm-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('pt_core_news_sm')
Number of training examples: 85171
Number of validation examples: 4724
Number of testing examples: 4692

In [0]:
# Alternate pre-trained models

# load pre-trained embeddings

ptwiki_trg = "/content/drive/My Drive/Colab Notebooks/lab4/data/paraphrase_generation/pp_ptwiki_trg_vec.pt"
ptwiki_src = "/content/drive/My Drive/Colab Notebooks/lab4/data/paraphrase_generation/pp_ptwiki_src_vec.pt"

fastt_trg = "/content/drive/My Drive/Colab Notebooks/lab4/data/paraphrase_generation/pp_fastt_trg_vec.pt"
fastt_src = "/content/drive/My Drive/Colab Notebooks/lab4/data/paraphrase_generation/pp_fastt_src_vec.pt"


trg_vec = torch.load(fastt_trg)
src_vec = torch.load(fastt_src)

## Attention Seq2seq model

In [0]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, enc_hid_dim, n_layers, dropout, bidirection):
        super().__init__()

        self.emb_dim = emb_dim
        self.enc_hid_dim = enc_hid_dim
        self.dropout = dropout
        self.n_layers = n_layers

        self.embedding = nn.Embedding(input_dim, emb_dim)#.from_pretrained(src_vec, freeze = False)
        self.lstm = nn.LSTM(emb_dim, enc_hid_dim, n_layers, dropout=dropout, bidirectional = bidirection)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
        
        #src = [src len, batch size]
        
        embedded = self.dropout(self.embedding(src))
        
        #embedded = [src len, batch size, emb dim]
        
        outputs, (hidden, cell) = self.lstm(embedded)
       
        # outputs are always from the top hidden layer, if bidirectional outputs are concatenated.
        # outputs shape [sequence_length, batch_size, hidden_dim * num_directions]
        # hidden is of shape [num_layers * num_directions, batch_size, hidden_size]
        # cell is of shape [num_layers * num_directions, batch_size, hidden_size]
        
        return outputs, (hidden, cell)

class Attention(nn.Module):
    def __init__(self):
        super().__init__()
        
    def forward(self, dec_hidden, encoder_outputs):
        #hidden = [batch size, dec hid dim]
        #encoder_outputs = [src len, batch size, enc hid dim * num_directions]
        
        batch_size = encoder_outputs.shape[1]
        src_len = encoder_outputs.shape[0]
        
        #repeat encoder hidden state src_len-1 times
        hidden = dec_hidden.unsqueeze(1)
        
        encoder_outputs = encoder_outputs.permute(1, 2, 0)
        
        #hidden = [batch size, 1, dec hid dim]
        #encoder_outputs = [batch size, enc hid dim * num_directions, src_len]
        
        # attention scoring function : S dot H_encoder
        attention = torch.bmm(hidden, encoder_outputs).squeeze(1)
   
        # attention = [batch size, src len]
        return F.softmax(attention, dim=1)


class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, n_layers, dropout, attention):
        super().__init__()
        
        self.output_dim = output_dim
        self.attention = attention
        
        self.embedding = nn.Embedding(output_dim, emb_dim)#.from_pretrained(trg_vec, freeze = False)
        
        self.rnn = nn.LSTM(emb_dim, dec_hid_dim, n_layers, dropout=dropout)

        self.fc_mid = nn.Linear(enc_hid_dim * 2 + dec_hid_dim, dec_hid_dim)
        
        self.fc_out = nn.Linear(dec_hid_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, cell, encoder_outputs):
             
        #input = [batch size], current token
        #hidden = [batch size, dec hid dim], previous decoder hidden state
        #cell = [batch size, dec hid dim], previous decoder cell state
        #encoder_outputs = [src len, batch size, enc hid dim * num_directions], all encoder hidden states (H)
        
        input = input.unsqueeze(0)
        
        #input = [1, batch size]
        
        embedded = self.dropout(self.embedding(input))
        
        #embedded = [1, batch size, emb dim]

        # Now, get the current hidden state of the decoder

        dec_output, (dec_hidden, dec_cell) = self.rnn(embedded, (hidden.unsqueeze(0), cell.unsqueeze(0)))

        dec_hidden = dec_hidden.squeeze(0)
        # dec_hidden = [batch size, dec hid dim]

        dec_cell = dec_cell.squeeze(0)
        # dec_cell = [batch size, dec hid dim]

        # Use the current hidden state of the decoder, get the attention probabilities

        attention_weights = self.attention(dec_hidden, encoder_outputs)
        # attention weights = [batch size, src_len]

        ## We need to create weighted context vector
        
        attention_weights = attention_weights.unsqueeze(1)
        # attention weights = [batch size, 1, src_len]
        
        encoder_outputs = encoder_outputs.permute(1, 0, 2)

        # encoder_outputs = [batch size, src len, enc hid dim * num_directions]
        # perform weighted sum of encoder hidden states to get attention output
        weighted = torch.bmm(attention_weights, encoder_outputs)
        
        #weighted = [batch size, 1, enc hid dim * num_directions]
        
        weighted = weighted.permute(1, 0, 2)
        
        #weighted = [1, batch size, enc hid dim * num_directions]

        weighted = weighted.squeeze(0)

        # weighted = [batch size, enc hid dim * num_directions]

        ## concatenate the weighted context vector with the current decoder hidden state

        conc = torch.cat((weighted, dec_hidden), dim = 1)

        # conc = [batch size, enc hid dim * num_directions + dec hid dim]

        ## Pass through the f_mid Linear layer to get S'

        S_new = self.fc_mid(conc)

        # S_new = [batch size, dec hid dim]

        ## Then pass the f_out Linear layer to get predictions

        prediction = self.fc_out(S_new)

        # prediction = [batch size, output dim]

        return prediction, dec_hidden, dec_cell, attention_weights

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        
        #src = [src len, batch size]
        #trg = [trg len, batch size]
        #teacher_forcing_ratio is probability to use teacher forcing
        #e.g. if teacher_forcing_ratio is 0.75 we use teacher forcing 75% of the time
        batch_size = src.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        #tensor to store decoder outputs
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        # save the encoder-decoder attention weights
        # all_attention_weights = [batch_size, trg len-1, src len ]
        all_attention_weights = torch.zeros(trg.shape[1], trg.shape[0]-1, src.shape[0])
        #encoder_outputs is all hidden states of the input sequence, back and forwards
        #hidden is the final forward and backward hidden states, passed through a linear layer
        encoder_outputs, (hidden, cell) = self.encoder(src)
          
        hidden = torch.mean(encoder_outputs, dim=0)
        cell = torch.cat((cell[0, :, :].unsqueeze(0), cell[1, :, :].unsqueeze(0)), dim = 2).squeeze(0)

        #first input to the decoder is the <sos> tokens
        input = trg[0,:]
        
        for t in range(1, trg_len):
            
            #insert input token embedding, previous hidden state and all encoder hidden states
            #receive output tensor (predictions) and new hidden state
            output, hidden, cell, attention_weights = self.decoder(input, hidden, cell, encoder_outputs)
            
            # all_attention_weights[t-1] = [src len, batch size]
            all_attention_weights[:,t-1,:] = attention_weights.squeeze(1)
            
            #place predictions in a tensor holding predictions for each token
            outputs[t] = output
            
            #decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            
            #get the highest predicted token from our predictions
            top1 = output.argmax(1) 
            
            #if teacher forcing, use actual next token as next input
            #if not, use predicted token
            input = trg[t] if teacher_force else top1

        return outputs,all_attention_weights


In [0]:
def inference(model, file_name, src_vocab, trg_vocab, attention= True, batch_size = 128, max_trg_len = 64):
    '''
    Function for translation inference

    Input: 
    model: translation model;
    file_name: the directoy of test file that the first column is target reference, and the second column is source language;
    trg_vocab: Target torchtext Field
    attention: the model returns attention weights or not.
    max_trg_len: the maximal length of translation text (optinal), default = 64

    Output:
    Corpus BLEU score.
    '''
    from nltk.translate.bleu_score import corpus_bleu
    from nltk.translate.bleu_score import sentence_bleu
    from torchtext.data import TabularDataset
    from torchtext.data import Iterator

    # convert index to text string
    def convert_itos(convert_vocab, token_ids):
        list_string = []
        for i in token_ids:
            if i == convert_vocab.vocab.stoi['<eos>']:
                break
            else:
                token = convert_vocab.vocab.itos[i]
                list_string.append(token)
        return list_string

    test = TabularDataset(
      path=file_name, # the root directory where the data lies
      format='tsv',
      skip_header=True, # if your tsv file has a header, make sure to pass this to ensure it doesn't get proceesed as data!
      fields=[('SRC', src_vocab), ('TRG', trg_vocab)])

    test_iter = Iterator(
    dataset = test, # we pass in the datasets we want the iterator to draw data from
    sort = False,batch_size=batch_size,
    sort_key=None,
    shuffle=False,
    sort_within_batch=False,
    device = device,
    train=False
    )
  
    model.eval()
    all_trg = []
    all_translated_trg = []

    TRG_PAD_IDX = trg_vocab.vocab.stoi[trg_vocab.pad_token]

    with torch.no_grad():
    
        for i, batch in enumerate(test_iter):

            src = batch.SRC
            #src = [src len, batch size]

            trg = batch.TRG
            #trg = [trg len, batch size]

            batch_size = trg.shape[1]

            # create a placeholder for traget language with shape of [max_trg_len, batch_size] where all the elements are the index of <pad>. Then send to device
            trg_placeholder = torch.Tensor(max_trg_len, batch_size)
            trg_placeholder.fill_(TRG_PAD_IDX)
            trg_placeholder = trg_placeholder.long().to(device)
            if attention == True:
              output,_ = model(src, trg_placeholder, 0) #turn off teacher forcing
            else:
              output = model(src, trg_placeholder) #turn off teacher forcing
            # get translation results, we ignor first token <sos> in both translation and target sentences. 
            # output_translate = [(trg len - 1), batch, output dim] output dim is size of target vocabulary.
            output_translate = output[1:]
            # store gold target sentences to a list 
            all_trg.append(trg[1:].cpu())

            # Choose top 1 word from decoder's output, we get the probability and index of the word
            prob, token_id = output_translate.data.topk(1)
            translation_token_id = token_id.squeeze(2).cpu()

            # store gold target sentences to a list 
            all_translated_trg.append(translation_token_id)
      
    all_gold_text = []
    all_translated_text = []
    for i in range(len(all_trg)): 
        cur_gold = all_trg[i]
        cur_translation = all_translated_trg[i]
        for j in range(cur_gold.shape[1]):
            gold_convered_strings = convert_itos(trg_vocab,cur_gold[:,j])
            gold_convered_strings = [c.lower() for c in gold_convered_strings]

            trans_convered_strings = convert_itos(trg_vocab,cur_translation[:,j])
            trans_convered_strings = [c.lower() for c in trans_convered_strings]

            all_gold_text.append(gold_convered_strings)
            all_translated_text.append(trans_convered_strings)

    corpus_all_gold_text = [[item] for item in all_gold_text]
    corpus_bleu_score = corpus_bleu(corpus_all_gold_text, all_translated_text)  
    return corpus_bleu_score

## Model training

In [7]:
'''
hyperparameters
'''
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
N_LAYERS = 1
CLIP = 1
BI_DIRECTION = True
ENC_HID_DIM = 512
DEC_HID_DIM = 2 * ENC_HID_DIM #(you should figure out this hyper-parameter). 
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.8
TEACH_FORCING_RATE = 0.7
LEARNING_RT = 0.001
WEIGHT_DECAY = 0.0000
MAX_EPOCH = 10

train_batch_size = 16
val_batch_size = 256
'''
instantiate the model
'''
attn = Attention()
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, N_LAYERS, ENC_DROPOUT, BI_DIRECTION)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, N_LAYERS, DEC_DROPOUT, attn)

model = Seq2Seq(enc, dec, device).to(device)
optimizer = optim.Adam(model.parameters(),lr=LEARNING_RT, weight_decay=WEIGHT_DECAY)

TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]
print('<pad> token index: ',TRG_PAD_IDX)
## we will ignore the pad token in true target set
criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

'''
initialize the model weights
'''
def init_weights(m):
    for name, param in m.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)
model.apply(init_weights)


'''
calculate the number of parameters
'''
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'The model has {count_parameters(model):,} trainable parameters')

# convert index to text string
def convert_itos(convert_vocab, token_ids):
    list_string = []
    for i in token_ids:
        if i == convert_vocab.vocab.stoi['<eos>']:
            break
        else:
            token = convert_vocab.vocab.itos[i]
            list_string.append(token)
    return list_string


'''
Full training helper functions
'''

def train_attn(model, iterator, optimizer, criterion, clip):
    manual_seed = 77
    torch.manual_seed(manual_seed)
    if n_gpu > 0:
        torch.cuda.manual_seed(manual_seed)
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        src = batch.SRC
        trg = batch.TRG
        
        optimizer.zero_grad()
        
        output,_ = model(src, trg, teacher_forcing_ratio = TEACH_FORCING_RATE)
        
        #trg = [trg len, batch size]
        #output = [trg len, batch size, output dim]
        
        output_dim = output.shape[-1]
        
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
        
        #trg = [(trg len - 1) * batch size]
        #output = [(trg len - 1) * batch size, output dim]
        
        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    bleu = inference(model, "/content/drive/My Drive/Colab Notebooks/lab4/data/paraphrase_generation/pp-train.tsv", SRC, TRG, True, train_batch_size, 64)
    return epoch_loss / len(iterator), bleu

def evaluate_attn(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src = batch.SRC
            trg = batch.TRG

            output,_ = model(src, trg, 0) #turn off teacher forcing

            #trg = [trg len, batch size]
            #output = [trg len, batch size, output dim]

            output_dim = output.shape[-1]
            
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)

            #trg = [(trg len - 1) * batch size]
            #output = [(trg len - 1) * batch size, output dim]

            loss = criterion(output, trg)

            epoch_loss += loss.item()
        
    bleu = inference(model, "/content/drive/My Drive/Colab Notebooks/lab4/data/paraphrase_generation/pp-valid.tsv", SRC, TRG, True, val_batch_size, 64)
    return epoch_loss / len(iterator), bleu



def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

'''
kickstart full training
'''

manual_seed = 77
torch.manual_seed(manual_seed)
n_gpu = torch.cuda.device_count()
if n_gpu > 0:
    torch.cuda.manual_seed(manual_seed)


best_valid_loss = float('inf')
CLIP = 1

for epoch in range(MAX_EPOCH):
    
    start_time = time.time()
    
    train_loss,bleu_train = train_attn(model, train_iter, optimizer, criterion, CLIP)
    valid_loss, bleu = evaluate_attn(model, valid_iter, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    # Create checkpoint at end of each epoch
    state_dict_model = model.state_dict() 
    state = {
        'epoch': epoch,
        'state_dict': state_dict_model,
        'optimizer': optimizer.state_dict()
        }

    torch.save(state, "/content/drive/My Drive/Colab Notebooks/lab4/ckpt/pp_"+str(epoch+1)+".pt")

    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\t Train Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')
    print(f'\t Train BLEU:{bleu_train:.3f} |  Val. BLEU: {bleu:7.3f}')


  "num_layers={}".format(dropout, num_layers))
  "num_layers={}".format(dropout, num_layers))


<pad> token index:  1
The model has 16,446,444 trainable parameters
Epoch: 01 | Time: 6m 39s
	 Train Loss: 1.761 | Train PPL:   5.820
	 Val. Loss: 5.134 |  Val. PPL: 169.752
	 Train BLEU:0.802 |  Val. BLEU:   0.230
Epoch: 02 | Time: 6m 43s
	 Train Loss: 0.266 | Train PPL:   1.304
	 Val. Loss: 6.548 |  Val. PPL: 697.534
	 Train BLEU:0.928 |  Val. BLEU:   0.239
Epoch: 03 | Time: 6m 50s
	 Train Loss: 0.137 | Train PPL:   1.147
	 Val. Loss: 8.199 |  Val. PPL: 3638.703
	 Train BLEU:0.960 |  Val. BLEU:   0.232
Epoch: 04 | Time: 6m 51s
	 Train Loss: 0.108 | Train PPL:   1.115
	 Val. Loss: 8.744 |  Val. PPL: 6273.482
	 Train BLEU:0.969 |  Val. BLEU:   0.247
Epoch: 05 | Time: 6m 48s
	 Train Loss: 0.096 | Train PPL:   1.101
	 Val. Loss: 9.880 |  Val. PPL: 19533.863
	 Train BLEU:0.979 |  Val. BLEU:   0.250
Epoch: 06 | Time: 6m 47s
	 Train Loss: 0.088 | Train PPL:   1.091
	 Val. Loss: 10.963 |  Val. PPL: 57699.342
	 Train BLEU:0.983 |  Val. BLEU:   0.251
Epoch: 07 | Time: 6m 50s
	 Train Loss: 0.08

## Preparing Predictions from Test Data

In [10]:
'''
load fields saved during preprocessing
'''
# with open("./drive/My Drive/Colab Notebooks/ckpt_pp_lab4/TRG.Field","rb") as f:
#      TRG_saved = pickle.load(f)

# with open("./drive/My Drive/Colab Notebooks/ckpt_pp_lab4/SRC.Field","rb") as f:
#      SRC_saved = pickle.load(f)

TRG_saved = TRG
SRC_saved = SRC

'''
instantiate the model
'''
attn = Attention()
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, N_LAYERS, ENC_DROPOUT, BI_DIRECTION)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, N_LAYERS, DEC_DROPOUT, attn)

model_best = Seq2Seq(enc, dec, device).to(device)

'''
load the checkpoint corresponding to the best epoch (usually epoch with highest validation BLEU score)
'''
model_best.load_state_dict(torch.load('/content/drive/My Drive/Colab Notebooks/lab4/ckpt/pp_7.pt')['state_dict'])
model_best = model_best.to(device)

'''
generate paraphrases for all the sentences in test data
'''

def translation_inference(token_id):
  sent = []
  for i in token_id:
    if i == TRG.vocab.stoi["<eos>"]:
      break
    else:
      token = TRG.vocab.itos[i]
      sent.append(token)
  
  return sent

def generate_paraphrases(model, eval_iter, trg_vocab, attention = True, max_trg_len = 64):
  '''
    Function for generating paraphrases by model inference

    Input: 
    model: paraphrase generation model;
    eval_iter: iterator over the evaluation data
    trg_vocab: Target torchtext Field
    attention: the model returns attention weights or not.
    max_trg_len: the maximal length of paraphrase text (optional), default = 64

    Output:
    List of predicted paraphrases
  '''
  model.eval()
  all_translation_word_ids = []
  all_gold_sents = []
  for batch in eval_iter:
    src = batch.SRC
    #src = [src len, batch size]
    batch_size = src.shape[1]

    # gold = batch.TRG

    # for i in range(src.shape[1]):
    #   gold_id = gold[1:, i].cpu().numpy()
    #   gold_sent = translation_inference(gold_id)
    #   all_gold_sents.append(gold_sent)


    # create a placeholder for target language with shape of [max_trg_len, batch_size] where all the elements are the index of <pad>. Then send to device
    trg_placeholder = torch.Tensor(max_trg_len, batch_size)
    trg_placeholder.fill_(TRG_PAD_IDX)
    trg_placeholder = trg_placeholder.long().to(device)
    if attention == True:
      output,_ = model(src, trg_placeholder, 0) #turn off teacher forcing
    else:
      output = model(src, trg_placeholder, 0) #turn off teacher forcing
    # get translation results, we ignore first token <sos> in both translation and target sentences. 
    # output_translate = [(trg len - 1), batch, output dim] output dim is size of target vocabulary.
    output_translate = output[1:]

    # Choose top 1 word from decoder's output, we get the probability and index of the word
    prob, token_id = output_translate.data.topk(1)
    translation_token_id = token_id.squeeze(2).cpu()

    # store gold target sentences to a list 
    all_translation_word_ids.append(translation_token_id)
  
  all_translation_text = []
  for i in range(len(all_translation_word_ids)):
    cur_translation_batch = all_translation_word_ids[i]
    for j in range(cur_translation_batch.shape[1]):
      trans_convered_strings = convert_itos(trg_vocab, cur_translation_batch[:,j])
      all_translation_text.append(' '.join(trans_convered_strings)) # convert list of words to text
  
  return all_translation_text #, all_gold_sents

# translate all the sentences in the test set      
test_predictions = generate_paraphrases(model, test_iter, TRG_saved, attention = True, max_trg_len = 64)
print(test_predictions[:10])
#print(gold[:10])

  "num_layers={}".format(dropout, num_layers))
  "num_layers={}".format(dropout, num_layers))


['há muitas estudantes tinha completar food .', 'por que você está escrito para o lugar ?', 'onde você fez nossos bagagem para a estudante ?', 'o rei comprou uma copo na caixa .', 'eu preciso abrir a porta .', 'o cavalo está mudar de verdade !', 'meu irmão sempre queria um médico .', 'há muitas pintores n esta trem , não há ?', 'é uma casa estas últimos uma pergunta .', 'onde está a estação de metrô é aqui ?']
