In [1]:
from data import generate_batches
from data import prepare_data
from data import data_to_index
from data import DEP_LABELS
from data import random_batch

from model.encoder import Encoder
from model.decoder import Decoder_luong

from BLEU import BLEU

from utils import time_since

from evaluator import Evaluator

import torch
import torch.nn as nn
from torch.nn import functional
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F

import numpy as np
import time
import random

#from validation import Evaluator

%load_ext autoreload
%autoreload 2

In [2]:
USE_CUDA = True
MAX_LENGTH = 100
DIR_FILES = 'data/translation/train/'
DIR_RESULTS = 'results/step_1'
SPLIT_TRAIN = 0.7
SPLIT_VALID = 0.15
# The rest is for test

# Reading the data

In [3]:
input_lang, output_lang, pairs = prepare_data('en', 'spa', max_length=MAX_LENGTH, dir=DIR_FILES)

Reading lines...
Read 115244 sentence pairs
Filtered to 113821 pairs
Creating vocab...
Indexed 13299 words in input language, 25383 words in output


In [4]:
pairs_train = np.array(pairs[:90000])
pairs_test = np.array(pairs[90000:])

# Train

In [5]:
def train(input_batches, target_batches, \
          encoder, decoder, encoder_optimizer, decoder_optimzier, \
          criterion, max_length=MAX_LENGTH, train=True):
    
    if train:
        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()
        
    loss = 0
    
    batch_size = input_batches.size()[1]
    
    encoder_hidden = encoder.init_hidden(batch_size)
    encoder_outputs, encoder_hidden = encoder(input_batches, encoder_hidden)
    
    decoder_context = Variable(torch.zeros(batch_size, decoder.hidden_size))   
    decoder_hidden = encoder_hidden
    
    # set the start of the sentences of the batch
    decoder_input = Variable(torch.LongTensor([input_lang.vocab.stoi['<sos>']] * batch_size))

    # store the decoder outputs to estimate the loss
    all_decoder_outputs = Variable(torch.zeros(target_batches.size()[0], batch_size, len(output_lang.vocab.stoi)))
    
    if USE_CUDA:
        decoder_input = decoder_input.cuda()
        all_decoder_outputs = all_decoder_outputs.cuda()
        decoder_context = decoder_context.cuda()  
    
    if train:
        use_teacher_forcing = random.random() < teacher_forcing_ratio
    else:
        use_teacher_forcing = False

    if use_teacher_forcing:        
        # Use targets as inputs
        for di in range(target_batches.shape[0]):
            decoder_output, decoder_context, decoder_hidden, decoder_attention = decoder(
                decoder_input.unsqueeze(0), decoder_context, decoder_hidden, encoder_outputs)
            
            all_decoder_outputs[di] = decoder_output
            decoder_input = target_batches[di]
    else:        
        # Use decoder output as inputs
        for di in range(target_batches.shape[0]):            
            decoder_output, decoder_context, decoder_hidden, decoder_attention = decoder(
                decoder_input.unsqueeze(0), decoder_context, decoder_hidden, encoder_outputs) 
            
            all_decoder_outputs[di] = decoder_output
            
            # Greedy approach, take the word with highest probability
            topv, topi = decoder_output.data.topk(1)            
            decoder_input = Variable(torch.LongTensor(topi.cpu()).squeeze())
            if USE_CUDA: decoder_input = decoder_input.cuda()
    
    loss = nn.NLLLoss()(all_decoder_outputs.view(-1, decoder.output_size), target_batches.contiguous().view(-1))          
    
    if train:
        loss.backward()
        torch.nn.utils.clip_grad_norm(encoder.parameters(), clip)
        torch.nn.utils.clip_grad_norm(decoder.parameters(), clip)
        encoder_optimizer.step()
        decoder_optimizer.step()
    
    return loss.data[0] 

# Model

In [6]:
attn_model = 'general'
hidden_size = 512
emb_size = 300
n_layers = 2
seed = 12
dropout_p = 0.1
teacher_forcing_ratio = 0.5
clip = 5.0

n_epochs = 6
batch_size = 128

In [7]:
torch.manual_seed(seed)

<torch._C.Generator at 0x7f5b9ec365d0>

In [8]:
encoder = Encoder(len(input_lang.vocab.stoi), hidden_size, emb_size, n_layers, dropout_p, input_lang, USE_CUDA)
decoder = Decoder_luong(attn_model, hidden_size, len(output_lang.vocab.stoi), emb_size, 2 * n_layers, dropout_p, output_lang, USE_CUDA)

if USE_CUDA:
    encoder = encoder.cuda()
    decoder = decoder.cuda()
    
learning_rate = 0.001
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
criterion = nn.NLLLoss()

In [None]:
# Keep track of time elapsed and running averages
start = time.time()
train_losses = []
validation_losses = []
validation_bleu = []

plot_every = 5
print_every = 5
validate_loss_every = 25
best_bleu = 0
print_loss_total = 0 # Reset every print_every
plot_loss_total = 0 # Reset every plot_every

In [None]:
for epoch in range(1, n_epochs): 
    # Shuffle data
    id_aux = np.random.permutation(np.arange(len(pairs_train)))
    pairs_train = pairs_train[id_aux]
    
    # Get the batches for this epoch
    input_batches, target_batches = generate_batches(input_lang, output_lang, batch_size, pairs_train, USE_CUDA=USE_CUDA)
    
    
    for batch_ix, (input_batch, target_var) in enumerate(zip(input_batches, target_batches)):
        encoder.train()
        decoder.train()
        
        #[input_var, _, _, _, _, _, _, _] = input_batch
        input_var = input_batch

        # Run the train function
        loss = train(input_var, target_var,\
                 encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, 
                 max_length=MAX_LENGTH, train=True)
        #loss = train_luong(input_var, target_var, input_var.size(1), True)
            
        torch.cuda.empty_cache()

        # Keep track of loss
        print_loss_total += loss
        plot_loss_total += loss

        if batch_ix == 0: continue

        if batch_ix % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print_summary = '%s (%d %d%%) %.4f' % (time_since(start, epoch / n_epochs), epoch, batch_ix / len(input_batches) * 100, print_loss_avg)
            train_losses.append(loss)
            print(print_summary)
    
    input_batches, target_batches = generate_batches(input_lang, output_lang, batch_size, pairs_test, USE_CUDA=USE_CUDA)
    print_loss_total = 0
    for input_batch, target_var in zip(input_batches, target_batches):
    
        encoder.eval()
        decoder.eval()
    
        #[input_var, _, _, _, _, _, _, _] = input_batch
        input_var = input_batch
        # Run the train function
        loss = train(input_var, target_var,\
                 encoder, decoder, encoder_optimizer, decoder_optimizer, criterion,\
                 max_length=MAX_LENGTH, train=False)
        
        print_loss_total += loss
    val_loss = print_loss_total / len(input_batches)
    validation_losses.append(val_loss)
    # Evaluating Bleu
    evaluator = Evaluator(encoder, decoder, input_lang, output_lang, MAX_LENGTH, True)
    candidates, references = evaluator.get_candidates_and_references(pairs_test, k_beams=1)
    bleu = BLEU(candidates, [references])
    if bleu[0] > best_bleu:
        best_bleu = bleu[0]
        torch.save(encoder.state_dict(), f'{DIR_RESULTS}/encoder.pkl')
        torch.save(decoder.state_dict(), f'{DIR_RESULTS}/decoder.pkl')
    validation_bleu.append(bleu)
    print(f'val_loss: {val_loss:.4f} - bleu: {bleu[0]}')

    # Prevent overflow gpu memory
    del evaluator



0m 11s (- 0m 58s) (1 0%) 8.3761
0m 13s (- 1m 5s) (1 1%) 4.4608
0m 14s (- 1m 12s) (1 2%) 3.6885
0m 15s (- 1m 19s) (1 2%) 3.2955
0m 17s (- 1m 25s) (1 3%) 3.4218
0m 18s (- 1m 31s) (1 4%) 3.3350
0m 19s (- 1m 36s) (1 4%) 3.3729
0m 20s (- 1m 42s) (1 5%) 3.1769
0m 21s (- 1m 48s) (1 6%) 3.3809
0m 22s (- 1m 53s) (1 7%) 3.2700
0m 23s (- 1m 58s) (1 7%) 3.4059
0m 24s (- 2m 4s) (1 8%) 3.3437
0m 25s (- 2m 9s) (1 9%) 3.3882
0m 27s (- 2m 15s) (1 9%) 3.0557
0m 28s (- 2m 20s) (1 10%) 3.1397
0m 29s (- 2m 26s) (1 11%) 3.0511
0m 30s (- 2m 32s) (1 12%) 3.0105
0m 31s (- 2m 37s) (1 12%) 3.0048
0m 32s (- 2m 43s) (1 13%) 2.9412
0m 33s (- 2m 49s) (1 14%) 2.9091
0m 34s (- 2m 54s) (1 14%) 3.0265
0m 36s (- 3m 0s) (1 15%) 2.8018
0m 37s (- 3m 6s) (1 16%) 2.8553
0m 38s (- 3m 12s) (1 17%) 2.6358
0m 39s (- 3m 17s) (1 17%) 2.8155
0m 40s (- 3m 24s) (1 18%) 2.5965
0m 41s (- 3m 29s) (1 19%) 2.8305
0m 43s (- 3m 35s) (1 19%) 2.6909
0m 44s (- 3m 41s) (1 20%) 2.5381
0m 45s (- 3m 46s) (1 21%) 2.6937
0m 46s (- 3m 52s) (1 22%) 2.6

A Jupyter Widget

In [12]:
validation_losses

[tensor(2.8253, device='cuda:0'),
 tensor(2.4174, device='cuda:0'),
 tensor(2.3385, device='cuda:0'),
 tensor(2.5791, device='cuda:0'),
 tensor(2.4425, device='cuda:0')]

In [13]:
train_losses

[tensor(4.2680, device='cuda:0'),
 tensor(4.3280, device='cuda:0'),
 tensor(3.2405, device='cuda:0'),
 tensor(3.0685, device='cuda:0'),
 tensor(3.4384, device='cuda:0'),
 tensor(3.1535, device='cuda:0'),
 tensor(3.4878, device='cuda:0'),
 tensor(3.2988, device='cuda:0'),
 tensor(3.5391, device='cuda:0'),
 tensor(3.2383, device='cuda:0'),
 tensor(3.5126, device='cuda:0'),
 tensor(3.6423, device='cuda:0'),
 tensor(3.3424, device='cuda:0'),
 tensor(2.8781, device='cuda:0'),
 tensor(3.3466, device='cuda:0'),
 tensor(3.3214, device='cuda:0'),
 tensor(2.6270, device='cuda:0'),
 tensor(3.1198, device='cuda:0'),
 tensor(2.9170, device='cuda:0'),
 tensor(2.9819, device='cuda:0'),
 tensor(2.7829, device='cuda:0'),
 tensor(2.6241, device='cuda:0'),
 tensor(2.6977, device='cuda:0'),
 tensor(2.3243, device='cuda:0'),
 tensor(3.0008, device='cuda:0'),
 tensor(2.7750, device='cuda:0'),
 tensor(3.4579, device='cuda:0'),
 tensor(2.9167, device='cuda:0'),
 tensor(2.5323, device='cuda:0'),
 tensor(2.8233

In [None]:
evaluator = Evaluator(encoder, decoder, input_lang, output_lang, 
                      MAX_LENGTH, USE_CUDA)
candidates, references = evaluator.get_candidates_and_references(pairs_test[:10000], k_beams=2)
len(candidates), len(references)

A Jupyter Widget

In [12]:
BLEU(candidates, [references]) 

(0.28063523097173265,
 [0.6573135078342698,
  0.37567686039915427,
  0.22488307382629802,
  0.13494545201862276],
 0.953820572858132)