In [1]:
from data import generate_batches
from data import prepare_data

from model.encoder import Encoder
from model.decoder import Decoder_luong

from BLEU import BLEU

from utils import time_since


from evaluator_dis import Evaluator

import torch
import torch.nn as nn
from torch.nn import functional
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F

import numpy as np
import time
import random
import os
#from validation import Evaluator

%load_ext autoreload
%autoreload 2

In [2]:
USE_CUDA = True
MAX_LENGTH = 100
DIR_FILES = 'data/disambiguation/sense-eval2/'
DIR_TRAIN = os.path.join(DIR_FILES, 'train')
DIR_TEST = os.path.join(DIR_FILES, 'test')
DIR_RESULTS = 'results/step_1'
SPLIT_TRAIN = 0.7
SPLIT_VALID = 0.15
# The rest is for test

# Reading the data

In [3]:
input_lang, output_lang, pairs = prepare_data('en', 'spa', max_length=MAX_LENGTH, dir=DIR_TRAIN)

Reading lines...
Read 9080 sentence pairs
Filtered to 9057 pairs
Creating vocab...
Indexed 23102 words in input language, 23688 words in output


In [4]:
def read_test(dir):
    pairs = []
    with open(os.path.join(dir, 'test.raw'), 'r') as file:
        for line in file.readlines():
            pairs.append(line.replace('\n', ''))
    
    return pairs

In [5]:
pairs_train = np.array(pairs)
pairs_test = read_test(DIR_TEST)

# Train

In [6]:
def train(input_batches, target_batches, \
          encoder, decoder, encoder_optimizer, decoder_optimzier, \
          criterion, max_length=MAX_LENGTH, train=True):
    
    if train:
        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()
        
    loss = 0
    
    batch_size = input_batches.size()[1]
    
    encoder_hidden = encoder.init_hidden(batch_size)
    encoder_outputs, encoder_hidden = encoder(input_batches, encoder_hidden)
    
    decoder_context = Variable(torch.zeros(batch_size, decoder.hidden_size))   
    decoder_hidden = encoder_hidden
    
    # set the start of the sentences of the batch
    decoder_input = Variable(torch.LongTensor([input_lang.vocab.stoi['<sos>']] * batch_size))

    # store the decoder outputs to estimate the loss
    all_decoder_outputs = Variable(torch.zeros(target_batches.size()[0], batch_size, len(output_lang.vocab.stoi)))
    
    if USE_CUDA:
        decoder_input = decoder_input.cuda()
        all_decoder_outputs = all_decoder_outputs.cuda()
        decoder_context = decoder_context.cuda()  
    
    if train:
        use_teacher_forcing = random.random() < teacher_forcing_ratio
    else:
        use_teacher_forcing = False

    if use_teacher_forcing:        
        # Use targets as inputs
        for di in range(target_batches.shape[0]):
            decoder_output, decoder_context, decoder_hidden, decoder_attention = decoder(
                decoder_input.unsqueeze(0), decoder_context, decoder_hidden, encoder_outputs)
            
            all_decoder_outputs[di] = decoder_output
            decoder_input = target_batches[di]
    else:        
        # Use decoder output as inputs
        for di in range(target_batches.shape[0]):            
            decoder_output, decoder_context, decoder_hidden, decoder_attention = decoder(
                decoder_input.unsqueeze(0), decoder_context, decoder_hidden, encoder_outputs) 
            
            all_decoder_outputs[di] = decoder_output
            
            # Greedy approach, take the word with highest probability
            topv, topi = decoder_output.data.topk(1)            
            decoder_input = Variable(torch.LongTensor(topi.cpu()).squeeze())
            if USE_CUDA: decoder_input = decoder_input.cuda()
    
    loss = nn.NLLLoss()(all_decoder_outputs.view(-1, decoder.output_size), target_batches.contiguous().view(-1))          
    
    if train:
        loss.backward()
        torch.nn.utils.clip_grad_norm_(encoder.parameters(), clip)
        torch.nn.utils.clip_grad_norm_(decoder.parameters(), clip)
        encoder_optimizer.step()
        decoder_optimizer.step()
    
    return loss.item() 

# Model

In [7]:
attn_model = 'general'
hidden_size = 512
emb_size = 300
n_layers = 2
seed = 12
dropout_p = 0.1
teacher_forcing_ratio = 0.5
clip = 5.0

n_epochs = 30
batch_size = 64

In [8]:
torch.manual_seed(seed)
np.random.seed(seed)

In [9]:
encoder = Encoder(len(input_lang.vocab.stoi), hidden_size, emb_size, n_layers, dropout_p, input_lang, USE_CUDA)
decoder = Decoder_luong(attn_model, hidden_size, len(output_lang.vocab.stoi), emb_size, 2 * n_layers, dropout_p, output_lang, USE_CUDA)

if USE_CUDA:
    encoder = encoder.cuda()
    decoder = decoder.cuda()
    
learning_rate = 0.001
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
criterion = nn.NLLLoss()

In [None]:
# Keep track of time elapsed and running averages
start = time.time()
train_losses = []
validation_losses = []
validation_acc = []

plot_every = 5
print_every = 5
validate_loss_every = 25
best_acc = 0
print_loss_total = 0 # Reset every print_every
plot_loss_total = 0 # Reset every plot_every

In [None]:
for epoch in range(1, n_epochs): 
    # Shuffle data
    id_aux = np.random.permutation(np.arange(len(pairs_train)))
    pairs_train = pairs_train[id_aux]
    
    # Get the batches for this epoch
    input_batches, target_batches = generate_batches(input_lang, output_lang, batch_size, pairs_train, USE_CUDA=USE_CUDA)
    print_loss_total = 0
    
    for batch_ix, (input_var, target_var) in enumerate(zip(input_batches, target_batches)):
        encoder.train()
        decoder.train()
        
        #[input_var, _, _, _, _, _, _, _] = input_batch

        # Run the train function
        loss = train(input_var, target_var,\
                 encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, 
                 max_length=MAX_LENGTH, train=True)
        #loss = train_luong(input_var, target_var, input_var.size(1), True)
            
        torch.cuda.empty_cache()

        # Keep track of loss
        print_loss_total += loss
        plot_loss_total += loss

        if batch_ix == 0: continue

        if batch_ix % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print_summary = '%s (%d %d%%) %.4f' % (time_since(start, epoch / n_epochs), epoch, batch_ix / len(input_batches) * 100, print_loss_avg)
            train_losses.append(loss)
            print(print_summary)
    
    #input_batches, target_batches = generate_batches(input_lang, output_lang, batch_size, pairs_test, USE_CUDA=USE_CUDA)
    #print_loss_total = 0
    encoder.eval()
    decoder.eval()
    #for input_batch, target_var in zip(input_batches, target_batches):
    
    #    encoder.eval()
    #    decoder.eval()
    
    #    #[input_var, _, _, _, _, _, _, _] = input_batch
    #    input_var = input_batch
    #    # Run the train function
    #    loss = train(input_var, target_var,\
    #             encoder, decoder, encoder_optimizer, decoder_optimizer, criterion,\
    #             max_length=MAX_LENGTH, train=False)
        
    #    print_loss_total += loss
    #val_loss = print_loss_total / len(input_batches)
    #validation_losses.append(val_loss)
    # Evaluating acc
    evaluator = Evaluator(encoder, decoder, input_lang, output_lang, MAX_LENGTH, True)
    acc = evaluator.evaluate_acc(pairs_test, k_beams=1)
    if acc > best_acc:
        best_bleu = acc
        torch.save(encoder.state_dict(), f'{DIR_RESULTS}/encoder.pkl')
        torch.save(decoder.state_dict(), f'{DIR_RESULTS}/decoder.pkl')
    validation_acc.append(acc)
    print(f'------------- acc: {acc}')

    # Prevent overflow gpu memory
    del evaluator

0m 12s (- 6m 11s) (1 3%) 8.6818
0m 25s (- 12m 14s) (1 7%) 4.1682
0m 36s (- 17m 49s) (1 10%) 3.4351
0m 47s (- 22m 50s) (1 14%) 3.5046
0m 59s (- 28m 33s) (1 17%) 3.3789
1m 9s (- 33m 34s) (1 21%) 3.4257
1m 19s (- 38m 34s) (1 24%) 3.1339
1m 30s (- 43m 46s) (1 28%) 3.3261
1m 44s (- 50m 28s) (1 31%) 2.9057
1m 55s (- 55m 56s) (1 35%) 3.0947
2m 8s (- 61m 59s) (1 38%) 3.0848
2m 18s (- 66m 55s) (1 42%) 3.2857
2m 30s (- 72m 50s) (1 45%) 2.8325
2m 40s (- 77m 38s) (1 49%) 3.0030
2m 49s (- 81m 52s) (1 52%) 3.4222
2m 57s (- 86m 1s) (1 56%) 3.6910
3m 9s (- 91m 24s) (1 59%) 3.2429
3m 23s (- 98m 27s) (1 63%) 2.9688
3m 36s (- 104m 37s) (1 66%) 2.9571
3m 46s (- 109m 33s) (1 70%) 3.1424
4m 0s (- 116m 11s) (1 73%) 2.7055
4m 10s (- 121m 11s) (1 77%) 3.1694
4m 20s (- 125m 47s) (1 80%) 3.1557
4m 33s (- 132m 21s) (1 84%) 2.6898
4m 42s (- 136m 39s) (1 88%) 2.9978
4m 54s (- 142m 22s) (1 91%) 2.9846
5m 9s (- 149m 31s) (1 95%) 2.4678
5m 19s (- 154m 33s) (1 98%) 2.9141
------------- acc: 0.0
13m 30s (- 189m 5s) (2 3

In [None]:
evaluator = Evaluator(encoder, decoder, input_lang, output_lang, MAX_LENGTH, True)
acc = evaluator.evaluate_acc(pairs_test, k_beams=1, verbose=True)

In [12]:
acc

0.0

In [14]:
p = ['Pop Art is an example .', '1', 'art_pop_art10600']
int(p[1])

1

In [13]:
train_losses

[tensor(4.2680, device='cuda:0'),
 tensor(4.3280, device='cuda:0'),
 tensor(3.2405, device='cuda:0'),
 tensor(3.0685, device='cuda:0'),
 tensor(3.4384, device='cuda:0'),
 tensor(3.1535, device='cuda:0'),
 tensor(3.4878, device='cuda:0'),
 tensor(3.2988, device='cuda:0'),
 tensor(3.5391, device='cuda:0'),
 tensor(3.2383, device='cuda:0'),
 tensor(3.5126, device='cuda:0'),
 tensor(3.6423, device='cuda:0'),
 tensor(3.3424, device='cuda:0'),
 tensor(2.8781, device='cuda:0'),
 tensor(3.3466, device='cuda:0'),
 tensor(3.3214, device='cuda:0'),
 tensor(2.6270, device='cuda:0'),
 tensor(3.1198, device='cuda:0'),
 tensor(2.9170, device='cuda:0'),
 tensor(2.9819, device='cuda:0'),
 tensor(2.7829, device='cuda:0'),
 tensor(2.6241, device='cuda:0'),
 tensor(2.6977, device='cuda:0'),
 tensor(2.3243, device='cuda:0'),
 tensor(3.0008, device='cuda:0'),
 tensor(2.7750, device='cuda:0'),
 tensor(3.4579, device='cuda:0'),
 tensor(2.9167, device='cuda:0'),
 tensor(2.5323, device='cuda:0'),
 tensor(2.8233

In [None]:
evaluator = Evaluator(encoder, decoder, input_lang, output_lang, 
                      MAX_LENGTH, USE_CUDA)
candidates, references = evaluator.get_candidates_and_references(pairs_test[:10000], k_beams=2)
len(candidates), len(references)

A Jupyter Widget

In [12]:
BLEU(candidates, [references]) 

(0.28063523097173265,
 [0.6573135078342698,
  0.37567686039915427,
  0.22488307382629802,
  0.13494545201862276],
 0.953820572858132)