In [1]:
from data import generate_batches
from data import prepare_data

from model.encoder import Encoder
from model.decoder import Decoder_luong

from BLEU import BLEU

from utils import time_since


from evaluator_dis import Evaluator

import torch
import torch.nn as nn
from torch.nn import functional
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F

import numpy as np
import time
import random
import os
#from validation import Evaluator

%load_ext autoreload
%autoreload 2

In [2]:
USE_CUDA = True
MAX_LENGTH = 100
DIR_FILES = 'data/disambiguation/sense-eval2/'
DIR_TRAIN = os.path.join(DIR_FILES, 'train')
DIR_TEST = os.path.join(DIR_FILES, 'test')
DIR_RESULTS = 'results/step_1'
SPLIT_TRAIN = 0.7
SPLIT_VALID = 0.15
# The rest is for test

# Reading the data

In [3]:
input_lang, output_lang, pairs_train, pairs_test= prepare_data(max_length=MAX_LENGTH, dir_train=DIR_TRAIN, dir_test=DIR_TEST)

Reading lines...
Read 9080 train pairs
Read 4151 test pairs
Filtered to 9058 pairs
Creating vocab...
Indexed 28967 words in input language, 23810 words in output


In [4]:
pairs_train = np.array(pairs_train)

# Train

In [5]:
def train(input_batches, target_batches, \
          encoder, decoder, encoder_optimizer, decoder_optimzier, \
          criterion, max_length=MAX_LENGTH, train=True):
    
    if train:
        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()
        
    loss = 0
    
    batch_size = input_batches.size()[1]
    
    encoder_hidden = encoder.init_hidden(batch_size)
    encoder_outputs, encoder_hidden = encoder(input_batches, encoder_hidden)
    
    decoder_context = Variable(torch.zeros(batch_size, decoder.hidden_size))   
    decoder_hidden = encoder_hidden
    
    # set the start of the sentences of the batch
    decoder_input = Variable(torch.LongTensor([input_lang.vocab.stoi['<sos>']] * batch_size))

    # store the decoder outputs to estimate the loss
    all_decoder_outputs = Variable(torch.zeros(target_batches.size()[0], batch_size, len(output_lang.vocab.stoi)))
    
    if USE_CUDA:
        decoder_input = decoder_input.cuda()
        all_decoder_outputs = all_decoder_outputs.cuda()
        decoder_context = decoder_context.cuda()  
    
    if train:
        use_teacher_forcing = random.random() < teacher_forcing_ratio
    else:
        use_teacher_forcing = False

    if use_teacher_forcing:        
        # Use targets as inputs
        for di in range(target_batches.shape[0]):
            decoder_output, decoder_context, decoder_hidden, decoder_attention = decoder(
                decoder_input.unsqueeze(0), decoder_context, decoder_hidden, encoder_outputs)

            all_decoder_outputs[di] = decoder_output
            decoder_input = target_batches[di]
    else:        
        # Use decoder output as inputs
        for di in range(target_batches.shape[0]):            
            decoder_output, decoder_context, decoder_hidden, decoder_attention = decoder(
                decoder_input.unsqueeze(0), decoder_context, decoder_hidden, encoder_outputs) 

            all_decoder_outputs[di] = decoder_output
            
            # Greedy approach, take the word with highest probability
            topv, topi = decoder_output.data.topk(1)            
            decoder_input = Variable(torch.LongTensor(topi.cpu()).squeeze())
            if USE_CUDA: decoder_input = decoder_input.cuda()
    
    loss = nn.NLLLoss()(all_decoder_outputs.view(-1, decoder.output_size), target_batches.contiguous().view(-1))          
    
    if train:
        loss.backward()
        torch.nn.utils.clip_grad_norm_(encoder.parameters(), clip)
        torch.nn.utils.clip_grad_norm_(decoder.parameters(), clip)
        encoder_optimizer.step()
        decoder_optimizer.step()
    
    return loss.item() 

# Model

In [6]:
attn_model = 'general'
hidden_size = 512
emb_size = 300
n_layers = 2
seed = 12
dropout_p = 0.2
teacher_forcing_ratio = 0.5
clip = 5.0

n_epochs = 40
batch_size = 100

In [7]:
torch.manual_seed(seed)
np.random.seed(seed)

In [8]:
encoder = Encoder(len(input_lang.vocab.stoi), hidden_size, emb_size, n_layers, dropout_p, USE_CUDA=USE_CUDA)
decoder = Decoder_luong(attn_model, hidden_size, len(output_lang.vocab.stoi), emb_size, 2 * n_layers, dropout_p, USE_CUDA=USE_CUDA)

if USE_CUDA:
    encoder = encoder.cuda()
    decoder = decoder.cuda()
    
learning_rate = 0.001
encoder_optimizer = optim.Adam(encoder.parameters())
decoder_optimizer = optim.Adam(decoder.parameters())
criterion = nn.NLLLoss()

In [11]:
# Keep track of time elapsed and running averages
start = time.time()
train_losses = []
validation_losses = []
validation_acc = []

plot_every = 5
start_eval = 15
print_every = 5
validate_loss_every = 25
best_acc = 0
print_loss_total = 0 # Reset every print_every
plot_loss_total = 0 # Reset every plot_every

In [None]:
for epoch in range(1, n_epochs): 
    # Shuffle data
    id_aux = np.random.permutation(np.arange(len(pairs_train)))
    pairs_train = pairs_train[id_aux]
    
    # Get the batches for this epoch
    input_batches, target_batches = generate_batches(input_lang, output_lang, batch_size, pairs_train, USE_CUDA=USE_CUDA)
    print_loss_total = 0
    
    for batch_ix, (input_var, target_var) in enumerate(zip(input_batches, target_batches)):
        encoder.train()
        decoder.train()

        #[input_var, _, _, _, _, _, _, _] = input_batch

        # Run the train function
        loss = train(input_var, target_var,\
                 encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, 
                 max_length=MAX_LENGTH, train=True)
        #loss = train_luong(input_var, target_var, input_var.size(1), True)
            
        torch.cuda.empty_cache()

        # Keep track of loss
        print_loss_total += loss
        plot_loss_total += loss

        if batch_ix == 0: continue

        if batch_ix % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print_summary = '%s (%d %d%%) %.4f' % (time_since(start, epoch / n_epochs), epoch, batch_ix / len(input_batches) * 100, print_loss_avg)
            train_losses.append(loss)
            print(print_summary)
    
    #input_batches, target_batches = generate_batches(input_lang, output_lang, batch_size, pairs_test, USE_CUDA=USE_CUDA)
    #print_loss_total = 0
    encoder.eval()
    decoder.eval()
    #for input_batch, target_var in zip(input_batches, target_batches):
    
    #    encoder.eval()
    #    decoder.eval()
    
    #    #[input_var, _, _, _, _, _, _, _] = input_batch
    #    input_var = input_batch
    #    # Run the train function
    #    loss = train(input_var, target_var,\
    #             encoder, decoder, encoder_optimizer, decoder_optimizer, criterion,\
    #             max_length=MAX_LENGTH, train=False)
        
    #    print_loss_total += loss
    #val_loss = print_loss_total / len(input_batches)
    #validation_losses.append(val_loss)
    # Evaluating acc
    if epoch > start_eval:
        evaluator = Evaluator(encoder, decoder, input_lang, output_lang, MAX_LENGTH, True)
        acc = evaluator.evaluate_acc(pairs_test, k_beams=1)
        if acc > best_acc:
            best_acc = acc
            torch.save(encoder.state_dict(), f'{DIR_RESULTS}/encoder.pkl')
            torch.save(decoder.state_dict(), f'{DIR_RESULTS}/decoder.pkl')
            print('Saving weights')
        validation_acc.append(acc)
        print(f'------------- acc: {acc}')

        # Prevent overflow gpu memory
        del evaluator



0m 8s (- 5m 17s) (1 5%) 6.3810
0m 13s (- 8m 39s) (1 10%) 3.7557
0m 18s (- 12m 0s) (1 16%) 4.7232
0m 23s (- 15m 1s) (1 21%) 3.6604
0m 27s (- 17m 53s) (1 27%) 3.3066
0m 32s (- 21m 21s) (1 32%) 2.9788
0m 39s (- 25m 29s) (1 38%) 2.7311
0m 43s (- 28m 18s) (1 43%) 3.6644
0m 48s (- 31m 20s) (1 49%) 3.0944
0m 53s (- 34m 33s) (1 54%) 3.0938
0m 57s (- 37m 40s) (1 60%) 3.1346
1m 2s (- 40m 30s) (1 65%) 3.1511
1m 7s (- 44m 0s) (1 71%) 2.7032
1m 12s (- 47m 18s) (1 76%) 2.7951
1m 17s (- 50m 12s) (1 82%) 3.0322
1m 22s (- 53m 19s) (1 87%) 2.8750
1m 26s (- 56m 32s) (1 93%) 2.6936
1m 30s (- 59m 0s) (1 98%) 3.0109
1m 37s (- 30m 51s) (2 5%) 3.0260
1m 42s (- 32m 26s) (2 10%) 2.5431
1m 47s (- 33m 56s) (2 16%) 2.8300
1m 51s (- 35m 12s) (2 21%) 2.8959
1m 55s (- 36m 38s) (2 27%) 2.7332
2m 0s (- 38m 5s) (2 32%) 2.8308
2m 5s (- 39m 42s) (2 38%) 2.5709
2m 10s (- 41m 14s) (2 43%) 2.6037
2m 15s (- 43m 2s) (2 49%) 2.2764
2m 20s (- 44m 34s) (2 54%) 2.9316
2m 25s (- 46m 9s) (2 60%) 2.5993
2m 30s (- 47m 33s) (2 65%) 3.0

In [None]:
evaluator = Evaluator(encoder, decoder, input_lang, output_lang, MAX_LENGTH, True)
acc = evaluator.evaluate_acc(pairs_test, k_beams=3, verbose=True)



['pop', 'art_art10600', 'is', 'an', 'example', '.', '<eos>']
----- ID
0
----- tokens input
pop art is an example .
----- output predecido
pop art_art10600 is an example . <eos>
----- respuesta
['art_pop_art10600']

--- hints:  0   --- acc: 0.0
['reform', 'and', 'alan', 'arts', 'funding', 'and', 'organisation', '.', '<eos>']
----- ID
1
----- tokens input
reform and decentralise arts funding and organisation .
----- output predecido
reform and alan arts funding and organisation . <eos>
----- respuesta
['art_arts10900']

--- hints:  0   --- acc: 0.0
['you', 'you', 'notice', 'how', 'all', 'art_art10600', 'on', 'on', 'people', 'in', 'trouble', '.', '<eos>']
----- ID
2
----- tokens input
you ever notice how all art focuses on people in trouble .
----- output predecido
you you notice how all art_art10600 on on people in trouble . <eos>
----- respuesta
['art_art10600']

--- hints:  1   --- acc: 0.0002409058058299205
['search', 'books', 'and', 'classes', 'teach', 'the', 'art_art10900', 'of', 'o

In [36]:
del evaluator