In [1]:
from data import generate_batches
from data import prepare_data

from model.encoder import Encoder
from model.decoder import Decoder_luong

from BLEU import BLEU

from utils import time_since


from evaluator_dis import Evaluator

import torch
import torch.nn as nn
from torch.nn import functional
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F

import numpy as np
import time
import random
import os
#from validation import Evaluator

%load_ext autoreload
%autoreload 2

In [2]:
USE_CUDA = True
MAX_LENGTH = 100
DIR_FILES = 'data/disambiguation/sense-eval2/'
DIR_TRAIN = os.path.join(DIR_FILES, 'train')
DIR_TEST = os.path.join(DIR_FILES, 'test')
DIR_RESULTS = 'results/step_1'
SPLIT_TRAIN = 0.7
SPLIT_VALID = 0.15
# The rest is for test

# Reading the data

In [3]:
input_lang, output_lang, pairs = prepare_data('en', 'spa', max_length=MAX_LENGTH, dir=DIR_TRAIN)

Reading lines...
Read 9080 sentence pairs
Filtered to 9058 pairs
Creating vocab...
Indexed 23115 words in input language, 23811 words in output


In [5]:
pairs_train = np.array(pairs)
pairs_test = read_test(DIR_TEST)

# Train

In [6]:
def train(input_batches, target_batches, \
          encoder, decoder, encoder_optimizer, decoder_optimzier, \
          criterion, max_length=MAX_LENGTH, train=True):
    
    if train:
        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()
        
    loss = 0
    
    batch_size = input_batches.size()[1]
    
    encoder_hidden = encoder.init_hidden(batch_size)
    encoder_outputs, encoder_hidden = encoder(input_batches, encoder_hidden)
    
    decoder_context = Variable(torch.zeros(batch_size, decoder.hidden_size))   
    decoder_hidden = encoder_hidden
    
    # set the start of the sentences of the batch
    decoder_input = Variable(torch.LongTensor([input_lang.vocab.stoi['<sos>']] * batch_size))

    # store the decoder outputs to estimate the loss
    all_decoder_outputs = Variable(torch.zeros(target_batches.size()[0], batch_size, len(output_lang.vocab.stoi)))
    
    if USE_CUDA:
        decoder_input = decoder_input.cuda()
        all_decoder_outputs = all_decoder_outputs.cuda()
        decoder_context = decoder_context.cuda()  
    
    if train:
        use_teacher_forcing = random.random() < teacher_forcing_ratio
    else:
        use_teacher_forcing = False

    if use_teacher_forcing:        
        # Use targets as inputs
        for di in range(target_batches.shape[0]):
            decoder_output, decoder_context, decoder_hidden, decoder_attention = decoder(
                decoder_input.unsqueeze(0), decoder_context, decoder_hidden, encoder_outputs)

            all_decoder_outputs[di] = decoder_output
            decoder_input = target_batches[di]
    else:        
        # Use decoder output as inputs
        for di in range(target_batches.shape[0]):            
            decoder_output, decoder_context, decoder_hidden, decoder_attention = decoder(
                decoder_input.unsqueeze(0), decoder_context, decoder_hidden, encoder_outputs) 

            all_decoder_outputs[di] = decoder_output
            
            # Greedy approach, take the word with highest probability
            topv, topi = decoder_output.data.topk(1)            
            decoder_input = Variable(torch.LongTensor(topi.cpu()).squeeze())
            if USE_CUDA: decoder_input = decoder_input.cuda()
    
    loss = nn.NLLLoss()(all_decoder_outputs.view(-1, decoder.output_size), target_batches.contiguous().view(-1))          
    
    if train:
        loss.backward()
        torch.nn.utils.clip_grad_norm_(encoder.parameters(), clip)
        torch.nn.utils.clip_grad_norm_(decoder.parameters(), clip)
        encoder_optimizer.step()
        decoder_optimizer.step()
    
    return loss.item() 

# Model

In [9]:
attn_model = 'general'
hidden_size = 1024
emb_size = 300
n_layers = 2
seed = 12
dropout_p = 0.1
teacher_forcing_ratio = 0.5
clip = 5.0

n_epochs = 40
batch_size = 100

In [8]:
torch.manual_seed(seed)
np.random.seed(seed)

In [10]:
encoder = Encoder(len(input_lang.vocab.stoi), hidden_size, emb_size, n_layers, dropout_p, input_lang, USE_CUDA)
decoder = Decoder_luong(attn_model, hidden_size, len(output_lang.vocab.stoi), emb_size, 2 * n_layers, dropout_p, output_lang, USE_CUDA)

if USE_CUDA:
    encoder = encoder.cuda()
    decoder = decoder.cuda()
    
learning_rate = 0.001
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
criterion = nn.NLLLoss()

In [None]:
# Keep track of time elapsed and running averages
start = time.time()
train_losses = []
validation_losses = []
validation_acc = []

plot_every = 5
print_every = 5
validate_loss_every = 25
best_acc = 0
print_loss_total = 0 # Reset every print_every
plot_loss_total = 0 # Reset every plot_every

In [None]:
for epoch in range(1, n_epochs): 
    # Shuffle data
    id_aux = np.random.permutation(np.arange(len(pairs_train)))
    pairs_train = pairs_train[id_aux]
    
    # Get the batches for this epoch
    input_batches, target_batches = generate_batches(input_lang, output_lang, batch_size, pairs_train, USE_CUDA=USE_CUDA)
    print_loss_total = 0
    
    for batch_ix, (input_var, target_var) in enumerate(zip(input_batches, target_batches)):
        encoder.train()
        decoder.train()

        #[input_var, _, _, _, _, _, _, _] = input_batch

        # Run the train function
        loss = train(input_var, target_var,\
                 encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, 
                 max_length=MAX_LENGTH, train=True)
        #loss = train_luong(input_var, target_var, input_var.size(1), True)
            
        torch.cuda.empty_cache()

        # Keep track of loss
        print_loss_total += loss
        plot_loss_total += loss

        if batch_ix == 0: continue

        if batch_ix % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print_summary = '%s (%d %d%%) %.4f' % (time_since(start, epoch / n_epochs), epoch, batch_ix / len(input_batches) * 100, print_loss_avg)
            train_losses.append(loss)
            print(print_summary)
    
    #input_batches, target_batches = generate_batches(input_lang, output_lang, batch_size, pairs_test, USE_CUDA=USE_CUDA)
    #print_loss_total = 0
    encoder.eval()
    decoder.eval()
    #for input_batch, target_var in zip(input_batches, target_batches):
    
    #    encoder.eval()
    #    decoder.eval()
    
    #    #[input_var, _, _, _, _, _, _, _] = input_batch
    #    input_var = input_batch
    #    # Run the train function
    #    loss = train(input_var, target_var,\
    #             encoder, decoder, encoder_optimizer, decoder_optimizer, criterion,\
    #             max_length=MAX_LENGTH, train=False)
        
    #    print_loss_total += loss
    #val_loss = print_loss_total / len(input_batches)
    #validation_losses.append(val_loss)
    # Evaluating acc
    evaluator = Evaluator(encoder, decoder, input_lang, output_lang, MAX_LENGTH, True)
    acc = evaluator.evaluate_acc(pairs_test, k_beams=1)
    if acc > best_acc:
        best_acc = acc
        torch.save(encoder.state_dict(), f'{DIR_RESULTS}/encoder.pkl')
        torch.save(decoder.state_dict(), f'{DIR_RESULTS}/decoder.pkl')
        print('Saving weights')
    validation_acc.append(acc)
    print(f'------------- acc: {acc}')

    # Prevent overflow gpu memory
    del evaluator



4154m 33s (- 6231m 50s) (16 5%) 0.2855
4154m 40s (- 6232m 0s) (16 10%) 0.2224
4154m 47s (- 6232m 11s) (16 16%) 0.1910
4154m 55s (- 6232m 22s) (16 21%) 0.2061
4155m 2s (- 6232m 33s) (16 27%) 0.2113
4155m 10s (- 6232m 45s) (16 32%) 0.1890
4155m 18s (- 6232m 57s) (16 38%) 0.0859
4155m 25s (- 6233m 8s) (16 43%) 0.3058
4155m 33s (- 6233m 19s) (16 49%) 0.1012
4155m 39s (- 6233m 29s) (16 54%) 0.1621
4155m 46s (- 6233m 40s) (16 60%) 0.2343
4155m 53s (- 6233m 50s) (16 65%) 0.1759
4156m 2s (- 6234m 4s) (16 71%) 0.2482
4156m 9s (- 6234m 14s) (16 76%) 0.1244
4156m 16s (- 6234m 25s) (16 82%) 0.1531
4156m 24s (- 6234m 36s) (16 87%) 0.2695
4156m 33s (- 6234m 49s) (16 93%) 0.2628
4156m 38s (- 6234m 58s) (16 98%) 0.1589
------------- acc: 0.2500602264514575
4160m 8s (- 5628m 25s) (17 5%) 0.1273
4160m 15s (- 5628m 34s) (17 10%) 0.2639
4160m 22s (- 5628m 45s) (17 16%) 0.1893
4160m 30s (- 5628m 54s) (17 21%) 0.2094
4160m 37s (- 5629m 4s) (17 27%) 0.1812
4160m 44s (- 5629m 14s) (17 32%) 0.0738
4160m 51s (-

In [21]:
epoch

16

In [16]:
evaluator = Evaluator(encoder, decoder, input_lang, output_lang, MAX_LENGTH, True)
acc = evaluator.evaluate_acc(pairs_test, k_beams=1, verbose=True)



['gametocide', '_', 'is', 'an', 'an', 'example', '.', '<eos>']
----- ID
0
----- tokens input
Pop Art is an example .
----- output predecido
gametocide _ is an an example . <eos>
----- respuesta
['art_pop_art10600']

--- hints:  0   --- acc: 0.0
['and', 'and', 'authority', 'art', '_', 'art10400', 'funding', 'and', 'organisation', '.', '<eos>']
----- ID
1
----- tokens input
Reform and decentralise arts funding and organisation .
----- output predecido
and and authority art _ art10400 funding and organisation . <eos>
----- respuesta
['art_arts10900']

--- hints:  0   --- acc: 0.0
['mostbrokers', 'ever', 'notice', 'how', 'all', 'art', '_', 'art10400', 'issues', 'on', 'people', 'in', 'trouble', '.', '<eos>']
----- ID
2
----- tokens input
You ever notice how all art focuses on people in trouble .
----- output predecido
mostbrokers ever notice how all art _ art10400 issues on people in trouble . <eos>
----- respuesta
['art_art10600']

--- hints:  0   --- acc: 0.0
['buoncamino', 'books', 'and'

KeyboardInterrupt: 

In [17]:
pairs_train[0]

array(['clearly the action of a madman ; no possible sense in it .',
       'clearly the action of a madman ; no possible sense _ sense10904 in it .'],
      dtype='<U582')

In [14]:
p = ['Pop Art is an example .', '1', 'art_pop_art10600']
int(p[1])

1

In [None]:
evaluator = Evaluator(encoder, decoder, input_lang, output_lang, 
                      MAX_LENGTH, USE_CUDA)
candidates, references = evaluator.get_candidates_and_references(pairs_test[:10000], k_beams=2)
len(candidates), len(references)

A Jupyter Widget

In [12]:
BLEU(candidates, [references]) 

(0.28063523097173265,
 [0.6573135078342698,
  0.37567686039915427,
  0.22488307382629802,
  0.13494545201862276],
 0.953820572858132)