In [1]:
import torch
import torch.nn as nn
from torch.nn import functional
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F

import numpy as np
import time
import random
import os

os.chdir('../')

%load_ext autoreload
%autoreload 2

In [2]:
from model.encoder import Encoder
from model.decoder_v2 import Decoder_luong

from src.utils import time_since
from src.data import prepare_data
from src.data_loader import get_loader
from src.evaluator import Evaluator

In [3]:
USE_CUDA = True
MAX_LENGTH = 100
DIR_FILES = 'data/disambiguation/'
DIR_TRAIN = os.path.join(DIR_FILES, 'all')
DIR_TEST = os.path.join(DIR_FILES, 'test')
DIR_RESULTS = 'results/'
SPLIT_TRAIN = 0.7
SPLIT_VALID = 0.15
# The rest is for test

# Reading the data

In [4]:
input_lang, output_lang, pairs_train, pairs_test= prepare_data('all_rouge_map', 'verbs_selected', max_length=MAX_LENGTH, dir_train=DIR_TRAIN, dir_test=DIR_TEST)

Reading lines...
Read 66620 train pairs
Reading lines...
Read 140 test pairs
Filtered to 66380 pairs
Creating vocab...
Indexed 69224 words in input language, 83423 words in output


In [5]:
pairs_train = np.array(pairs_train)
pairs_test = np.array(pairs_test)

# Train

In [6]:
def train(input_batches, target_batches, \
          encoder, decoder, encoder_optimizer, decoder_optimzier, \
          criterion, max_length=MAX_LENGTH, train=True):
    
    if train:
        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()
        
    loss = 0
    
    batch_size = input_batches.size()[1]
    
    encoder_hidden = encoder.init_hidden(batch_size)
    encoder_outputs, encoder_hidden = encoder(input_batches, encoder_hidden)
    
    decoder_context = Variable(torch.zeros(batch_size, decoder.hidden_size))   
    decoder_hidden = encoder_hidden
    
    # set the start of the sentences of the batch
    decoder_input = Variable(torch.LongTensor([input_lang.vocab.stoi['<sos>']] * batch_size))

    # store the decoder outputs to estimate the loss
    all_decoder_outputs = Variable(torch.zeros(target_batches.size()[0], batch_size, len(output_lang.vocab.stoi)))
    
    if USE_CUDA:
        decoder_input = decoder_input.cuda()
        all_decoder_outputs = all_decoder_outputs.cuda()
        decoder_context = decoder_context.cuda()  
    
    if train:
        use_teacher_forcing = random.random() < teacher_forcing_ratio
    else:
        use_teacher_forcing = False

    if use_teacher_forcing:        
        # Use targets as inputs
        for di in range(target_batches.shape[0]):
            decoder_output, decoder_context, decoder_hidden, decoder_attention = decoder(
                decoder_input.unsqueeze(0), decoder_context, decoder_hidden, encoder_outputs)

            all_decoder_outputs[di] = decoder_output
            decoder_input = target_batches[di]
    else:        
        # Use decoder output as inputs
        for di in range(target_batches.shape[0]):            
            decoder_output, decoder_context, decoder_hidden, decoder_attention = decoder(
                decoder_input.unsqueeze(0), decoder_context, decoder_hidden, encoder_outputs) 

            all_decoder_outputs[di] = decoder_output
            
            # Greedy approach, take the word with highest probability
            topv, topi = decoder_output.data.topk(1)            
            decoder_input = Variable(torch.LongTensor(topi.cpu()).squeeze())
            if USE_CUDA: decoder_input = decoder_input.cuda()
    
    loss = nn.NLLLoss()(all_decoder_outputs.view(-1, decoder.output_size), target_batches.contiguous().view(-1))          
    
    if train:
        loss.backward()
        torch.nn.utils.clip_grad_norm_(encoder.parameters(), clip)
        torch.nn.utils.clip_grad_norm_(decoder.parameters(), clip)
        encoder_optimizer.step()
        decoder_optimizer.step()
    
    return loss.item() 

In [7]:
torch.__version__

'1.0.1.post2'

# Model

In [8]:
attn_model = 'general'
hidden_size = 256
emb_size = 300
n_layers = 2
seed = 12
dropout_p = 0.2
teacher_forcing_ratio = 0.5
clip = 5.0

n_epochs = 40
batch_size = 50

In [10]:
torch.manual_seed(seed)
np.random.seed(seed)

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [12]:
encoder = Encoder(len(input_lang.vocab.stoi), hidden_size, emb_size, n_layers, dropout_p, USE_CUDA=USE_CUDA)
decoder = Decoder_luong(attn_model, hidden_size, len(output_lang.vocab.stoi), emb_size, 2 * n_layers, dropout_p, USE_CUDA=USE_CUDA)

if USE_CUDA:
    encoder = encoder.cuda()
    decoder = decoder.cuda()
    
learning_rate = 0.001
encoder_optimizer = optim.Adam(encoder.parameters())
decoder_optimizer = optim.Adam(decoder.parameters())
criterion = nn.NLLLoss()

In [13]:
# Keep track of time elapsed and running averages
start = time.time()
train_losses = []
validation_losses = []
validation_acc = []

plot_every = 5
start_eval = 15
print_every = 5
validate_loss_every = 25
best_acc = 0
print_loss_total = 0 # Reset every print_every
plot_loss_total = 0 # Reset every plot_every

In [None]:
train_loader = get_loader(pairs_train, input_lang.vocab.stoi, output_lang.vocab.stoi, batch_size=batch_size)

In [None]:
for epoch in range(1, n_epochs): 
    # Shuffle data
    id_aux = np.random.permutation(np.arange(len(pairs_train)))
    pairs_train = pairs_train[id_aux]
    
    encoder.train()
    decoder.train()
    print_loss_total = 0
    # Get the batches for this epoch
    
    for batch_ix, (input_var, _, target_var, _) in enumerate(train_loader):
        # Transfer to GPU
        input_var, target_var = input_var.to(device), target_var.to(device)

        # Run the train function
        loss = train(input_var, target_var,\
                 encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, 
                 max_length=MAX_LENGTH, train=True)
        #loss = train_luong(input_var, target_var, input_var.size(1), True)
            
        torch.cuda.empty_cache()

        # Keep track of loss
        print_loss_total += loss
        plot_loss_total += loss

        if batch_ix == 0: continue

        if batch_ix % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print_summary = '%s (%d %d%%) %.4f' % (time_since(start, epoch / n_epochs), epoch, batch_ix, print_loss_avg)
            train_losses.append(loss)
            print(print_summary)
    
    #input_batches, target_batches = generate_batches(input_lang, output_lang, batch_size, pairs_test, USE_CUDA=USE_CUDA)
    #print_loss_total = 0
    encoder.eval()
    decoder.eval()
    #for input_batch, target_var in zip(input_batches, target_batches):
    
    #    encoder.eval()
    #    decoder.eval()
    
    #    #[input_var, _, _, _, _, _, _, _] = input_batch
    #    input_var = input_batch
    #    # Run the train function
    #    loss = train(input_var, target_var,\
    #             encoder, decoder, encoder_optimizer, decoder_optimizer, criterion,\
    #             max_length=MAX_LENGTH, train=False)
        
    #    print_loss_total += loss
    #val_loss = print_loss_total / len(input_batches)
    #validation_losses.append(val_loss)
    # Evaluating acc
    if epoch > start_eval:
        evaluator = Evaluator(encoder, decoder, input_lang, output_lang, MAX_LENGTH, True)
        acc = evaluator.evaluate_acc(pairs_test, k_beams=1)
        if acc > best_acc:
            best_acc = acc
            torch.save(encoder.state_dict(), f'{DIR_RESULTS}/encoder.pkl')
            torch.save(decoder.state_dict(), f'{DIR_RESULTS}/decoder.pkl')
            print('Saving weights')
        validation_acc.append(acc)
        print(f'------------- acc: {acc}')

        # Prevent overflow gpu memory
        del evaluator

0m 19s (- 12m 47s) (1 5%) 8.9294
0m 36s (- 24m 2s) (1 10%) 4.5356
0m 52s (- 34m 12s) (1 15%) 3.7169
1m 9s (- 44m 55s) (1 20%) 3.4474
1m 26s (- 56m 21s) (1 25%) 3.3652
1m 42s (- 66m 49s) (1 30%) 3.2042
1m 59s (- 77m 43s) (1 35%) 2.9129
2m 18s (- 89m 45s) (1 40%) 3.0333
2m 33s (- 99m 51s) (1 45%) 3.2498
2m 51s (- 111m 19s) (1 50%) 2.7783
3m 7s (- 121m 38s) (1 55%) 2.9454
3m 24s (- 132m 37s) (1 60%) 3.1151
3m 39s (- 142m 50s) (1 65%) 3.1249
3m 56s (- 153m 26s) (1 70%) 3.2791
4m 15s (- 165m 47s) (1 75%) 2.7932
4m 29s (- 175m 11s) (1 80%) 3.1662
4m 44s (- 185m 5s) (1 85%) 3.1078
5m 0s (- 195m 14s) (1 90%) 3.1366
5m 15s (- 205m 2s) (1 95%) 3.0716
5m 31s (- 215m 11s) (1 100%) 3.0004
5m 49s (- 227m 16s) (1 105%) 2.8469
6m 5s (- 237m 23s) (1 110%) 2.9899
6m 22s (- 248m 43s) (1 115%) 2.8820
6m 37s (- 258m 39s) (1 120%) 3.1196
6m 52s (- 268m 4s) (1 125%) 3.0404
7m 9s (- 278m 55s) (1 130%) 2.9314
7m 26s (- 290m 5s) (1 135%) 2.6563
7m 43s (- 301m 16s) (1 140%) 2.8155
8m 0s (- 312m 8s) (1 145%) 2.88

In [11]:
best_acc

0.03444953023367863

In [None]:
del encoder
del decoder

In [20]:
torch.cuda.empty_cache()

encoder = Encoder(len(input_lang.vocab.stoi), hidden_size, emb_size, n_layers, dropout_p, USE_CUDA=USE_CUDA)
decoder = Decoder_luong(attn_model, hidden_size, len(output_lang.vocab.stoi), emb_size, 2 * n_layers, dropout_p, USE_CUDA=USE_CUDA)

if USE_CUDA:
    encoder = encoder.cuda()
    decoder = decoder.cuda()
    
encoder.load_state_dict(torch.load(f'{DIR_RESULTS}/encoder.pkl'))
decoder.load_state_dict(torch.load(f'{DIR_RESULTS}/decoder.pkl'))

In [29]:
encoder.eval()
decoder.eval()

Decoder_luong(
  (embedding): Embedding(23810, 300)
  (rnn): GRU(300, 512, num_layers=4, dropout=0.2)
  (attn): Attention(
    (linear_out): Linear(in_features=1024, out_features=512, bias=True)
  )
  (out): Linear(in_features=512, out_features=23810, bias=True)
)

In [None]:
evaluator = Evaluator(encoder, decoder, input_lang, output_lang, MAX_LENGTH, True)
acc = evaluator.evaluate_acc(pairs_test, k_beams=3, verbose=False)



In [24]:
del evaluator

In [26]:
acc

0.3640086726090099

In [27]:
best_acc

0.37099494097807756

In [None]:
9