In [1]:
from data import generate_batches
from data import prepare_data
from data import data_to_index
from data import DEP_LABELS

from model.graph import Sintactic_GCN
from model.encoder import Encoder
from model.decoder import Decoder_luong

from BLEU import BLEU

from utils import time_since

import torch
import torch.nn as nn
from torch.nn import functional
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F

from stanfordcorenlp import StanfordCoreNLP 

import numpy as np
import time

from validation import Evaluator

%load_ext autoreload
%autoreload 2

In [2]:
USE_CUDA = True
MAX_LENGTH = 100

SPLIT_TRAIN = 0.7
SPLIT_VALID = 0.15
# The rest is for test

# Reading the data

Prepare vocabulary and pairs for the data

In [3]:
input_lang, output_lang, pairs = prepare_data('en', 'spa', max_length=MAX_LENGTH)

Reading lines...
Read 118964 sentence pairs
Filtered to 85785 pairs
Creating vocab...
Indexed 12436 words in input language, 22765 words in output


Splitting pairs into test, val and test

In [4]:
#np.shuffle(pairs)
pairs_train = pairs[:int(len(pairs) * SPLIT_TRAIN)]
pairs_valid = pairs[int(len(pairs) * SPLIT_TRAIN):int(len(pairs) * (SPLIT_TRAIN + SPLIT_VALID))]
pairs_test = pairs[int(len(pairs) * (SPLIT_TRAIN + SPLIT_VALID)):]

In [5]:
len(pairs_train), len(pairs_valid), len(pairs_test)

(60049, 12868, 12868)

Get the adjacency matrix for the pairs

In [6]:
nlp = StanfordCoreNLP(r'/home/krivas/stanford-corenlp-full-2018-02-27/')

In [7]:
from tqdm import tqdm
def get_adjacency_matrix(pairs):
    arr_dep = []
    for pair in tqdm(pairs):
        arr_dep.append(nlp.dependency_parse(pair[0]))
    return np.array(arr_dep)

In [8]:
arr_dep_train = get_adjacency_matrix(pairs_train)
arr_dep_valid = get_adjacency_matrix(pairs_valid)
arr_dep_test = get_adjacency_matrix(pairs_test)

100%|██████████| 60049/60049 [07:22<00:00, 135.68it/s]
100%|██████████| 12868/12868 [02:01<00:00, 106.01it/s]
100%|██████████| 12868/12868 [02:26<00:00, 87.54it/s]


Converting words to index in pairs

In [9]:
pairs_train = data_to_index(pairs_train, input_lang, output_lang)
pairs_valid = data_to_index(pairs_valid, input_lang, output_lang)
pairs_test = data_to_index(pairs_test, input_lang, output_lang)

# Training

In [10]:
def pass_batch_luong(batch_size, input_batches, target_batches, train=True, adj_arc_in=None, adj_arc_out=None, adj_lab_in=None, adj_lab_out=None, mask_in=None, mask_out=None, mask_loop=None):
        
    hidden = encoder.init_hidden(batch_size)

    encoder_outputs, encoder_hidden = encoder(input_batches, hidden)
    decoder_input = Variable(torch.LongTensor([input_lang.vocab.stoi["<sos>"]] * batch_size))
    
    decoder_hidden = encoder_hidden
    decoder_context = Variable(torch.zeros(batch_size, decoder.hidden_size)) 
    
    all_decoder_outputs = Variable(torch.zeros(target_batches.data.size()[0], batch_size, len(output_lang.vocab.itos)))

    if USE_CUDA:
        all_decoder_outputs = all_decoder_outputs.cuda()
        decoder_input = decoder_input.cuda()
        decoder_context = decoder_context.cuda()
    
    if train:
        use_teacher_forcing = np.random.random() < tf_ratio
    else:
        use_teacher_forcing = False
    
    if use_teacher_forcing:        
        # Use targets as inputs
        for di in range(target_batches.shape[0]):
            decoder_output, decoder_context, decoder_hidden, decoder_attention = decoder(
                decoder_input.unsqueeze(0), decoder_context, decoder_hidden, encoder_outputs)
            
            all_decoder_outputs[di] = decoder_output
            decoder_input = target_batches[di]
    else:        
        # Use decoder output as inputs
        for di in range(target_batches.shape[0]):            
            decoder_output, decoder_context, decoder_hidden, decoder_attention = decoder(
                decoder_input.unsqueeze(0), decoder_context, decoder_hidden, encoder_outputs) 
            
            all_decoder_outputs[di] = decoder_output
            
            # Greedy approach, take the word with highest probability
            topv, topi = decoder_output.data.topk(1)            
            decoder_input = Variable(torch.LongTensor(topi.cpu()).squeeze())
            if USE_CUDA: decoder_input = decoder_input.cuda()
        
    del decoder_output
    del decoder_hidden
        
    return all_decoder_outputs, target_batches

def train_luong(input_batches, target_batches, batch_size, train=True, adj_arc_in=None, adj_arc_out=None, adj_lab_in=None, adj_lab_out=None, mask_in=None, mask_out=None, mask_loop=None):
    
    # Zero gradients of both optimizers
    if train:
        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()

    loss = 0 # Added onto for each word
    all_decoder_outputs, target_batches = pass_batch_luong(batch_size, input_batches, target_batches, train, adj_arc_in, adj_arc_out, adj_lab_in, adj_lab_out, mask_in, mask_out, mask_loop)
    
    # Loss calculation and backpropagation
    loss = criterion(all_decoder_outputs.view(-1, decoder.output_size), target_batches.contiguous().view(-1))
    
    if train:
        loss.backward()
        torch.nn.utils.clip_grad_norm_(encoder.parameters(), clip)
        torch.nn.utils.clip_grad_norm_(decoder.parameters(), clip)
        encoder_optimizer.step()
        decoder_optimizer.step()
        
        if gcn1:
            torch.nn.utils.clip_grad_norm_(gcn1.parameters(), clip)
            gcn1_optimizer.step()

    del all_decoder_outputs
    del target_batches
    
    return loss.item()

# Model

In [11]:
# Configure models
hidden_size_rnn = 512
hidden_size_graph = 512
emb_size=300
n_layers = 2
dropout = 0.1
batch_size = 50

# Configure training/optimization
clip = 10.0
learning_rate_graph = 0.0002
n_epochs = 20
print_every = 10
validate_loss_every = 50
validate_acc_every = 2 * validate_loss_every
tf_ratio = 0.5

In [12]:
# Initialize models
encoder = Encoder(len(input_lang.vocab.itos), hidden_size_rnn, emb_size, n_layers=n_layers, dropout=dropout, USE_CUDA=USE_CUDA)
decoder = Decoder_luong('general', hidden_size_graph, len(output_lang.vocab.itos), 300, n_layers=2 * n_layers, dropout=dropout, USE_CUDA=USE_CUDA)
gcn1 = Sintactic_GCN(hidden_size_rnn, hidden_size_graph, num_labels=len(DEP_LABELS))

# Initialize optimizers and criterion
encoder_optimizer = optim.Adam(encoder.parameters())
decoder_optimizer = optim.Adam(decoder.parameters())
gcn1_optimizer = optim.Adam(gcn1.parameters(), learning_rate_graph)

criterion = nn.NLLLoss()

# Move models to GPU
if USE_CUDA:
    encoder = encoder.cuda()
    decoder = decoder.cuda()
    gcn1 = gcn1.cuda()
    
# Keep track of time elapsed and running averages
start = time.time()
train_losses = []
validation_losses = []
validation_bleu = []

print_loss_total = 0 # Reset every print_every
plot_loss_total = 0 # Reset every plot_every

In [None]:
for epoch in range(1, n_epochs): 
    # Shuffle data
    id_aux = np.random.permutation(np.arange(len(pairs_train)))
    pairs_train = pairs_train[id_aux]
    arr_dep_train = arr_dep_train[id_aux]
    
    # Get the batches for this epoch
    input_batches, target_batches = generate_batches(input_lang, output_lang, batch_size, pairs_train, return_dep_tree=True, arr_dep=arr_dep_train, max_degree=6, USE_CUDA=USE_CUDA)
    print_loss_total = 0
    for batch_ix, (input_batch, target_var) in enumerate(zip(input_batches, target_batches)):
    
        encoder.train()
        decoder.train()
        gcn1.train()
    
        [input_var, adj_arc_in, adj_arc_out, adj_lab_in, adj_lab_out, mask_in, mask_out, mask_loop] = input_batch
        # Run the train function
        loss = train_luong(input_var, target_var, input_var.size(1), 
                    True, adj_arc_in, adj_arc_out, adj_lab_in, adj_lab_out, mask_in, mask_out, mask_loop)
            
        torch.cuda.empty_cache()

        # Keep track of loss
        print_loss_total += loss
        plot_loss_total += loss

        if batch_ix == 0: continue

        if batch_ix % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print_summary = '%s (%d %d%%) %.4f' % (time_since(start, epoch / n_epochs), epoch, epoch / n_epochs * 100, print_loss_avg)
            train_losses.append(loss)

            print(f'{time_since(start, batch_ix / len(input_batches))} ({batch_ix} {batch_ix / len(input_batches) * 100:.2f}%) train_loss: {print_loss_avg:.4f}')
    
    input_batches, target_batches = generate_batches(input_lang, output_lang, batch_size, pairs_valid, return_dep_tree=True, arr_dep=arr_dep_train, max_degree=6, USE_CUDA=USE_CUDA)
    print_loss_total = 0
    for input_batch, target_var in zip(input_batches, target_batches):
    
        encoder.eval()
        decoder.eval()
        gcn1.eval()
    
        [input_var, adj_arc_in, adj_arc_out, adj_lab_in, adj_lab_out, mask_in, mask_out, mask_loop] = input_batch
        # Run the train function
        loss = train_luong(input_var, target_var, input_var.size(1), 
                     False, adj_arc_in, adj_arc_out, adj_lab_in, adj_lab_out, mask_in, mask_out, mask_loop)
        
        print_loss_total += loss
    val_loss = print_loss_total / len(input_batches)
    validation_losses.append(val_loss)
    # Evaluating Bleu
    evaluator = Evaluator(encoder, decoder, gcn1, None, input_lang, output_lang, MAX_LENGTH, True)
    candidates, references = evaluator.get_candidates_and_references(pairs_test, arr_dep_test, k_beams=1)
    bleu = BLEU(candidates, [references])
    validation_bleu.append(bleu)
    print(f'val_loss: {val_loss:.4f} - bleu: {bleu}', end=' ')

    # Prevent overflow gpu memory
    del evaluator

19m 22s (- 2308m 5s) (10 0.83%) train_loss: 1.1258
19m 25s (- 1147m 10s) (20 1.67%) train_loss: 1.0616
19m 28s (- 760m 7s) (30 2.50%) train_loss: 0.8672
19m 31s (- 566m 33s) (40 3.33%) train_loss: 0.9690
19m 33s (- 450m 22s) (50 4.16%) train_loss: 1.1023
19m 36s (- 372m 55s) (60 5.00%) train_loss: 1.0047
19m 39s (- 317m 30s) (70 5.83%) train_loss: 1.0243
19m 41s (- 276m 0s) (80 6.66%) train_loss: 0.9711
19m 44s (- 243m 43s) (90 7.49%) train_loss: 0.9698
19m 47s (- 217m 55s) (100 8.33%) train_loss: 0.9571
19m 50s (- 196m 46s) (110 9.16%) train_loss: 1.0412
19m 53s (- 179m 9s) (120 9.99%) train_loss: 1.1032
19m 56s (- 164m 14s) (130 10.82%) train_loss: 0.9723
19m 58s (- 151m 26s) (140 11.66%) train_loss: 0.9889
20m 1s (- 140m 20s) (150 12.49%) train_loss: 0.9124
20m 4s (- 130m 37s) (160 13.32%) train_loss: 0.9811
20m 7s (- 122m 2s) (170 14.15%) train_loss: 1.0140
20m 10s (- 114m 25s) (180 14.99%) train_loss: 1.0233
20m 13s (- 107m 35s) (190 15.82%) train_loss: 1.0557
20m 15s (- 101m 25s)

100%|██████████| 12867/12867 [05:23<00:00, 39.80it/s]


val_loss: 2.7739 - bleu: (0.0012412776617202854, [0.20323008945867704, 0.005821384730254749, 0.0009372354577337042, 3.7534015201276154e-05], 0.48870516398975156) 31m 2s (- 3697m 25s) (10 0.83%) train_loss: 0.5350
31m 5s (- 1835m 37s) (20 1.67%) train_loss: 0.5244
31m 7s (- 1215m 4s) (30 2.50%) train_loss: 0.5714
31m 10s (- 904m 46s) (40 3.33%) train_loss: 0.5122
31m 12s (- 718m 35s) (50 4.16%) train_loss: 0.5560
31m 15s (- 594m 26s) (60 5.00%) train_loss: 0.5952
31m 18s (- 505m 44s) (70 5.83%) train_loss: 0.5704
31m 20s (- 439m 12s) (80 6.66%) train_loss: 0.5518
31m 22s (- 387m 24s) (90 7.49%) train_loss: 0.4451
31m 25s (- 345m 56s) (100 8.33%) train_loss: 0.5614
31m 27s (- 312m 1s) (110 9.16%) train_loss: 0.4431
31m 30s (- 283m 46s) (120 9.99%) train_loss: 0.5971
31m 32s (- 259m 51s) (130 10.82%) train_loss: 0.5648
31m 34s (- 239m 20s) (140 11.66%) train_loss: 0.5915
31m 37s (- 221m 32s) (150 12.49%) train_loss: 0.5195
31m 39s (- 205m 58s) (160 13.32%) train_loss: 0.5234
31m 41s (- 19