In [39]:
from data import generate_batches
from data import prepare_data
from data import data_to_index
from data import DEP_LABELS
from data import random_batch

from model.encoder import Encoder
from model.decoder import Decoder_luong
from model.gcn import Gcn

from BLEU import BLEU

from utils import time_since

from evaluator import Evaluator

import torch
import torch.nn as nn
from torch.nn import functional
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F

import numpy as np
import time
import random

#from validation import Evaluator

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
USE_CUDA = False
MAX_LENGTH = 100
DIR_FILES = 'data/translation/train/'
DIR_RESULTS = 'results/step_1'
SPLIT_TRAIN = 0.7
SPLIT_VALID = 0.15
# The rest is for test

# Reading the data

In [68]:
input_lang, output_lang, input_trees, _, pairs = prepare_data('eng', 'esp', dir=DIR_FILES, return_trees=True)

Reading lines...
Read 115244 sentence pairs
Filtered to 84144 pairs
Creating vocab...
Creating matrixes...
Indexed 12330 words in input language, 21913 words in output


In [69]:
input_matrixes = input_trees

In [70]:
pairs_train = np.array(pairs[:60000])
pairs_test = np.array(pairs[60000:])

matrixes_train = np.array(input_matrixes[:60000])
matrixes_test = np.array(input_matrixes[60000:])

# Train

In [132]:
def train(input_batches, target_batches, input_matrixes,\
          encoder, decoder, gcn, criterion, batch_ix, train=True):
    
    if train and (batch_ix % batch_size) == 0:
        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()
        gcn_optimizer.zero_grad()
        
    loss = 0
    
    encoder_hidden = encoder.init_hidden(1)
    encoder_outputs, encoder_hidden = encoder(input_batches, encoder_hidden)

    encoder_outputs = nn.LeakyReLU()(gcn(encoder_outputs.squeeze(1), input_matrixes).unsqueeze(1))
    decoder_hidden = encoder_hidden
    #print(encoder_outputs.shape, state.shape)
    
    decoder_context = Variable(torch.zeros(1, decoder.hidden_size))   
    #decoder_hidden = encoder_hidden
    # set the start of the sentences of the batch
    decoder_input = torch.LongTensor([input_lang.vocab.stoi['<sos>']] * 1)

    # store the decoder outputs to estimate the loss
    all_decoder_outputs = Variable(torch.zeros(target_batches.size()[0], 1, len(output_lang.vocab.stoi)))
    
    if USE_CUDA:
        decoder_input = decoder_input.cuda()
        all_decoder_outputs = all_decoder_outputs.cuda()
        decoder_context = decoder_context.cuda()  
    
    if train:
        use_teacher_forcing = random.random() < teacher_forcing_ratio
    else:
        use_teacher_forcing = False
    
    if use_teacher_forcing:        
        # Use targets as inputs
        for di in range(target_batches.shape[0]):
            decoder_output, decoder_context, decoder_hidden, decoder_attention = decoder(
                decoder_input.unsqueeze(0), decoder_context, decoder_hidden, encoder_outputs)
            
            all_decoder_outputs[di] = decoder_output
            decoder_input = target_batches[di]
    else:        
        # Use decoder output as inputs
        for di in range(target_batches.shape[0]): 
            decoder_output, decoder_context, decoder_hidden, decoder_attention = decoder(
                decoder_input.unsqueeze(0), decoder_context, decoder_hidden, encoder_outputs) 
            
            all_decoder_outputs[di] = decoder_output
            
            # Greedy approach, take the word with highest probability
            topv, topi = decoder_output.data.topk(1)            
            decoder_input = Variable(torch.LongTensor(topi.cpu()).squeeze(dim=0))
            if USE_CUDA: decoder_input = decoder_input.cuda()
    
    loss = nn.NLLLoss()(all_decoder_outputs.view(-1, decoder.output_size), target_batches.contiguous().view(-1))          
    
    if train and (batch_ix % batch_size) == 0:
        loss.backward()
        torch.nn.utils.clip_grad_norm_(encoder.parameters(), clip)
        torch.nn.utils.clip_grad_norm_(decoder.parameters(), clip)
        torch.nn.utils.clip_grad_norm_(gcn.parameters(), clip)
        encoder_optimizer.step()
        decoder_optimizer.step()
        gcn_optimizer.step()
    elif train:
        loss.backward()
    
    return loss.item() 

# Model

In [121]:
attn_model = 'general'
hidden_size = 512
emb_size = 300
n_layers = 2
dropout_p = 0.1
seed = 12
teacher_forcing_ratio = 0.5
clip = 5.0

n_epochs = 20
batch_size = 128

In [122]:
torch.manual_seed(seed)
np.random.seed(seed)

In [123]:
import math
import torch
from torch.nn.parameter import Parameter
from torch.nn.modules.module import Module

class Gcn(Module):
    def __init__(self, in_features, out_features, bias=True):
        super(Gcn, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        
        self.weight = Parameter(torch.FloatTensor(in_features, out_features))
        if bias:
            self.bias = Parameter(torch.FloatTensor(out_features))
        else:
            self.register_parameter('bias', None)
        self.init_params()
        
    def init_params(self):
        stdv = 1. / math.sqrt(self.weight.size(1))
        self.weight.data.uniform_(-stdv, stdv)
        if self.bias is not None:
            self.bias.data.uniform_(-stdv, stdv)
            
    def forward(self, input, adj):
        # input: (seq_len x in_features)
        # adj: (seq_len x seq_len)
        support = torch.mm(input, self.weight)
        output = torch.spmm(adj, support)
        if self.bias is not None:
            return output + self.bias
        else:
            return output

In [129]:
encoder = Encoder(len(input_lang.vocab.stoi), hidden_size, emb_size, n_layers, dropout_p, input_lang, USE_CUDA)
decoder = Decoder_luong(attn_model, hidden_size, len(output_lang.vocab.stoi), emb_size, 2 * n_layers, dropout_p, output_lang, USE_CUDA)
gcn = Gcn(hidden_size, hidden_size)

if USE_CUDA:
    encoder = encoder.cuda()
    decoder = decoder.cuda()
    gcn = gcn.cuda()
    
learning_rate = 0.001
encoder_optimizer = optim.Adam(filter(lambda p: p.requires_grad, encoder.parameters()), lr=learning_rate)
decoder_optimizer = optim.Adam(filter(lambda p: p.requires_grad, decoder.parameters()), lr=learning_rate)
gcn_optimizer = optim.Adam(gcn.parameters(), lr=learning_rate)
criterion = nn.NLLLoss()

In [130]:
# Keep track of time elapsed and running averages
start = time.time()
train_losses = []
validation_losses = []
validation_bleu = []

plot_every = 5
print_every = 5
validate_loss_every = 25
best_bleu = 0
print_loss_total = 0 # Reset every print_every
plot_loss_total = 0 # Reset every plot_every

In [133]:
for epoch in range(1, n_epochs): 
    # Shuffle data
    #id_aux = np.random.permutation(np.arange(len(pairs_train)))
    #pairs_train = pairs_train[id_aux]
    
    # Get the batches for this epoch
    input_batches, input_matrixes, target_batches = generate_batches(input_lang, output_lang, 1, pairs_train, arr_dep=matrixes_train, USE_CUDA=USE_CUDA)    
    
    encoder.train()
    decoder.train()
    gcn.train()
        
    for batch_ix, (input_var, input_matrix, target_var) in enumerate(zip(input_batches, input_matrixes, target_batches)):
        
        # Run the train function
        input_matrix = np.array(input_matrix[0])
        degree = np.array(np.sum(input_matrix, axis=0))
        degree = np.matrix(np.diag(degree))
        
        input_matrix = torch.FloatTensor(np.linalg.inv(degree) * input_matrix)

        loss = train(input_var, target_var, input_matrix,\
                 encoder, decoder, gcn, criterion, batch_ix, train=True)
        #loss = train_luong(input_var, target_var, input_var.size(1), True)
            
        torch.cuda.empty_cache()

        # Keep track of loss
        print_loss_total += loss
        plot_loss_total += loss

        if batch_ix == 0: continue            

        if batch_ix % (print_every * batch_size)  == 0:
            print_loss_avg = print_loss_total / (print_every * batch_size)
            print_loss_total = 0
            print_summary = '%s (%d %d%%) %.4f' % (time_since(start, epoch / n_epochs), epoch, batch_ix / len(input_batches) * 100, print_loss_avg)
            train_losses.append(loss)
            print(print_summary)
    
    input_batches, input_matrixes, target_batches = generate_batches(input_lang, output_lang, 1, pairs_test, arr_dep=matrixes_test, USE_CUDA=USE_CUDA)
    
    encoder.eval()
    decoder.eval()
    gcn.eval()
    
    with torch.no_grad():
        print_loss_total = 0
        for batch_ix, (input_var, input_matrix, target_var) in enumerate(zip(input_batches, input_matrixes, target_batches)):
    
            # Run the train function
            input_matrix = np.array(input_matrix[0])
            degree = np.array(np.sum(input_matrix, axis=0))
            degree = np.matrix(np.diag(degree))

            input_matrix = torch.FloatTensor(np.linalg.inv(degree) * input_matrix)
            loss = train(input_var, target_var, input_matrix,\
                     encoder, decoder, gcn, criterion, batch_ix, train=False)

            print_loss_total += loss
            torch.cuda.empty_cache()
    val_loss = print_loss_total / len(input_batches)
    validation_losses.append(val_loss)
    # Evaluating Bleu
    #evaluator = Evaluator(encoder, decoder, input_lang, output_lang, MAX_LENGTH, True)
    #candidates, references = evaluator.get_candidates_and_references(pairs_test, k_beams=1)
    #bleu = BLEU(candidates, [references])
    #if bleu[0] > best_bleu:
    #    best_bleu = bleu[0]
    #    torch.save(encoder.state_dict(), f'{DIR_RESULTS}/encoder.pkl')
    #    torch.save(decoder.state_dict(), f'{DIR_RESULTS}/decoder.pkl')
    #validation_bleu.append(bleu)
    print(f'val_loss: {val_loss:.4f} - bleu: {0}')

    # Prevent overflow gpu memory
    #del evaluator

8m 58s (- 170m 39s) (1 1%) 8.6812
17m 2s (- 323m 51s) (1 2%) 7.7930
25m 8s (- 477m 42s) (1 3%) 7.1848
33m 11s (- 630m 43s) (1 4%) 7.0467
41m 15s (- 784m 2s) (1 5%) 6.8925
49m 20s (- 937m 24s) (1 6%) 6.9293
57m 24s (- 1090m 50s) (1 7%) 6.7174
65m 28s (- 1243m 55s) (1 8%) 6.5930
73m 33s (- 1397m 36s) (1 9%) 6.4553
81m 35s (- 1550m 15s) (1 10%) 6.4956
89m 41s (- 1704m 2s) (1 11%) 6.3985
97m 45s (- 1857m 17s) (1 12%) 6.4574
105m 48s (- 2010m 18s) (1 13%) 6.8812
113m 49s (- 2162m 44s) (1 14%) 6.5730
121m 51s (- 2315m 25s) (1 16%) 6.4642
129m 59s (- 2469m 44s) (1 17%) 6.3321
137m 58s (- 2621m 33s) (1 18%) 6.3489
146m 5s (- 2775m 36s) (1 19%) 6.2115
154m 12s (- 2929m 52s) (1 20%) 6.1239
162m 15s (- 3082m 54s) (1 21%) 6.1571
170m 21s (- 3236m 45s) (1 22%) 6.1340
178m 23s (- 3389m 30s) (1 23%) 6.0719
186m 25s (- 3542m 13s) (1 24%) 6.2250
194m 29s (- 3695m 12s) (1 25%) 6.1202
202m 35s (- 3849m 13s) (1 26%) 6.0625
210m 36s (- 4001m 40s) (1 27%) 6.1299
218m 41s (- 4155m 1s) (1 28%) 6.3176
226m 41s

RuntimeError: _indices not supported on torch.LongTensor

In [96]:
temp

array([[1., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 1., 1., 0., 0., 1., 0., 0., 1., 1.],
       [0., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 1., 0., 0., 0., 0.],
       [0., 1., 0., 1., 1., 1., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 1., 0., 0.],
       [0., 0., 0., 0., 0., 1., 1., 1., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 1.]])

In [95]:
D = np.array(np.sum(temp, axis=0))
print(D)
D = np.matrix(np.diag(D))
D

[2. 6. 2. 2. 2. 5. 2. 3. 2. 2.]


matrix([[2., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 6., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 2., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 2., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 2., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 5., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 2., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 3., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 2., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 2.]])

In [99]:
D**-1

matrix([[0.5       , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.16666667, 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.5       , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.5       , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.5       ,
         0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        ,
         0.2       , 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.5       , 0.        , 0.        , 0.        ],
        [0.        , 0.    

In [105]:
np.multiply(temp, np.linalg.inv(D))

matrix([[0.5       , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.16666667, 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.5       , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.5       , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.5       ,
         0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        ,
         0.2       , 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.5       , 0.        , 0.        , 0.        ],
        [0.        , 0.    

In [103]:
D**-1*temp

matrix([[0.5       , 0.5       , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.16666667, 0.16666667, 0.16666667, 0.        , 0.        ,
         0.16666667, 0.        , 0.        , 0.16666667, 0.16666667],
        [0.        , 0.5       , 0.5       , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.5       , 0.        ,
         0.5       , 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.5       ,
         0.5       , 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.2       , 0.        , 0.2       , 0.2       ,
         0.2       , 0.        , 0.2       , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.5       , 0.5       , 0.        , 0.        ],
        [0.        , 0.    

In [64]:
ix = 0
input_batches[ix].shape, input_matrixes[ix][0].shape

(torch.Size([7, 1]), (7, 7))

In [65]:
input_var.shape

torch.Size([8, 1])

In [66]:
input_matrix.shape

torch.Size([7, 7])

In [67]:
' '.join([input_lang.vocab.itos[w] for w in input_var])

'he gave me 10 ,000 yen . <eos>'

In [58]:
type(input_matrix)

torch.Tensor

In [None]:
evaluator = Evaluator(encoder, decoder, input_lang, output_lang, 
                      MAX_LENGTH, USE_CUDA)
candidates, references = evaluator.get_candidates_and_references(pairs_test[:10000], k_beams=2)
len(candidates), len(references)

In [12]:
BLEU(candidates, [references]) 

(0.28063523097173265,
 [0.6573135078342698,
  0.37567686039915427,
  0.22488307382629802,
  0.13494545201862276],
 0.953820572858132)

In [11]:
train_losses

[9.041672706604004,
 7.149496555328369,
 14.810012817382812,
 12.101698875427246,
 9.06682014465332,
 7.260944366455078,
 6.044005393981934,
 6.0010833740234375,
 7.1673264503479,
 6.06639289855957,
 6.229933261871338,
 8.623738288879395,
 5.882908344268799,
 4.927737236022949,
 5.671040058135986,
 7.489520072937012,
 5.501565456390381,
 5.914140224456787,
 6.912267208099365,
 6.224165916442871,
 7.6937079429626465,
 6.377658843994141,
 8.010522842407227,
 8.193879127502441,
 7.284544944763184,
 4.864162445068359,
 6.447579860687256,
 6.805881977081299,
 5.03970193862915,
 6.144567012786865,
 5.522188186645508,
 6.3946533203125,
 7.317024230957031,
 7.084739685058594,
 4.866414546966553,
 4.8789286613464355,
 6.360021114349365,
 5.258521556854248,
 7.594843864440918,
 5.99109411239624,
 6.1218085289001465,
 4.6263885498046875,
 6.505831241607666,
 6.49678897857666,
 6.661844253540039,
 5.703457832336426,
 6.080120086669922,
 5.556210994720459,
 4.3718485832214355,
 7.616245269775391,
 

In [13]:
pairs_test[480]

array(["tom wasn 't convinced it was a good idea .",
       'tom no estaba convencido de que fuera una buena idea .'],
      dtype='<U245')

In [14]:
pairs_train[80000]

array(['just act as if nothing has happened .',
       'haga de cuenta que nada ha ocurrido .'],
      dtype='<U245')