In [1]:
from data import generate_batches
from data import prepare_data
from data import data_to_index
from data import DEP_LABELS
from data import random_batch

from model.encoder import Encoder
from model.decoder import Decoder_luong
#from model.tree_lstm import Tree_lstm

from BLEU import BLEU

from utils import time_since

from evaluator import Evaluator

import torch
import torch.nn as nn
from torch.nn import functional
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F

import numpy as np
import time
import random

#from validation import Evaluator

%load_ext autoreload
%autoreload 2

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# Child sum tree lstm
class Tree_lstm(nn.Module):
    def __init__(self, in_dim, mem_dim):
        super(Tree_lstm, self).__init__()
        self.in_dim = in_dim
        self.mem_dim = mem_dim
        self.ioux = nn.Linear(self.in_dim, 3 * self.mem_dim)
        self.iouh = nn.Linear(self.mem_dim, 3 * self.mem_dim)
        self.fx = nn.Linear(self.in_dim, self.mem_dim)
        self.fh = nn.Linear(self.mem_dim, self.mem_dim)

    def node_forward(self, inputs, child_c, child_h):
        child_h_sum = torch.sum(child_h, dim=0, keepdim=True)

        iou = self.ioux(inputs) + self.iouh(child_h_sum)
        i, o, u = torch.split(iou, iou.size(1) // 3, dim=1)
        i, o, u = F.sigmoid(i), F.sigmoid(o), F.tanh(u)

        f = F.sigmoid(
            self.fh(child_h) +
            self.fx(inputs).repeat(len(child_h), 1)
        )
        fc = torch.mul(f, child_c)

        c = torch.mul(i, u) + torch.sum(fc, dim=0, keepdim=True)
        h = torch.mul(o, F.tanh(c))
        return c, h

    def forward(self, tree, inputs):
        tree_acum = None
        for idx in range(tree.num_children):
            tree_acum = self.forward(tree.children[idx], inputs)
        
        if tree.num_children == 0:
            child_c = inputs[0].detach().new(1, self.mem_dim).fill_(0.).requires_grad_()
            child_h = inputs[0].detach().new(1, self.mem_dim).fill_(0.).requires_grad_()
        else:
            child_c, child_h = zip(* map(lambda x: x.state, tree.children))
            child_c, child_h = torch.cat(child_c, dim=0), torch.cat(child_h, dim=0)
        
        tree.state = self.node_forward(inputs[tree.idx - 1], child_c, child_h)
        if tree_acum:
            s, h = torch.cat((tree_acum[0], tree.state[0])), tree.state[1]
        else:
            s, h = tree.state
        return s, h

In [3]:
USE_CUDA = True
MAX_LENGTH = 100
DIR_FILES = 'data/translation/train/'
DIR_RESULTS = 'results/step_1'
SPLIT_TRAIN = 0.7
SPLIT_VALID = 0.15
SEED = 12
# The rest is for test

In [4]:
torch.cuda.set_device(1)

In [5]:
torch.manual_seed(SEED)
np.random.seed(SEED)

# Reading the data

In [6]:
input_lang, output_lang, trees, _, pairs = prepare_data('eng', 'esp', dir=DIR_FILES, return_trees=True)

Reading lines...
Read 115244 sentence pairs
Filtered to 83374 pairs
Creating vocab...
Creating trees...
Indexed 12248 words in input language, 22537 words in output


In [14]:
pairs_train = np.array(pairs[:60000])
pairs_test = np.array(pairs[60000:])

trees_train = np.array(trees[:60000])
trees_test = np.array(trees[60000:])

# Train

In [8]:
def train(input_batches, target_batches, input_tree,\
          encoder, decoder, tree, criterion, batch_ix, train=True):
    
    if train and (batch_ix % batch_size) == 0:
        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()
        tree_optimizer.zero_grad()
        
    loss = 0
    
    encoder_hidden = encoder.init_hidden(1)
    encoder_outputs, encoder_hidden = encoder(input_batches, encoder_hidden)

    state, tree_hidden = tree(input_tree[0], encoder_outputs)
    #encoder_outputs = torch.cat((encoder_outputs, state.unsqueeze(1)))
    #print(encoder_outputs.shape, state.shape)

    decoder_context = Variable(torch.zeros(1, decoder.hidden_size))   
    decoder_hidden = torch.cat((encoder_hidden, tree_hidden.unsqueeze(1)))
    #decoder_hidden = encoder_hidden
    # set the start of the sentences of the batch
    decoder_input = torch.LongTensor([input_lang.vocab.stoi['<sos>']] * 1)

    # store the decoder outputs to estimate the loss
    all_decoder_outputs = Variable(torch.zeros(target_batches.size()[0], 1, len(output_lang.vocab.stoi)))
    
    if USE_CUDA:
        decoder_input = decoder_input.cuda()
        all_decoder_outputs = all_decoder_outputs.cuda()
        decoder_context = decoder_context.cuda()  
    
    if train:
        use_teacher_forcing = random.random() < teacher_forcing_ratio
    else:
        use_teacher_forcing = False
    
    if use_teacher_forcing:        
        # Use targets as inputs
        for di in range(target_batches.shape[0]):
            decoder_output, decoder_context, decoder_hidden, decoder_attention = decoder(
                decoder_input.unsqueeze(0), decoder_context, decoder_hidden, state.unsqueeze(1))
            
            all_decoder_outputs[di] = decoder_output
            decoder_input = target_batches[di]
    else:        
        # Use decoder output as inputs
        for di in range(target_batches.shape[0]): 
            decoder_output, decoder_context, decoder_hidden, decoder_attention = decoder(
                decoder_input.unsqueeze(0), decoder_context, decoder_hidden, encoder_outputs) 
            
            all_decoder_outputs[di] = decoder_output
            
            # Greedy approach, take the word with highest probability
            topv, topi = decoder_output.data.topk(1)            
            decoder_input = Variable(torch.LongTensor(topi.cpu()).squeeze(dim=0))
            if USE_CUDA: decoder_input = decoder_input.cuda()
    
    loss = nn.NLLLoss()(all_decoder_outputs.view(-1, decoder.output_size), target_batches.contiguous().view(-1))          
    
    if train and (batch_ix % batch_size) == 0:
        loss.backward()
        torch.nn.utils.clip_grad_norm_(encoder.parameters(), clip)
        torch.nn.utils.clip_grad_norm_(decoder.parameters(), clip)
        torch.nn.utils.clip_grad_norm_(tree.parameters(), clip)
        encoder_optimizer.step()
        decoder_optimizer.step()
        tree_optimizer.step()
    elif train:
        loss.backward()
    else:
        del all_decoder_outputs
        del encoder_outputs
        del decoder_hidden
        torch.cuda.empty_cache()
    
    return loss.data[0] 

# Model

In [15]:
attn_model = 'general'
hidden_size = 512
emb_size = 300
n_layers = 2
dropout_p = 0.1
seed = 12
teacher_forcing_ratio = 0.5
clip = 5.0

n_epochs = 20
batch_size = 128

In [16]:
encoder = Encoder(len(input_lang.vocab.stoi), hidden_size, emb_size, n_layers, dropout_p, input_lang, USE_CUDA)
decoder = Decoder_luong(attn_model, hidden_size, len(output_lang.vocab.stoi), emb_size, 2 * n_layers + 1, dropout_p, output_lang, USE_CUDA)
tree = Tree_lstm(hidden_size, hidden_size)

if USE_CUDA:
    encoder = encoder.cuda()
    decoder = decoder.cuda()
    tree = tree.cuda()
    
learning_rate = 0.001
encoder_optimizer = optim.Adam(filter(lambda p: p.requires_grad, encoder.parameters()), lr=learning_rate)
decoder_optimizer = optim.Adam(filter(lambda p: p.requires_grad, decoder.parameters()), lr=learning_rate)
tree_optimizer = optim.Adam(tree.parameters(), lr=learning_rate)
criterion = nn.NLLLoss()

In [17]:
# Keep track of time elapsed and running averages
start = time.time()
train_losses = []
validation_losses = []
validation_bleu = []

plot_every = 5
print_every = 5
validate_loss_every = 25
best_bleu = 0
print_loss_total = 0 # Reset every print_every
plot_loss_total = 0 # Reset every plot_every

In [None]:
for epoch in range(1, n_epochs): 
    # Shuffle data
    #id_aux = np.random.permutation(np.arange(len(pairs_train)))
    #pairs_train = pairs_train[id_aux]
    
    tree.train()
    encoder.train()
    decoder.train()
    # Get the batches for this epoch
    input_batches, input_trees, target_batches = generate_batches(input_lang, output_lang, 1, pairs_train, arr_dep=trees_train, USE_CUDA=USE_CUDA)    
    print_loss_total = 0
    for batch_ix, (input_var, input_tree, target_var) in enumerate(zip(input_batches, input_trees, target_batches)):
        
        # Run the train function
        loss = train(input_var, target_var, input_tree,\
                 encoder, decoder, tree, criterion, batch_ix, train=True)
        #loss = train_luong(input_var, target_var, input_var.size(1), True)
            
        torch.cuda.empty_cache()

        # Keep track of loss
        print_loss_total += loss

        if batch_ix == 0: continue            

        if batch_ix % (print_every * batch_size) == 0:
            print_loss_avg = print_loss_total / (print_every * batch_size)
            print_loss_total = 0
            print_summary = '%s (%d %d%%) %.4f' % (time_since(start, batch_ix / len(input_batches)), epoch, batch_ix / len(input_batches) * 100, print_loss_avg)
            train_losses.append(loss)
            print(print_summary)
    
    tree.eval()
    encoder.eval()
    decoder.eval()
    
    with torch.no_grad():
        input_batches, input_trees, target_batches = generate_batches(input_lang, output_lang, 1, pairs_test, arr_dep=trees_test, USE_CUDA=USE_CUDA)
        print_loss_total = 0
        for batch_ix, (input_var, input_tree, target_var) in enumerate(zip(input_batches, input_trees, target_batches)):

            # Run the train function
            loss = train(input_var, target_var, input_tree,\
                     encoder, decoder, tree, criterion, batch_ix, train=False)

            print_loss_total += loss
            del loss
            torch.cuda.empty_cache()
    val_loss = print_loss_total / len(input_batches)
    validation_losses.append(val_loss)
    # Evaluating Bleu
    #evaluator = Evaluator(encoder, decoder, input_lang, output_lang, MAX_LENGTH, True)
    #candidates, references = evaluator.get_candidates_and_references(pairs_test, k_beams=1)
    #bleu = BLEU(candidates, [references])
    #if bleu[0] > best_bleu:
    #    best_bleu = bleu[0]
    #    torch.save(encoder.state_dict(), f'{DIR_RESULTS}/encoder.pkl')
    #    torch.save(decoder.state_dict(), f'{DIR_RESULTS}/decoder.pkl')
    #validation_bleu.append(bleu)
    print(f'val_loss: {val_loss:.4f} - bleu: {0}')

    # Prevent overflow gpu memory
   # del evaluator



1m 10s (- 108m 31s) (1 1%) 9.2668
2m 9s (- 98m 44s) (1 2%) 8.5496
3m 7s (- 94m 21s) (1 3%) 7.7006
4m 5s (- 91m 48s) (1 4%) 7.4920
5m 2s (- 89m 37s) (1 5%) 7.2081
5m 58s (- 87m 21s) (1 6%) 6.9733
6m 56s (- 86m 3s) (1 7%) 6.9306
7m 54s (- 84m 41s) (1 8%) 6.9142
8m 53s (- 83m 41s) (1 9%) 6.8339
9m 50s (- 82m 25s) (1 10%) 6.8981
10m 48s (- 81m 15s) (1 11%) 6.7765
11m 44s (- 80m 2s) (1 12%) 6.6715
12m 44s (- 79m 7s) (1 13%) 6.7975
13m 43s (- 78m 11s) (1 14%) 6.6339
14m 43s (- 77m 16s) (1 16%) 6.5732
15m 42s (- 76m 17s) (1 17%) 6.3988
16m 41s (- 75m 21s) (1 18%) 6.4290
17m 39s (- 74m 18s) (1 19%) 6.4801
18m 36s (- 73m 13s) (1 20%) 6.4630
19m 36s (- 72m 17s) (1 21%) 6.7799
20m 38s (- 71m 29s) (1 22%) 6.8829
21m 36s (- 70m 26s) (1 23%) 6.3316
22m 34s (- 69m 26s) (1 24%) 6.3903
23m 35s (- 68m 32s) (1 25%) 6.4865
24m 34s (- 67m 35s) (1 26%) 6.4363
25m 33s (- 66m 35s) (1 27%) 6.4828
26m 32s (- 65m 36s) (1 28%) 6.6457
27m 33s (- 64m 41s) (1 29%) 6.9014
28m 33s (- 63m 44s) (1 30%) 6.7278
29m 34s (-

In [None]:
evaluator = Evaluator(encoder, decoder, input_lang, output_lang, 
                      MAX_LENGTH, USE_CUDA)
candidates, references = evaluator.get_candidates_and_references(pairs_test[:10000], k_beams=2)
len(candidates), len(references)

In [114]:
batch_ix

4428

In [77]:
loss = train(input_var, target_var, input_tree,\
                 encoder, decoder, tree, criterion, batch_ix, train=False)

torch.Size([5, 1, 512])




IndexError: index 5 is out of bounds for dimension 0 with size 5

In [120]:
batch_ix

4428

In [20]:
sentence = ' '.join([input_lang.vocab.itos[t] for t in input_var[:-1]])
len(sentence.split()), sentence, len(input_var)

(9, 'sometimes you gotta do what you gotta do .', 10)

In [34]:
from stanfordcorenlp import StanfordCoreNLP

nlp = StanfordCoreNLP(r'data/lib/stanford-corenlp')
temp_tree = nlp.dependency_parse("hi everyone")

In [35]:
temp_tree

[('ROOT', 0, 2), ('dep', 2, 1)]

In [124]:
re.findall(r'[?|.|!|"]', 'asdasdasdasdas"""""')

['"', '"', '"', '"', '"']

In [125]:
asd = 'asdas"""""'

In [126]:
asd.replace('"', "'")

"asdas'''''"

In [119]:
temp_tree

[('ROOT', 0, 11),
 ('det', 2, 1),
 ('nsubj', 11, 2),
 ('punct', 8, 3),
 ('amod', 8, 4),
 ('punct', 8, 5),
 ('cc', 8, 6),
 ('punct', 8, 7),
 ('dep', 2, 8),
 ('punct', 8, 9),
 ('cop', 11, 10),
 ('punct', 11, 12)]

In [118]:
from utils import print_tree
print_tree(input_tree[0], 0)

arbol: 0 N hijos: 1
parent: 0 child 11

arbol: 11 N hijos: 3
parent: 11 child 2

arbol: 2 N hijos: 2
parent: 2 child 1

arbol: 2 N hijos: 2
parent: 2 child 8

arbol: 8 N hijos: 6
parent: 8 child 3

arbol: 8 N hijos: 6
parent: 8 child 4

arbol: 8 N hijos: 6
parent: 8 child 5

arbol: 8 N hijos: 6
parent: 8 child 6

arbol: 8 N hijos: 6
parent: 8 child 7

arbol: 8 N hijos: 6
parent: 8 child 9

arbol: 11 N hijos: 3
parent: 11 child 10

arbol: 11 N hijos: 3
parent: 11 child 12



In [121]:
len(pairs_test[4428][0].split())

8

In [31]:
for batch_ix, (input_var, input_tree, target_var) in enumerate(zip(input_batches, input_trees, target_batches)):
    
        encoder.eval()
        decoder.eval()
    
        #[input_var, _, _, _, _, _, _, _] = input_batch
        # Run the train function
        loss = train(input_var, target_var, input_tree,\
                 encoder, decoder, tree, criterion, batch_ix, train=False)
        
        print_loss_total += loss
        del loss
        torch.cuda.empty_cache()
val_loss = print_loss_total / len(input_batches)
validation_losses.append(val_loss)

print(f'val_loss: {val_loss:.4f} - bleu: {0}')




KeyboardInterrupt: 

In [22]:
BLEU(candidates, [references]) 

NameError: name 'candidates' is not defined

In [23]:
train_losses

[10.000017166137695,
 7.632211208343506,
 5.206856727600098,
 4.905932903289795,
 7.33738899230957,
 7.596729278564453,
 5.707835674285889,
 6.398375511169434,
 7.879125595092773,
 3.5190353393554688,
 5.947818279266357,
 6.372866630554199]

In [17]:
batch_ix

248