In [1]:
import string
import re
import random
import time
import math

import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F
from collections import Counter
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import gensim
from gensim.models import KeyedVectors
import numpy as np
from operator import add

In [2]:
# This class generates vocabulary and word2index and index2word on any corpus
class Language():
    def __init__(self, filename, filter_special_characters, word_count_threshold, special_symbols_list, pretrained_word2vec_path=None, word_embedding_dim=None):
        self.contents = self.read_file(filename)
        self.contents = self.normalize_file_contents(self.contents, filter_special_characters)
        self.vocabulary = self.generate_vocabulary(self.contents, word_count_threshold, special_symbols_list)
        self.word2index = self.generate_word2index(self.vocabulary)
        self.index2word = self.generate_index2word(self.vocabulary)
        self.filter_special_characters = filter_special_characters
        self.special_symbols_list = special_symbols_list
        if pretrained_word2vec_path is not None and word_embedding_dim is not None:
            self.word_embedding_dim = word_embedding_dim
            self.word_embeddings = self.initialize_word_vectors(pretrained_word2vec_path)
        else:
            self.word_embedding_dim = None
            self.word_embeddings = None
            
    # Returns contents of a file as list of sentences
    def read_file(self, filename):
        _file = open(filename,'r')
        contents = []
        for line in _file:
            contents.append(line)
        _file.close()
        return contents

    # Lowercase, trim, and remove filter_special_characters 
    def normalize_file_contents(self, contents, filter_special_characters):
        normalized_contents = []
        for line in contents:
            line = line.lower().strip()
            line = line.translate(None, filter_special_characters)
            normalized_contents.append(line.split())
        return normalized_contents

    # Returns the vocabulary- words below a threshold are dropped and special symbols are added(SOS, EOS)   
    def generate_vocabulary(self, contents, word_count_threshold, special_symbols_list):
        vocab = []
        for special_symbols in special_symbols_list:
            vocab.append(special_symbols)  

        counter = Counter()
        for line in contents:
            counter.update(line)

        for word,count in counter.iteritems():
            if count > word_count_threshold:
                vocab.append(word)
        return vocab

    # maps word to index
    def generate_word2index(self, vocabulary):
        word2index = {}
        for index, word in enumerate(vocabulary):
            word2index[word] = index
        return word2index

    # maps index to word
    def generate_index2word(self, vocabulary):
        index2word = {}
        for index, word in enumerate(vocabulary):
            index2word[index] = word
        return index2word
    
    def initialize_word_vectors(self, pretrained_word2vec_path):
        word2vec_model = KeyedVectors.load_word2vec_format(pretrained_word2vec_path, binary=True)
        word_vectors = np.random.uniform(-0.1, 0.1, (len(self.vocabulary), self.word_embedding_dim))
        for index, word in self.index2word.iteritems():
            if word in word2vec_model:
                word_vectors[index, :] = word2vec_model[word]
        return word_vectors

In [4]:
sourceLanguage = Language(filename= '../training_data/geo_tr.nl.tem',
                          filter_special_characters= string.punctuation.translate(None, '@'),
                          word_count_threshold= 1,
                          special_symbols_list=['<sos>','<eos>','<unk>'],
                          pretrained_word2vec_path='/Users/wyz0214/Downloads/GoogleNews-vectors-negative300.bin', 
                          word_embedding_dim=300)

In [5]:
# had to add unkown in sql vocab--> Check this
targetLanguage = Language(filename= '../training_data/geo_tr.sql.tem',
                          filter_special_characters= '',
                          word_count_threshold= 1,
                          special_symbols_list= ['<sos>','<eos>','<unk>'])

In [6]:
# This class is used to load the dataset
class Nl2SqlDataset(Dataset):
    def __init__(self, csv_file, transform=None):
        self.dataframe = pd.read_csv(csv_file)
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        source = self.dataframe.iloc[idx, 0]
        target = self.dataframe.iloc[idx, 1]
        
        sample = {'nl': source, 'sql': target}
        
        if self.transform:
            sample = self.transform(sample)
        
        return sample

# Transformers:
# 1. Lowercase, trim, and remove filter_special_characters
# 2. add SOS, EOS and UNK to the sentence
# 3. converts sentence to index
# 4. to tensor

class Transformer(object):
    
    def __init__(self, sourceLanguage, targetLanguage):
        self.source_language = sourceLanguage
        self.target_language = targetLanguage

    def __call__(self, sample):
        nl, sql = sample['nl'], sample['sql']
        
        # 1. Lowercase, trim, and remove filter_special_characters
        nl  = self.normalize_line(nl, self.source_language.filter_special_characters)
        sql = self.normalize_line(sql, self.target_language.filter_special_characters)
        
        # 2. add SOS, EOS and UNK to the sentence
        nl = self.replace_unknown_words(nl, self.source_language, self.source_language.special_symbols_list[2])
        sql = self.replace_unknown_words(sql, self.target_language, self.target_language.special_symbols_list[2])
        nl = [self.source_language.special_symbols_list[0]] + nl + [self.source_language.special_symbols_list[1]]
        sql= [self.target_language.special_symbols_list[0]] + sql + [self.target_language.special_symbols_list[1]]
        
        # 3. converts sentence to index
        nl  = self.sentence2index(nl, self.source_language)
        sql = self.sentence2index(sql, self.target_language)
        
        # 4. to tensor
        nl = torch.LongTensor(nl)
        sql = torch.LongTensor(sql)
        
        return {'nl': nl, 'sql': sql}
    
    def normalize_line(self, line, filter_special_characters):
        line = line.lower().strip()
        line = line.translate(None, filter_special_characters)
        line = line.split()
        return line

    def replace_unknown_words(self, sentence, language, unknown_symbol):
        for idx, word in enumerate(sentence):
            if word not in language.vocabulary:
                sentence[idx] = unknown_symbol
        return sentence
    
    def sentence2index(self, sentence, language):
        new_sentence = []
        for word in sentence:
            new_sentence.append(language.word2index[word])
        return new_sentence

In [7]:
training_dataset = Nl2SqlDataset('../training_data/geo_train.tem.csv', transform=Transformer(sourceLanguage, targetLanguage))
dev_dataset = Nl2SqlDataset('../training_data/geo_dev.tem.csv', transform=Transformer(sourceLanguage, targetLanguage))
test_dataset = Nl2SqlDataset('../training_data/geo_test.tem.csv', transform=Transformer(sourceLanguage, targetLanguage))

In [8]:
class Encoder(nn.Module):
    def __init__(self, source_language, lstm_hidden_size, lstm_num_layers, dropout_prob):
        super(Encoder, self).__init__()
        self.num_embeddings = len(source_language.vocabulary)
        self.embedding_dim = source_language.word_embedding_dim
        self.lstm_hidden_size = lstm_hidden_size
        self.lstm_num_layers = lstm_num_layers
        
        self.embedding = nn.Embedding(self.num_embeddings, self.embedding_dim)
        self.bilstm = nn.LSTM(input_size=self.embedding_dim, 
                              hidden_size= lstm_hidden_size,
                              num_layers= lstm_num_layers,
                              bidirectional=True)
        self.dropout = nn.Dropout(dropout_prob)
        
        self.initialize_embeddings(source_language.word_embeddings)
        
    def initialize_embeddings(self, initial_word_embeddings):
        self.embedding.weight.data.copy_(torch.from_numpy(initial_word_embeddings))
    
    def forward(self, source_language_sentence):
        
        embedded  = self.embedding(source_language_sentence)
        embedded  = self.dropout(embedded)
        
        seq_len = len(source_language_sentence)
        bilstm_input = embedded.view(seq_len, 1, self.embedding_dim)
        
        output, (hidden_state, cell_state) = self.bilstm(bilstm_input)
        
        return output, hidden_state, cell_state

In [9]:
class AttnDecoder(nn.Module):
    def __init__(self, target_language, encoder, target_embedding_dim, lstm_num_layers, dropout_prob):
        super(AttnDecoder, self).__init__()
        self.num_embeddings = len(target_language.vocabulary)
        self.embedding_dim = target_embedding_dim
        self.lstm_hidden_size = encoder.lstm_hidden_size
        self.lstm_num_layers = lstm_num_layers
        
        self.embedding = nn.Embedding(self.num_embeddings, self.embedding_dim)
        self.bilstm = nn.LSTM(input_size=self.embedding_dim + encoder.lstm_hidden_size*2,
                              hidden_size=self.lstm_hidden_size,
                              num_layers= lstm_num_layers,
                              bidirectional=True)
        self.output_layer = nn.Linear(self.lstm_hidden_size*2 + encoder.lstm_hidden_size*2, len(target_language.vocabulary))
        self.dropout = nn.Dropout(dropout_prob)
        self.softmax = nn.LogSoftmax()
        
    def forward(self, target_language_word, hidden_state, cell_state, encoder_outputs):
        
        #1 Embedding layer
        embedded = self.embedding(target_language_word)
        embedded  = self.dropout(embedded)
        bilstm_input = embedded.view(1, 1, self.embedding_dim)
       
        #2 Attention layer
        attn_weights = self.attn(hidden_state, encoder_outputs) #(1, 1, seqlen)
        context = torch.bmm(attn_weights, torch.transpose(encoder_outputs, 0, 1)) #(1,1,2*encoder_lstm_hidden_size)
        
        #3 Bilstm
        bilstm_input = torch.cat((bilstm_input, context), 2)
        bilstm_output, (hidden_state, cell_state) = self.bilstm(bilstm_input, (hidden_state, cell_state))
        
        #4 Output Layer
        output_layer_input = torch.cat((bilstm_output, context), 2)
        output_layer_input = torch.squeeze(output_layer_input, 0) #(1, self.lstm_hidden_size*2 + encoder.lstm_hidden_size*2)
        output = self.softmax(self.output_layer(output_layer_input)) #(1, len(target_language.vocabulary))
        
        return output, hidden_state, cell_state 
    
    def attn(self, hidden_state, encoder_outputs):
        seqlen = len(encoder_outputs)
        attn_energies = Variable(torch.zeros(seqlen))
        
        for i in range(seqlen):
            attn_energies[i] = torch.dot(hidden_state.view(1,-1), encoder_outputs[i])
        
        return F.softmax(attn_energies).unsqueeze(0).unsqueeze(0)

In [10]:
def train(source_language_sentence, target_language_sentence, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, clip_gradient):
    # set training to true for dropout layers
    encoder.train()
    decoder.train()
    
    # Zero gradients of both optimizers
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    
    # Get size of input and target sentences
    input_length = source_language_sentence.size()[0]
    target_length = target_language_sentence.size()[0]
    
    # Run words through encoder
    encoder_outputs, encoder_hidden_state, encoder_cell_state = encoder(source_language_sentence)
    # encoder_outputs -> (input_length, 1, 2*encoder_hidden_size)
    # encoder_hidden_state -> (2, 1, encoder_hidden_size)

    # Prepare decoder input and output
    decoder_hidden_state = encoder_hidden_state
    decoder_cell_state = Variable(torch.zeros(decoder_hidden_state.shape[0], decoder_hidden_state.shape[1], decoder_hidden_state.shape[2]))
    
    #only using teacher forcing for now --> Check this
    loss = 0
    for i in range(target_length-1):
        decoder_output, decoder_hidden_state, decoder_cell_state = decoder(target_language_sentence[i].view(1,1), decoder_hidden_state, decoder_cell_state, encoder_outputs)
        loss += criterion(decoder_output, target_language_sentence[i+1])
    
    # Backpropagation
    loss.backward()
    torch.nn.utils.clip_grad_norm(encoder.parameters(), clip_gradient)
    torch.nn.utils.clip_grad_norm(decoder.parameters(), clip_gradient)
    encoder_optimizer.step()
    decoder_optimizer.step()
    
    return loss.data[0] / target_length 

In [180]:
def evaluate(dataset, num_samples, source_language, target_language, encoder, decoder, criterion, verbose=False):
    # set training to false for dropout layers
    encoder.eval()
    decoder.eval()
    
    total_loss = 0
    for n_sample in range(num_samples):
        sample = random.choice(dataset)
        source_language_sentence, target_language_sentence = Variable(sample['nl']), Variable(sample['sql'])
        
        # Get size of input and target sentences
        input_length = source_language_sentence.size()[0]
        target_length = target_language_sentence.size()[0]

        # Run words through encoder
        encoder_outputs, encoder_hidden_state, encoder_cell_state = encoder(source_language_sentence)
        # encoder_outputs -> (input_length, 1, 2*encoder_hidden_size)
        # encoder_hidden_state -> (2, 1, encoder_hidden_size)

        # Prepare decoder input and output
        decoder_hidden_state = encoder_hidden_state
        decoder_cell_state = Variable(torch.zeros(decoder_hidden_state.shape[0], decoder_hidden_state.shape[1], decoder_hidden_state.shape[2]))
        
        loss = 0
        decoder_input = target_language_sentence[0].view(1,1)
        print(decoder_input)
        predicted_sentence = []
        for i in range(target_length-1):
            decoder_output, decoder_hidden_state, decoder_cell_state = decoder(decoder_input, decoder_hidden_state, decoder_cell_state, encoder_outputs)
            loss += criterion(decoder_output, target_language_sentence[i+1])
            
            # Choose top word from output
            topv, topi = decoder_output.data.topk(1)
            ni = topi[0][0]
            if ni == target_language.word2index['<eos>']:
                #print('!!!!!')
                break
            
            # Next input is chosen word
            decoder_input = Variable(torch.LongTensor([[ni]]))
            predicted_sentence.append(ni)
        
        total_loss += (loss.data[0] / target_length)
        if verbose:
            print 'Sample ' + str(n_sample)
            print 'Source sentence = ' + str(to_sentence(source_language_sentence.data.numpy(), source_language))
            print 'Target sentence = ' + str(to_sentence(target_language_sentence.data.numpy(), target_language))
            print 'Predicted sentence = ' + str(to_sentence(predicted_sentence, target_language))
            
    return total_loss/num_samples

def to_sentence(index_list ,language):
    sentence = []
    for index in index_list:
        sentence.append(language.index2word[index])
    return sentence

In [205]:
def evaluate_withBeam(dataset, num_samples, source_language, target_language, encoder, decoder, criterion, num_beam=1, verbose=False):
    # set training to false for dropout layers
    encoder.eval()
    decoder.eval()
    
    total_loss = 0
    for n_sample in range(num_samples):
        sample = random.choice(dataset)
        source_language_sentence, target_language_sentence = Variable(sample['nl']), Variable(sample['sql'])
        
        # Get size of input and target sentences
        input_length = source_language_sentence.size()[0]
        target_length = target_language_sentence.size()[0]

        # Run words through encoder
        encoder_outputs, encoder_hidden_state, encoder_cell_state = encoder(source_language_sentence)
        # encoder_outputs -> (input_length, 1, 2*encoder_hidden_size)
        # encoder_hidden_state -> (2, 1, encoder_hidden_size)

        # Prepare decoder input and output
        decoder_hidden_state = encoder_hidden_state
        decoder_cell_state = Variable(torch.zeros(decoder_hidden_state.shape[0], decoder_hidden_state.shape[1], decoder_hidden_state.shape[2]))
        
        decoder_input = target_language_sentence[0].view(1,1)
        
        
        loss = []
        predicted_sentences = []
        
        decoder_output, decoder_hidden_state, decoder_cell_state = decoder(decoder_input, decoder_hidden_state, decoder_cell_state, encoder_outputs)
            
        topv, topi = decoder_output.data.topk(num_beam)
        for m in range(num_beam):
            predicted_sentences.append([[topi[0][m]], -topv[0][m]])
            loss.append(criterion(decoder_output, target_language_sentence[1]))
        #print(predicted_sentences)
        
        for i in range(target_length-2): #target_length-1
            temp_loss = []
            temp_sentences = []
            for ct, n in enumerate(predicted_sentences):
                if n[0][-1] == '<eos>':
                    temp_sentences.append(n)
                    temp_loss.append(loss[ct])
                    continue
                    
                decoder_input = Variable(torch.LongTensor([[n[0][-1]]]))
                decoder_output, decoder_hidden_state, decoder_cell_state = decoder(decoder_input, decoder_hidden_state, decoder_cell_state, encoder_outputs)

                #print(decoder_output.data)
                #print(topi)

                topv, topi = decoder_output.data.topk(num_beam)
                for j in range(num_beam):
                    ni = topi[0][j]
                    wordLst = n[0][:]
                    temp_loss.append(loss[ct]+criterion(decoder_output, target_language_sentence[i+1]))
                    #print(ni)
                    if ni == target_language.word2index['<eos>']:
                        print(wordLst)
                        wordLst.append('<eos>')
                        temp_sentences.append([wordLst, float('Inf')])
                    else:
                        wordLst.append(ni)
                        temp_sentences.append([wordLst, n[1]-topv[0][j]])
            #print(temp_sentences)
            predicted_sentences = sorted(temp_sentences, key=lambda lst: lst[1])[:num_beam][:]
            print(predicted_sentences)
            predicted_ind = sorted(range(len(temp_sentences)), key=lambda i: temp_sentences[i])
            loss = [x for _, x in sorted(zip(predicted_ind, temp_loss), key=lambda pair: pair[0])][:num_beam]
            
        #print(predicted_sentences)
        predicted_sentence = sorted(predicted_sentences, key=lambda lst: lst[1])[0][0]
        if predicted_sentence[-1] == '<eos>':
            predicted_sentence = predicted_sentence[:-1]
        predicted_i = sorted(range(len(predicted_sentences)), key=lambda i: predicted_sentences[i])
        loss_ = [x for _, x in sorted(zip(predicted_i, loss), key=lambda pair: pair[0])][0]
            
        total_loss += (loss_.data[0] / target_length)
        if verbose:
            print 'Sample ' + str(n_sample)
            print 'Source sentence = ' + str(to_sentence(source_language_sentence.data.numpy(), source_language))
            print 'Target sentence = ' + str(to_sentence(target_language_sentence.data.numpy(), target_language))
            print 'Predicted sentence = ' + str(to_sentence(predicted_sentence, target_language))
            
    return total_loss/num_samples

In [206]:
test_loss_beam = evaluate_withBeam(test_dataset, 10, sourceLanguage, targetLanguage, encoder, decoder, criterion, num_beam=1, verbose=True)



[[[56, 54], 0.3299456719250884]]
[[[56, 54, 58], 0.3299702196381986]]
[[[56, 54, 58, 69], 0.33041607023915276]]
[[[56, 54, 58, 69, 46], 0.33073886652709916]]
[[[56, 54, 58, 69, 46, 64], 0.3328757959534414]]
[[[56, 54, 58, 69, 46, 64, 84], 0.33350016607437283]]
[[[56, 54, 58, 69, 46, 64, 84, 5], 0.3574521156260744]]
[[[56, 54, 58, 69, 46, 64, 84, 5, 36], 0.36073526937980205]]
[56, 54, 58, 69, 46, 64, 84, 5, 36]
[[[56, 54, 58, 69, 46, 64, 84, 5, 36, '<eos>'], inf]]
Sample 0
Source sentence = ['<sos>', 'what', 'is', 'the', 'lowest', 'point', 'in', 'state@0', 'in', '<unk>', '<eos>']
Target sentence = ['<sos>', 'select', 'highlow.lowest_point', 'from', 'highlow', 'where', 'highlow.state_name', '=', 'state@0', ';', '<eos>']
Predicted sentence = ['select', 'highlow.lowest_point', 'from', 'highlow', 'where', 'highlow.state_name', '=', 'state@0', ';']
[[[56, 35], 0.38689592983791954]]
[[[56, 35, 58], 0.38924827673690743]]
[[[56, 35, 58, 33], 0.38943239506261307]]
[[[56, 35, 58, 33, 46], 0.42994

[[[56, 78, 58, 33, 46, 40, 84, 9, 56, 34, 9, 40, 82, 58], 3.5570292273350788]]
[[[56, 78, 58, 33, 46, 40, 84, 9, 56, 34, 9, 40, 82, 58, 33], 3.5787308053859306]]
[[[56, 78, 58, 33, 46, 40, 84, 9, 56, 34, 9, 40, 82, 58, 33, 82], 3.6074427930452657]]
[[[56, 78, 58, 33, 46, 40, 84, 9, 56, 34, 9, 40, 82, 58, 33, 82, 36], 3.676321019027455]]
[56, 78, 58, 33, 46, 40, 84, 9, 56, 34, 9, 40, 82, 58, 33, 82, 36]
[[[56, 78, 58, 33, 46, 40, 84, 9, 56, 34, 9, 40, 82, 58, 33, 82, 36, '<eos>'], inf]]
[[[56, 78, 58, 33, 46, 40, 84, 9, 56, 34, 9, 40, 82, 58, 33, 82, 36, '<eos>'], inf]]
[[[56, 78, 58, 33, 46, 40, 84, 9, 56, 34, 9, 40, 82, 58, 33, 82, 36, '<eos>'], inf]]
[[[56, 78, 58, 33, 46, 40, 84, 9, 56, 34, 9, 40, 82, 58, 33, 82, 36, '<eos>'], inf]]
[[[56, 78, 58, 33, 46, 40, 84, 9, 56, 34, 9, 40, 82, 58, 33, 82, 36, '<eos>'], inf]]
[[[56, 78, 58, 33, 46, 40, 84, 9, 56, 34, 9, 40, 82, 58, 33, 82, 36, '<eos>'], inf]]
[[[56, 78, 58, 33, 46, 40, 84, 9, 56, 34, 9, 40, 82, 58, 33, 82, 36, '<eos>'], inf]]

In [20]:
# initialize models, optimizers, and a loss function (criterion).
learning_rate = 0.0005  ##
batch_size = 200  ##
num_epochs = 100  ##
clip_gradient = 5.0

encoder_lstm_hidden_size = 100
encoder_lstm_num_layers=1  #--
encoder_dropout_prob = 0.05  ##

decoder_target_embedding_dim=100
decoder_lstm_num_layers=1  #--
decoder_dropout_prob = 0.05  ##

encoder = Encoder(source_language= sourceLanguage, 
                  lstm_hidden_size= encoder_lstm_hidden_size, 
                  lstm_num_layers= encoder_lstm_num_layers,
                  dropout_prob= encoder_dropout_prob)
decoder = AttnDecoder(target_language= targetLanguage, 
                      encoder= encoder, 
                      target_embedding_dim= decoder_target_embedding_dim, 
                      lstm_num_layers= decoder_lstm_num_layers,
                      dropout_prob= decoder_dropout_prob)
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
criterion = nn.NLLLoss()

In [21]:
# Start training
for n_epochs in range(num_epochs):
    training_loss = 0
    for batch in range(batch_size):
        sample = random.choice(training_dataset)
        source_language_sentence, target_language_sentence = Variable(sample['nl']), Variable(sample['sql'])
        training_loss += train(source_language_sentence, target_language_sentence, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, clip_gradient)
    
    training_loss = training_loss/batch_size
    dev_loss = evaluate(dev_dataset, 10, sourceLanguage, targetLanguage, encoder, decoder, criterion)
    print 'Training loss after ' + str(n_epochs) + ' epoch is ' + str(training_loss) + ' dev loss= ' + str(dev_loss)



Training loss after 0 epoch is 2.27607825993 dev loss= 1.15467170247
Training loss after 1 epoch is 1.20679311445 dev loss= 1.20678237979
Training loss after 2 epoch is 0.943579580331 dev loss= 1.19761471711
Training loss after 3 epoch is 0.730745500779 dev loss= 1.71075533092
Training loss after 4 epoch is 0.703775370299 dev loss= 1.79269359937
Training loss after 5 epoch is 0.573016143514 dev loss= 1.86106495036
Training loss after 6 epoch is 0.530724371274 dev loss= 1.33312898146
Training loss after 7 epoch is 0.542422477901 dev loss= 0.746056127876
Training loss after 8 epoch is 0.571043197164 dev loss= 1.68412802186
Training loss after 9 epoch is 0.524221282079 dev loss= 1.53416181212
Training loss after 10 epoch is 0.449801435331 dev loss= 0.694984872877
Training loss after 11 epoch is 0.546943508527 dev loss= 1.36338791149
Training loss after 12 epoch is 0.519769422816 dev loss= 2.20712772115
Training loss after 13 epoch is 0.500848457615 dev loss= 1.34798843869
Training loss af

In [181]:
# Evaluating on small sample of test set
test_loss = evaluate(test_dataset, 10, sourceLanguage, targetLanguage, encoder, decoder, criterion, verbose=True)

Variable containing:
 0
[torch.LongTensor of size 1x1]

Sample 0
Source sentence = ['<sos>', 'what', 'states', 'have', 'cities', 'named', 'city@0', '<eos>']
Target sentence = ['<sos>', 'select', 'city.state_name', 'from', 'city', 'where', 'city.city_name', '=', 'city@0', ';', '<eos>']
Predicted sentence = ['select', 'city.state_name', 'from', 'city', 'where', 'city.city_name', '=', 'city@0', ';']
Variable containing:
 0
[torch.LongTensor of size 1x1]

Sample 1
Source sentence = ['<sos>', 'which', 'state', 'has', 'the', 'highest', 'elevation', '<eos>']
Target sentence = ['<sos>', 'select', 'highlow.state_name', 'from', 'highlow', 'where', 'highlow.highest_elevation', '=', '(', 'select', 'max', '(', 'highlow.highest_elevation', ')', 'from', 'highlow', ')', ';', '<eos>']
Predicted sentence = ['select', 'state.state_name', 'from', 'state', 'where', 'state.density', '=', '(', 'select', 'max', '(', 'highlow.highest_elevation', ')', 'from', 'highlow', ')', ';']
Variable containing:
 0
[torch.



Sample 7
Source sentence = ['<sos>', 'how', 'many', 'states', 'does', 'state@0', 'border', '<eos>']
Target sentence = ['<sos>', 'select', 'count', '(', 'border_info.border', ')', 'from', 'border_info', 'where', 'border_info.state_name', '=', 'state@0', ';', '<eos>']
Predicted sentence = ['select', 'count', '(', 'border_info.border', ')', 'from', 'border_info', 'where', 'border_info.state_name', '=', 'state@0', ';']
Variable containing:
 0
[torch.LongTensor of size 1x1]

Sample 8
Source sentence = ['<sos>', 'which', 'states', 'does', 'the', 'river@0', 'run', 'through', '<eos>']
Target sentence = ['<sos>', 'select', 'river.traverse', 'from', 'river', 'where', 'river.river_name', '=', 'river@0', ';', '<eos>']
Predicted sentence = ['select', 'river.traverse', 'from', 'river', 'where', 'river.river_name', '=', 'river@0', ';']
Variable containing:
 0
[torch.LongTensor of size 1x1]

Sample 9
Source sentence = ['<sos>', 'what', 'is', 'the', 'most', '<unk>', 'state', 'in', 'the', 'united', 'sta

In [23]:
test_loss

2.1233193239597012

In [None]:
#Incomplete Beam search-- fill it up or throw it and implement from scratch
class State(object):
    def __init__(self, decoder_input, decoder_hidden_state, decoder_cell_state, encoder_outputs, current_score, decoded_sentence):
            self.decoder_input = decoder_input
            self.decoder_hidden_state = decoder_hidden_state
            self.decoder_cell_state = decoder_cell_state
            self.encoder_outputs = encoder_outputs
            self.current_score = current_score
            self.decoded_sentence = decoded_sentence

class BeamSearch(object):
    def __init__(self, beam_size, decoder):
        self.beam_size = beam_size
        self.decoder = decoder
        self.current_states = []
        self.next_states = []
    
    # initialize first state in current state
    def initialize_beam_search(self, decoder_input, decoder_hidden_state, decoder_cell_state, encoder_outputs):
    
    #performs one step of beam search
    def forward(self):
        for state in self.current_states:
            decoder_output, decoder_hidden_state, decoder_cell_state = self.decoder(state.decoder_input, 
                                                                                    state.decoder_hidden_state, 
                                                                                    state.decoder_cell_state,
                                                                                    state.encoder_outputs)
            self.next_states.append(self.get_next_states(decoder_output, decoder_hidden_state, decoder_cell_state, state.encoder_outputs, state.current_score, state.decoded_sentence))
        
        self.current_states[:] = get_best_states(self.beam_size, self.next_states)
        self.next_states[:] = []
        
    #computes the next states
    def get_next_states():
    
    #sorts the states based on score
    def get_best_states():
    
    def search(self):
        self.initialize_beam_search()
        # call forward in loop to perform one step of beam search 
        # do untill EOS is obtained

In [28]:
from math import log
from numpy import array
from numpy import argmax

# beam search
def beam_search_decoder(data, k):
    sequences = [[list(), 1.0]]
    # walk over each step in sequence
    for row in data:
        all_candidates = list()
        # expand each current candidate
        for i in range(len(sequences)):
            seq, score = sequences[i]
            for j in range(len(row)):
                candidate = [seq + [j], score * -log(row[j])]
                all_candidates.append(candidate)
        # order all candidates by score
        ordered = sorted(all_candidates, key=lambda tup:tup[1])
        # select k best
        sequences = ordered[:k]
    return sequences

# define a sequence of 10 words over a vocab of 5 words
data = [[0.1, 0.2, 0.3, 0.4, 0.5],
		[0.5, 0.4, 0.3, 0.2, 0.1],
		[0.1, 0.2, 0.3, 0.4, 0.5],
		[0.5, 0.4, 0.3, 0.2, 0.1],
		[0.1, 0.2, 0.3, 0.4, 0.5],
		[0.5, 0.4, 0.3, 0.2, 0.1],
		[0.1, 0.2, 0.3, 0.4, 0.5],
		[0.5, 0.4, 0.3, 0.2, 0.1],
		[0.1, 0.2, 0.3, 0.4, 0.5],
		[0.5, 0.4, 0.3, 0.2, 0.1]]
data = array(data)
# decode sequence
result = beam_search_decoder(data, 3)
# print result
for seq in result:
    print(seq)

[[4, 0, 4, 0, 4, 0, 4, 0, 4, 0], 0.025600863289563108]
[[4, 0, 4, 0, 4, 0, 4, 0, 4, 1], 0.03384250043584397]
[[4, 0, 4, 0, 4, 0, 4, 0, 3, 0], 0.03384250043584397]


In [64]:
a = torch.FloatTensor([[1,2,3,4,5,6,7,8.9,10,11,12]])
b = []
b.append(a)
print(b)
print(a)

[

Columns 0 to 7 
  1.0000   2.0000   3.0000   4.0000   5.0000   6.0000   7.0000   8.9000

Columns 8 to 10 
 10.0000  11.0000  12.0000
[torch.FloatTensor of size 1x11]
]


Columns 0 to 7 
  1.0000   2.0000   3.0000   4.0000   5.0000   6.0000   7.0000   8.9000

Columns 8 to 10 
 10.0000  11.0000  12.0000
[torch.FloatTensor of size 1x11]



In [74]:
a = [[[1,2,3],-1], [[1,2,4],-2], [[2,2,3],-3], [[0,2,3],1]]

In [90]:
k = sorted(a, key=lambda lst: lst[1])[:3]
k

[[[2, 2, 3], -3], [[1, 2, 4], -2], [[1, 2, 3], -1]]

In [198]:
float('inf') >1e100

True