In [1]:
from collections import Counter
from gensim.models import Word2Vec
from random import random
from nltk import word_tokenize
from nltk.translate.bleu_score import sentence_bleu
from torch import nn
from torch.autograd import Variable

import numpy as np
import torch
import torch.nn.functional as F

# Data Acquisition

For this assignment, you must download the data and extract it into `data/`. The dataset contains two files, both containing a single caption on each line. We should have 415,795 sentences in the training captions and 500 sentences in the validation captions.

To download the data, run the following directly on your server: `wget https://s3-us-west-2.amazonaws.com/cpsc532l-data/a3_data.zip`

In [2]:
# Load the data into memory.
train_sentences = [line.strip() for line in open("data3/mscoco_train_captions.txt").readlines()]
val_sentences = [line.strip() for line in open("data3/mscoco_val_captions.txt").readlines()]

train_sentences = [x for x in train_sentences if x] 
val_sentences = [x for x in val_sentences if x] 
print(len(train_sentences))
print(len(val_sentences))
print(train_sentences[0])
print(train_sentences[1])
print(train_sentences[2])

414143
500
A very clean and well decorated empty bathroom
A panoramic view of a kitchen and all of its appliances.
A blue and white bathroom with butterfly themed wall tiles.


# Preprocessing

The code provided below creates word embeddings for you to use. After creating the vocabulary, we construct both one-hot embeddings and word2vec embeddings. 

All of the packages utilized should be installed on your Azure servers, however you will have to download an NLTK corpus. To do this, follow the instructions below:

1. SSH to your Azure server
2. Open up Python interpreter
3. `import nltk`
4. `nltk.download()`

    You should now see something that looks like:

    ```
    >>> nltk.download()
    NLTK Downloader
    ---------------------------------------------------------------------------
        d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
    ---------------------------------------------------------------------------
    Downloader> 

    ```

5. `d punkt`
6. Provided the download finished successfully, you may now exit out of the Python interpreter and close the SSH connection.

Please look through the functions provided below **carefully**, as you will need to use all of them at some point in your assignment.

In [33]:
sentences = train_sentences

# Lower-case the sentence, tokenize them and add <SOS> and <EOS> tokens
sentences = [["<SOS>"] + word_tokenize(sentence.lower()) + ["<EOS>"] for sentence in sentences]

# Create the vocabulary. Note that we add an <UNK> token to represent words not in our vocabulary.
vocabularySize = 1000
word_counts = Counter([word for sentence in sentences for word in sentence])
vocabulary = ["<UNK>"] + [e[0] for e in word_counts.most_common(vocabularySize-1)]
word2index = {word:index for index,word in enumerate(vocabulary)}
one_hot_embeddings = np.eye(vocabularySize)

# Build the word2vec embeddings
wordEncodingSize = 300
filtered_sentences = [[word for word in sentence if word in word2index] for sentence in sentences]
w2v = Word2Vec(filtered_sentences, min_count=0, size=wordEncodingSize)
w2v_embeddings = np.concatenate((np.zeros((1, wordEncodingSize)), w2v.wv.syn0))

# Define the max sequence length to be the longest sentence in the training data. 
maxSequenceLength = max([len(sentence) for sentence in sentences])

def numberize(sentence):
    numberized = [word2index.get(word, 0) for word in sentence]
    return numberized

def one_hot(sentence):
    numberized = numberize(sentence)
    # Represent each word as it's one-hot embedding
    one_hot_embedded = one_hot_embeddings[numberized]
    
    return one_hot_embedded


print(sentences[1])
print(numberize(sentences[1]))
print(one_hot(sentences[1]))


['<SOS>', 'a', 'panoramic', 'view', 'of', 'a', 'kitchen', 'and', 'all', 'of', 'its', 'appliances', '.', '<EOS>']
[2, 1, 0, 174, 6, 1, 63, 10, 319, 6, 157, 616, 4, 3]
[[ 0.  0.  1. ...,  0.  0.  0.]
 [ 0.  1.  0. ...,  0.  0.  0.]
 [ 1.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]]


# 1. Building a Language Decoder

We now implement a language decoder. For now, we will have the decoder take a single training sample at a time (as opposed to batching). For our purposes, we will also avoid defining the embeddings as part of the model and instead pass in embedded inputs. While this is sometimes useful, as it learns/tunes the embeddings, we avoid doing it for the sake of simplicity and speed.

Remember to use LSTM hidden units!

In [80]:
hidden_size = 300
input_size = vocabularySize
output_size = vocabularySize


class DecoderLSTM(nn.Module):
    def __init__(self, hidden_size, input_size, output_size):
        super(DecoderLSTM, self).__init__()
        self.hidden_size = hidden_size

        self.lstm = nn.LSTM(input_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        

    def forward(self, inputs, hidden,state):
        output,(hidden,state) = self.lstm(inputs,(hidden,state))
        output = self.out(output)
        return output,hidden,state

    def initHidden(self):
        h = Variable(torch.zeros(1, 1, self.hidden_size))
        c = Variable(torch.zeros(1, 1, self.hidden_size))
        return h.cuda(), c.cuda()


decoder = DecoderLSTM(hidden_size, input_size, output_size) 
decoder.cuda()
decoder.train()

decoder

DecoderLSTM (
  (lstm): LSTM(1000, 300)
  (out): Linear (300 -> 1000)
)

# 2. Training a Language Decoder

We must now train the language decoder we implemented above. An important thing to pay attention to is the [inputs for an LSTM](http://pytorch.org/docs/master/nn.html#torch.nn.LSTM).

In [22]:

def train(decoder,decoder_optimizer,criterion,embeddings): 
    
    loss = 0
    decoder_optimizer.zero_grad()
    decoder_hidden, decoder_state = decoder.initHidden()
    
    #use embeddings as target variable 
    target_variable = embeddings
    nWords, VocSize = target_variable.shape
    
    decoder_input = torch.FloatTensor(target_variable[1]) 
    decoder_input = decoder_input.unsqueeze(0).unsqueeze(0)  
    decoder_input = Variable(decoder_input).cuda()
    
    #Without teacher forcing #ignore <SOS> #teaching it to break at <EOS>
    for di in range(2,nWords):
        
        decoder_output,decoder_hidden,decoder_state = decoder(decoder_input,decoder_hidden,decoder_state)
        
        temp_target = torch.FloatTensor(target_variable[di]) 
        temp_target = temp_target.unsqueeze(0).unsqueeze(0)  
        decoder_target = Variable(temp_target).cuda()
        decoder_target = decoder_target.long()
        decoder_target = decoder_target.squeeze(0)
        label = torch.max(decoder_target, 1)[1]
        
        topv, topi = decoder_output.data.topk(1)
        ni = topi[0][0]
        
        next_input = one_hot_embeddings[ni.cpu().numpy()]
        decoder_input = Variable(torch.FloatTensor(next_input).unsqueeze(0)) 
        decoder_input = decoder_input.cuda() 
        
        loss += criterion(decoder_output.squeeze(0), label)
          

    loss.backward()

    decoder_optimizer.step()

    return loss.data[0] / (nWords - 1)
    

# Train the model and monitor the loss
def trainIters(decoder, epochs, learning_rate):
    
    plot_loss_total = 0  # Reset every plot_every

    criterion = nn.CrossEntropyLoss()
    decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=learning_rate)
    
    
    for epoch in range(epochs):
        count = 0
        for sentence in sentences[:200000]:
            
            embeddings = one_hot(sentence)
            loss = train(decoder, decoder_optimizer, criterion, embeddings)
            plot_loss_total += loss
            
            if (count % 5000 == 0):
                print(plot_loss_total/5000)
                plot_loss_total = 0
            
            count = count + 1
         
    
    
epochs = 1
learning_rate = 0.00001

trainIters(decoder,epochs,learning_rate)  

torch.save(decoder.state_dict(), './decoder.pth')
print('training done')

0.0012292527940538196
4.886725752624682
4.336857649431972
4.2257553900376585
4.187116980613085
4.148298711858418
4.1136087421619445
4.206291209293989
4.223092802356398
4.162142173205294
4.1507513252129025
4.11312288692043
4.0729030377499145
4.046462275332244
4.240358991977747
4.116486894174229
4.058497967713027
4.005690454113624
4.034310622477259
3.97756621327979
4.0077196952222724
4.055543205432031
3.987977908147868
3.979404921078065
3.9741019444113546
3.945411133030882
3.9193164693212186
4.044165189349143
3.98813402596566
3.9466712819218293
3.936908598183193
3.9422106237556442
3.8951745515626026
3.931647680826702
4.112219685005499
4.010604763434629
3.9820996354137015
3.9768720861739393
3.9631242376099296
3.9376604224580043
training done


# 3. Building Language Decoder MAP Inference

We now define a method to perform inference with our decoder and test it with a few different starting words. This code will be fairly similar to your training function from part 2.

In [37]:
decoder.load_state_dict(torch.load('./decoder.pth'))
softmax = nn.Softmax()
#decoder.eval()

index2word = {index:word for index,word in enumerate(vocabulary)}

def inference(decoder, init_word, embeddings=one_hot_embeddings, max_length=maxSequenceLength):
    
    decoder_hidden, decoder_state = decoder.initHidden()
    decoded_words = [init_word]
    
    ind = word2index.get(init_word, 0)
    one_hot_vec = embeddings[ind]
    decoder_input = torch.FloatTensor(one_hot_vec)
    decoder_input = decoder_input.unsqueeze(0).unsqueeze(0) 
    decoder_input = Variable(decoder_input).cuda() 
    
    for di in range(max_length):
        
        decoder_output,decoder_hidden,decoder_state = decoder(decoder_input,decoder_hidden,decoder_state)
        decoder_output = softmax(decoder_output.squeeze(0))
        topv, topi = decoder_output.data.topk(1)
        ni = topi[0][0]
        
        #detect <EOS> 
        if (ni == 2):
            break
        else:
            decoded_words.append(index2word.get(ni,0))

        next_input = one_hot_embeddings[ni]
        decoder_input = Variable(torch.FloatTensor(next_input).unsqueeze(0).unsqueeze(0)) 
        decoder_input = decoder_input.cuda() 

    return decoded_words

s = " "
print(s.join(inference(decoder, init_word="the")))
print(s.join(inference(decoder, init_word="man")))
print(s.join(inference(decoder, init_word="woman")))
print(s.join(inference(decoder, init_word="dog")))

the man of a a a a a a .
man man of a a a a a . .
woman man of a a a a a . .
dog man of a a a a a . .


The decoder is trained without teacher forcing and without any sort of assistance and MAP inference we could expect it to get stuck to a particular type of sequence like the one above.    

# 4. Building Language Decoder Sampling Inference

We must now modify the method defined in part 3, to sample from the distribution outputted by the LSTM rather than taking the most probable word.

It might be useful to take a look at the output of your model and (depending on your implementation) modify it so that the outputs sum to 1. 

In [41]:
from bisect import bisect
from random import random
softmax = nn.Softmax()

decoder.load_state_dict(torch.load('./decoder.pth'))

index2word = {index:word for index,word in enumerate(vocabulary)}

def sampling_inference(decoder, init_word, embeddings=one_hot_embeddings, max_length=maxSequenceLength):
    
    decoder_hidden, decoder_state = decoder.initHidden()
    decoded_words = [init_word]

    decoded_words = [init_word]
    
    ind = word2index.get(init_word, 0)
    one_hot_vec = embeddings[ind]
    decoder_input = torch.FloatTensor(one_hot_vec)
    decoder_input = decoder_input.unsqueeze(0).unsqueeze(0) 
    decoder_input = Variable(decoder_input).cuda() 
    
    for di in range(max_length):
        
        decoder_output,decoder_hidden,decoder_state = decoder(decoder_input,decoder_hidden,decoder_state)
        decoder_output = softmax(decoder_output.squeeze(0))
        p = decoder_output.data.squeeze().cpu().numpy()
        
        cdf = [p[0]]
        for i in range(1, len(p)):
            cdf.append(cdf[-1] + p[i])

        ni = bisect(cdf,random())
        
            
        #detect <EOS> 
        if (ni == 2):
            #decoded_words.append("<EOS>")
            break
        else:
            decoded_words.append(index2word.get(ni,0))
        
        next_input = one_hot_embeddings[ni]
        decoder_input = Variable(torch.FloatTensor(next_input).unsqueeze(0).unsqueeze(0))
        decoder_input = decoder_input.cuda() 

    return decoded_words

s = " "

for i in range(0,5):
    print(s.join(sampling_inference(decoder, init_word="the")))
    print(s.join(sampling_inference(decoder, init_word="man")))
    print(s.join(sampling_inference(decoder, init_word="woman")))
    print(s.join(sampling_inference(decoder, init_word="dog")))
    print(" ")


the are this the a a boats about at <UNK>
man <UNK> man people while pose black suit dirt 's hole covered distance <UNK>
woman girl on <UNK> truck types event placed tree by grass of beach rain dress boy
dog baby full gray an book dog a tall
 
the car grass white with bedroom hillside a tall umbrella . a
man brown <UNK> <UNK> an fence back pizza of can bridge
woman small walks walking on soccer grass a road to suit
dog brown trunk holding the bunch during pulled a zoo blue looking of
 
the lush girl colored standing a a <UNK> chair area lawn
man bunch dog empty city sand ties next for into pieces <UNK> other on can .
woman black is lying shirt and dressed different ocean mountain background . a
dog sitting are one walk women other water graze nearby station . big by park the picture . a
 
the walks <UNK> its in set to it lamp himself . .
man parked on <UNK> camera top before both grassy zoo other
woman lies white dog on into a suit sand side mother he . <UNK> rocks and . small
dog outd

With sampling, the inference is not repetitive but neither is it sensible. The sampling helps the inference escape a overfitting sequence but the decoder is still not well trained enough.    

# 5.  Building Language Encoder

We now build a language encoder, which will encode an input word by word, and ultimately output a hidden state that we can then be used by our decoder.

In [79]:
hidden_size = 300
input_size = vocabularySize

class EncoderLSTM(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderLSTM, self).__init__()
        self.hidden_size = hidden_size

        self.lstm = nn.LSTM(input_size, hidden_size)

    def forward(self, inputs,hidden,state):
        output,(hidden,state) = self.lstm(inputs,(hidden,state))
        return output,hidden,state

    def initHidden(self):
        h = Variable(torch.zeros(1, 1, self.hidden_size))
        c = Variable(torch.zeros(1, 1, self.hidden_size))
        return h.cuda(), c.cuda()
        

encoder = EncoderLSTM(input_size, hidden_size)
encoder.cuda()
encoder.train()
encoder


EncoderLSTM (
  (lstm): LSTM(1000, 300)
)

# 6. Connecting Encoder to Decoder and Training End-to-End

We now connect our newly created encoder with our decoder, to train an end-to-end seq2seq architecture. 

It's likely that you'll be able to re-use most of your code from part 2. For our purposes, the only interaction between the encoder and the decoder is that the *last hidden state of the encoder is used as the initial hidden state of the decoder*. 

In [81]:

def train(embeddings, encoder, decoder, 
          encoder_optimizer, decoder_optimizer, criterion,hidden_states,max_length=maxSequenceLength):
    
    encoder_hidden,encoder_state = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_variable = embeddings
    target_variable = embeddings
    nWords, VocSize = target_variable.shape
    
    encoder_outputs = Variable(torch.zeros(nWords,1, hidden_size))
    encoder_outputs = encoder_outputs.cuda() 

    loss = 0

    for ei in range(1,nWords):
        
        encoder_input = torch.FloatTensor(input_variable[ei]) 
        encoder_input = encoder_input.unsqueeze(0).unsqueeze(0)  
        encoder_input = Variable(encoder_input).cuda()
        
        encoder_output, encoder_hidden,encoder_state = encoder(encoder_input,encoder_hidden,encoder_state)
        encoder_outputs[ei] = encoder_output[0][0]
    
    
    ind = word2index.get("<SOS>", 0)
    one_hot_vec = one_hot_embeddings[ind]
    decoder_input = torch.FloatTensor(one_hot_vec)
    decoder_input = decoder_input.unsqueeze(0).unsqueeze(0) 
    decoder_input = Variable(decoder_input).cuda() 
    
    hidden_states.append(encoder_hidden.squeeze().cpu().data.numpy()) 
    decoder_hidden = encoder_hidden

    #Without teacher forcing
    for di in range(1,nWords):
        
        decoder_output,decoder_hidden,decoder_state = decoder(decoder_input,decoder_hidden,
                                                              encoder_outputs[di].unsqueeze(0))
        
        temp_target = torch.FloatTensor(target_variable[di]) 
        temp_target = temp_target.unsqueeze(0).unsqueeze(0)  
        decoder_target = Variable(temp_target).cuda()
        decoder_target = decoder_target.long()
        decoder_target = decoder_target.squeeze(0)
        label = torch.max(decoder_target, 1)[1]
        
        topv, topi = decoder_output.data.topk(1)
        ni = topi[0][0]
        
        next_input = one_hot_embeddings[ni.cpu().numpy()]
        decoder_input = Variable(torch.FloatTensor(next_input).unsqueeze(0)) 
        decoder_input = decoder_input.cuda() 
        
        loss += criterion(decoder_output.squeeze(0), label)
            
    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.data[0] / nWords,hidden_states


def trainIters(encoder, decoder, epochs, learning_rate):
    
    hidden_states = []
    plot_loss_total = 0  
    encoder_optimizer = torch.optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=learning_rate)
    
    criterion = nn.CrossEntropyLoss()

    for epoch in range(epochs):
        count = 0
        for sentence in sentences[:200000]:
            
            embeddings = one_hot(sentence)
            loss,hidden_states = train(embeddings, encoder, decoder, encoder_optimizer,
                                       decoder_optimizer,criterion,hidden_states)
            plot_loss_total += loss
            
            if (count % 5000 == 0):
                print(plot_loss_total/5000)
                plot_loss_total = 0
            
            count = count + 1
            
        
        np.save(open('outputs/hidden_states_train'+str(epoch), 'wb+'), hidden_states)    
           

        
epochs = 1
learning_rate = 0.00001

trainIters(encoder,decoder,epochs,learning_rate)

torch.save(encoder.state_dict(), './encoder.pth')
torch.save(decoder.state_dict(), './decoder2.pth')
print('training done')   

0.0012415499114990234
4.395923924233522
3.8804789565686595
3.8117521413752575
3.7369077205875603
3.6265489139530023
3.504738045248612
3.525242896123824
3.4492659143583793
3.2708842698387404
3.1814707714514996
3.0152420302247838
2.89141593246968
2.752547861714314
2.8693497255925395
2.651424077232
2.5234132797761375
2.385096574447404
2.350763037304252
2.2259060771188692
2.2155633906129037
2.2331759170922063
2.1212107715304764
2.0649700116268157
1.9905722349000823
1.927227783126948
1.8486965851818027
1.9533152385798818
1.8582603858281683
1.7862792856896434
1.7453329136336375
1.688983353553361
1.6122121211462053
1.63759907672229
1.8667013490501907
1.7437227918804892
1.6763909112494053
1.6065134934827499
1.5624669757722371
1.5101744062591578
training done


# 7. Testing 

We must now define a method that allows us to do inference using the seq2seq architecture. We then run the 500 validation captions through this method, and ultimately compare the **reference** and **generated** sentences using our **BLEU** similarity score method defined above, to identify the average BLEU score.

In [98]:
from nltk.translate.bleu_score import SmoothingFunction
cc = SmoothingFunction()

sentences = val_sentences

# Lower-case the sentence, tokenize them and add <SOS> and <EOS> tokens
sentences = [["<SOS>"] + word_tokenize(sentence.lower()) + ["<EOS>"] for sentence in sentences]

#bleu function with reweighting
def bleu(reference_sentence, predicted_sentence):
    return sentence_bleu([reference_sentence], predicted_sentence,smoothing_function=cc.method4)

print(sentences[1])
print(numberize(sentences[1]))
print(one_hot(sentences[1]))


['<SOS>', 'a', 'man', 'speaking', 'into', 'a', 'microphone', 'on', 'a', 'stage', 'with', 'a', 'bicycle', 'and', 'dressed', 'in', 'cyclist', 'gear', '.', '<EOS>']
[3, 1, 12, 822, 86, 1, 608, 6, 1, 819, 9, 1, 459, 10, 224, 8, 901, 380, 4, 2]
[[ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  1.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  1. ...,  0.  0.  0.]]


In [99]:
def seq2seq_inference(sentence,encoder,decoder,hidden_states,max_length=maxSequenceLength):
    
    encoder_hidden,encoder_state = encoder.initHidden()
    
    embeddings = one_hot(sentence)
    input_variable = embeddings
    target_variable = embeddings
    nWords, VocSize = target_variable.shape
    
    encoder_outputs = Variable(torch.zeros(nWords,1, hidden_size))
    encoder_outputs = encoder_outputs.cuda() 
    
    for ei in range(1,nWords):
        
        encoder_input = torch.FloatTensor(input_variable[ei]) 
        encoder_input = encoder_input.unsqueeze(0).unsqueeze(0)  
        encoder_input = Variable(encoder_input).cuda()
        
        encoder_output, encoder_hidden,encoder_state = encoder(encoder_input,encoder_hidden,encoder_state)
        encoder_outputs[ei] = encoder_output[0][0]
        
    
    ind = word2index.get("<SOS>", 0)
    one_hot_vec = one_hot_embeddings[ind]
    decoder_input = torch.FloatTensor(one_hot_vec)
    decoder_input = decoder_input.unsqueeze(0).unsqueeze(0) 
    decoder_input = Variable(decoder_input).cuda() 
    
    hidden_states.append(encoder_hidden.squeeze().cpu().data.numpy()) 
    decoder_hidden = encoder_hidden
    decoded_words = ["<SOS>"]
    
    for di in range(max_length):
        
        decoder_output,decoder_hidden,decoder_state = decoder(decoder_input,decoder_hidden,
                                                              encoder_outputs[di].unsqueeze(0))
        decoder_output = softmax(decoder_output.squeeze(0))
        topv, topi = decoder_output.data.topk(1)
        ni = topi[0][0]
        
        
        #detect <EOS> 
        if (ni == 2):
            decoded_words.append("<EOS>")
            break
        else:
            decoded_words.append(index2word.get(ni,0))

        next_input = one_hot_embeddings[ni]
        decoder_input = Variable(torch.FloatTensor(next_input).unsqueeze(0).unsqueeze(0)) 
        decoder_input = decoder_input.cuda() 
        
    
    predicted = decoded_words
    bleu_score = bleu(sentence,predicted)
    return bleu_score,hidden_states

In [100]:
# Perform inference for all validation sequences and report the average BLEU score
hidden_states = []
softmax = nn.Softmax()

encoder.load_state_dict(torch.load('./encoder.pth'))

decoder.load_state_dict(torch.load('./decoder2.pth'))

Total_bleu = 0 
for sentence in sentences[:500]:
    
    blue_score,hidden_states = seq2seq_inference(sentence,encoder,decoder,hidden_states)
    Total_bleu += blue_score


np.save(open('outputs/hidden_states_val', 'wb+'), hidden_states)     
print("Average bleu score:",Total_bleu/500)    

Average bleu score: 0.2261655668561503


The bleu score is realtively low and this could be because I did not train the model with the entire data but with only half of it. Also, one hot representation might not be the best form of representation for this task.    

# 8. Encoding as Generic Feature Representation

We now use the final hidden state of our encoder, to identify the nearest neighbor amongst the training sentences for each sentence in our validation data.

It would be effective to first define a method that would generate all of the hidden states and store these hidden states **on the CPU**, and then loop over the generated hidden states to identify/output the nearest neighbors.

In [102]:
# Now get nearest neighbors and print
import math

epoch = 0
f_train = np.load(open('outputs/hidden_states_train'+str(epoch), 'rb'))
f_val = np.load(open('outputs/hidden_states_val', 'rb'))


for val_id in range(10):
       
    val_vec = f_val[val_id]
    min_id = 0
    min_dist = math.inf
    for train_id in range(200000):
        train_vec = f_train[train_id]
        
        dist = np.square(np.linalg.norm(val_vec-train_vec))
        if (dist < min_dist):
            min_dist = dist
            min_id = train_id
            
           
    
    print(val_sentences[val_id])
    print(train_sentences[min_id])
    print("")
    


A man and woman at a table with beer and wine
a man is wearing a purple shirt and glasses

A man speaking into a microphone on a stage with a bicycle and dressed in cyclist gear.
A smiling, bespectacled young man leans wearing a tie with a t-shirt and jeans leans against a tree.

Four horses are skattered around a small water hole.
Five horse and buggies race while a crowd watches.

A man and a young girl playing Wii
a woman and a man are shaking hands

A boat home sitting on a river bay.
An acrobat rides a horse while spectators watch.

Several Tim's of mints are stacked up with a bottle that has several  clipped roses inside
Elephant raising it's trunk next to gate with a bench strapped to it's back

Family at a pizza restaurant posing for a picture before meal.
People walk outside with umbrellas, two men do not have umbrellas.

Several mopeds are lined up along the side of a hotel parking lot.
Wedding cake with figure of bride and groom on a silver platter.

A young man appears to b

# 9. Effectiveness of word2vec

We now repeat everything done above using word2vec embeddings in place of one-hot embeddings. This will require re-running steps 1-8.

In [22]:
sentences = train_sentences

sentences = [["<SOS>"] + word_tokenize(sentence.lower()) + ["<EOS>"] for sentence in sentences]

def word2vec(sentence):
    numberized = numberize(sentence)
    
    w2v_embedded = w2v_embeddings[numberized]
    
    return w2v_embedded

print(word2vec(sentences[1]))

[[ 1.25918221 -0.83345079  0.55769587 ...,  1.30408192  0.19403522
   1.05969226]
 [ 0.4817569  -0.73988849  0.32601929 ...,  1.22742546  0.57799971
   0.68640846]
 [ 0.          0.          0.         ...,  0.          0.          0.        ]
 ..., 
 [-0.59881067  0.74625498 -0.05972543 ..., -0.17042953  0.39723638
  -0.35443801]
 [-0.19171366  0.09861971  0.4687027  ...,  0.72048628 -0.25929466
  -0.09999225]
 [ 0.46664095  0.47151706 -0.38249168 ...,  0.88390398 -0.55219585
  -0.35060215]]


In [27]:
wordEncodingSize = 300
hidden_size = 300
input_size = wordEncodingSize
output_size = vocabularySize


class DecoderLSTM(nn.Module):
    def __init__(self, hidden_size, input_size, output_size):
        super(DecoderLSTM, self).__init__()
        self.hidden_size = hidden_size

        self.lstm = nn.LSTM(input_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        

    def forward(self, inputs, hidden,state):
        output,(hidden,state) = self.lstm(inputs,(hidden,state))
        output = self.out(output)
        return output,hidden,state

    def initHidden(self):
        h = Variable(torch.zeros(1, 1, self.hidden_size))
        c = Variable(torch.zeros(1, 1, self.hidden_size))
        return h.cuda(), c.cuda()


decoder = DecoderLSTM(hidden_size, input_size, output_size) 
decoder.cuda()
decoder.train()

decoder

DecoderLSTM (
  (lstm): LSTM(300, 300)
  (out): Linear (300 -> 1000)
)

In [9]:

def train(decoder,decoder_optimizer,criterion,input_embeddings,target_embeddings): 
    
    loss = 0
    decoder_optimizer.zero_grad()
    decoder_hidden, decoder_state = decoder.initHidden()
    
    #use embeddings as target variable 
    target_variable = target_embeddings
    input_variable = input_embeddings
    nWords, VocSize = input_variable.shape
    
    decoder_input = torch.FloatTensor(input_variable[1]) 
    decoder_input = decoder_input.unsqueeze(0).unsqueeze(0)  
    decoder_input = Variable(decoder_input).cuda()
    
    #Without teacher forcing #ignore <SOS> #teaching it to break at <EOS>
    for di in range(2,nWords):
        
        decoder_output,decoder_hidden,decoder_state = decoder(decoder_input,decoder_hidden,decoder_state)
        
        temp_target = torch.FloatTensor(target_variable[di]) 
        temp_target = temp_target.unsqueeze(0).unsqueeze(0)  
        decoder_target = Variable(temp_target).cuda()
        decoder_target = decoder_target.long()
        decoder_target = decoder_target.squeeze(0)
        label = torch.max(decoder_target, 1)[1]
        
        topv, topi = decoder_output.data.topk(1)
        ni = topi[0][0]
        
        next_input = w2v_embeddings[ni.cpu().numpy()]
        decoder_input = Variable(torch.FloatTensor(next_input).unsqueeze(0)) 
        decoder_input = decoder_input.cuda() 
        
        loss += criterion(decoder_output.squeeze(0), label)
          

    loss.backward()

    decoder_optimizer.step()

    return loss.data[0] / (nWords - 1)
    

# Train the model and monitor the loss
def trainIters(decoder, epochs, learning_rate):
    
    plot_loss_total = 0  # Reset every plot_every

    criterion = nn.CrossEntropyLoss()
    decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=learning_rate)
    
    
    for epoch in range(epochs):
        count = 0
        for sentence in sentences[:100000]:
            
            input_embeddings = word2vec(sentence)
            target_embeddings = one_hot(sentence)
            loss = train(decoder, decoder_optimizer, criterion, input_embeddings,target_embeddings)
            plot_loss_total += loss
            
            if (count % 5000 == 0):
                print(plot_loss_total/5000)
                plot_loss_total = 0
            
            count = count + 1
         
    
    
epochs = 1
learning_rate = 0.00001

trainIters(decoder,epochs,learning_rate)  

torch.save(decoder.state_dict(), './decoderW1.pth')
print('training done')

0.0010238756815592446
4.054862071358708
4.005052740334257
3.9799438948169876
3.9705545279096937
3.9343653607996703
3.8978847113871167
4.037550494438426
4.071012942297543
4.00489829920382
4.00068595684393
3.958445666579537
3.9350900953042203
3.908114615435343
4.094549600129489
3.9515983262681855
3.897029918298033
3.850814563848063
3.8879540896935354
3.8382240496242934
training done


# MAP inference

In [31]:
decoder.load_state_dict(torch.load('./decoderW1.pth'))
softmax = nn.Softmax()

index2word = {index:word for index,word in enumerate(vocabulary)}

def inference(decoder, init_word, embeddings=w2v_embeddings, max_length=maxSequenceLength):
    
    decoder_hidden, decoder_state = decoder.initHidden()
    decoded_words = [init_word]
    
    ind = word2index.get(init_word, 0)
    wordvec = embeddings[ind]
    decoder_input = torch.FloatTensor(wordvec)
    decoder_input = decoder_input.unsqueeze(0).unsqueeze(0) 
    decoder_input = Variable(decoder_input).cuda() 
    
    for di in range(max_length):
        
        decoder_output,decoder_hidden,decoder_state = decoder(decoder_input,decoder_hidden,decoder_state)
        decoder_output = softmax(decoder_output.squeeze(0))
        topv, topi = decoder_output.data.topk(1)
        ni = topi[0][0]
        
        #detect <EOS> 
        if (ni == 2):
            break
        else:
            #print(ni)
            decoded_words.append(index2word.get(ni,0))

        next_input = embeddings[ni]
        decoder_input = Variable(torch.FloatTensor(next_input).unsqueeze(0).unsqueeze(0)) 
        decoder_input = decoder_input.cuda() 

    return decoded_words

s = " "
print(s.join(inference(decoder, init_word="the")))
print(s.join(inference(decoder, init_word="man")))
print(s.join(inference(decoder, init_word="woman")))
print(s.join(inference(decoder, init_word="dog")))

the <UNK> <UNK> <UNK> <UNK> a <UNK> <UNK> <UNK>
man <UNK> <UNK> <UNK> <UNK> a <UNK> <UNK> <UNK>
woman <UNK> <UNK> <UNK> <UNK> a <UNK> <UNK> <UNK>
dog <UNK> <UNK> <UNK> <UNK> a <UNK> <UNK> <UNK>


Overfitting to a sequence without teacher forcing.   

# sampling inference

In [13]:
from bisect import bisect
from random import random
softmax = nn.Softmax()

decoder.load_state_dict(torch.load('./decoderW1.pth'))

index2word = {index:word for index,word in enumerate(vocabulary)}

def sampling_inference(decoder, init_word, embeddings=w2v_embeddings, max_length=maxSequenceLength):
    
    decoder_hidden, decoder_state = decoder.initHidden()
    decoded_words = [init_word]

    decoded_words = [init_word]
    
    ind = word2index.get(init_word, 0)
    one_hot_vec = embeddings[ind]
    decoder_input = torch.FloatTensor(one_hot_vec)
    decoder_input = decoder_input.unsqueeze(0).unsqueeze(0) 
    decoder_input = Variable(decoder_input).cuda() 
    
    for di in range(max_length):
        
        decoder_output,decoder_hidden,decoder_state = decoder(decoder_input,decoder_hidden,decoder_state)
        decoder_output = softmax(decoder_output.squeeze(0))
        p = decoder_output.data.squeeze().cpu().numpy()
        
        cdf = [p[0]]
        for i in range(1, len(p)):
            cdf.append(cdf[-1] + p[i])

        ni = bisect(cdf,random())
        
            
        #detect <EOS> 
        if (ni == 2):
            #decoded_words.append("<EOS>")
            break
        else:
            decoded_words.append(index2word.get(ni,0))
        
        next_input = embeddings[ni]
        decoder_input = Variable(torch.FloatTensor(next_input).unsqueeze(0).unsqueeze(0))
        decoder_input = decoder_input.cuda() 

    return decoded_words

s = " "

for i in range(0,5):
    print(s.join(sampling_inference(decoder, init_word="the")))
    print(s.join(sampling_inference(decoder, init_word="man")))
    print(s.join(sampling_inference(decoder, init_word="woman")))
    print(s.join(sampling_inference(decoder, init_word="dog")))
    print(" ")


the bird old a flock on standing with pair . the in on . . by it <UNK> and
man sitting on grass are zoo on the is . . a <UNK> the
woman bus bus pen together in horses a sitting the a . sky giraffe
dog bus holds other . a blue field a <UNK> water long number <UNK> . .
 
the red 's of a in the <UNK> all drinking <UNK> <UNK> on
man red the the sitting bench in <UNK> sky around <UNK> in trees open
woman <UNK> next the on of a <UNK> the <UNK> the
dog shot different setting on walking in a a
 
the
man stopped around his very two with on birds the in street lights a <UNK>
woman <UNK> trucks are a of parked railroad sheep a them hydrant . . . . bird concrete hydrant airport with <UNK>
dog traffic man double a next benches <UNK> that <UNK> and bus bench a street front . . a pole
 
the white <UNK> has of is near the a on other
man driving near outside a around a a the traffic down <UNK>
woman picture <UNK> on other a next giraffes sitting is home with
dog red outside sits grazing a on to a by
 


In [28]:
wordEncodingSize = 300
hidden_size = 300
input_size = wordEncodingSize

class EncoderLSTM(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderLSTM, self).__init__()
        self.hidden_size = hidden_size

        self.lstm = nn.LSTM(input_size, hidden_size)

    def forward(self, inputs,hidden,state):
        output,(hidden,state) = self.lstm(inputs,(hidden,state))
        return output,hidden,state

    def initHidden(self):
        h = Variable(torch.zeros(1, 1, self.hidden_size))
        c = Variable(torch.zeros(1, 1, self.hidden_size))
        return h.cuda(), c.cuda()
        

encoder = EncoderLSTM(input_size, hidden_size)
encoder.cuda()
encoder.train()
encoder


EncoderLSTM (
  (lstm): LSTM(300, 300)
)

In [29]:

def train(input_embeddings,target_embeddings, encoder, decoder, 
          encoder_optimizer, decoder_optimizer, criterion,hidden_states,max_length=maxSequenceLength):
    
    encoder_hidden,encoder_state = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_variable = input_embeddings
    target_variable = target_embeddings
    nWords, VocSize = target_variable.shape
    
    encoder_outputs = Variable(torch.zeros(nWords,1, hidden_size))
    encoder_outputs = encoder_outputs.cuda() 

    loss = 0

    for ei in range(1,nWords):
        
        encoder_input = torch.FloatTensor(input_variable[ei]) 
        encoder_input = encoder_input.unsqueeze(0).unsqueeze(0)  
        encoder_input = Variable(encoder_input).cuda()
        
        encoder_output, encoder_hidden,encoder_state = encoder(encoder_input,encoder_hidden,encoder_state)
        encoder_outputs[ei] = encoder_output[0][0]
    
    
    ind = word2index.get("<SOS>", 0)
    wordvec = w2v_embeddings[ind]
    decoder_input = torch.FloatTensor(wordvec)
    decoder_input = decoder_input.unsqueeze(0).unsqueeze(0) 
    decoder_input = Variable(decoder_input).cuda() 
    
    hidden_states.append(encoder_hidden.squeeze().cpu().data.numpy()) 
    decoder_hidden = encoder_hidden

    #Without teacher forcing
    for di in range(1,nWords):
        
        decoder_output,decoder_hidden,decoder_state = decoder(decoder_input,decoder_hidden,
                                                              encoder_outputs[di].unsqueeze(0))
        
        temp_target = torch.FloatTensor(target_variable[di])
        temp_target = temp_target.unsqueeze(0).unsqueeze(0)  
        decoder_target = Variable(temp_target).cuda()
        decoder_target = decoder_target.long()
        decoder_target = decoder_target.squeeze(0)
        label = torch.max(decoder_target, 1)[1]
        
        topv, topi = decoder_output.data.topk(1)
        ni = topi[0][0]
        
               
        next_input = input_variable[di]  #teacher forcing 
        decoder_input = Variable(torch.FloatTensor(next_input).unsqueeze(0).unsqueeze(0)) 
        decoder_input = decoder_input.cuda() 
        
        loss += criterion(decoder_output.squeeze(0), label)
            
    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.data[0] / nWords,hidden_states


def trainIters(encoder, decoder, epochs, learning_rate):
    
    hidden_states = []
    plot_loss_total = 0  
    encoder_optimizer = torch.optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=learning_rate)
    
    criterion = nn.CrossEntropyLoss()

    for epoch in range(epochs):
        count = 0
        for sentence in sentences[:400000]:
            
            input_embeddings = word2vec(sentence)
            target_embeddings = one_hot(sentence)
            loss,hidden_states = train(input_embeddings,target_embeddings, encoder, decoder, encoder_optimizer,
                                       decoder_optimizer,criterion,hidden_states)
            plot_loss_total += loss
            
            if (count % 5000 == 0):
                print(plot_loss_total/5000)
                plot_loss_total = 0
            
            count = count + 1
            
        
        np.save(open('outputs/hidden_states_train_W'+str(epoch), 'wb+'), hidden_states)    
           

        
epochs = 1
learning_rate = 0.00001

trainIters(encoder,decoder,epochs,learning_rate)

torch.save(encoder.state_dict(), './encoderW.pth')
torch.save(decoder.state_dict(), './decoderW2.pth')
print('training done')   

0.0012406927490234375
3.9714420115110762
2.646813261747851
2.089078832237761
1.7201897547167064
1.4526864824533725
1.2431494200918727
1.30844163656379
1.2592034651597226
1.0546482471747567
0.9279688186010395
0.7899628455017726
0.7071595312610173
0.6254463153885791
0.7115813979200543
0.5559409819726183
0.4655920049473506
0.3909160639053919
0.3690152903705918
0.3155109133435735
0.3446520580577199
0.3819416409768279
0.3249696907668533
0.2932152822757393
0.2499133995378782
0.22708863555338943
0.2066523039587023
0.27541561878267146
0.2363468923944616
0.20949713341679882
0.18629429317420612
0.1658664881581902
0.1552178224388369
0.1676084328808049
0.20435986670918707
0.19051033126424366
0.1576427346991389
0.14267350991140368
0.13081848422938477
0.12291006579770433
0.20988723894583808
0.20064539913245888
0.16076310725382834
0.13018595665315671
0.11187859212777694
0.11125913882631629
0.13552578711613472
0.30351470212680154
0.21556650928767399
0.1775215910564455
0.1450113635452677
0.123916038719

This time, the training is done with almost the entire data and we can see a substantial decrease in the loss compared to the end-end model using one hot representation.  

In [34]:
from nltk.translate.bleu_score import SmoothingFunction
cc = SmoothingFunction()

sentences = val_sentences

# Lower-case the sentence, tokenize them and add <SOS> and <EOS> tokens
sentences = [["<SOS>"] + word_tokenize(sentence.lower()) + ["<EOS>"] for sentence in sentences]

#bleu function with reweighting
def bleu(reference_sentence, predicted_sentence):
    return sentence_bleu([reference_sentence], predicted_sentence,smoothing_function=cc.method4)

print(sentences[1])
print(numberize(sentences[1]))
print(word2vec(sentences[1]))


['<SOS>', 'a', 'man', 'speaking', 'into', 'a', 'microphone', 'on', 'a', 'stage', 'with', 'a', 'bicycle', 'and', 'dressed', 'in', 'cyclist', 'gear', '.', '<EOS>']
[2, 1, 12, 0, 152, 1, 0, 5, 1, 0, 9, 1, 352, 10, 371, 8, 0, 630, 4, 3]
[[ 0.52435738 -1.17175257  1.43554449 ..., -0.22541249  0.08525697
   0.51633179]
 [ 0.04788534 -1.0648185   0.52550334 ..., -0.19837394  0.5309515
   0.47923851]
 [-0.6551522   0.16742826  0.79151756 ...,  0.71651602  2.03825712
  -0.65796065]
 ..., 
 [-0.17633598 -0.29771715 -0.1895007  ...,  0.25773174 -0.25257963
  -1.73474598]
 [-0.45129138 -0.11723721 -0.30734822 ...,  0.33252311  0.50422186
   0.48540848]
 [-0.69795328  0.94974667 -0.46290016 ...,  0.17884381  0.57679081
   0.35002416]]


In [81]:
def seq2seq_inference(sentence,encoder,decoder,hidden_states,max_length=maxSequenceLength):
    
    encoder_hidden,encoder_state = encoder.initHidden()
    
    input_variable = word2vec(sentence)
    #target_variable = one_hot(sentence)
    nWords, VocSize = input_variable.shape
    
    encoder_outputs = Variable(torch.zeros(nWords,1, hidden_size))
    encoder_outputs = encoder_outputs.cuda() 
    
    for ei in range(1,nWords):
        
        encoder_input = torch.FloatTensor(input_variable[ei]) 
        encoder_input = encoder_input.unsqueeze(0).unsqueeze(0)  
        encoder_input = Variable(encoder_input).cuda()
        
        encoder_output, encoder_hidden,encoder_state = encoder(encoder_input,encoder_hidden,encoder_state)
        encoder_outputs[ei] = encoder_output[0][0]
        
    
    ind = word2index.get("<SOS>", 0)
    one_hot_vec = w2v_embeddings[ind]
    decoder_input = torch.FloatTensor(one_hot_vec)
    decoder_input = decoder_input.unsqueeze(0).unsqueeze(0) 
    decoder_input = Variable(decoder_input).cuda() 
    
    hidden_states.append(encoder_hidden.squeeze().cpu().data.numpy()) 
    decoder_hidden = encoder_hidden
    decoded_words = ["<SOS>"]
    
    for di in range(max_length):
        
        if (di<nWords):
            decoder_output,decoder_hidden,decoder_state = decoder(decoder_input,decoder_hidden,
                                                              encoder_outputs[di].unsqueeze(0))
        else: 
            decoder_output,decoder_hidden,decoder_state = decoder(decoder_input,decoder_hidden,
                                                              encoder_outputs[nWords-1].unsqueeze(0))
            
        decoder_output = softmax(decoder_output.squeeze(0))
        topv, topi = decoder_output.data.topk(1)
        ni = topi[0][0]
        
        #detect <EOS> 
        if (ni == 3):
            decoded_words.append("<EOS>")
            break
        elif (ni != 0):
            #print(ni)
            decoded_words.append(index2word.get(ni,0))

        next_input = w2v_embeddings[ni]
        decoder_input = Variable(torch.FloatTensor(next_input).unsqueeze(0).unsqueeze(0)) 
        decoder_input = decoder_input.cuda() 
        
    s = " "
    predicted = decoded_words
    #print(s.join(sentence))
    #print(s.join(predicted))
    bleu_score = bleu(sentence,predicted)
    return bleu_score,hidden_states

# Infer on validation sentences

In [65]:
# Perform inference for all validation sequences and report the average BLEU score
hidden_states = []
softmax = nn.Softmax()
index2word = {index:word for index,word in enumerate(vocabulary)}

encoder.load_state_dict(torch.load('./encoderW.pth'))

decoder.load_state_dict(torch.load('./decoderW2.pth'))

Total_bleu = 0 
for sentence in sentences[:25]:
    
    blue_score,hidden_states = seq2seq_inference(sentence,encoder,decoder,hidden_states)
    Total_bleu += blue_score


#np.save(open('outputs/hidden_states_val_W', 'wb+'), hidden_states)     
#print("Average bleu score:",Total_bleu/500)    

<SOS> a man and woman at a table with beer and wine <EOS>
<SOS> a man and woman at a table with beer and wine
<SOS> a man speaking into a microphone on a stage with a bicycle and dressed in cyclist gear . <EOS>
<SOS> a man into a a with a bike and dressed in gear .
<SOS> four horses are skattered around a small water hole . <EOS>
<SOS> three are around a small
<SOS> a man and a young girl playing wii <EOS>
<SOS> a man and a young girl playing wii
<SOS> a boat home sitting on a river bay . <EOS>
<SOS> a boat home sitting a
<SOS> several tim 's of mints are stacked up with a bottle that has several clipped roses inside <EOS>
<SOS> several 's of are stacked up with a bottle that has three inside
<SOS> family at a pizza restaurant posing for a picture before meal . <EOS>
<SOS> family at a pizza restaurant posing for a picture before meal
<SOS> several mopeds are lined up along the side of a hotel parking lot . <EOS>
<SOS> several are lined up along the side of a parking lot
<SOS> a young m

The model is making better predictions on validation data compared to last time.

# Bleu score 

In [63]:
# Perform inference for all validation sequences and report the average BLEU score
hidden_states = []
softmax = nn.Softmax()
index2word = {index:word for index,word in enumerate(vocabulary)}

encoder.load_state_dict(torch.load('./encoderW.pth'))

decoder.load_state_dict(torch.load('./decoderW2.pth'))

Total_bleu = 0 
for sentence in sentences[:500]:
    
    blue_score,hidden_states = seq2seq_inference(sentence,encoder,decoder,hidden_states)
    Total_bleu += blue_score


np.save(open('outputs/hidden_states_val_W', 'wb+'), hidden_states)     
print("Average bleu score:",Total_bleu/500)    

Average bleu score: 0.4768919166049488


Bleu score is higher when compared to the one hot representation.    

# Nearest representation

In [14]:
# Now get nearest neighbors and print
import math

epoch = 0
f_train = np.load(open('outputs/hidden_states_train_W'+str(epoch), 'rb'))
f_val = np.load(open('outputs/hidden_states_val_W', 'rb'))



for val_id in range(10):
       
    val_vec = f_val[val_id]
    
    min_id = 0
    min_dist = math.inf
    for train_id in range(400000):
        train_vec = f_train[train_id]
        
        dist = np.square(np.linalg.norm(val_vec-train_vec))
        
        if (dist < min_dist):
            min_dist = dist
            min_id = train_id
            
      
    #print(min_id)
    print(val_sentences[val_id])
    print(train_sentences[min_id])
    print("")
    

A man and woman at a table with beer and wine
5 adults strangely dressed about to snow ski

A man speaking into a microphone on a stage with a bicycle and dressed in cyclist gear.
5 adults strangely dressed about to snow ski

Four horses are skattered around a small water hole.
5 adults strangely dressed about to snow ski

A man and a young girl playing Wii
5 adults strangely dressed about to snow ski

A boat home sitting on a river bay.
5 adults strangely dressed about to snow ski

Several Tim's of mints are stacked up with a bottle that has several  clipped roses inside
United Postal Service truck drives over very snowy roads

Family at a pizza restaurant posing for a picture before meal.
5 adults strangely dressed about to snow ski

Several mopeds are lined up along the side of a hotel parking lot.
5 adults strangely dressed about to snow ski

A young man appears to be taking a break from the waves.
5 adults strangely dressed about to snow ski

A baseball player standing next to hom

I cannot explain why this is happening. Even though the prediction performance is good, the nearest representation always gives only 2 sentences for all validation sentences (I tried for most). 

# With Mini Batch 

This part is not complete. I have not made changes to the custom RNN model to incorporate pad packed sequences. I have got the tensors ready but it appears that to run pad packed sequences with custom RNN models we have to make changes to the way we describe the model, which is not done here. Hence, the compile error.    

In [3]:
sentences = train_sentences

# Lower-case the sentence, tokenize them and add <SOS> and <EOS> tokens
sentences = [["<SOS>"] + word_tokenize(sentence.lower()) + ["<EOS>"] for sentence in sentences]

# Create the vocabulary. Note that we add an <UNK> token to represent words not in our vocabulary.
vocabularySize = 1000
word_counts = Counter([word for sentence in sentences for word in sentence])
vocabulary = ["<pad>"] + [e[0] for e in word_counts.most_common(vocabularySize-1)]
word2index = {word:index for index,word in enumerate(vocabulary)}
one_hot_embeddings = np.eye(vocabularySize)

# Define the max sequence length to be the longest sentence in the training data. 
maxSequenceLength = max([len(sentence) for sentence in sentences])

def numberize(sentence):
    numberized = [word2index.get(word, 0) for word in sentence]
    return numberized

def one_hot(sentence):
    numberized = numberize(sentence)
    # Represent each word as it's one-hot embedding
    one_hot_embedded = one_hot_embeddings[numberized]
    
    return one_hot_embedded



In [19]:
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import torch.nn.functional as F
import numpy as np
import itertools


def trainIters(encoder, decoder, epochs, learning_rate):
    
    hidden_states = []
    plot_loss_total = 0  
    encoder_optimizer = torch.optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=learning_rate)
    
    criterion = nn.CrossEntropyLoss()

    for epoch in range(epochs):
        count = 0
        for bch in range(2):
            
            vectorized_sen = [[one_hot(sen)]for sen in sentences[(bch*batch_size)+1:(bch+1)*batch_size]]

            seq_lengths = []
            for i in range(len(vectorized_sen)):
                seq_lengths.append(len(vectorized_sen[i][0]))
    

            seq_tensor = Variable(torch.zeros((len(vectorized_sen),maxSequenceLength,1000))).long().cuda()
            for idx, (seq, seqlen) in enumerate(zip(vectorized_sen, seq_lengths)):
                seq_tensor[idx,:seqlen,:1000] = torch.FloatTensor(seq)
    
            seq_lengths = torch.cuda.LongTensor(seq_lengths)
            seq_tensor = seq_tensor.long()
            # SORT YOUR TENSORS BY LENGTH!
            seq_lengths, perm_idx = seq_lengths.sort(0,descending=True)
            seq_tensor = seq_tensor[perm_idx]
            seq_tensor = seq_tensor.transpose(0,1) # (B,L,D) -> (L,B,D)

            #pack them up 
            packed_input = pack_padded_sequence(seq_tensor, seq_lengths.cpu().numpy())

            encoder_hidden,encoder_state = encoder.initHidden()
            encoder_optimizer.zero_grad()
            decoder_optimizer.zero_grad()
            encoder_outputs = Variable(torch.zeros(maxSequenceLength,batch_size, hidden_size))
            encoder_outputs = encoder_outputs.cuda() 

            #ERROR HERE     
            encoder_output, encoder_hidden,encoder_state = encoder(packed_input,encoder_hidden,encoder_state)
            
    
                                 
            decoder_input = packed_input
            hidden_states.append(encoder_hidden.squeeze().cpu().data.numpy()) 
            decoder_hidden = encoder_hidden

                    
            decoder_output,decoder_hidden,decoder_state = decoder(decoder_input,decoder_hidden,
                                                                               encoder_output)
            
            #change this 
            '''         
            topv, topi = decoder_output.data.topk(1)
            ni = topi[0][0]
            ''' 
                    
            loss += criterion(decoder_output.squeeze(0), label)
            
            loss.backward()

            encoder_optimizer.step()
            decoder_optimizer.step()

                        
            plot_loss_total += loss.data[0] / nWords
            
            if (count % 50 == 0):
                print(plot_loss_total/50)
                plot_loss_total = 0
            
            count = count + 1
            
        
        np.save(open('outputs/hidden_states_train_BW'+str(epoch), 'wb+'), hidden_states)    
           

        
epochs = 1
learning_rate = 0.00001

trainIters(encoder,decoder,epochs,learning_rate)

torch.save(encoder.state_dict(), './encoderBW.pth')
torch.save(decoder.state_dict(), './decoderBW.pth')
print('training done')   


# throw them through your LSTM (remember to give batch_first=True here if you packed with it)
packed_output,ht,ct = decoder(packed_input,ht,ct)



Variable containing:
( 0 ,.,.) = 
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0
     ...       ⋱       ...    
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0
[torch.cuda.LongTensor of size 1x30x300 (GPU 0)]



TypeError: torch.addmm received an invalid combination of arguments - got (int, torch.cuda.LongTensor, int, torch.cuda.LongTensor, torch.cuda.FloatTensor, out=torch.cuda.LongTensor), but expected one of:
 * (torch.cuda.LongTensor source, torch.cuda.LongTensor mat1, torch.cuda.LongTensor mat2, *, torch.cuda.LongTensor out)
 * (torch.cuda.LongTensor source, torch.cuda.sparse.LongTensor mat1, torch.cuda.LongTensor mat2, *, torch.cuda.LongTensor out)
 * (int beta, torch.cuda.LongTensor source, torch.cuda.LongTensor mat1, torch.cuda.LongTensor mat2, *, torch.cuda.LongTensor out)
 * (torch.cuda.LongTensor source, int alpha, torch.cuda.LongTensor mat1, torch.cuda.LongTensor mat2, *, torch.cuda.LongTensor out)
 * (int beta, torch.cuda.LongTensor source, torch.cuda.sparse.LongTensor mat1, torch.cuda.LongTensor mat2, *, torch.cuda.LongTensor out)
 * (torch.cuda.LongTensor source, int alpha, torch.cuda.sparse.LongTensor mat1, torch.cuda.LongTensor mat2, *, torch.cuda.LongTensor out)
 * (int beta, torch.cuda.LongTensor source, int alpha, torch.cuda.LongTensor mat1, torch.cuda.LongTensor mat2, *, torch.cuda.LongTensor out)
      didn't match because some of the arguments have invalid types: (int, torch.cuda.LongTensor, int, torch.cuda.LongTensor, !torch.cuda.FloatTensor!, out=torch.cuda.LongTensor)
 * (int beta, torch.cuda.LongTensor source, int alpha, torch.cuda.sparse.LongTensor mat1, torch.cuda.LongTensor mat2, *, torch.cuda.LongTensor out)
      didn't match because some of the arguments have invalid types: (int, torch.cuda.LongTensor, int, !torch.cuda.LongTensor!, !torch.cuda.FloatTensor!, out=torch.cuda.LongTensor)
