In [1]:
from collections import Counter, defaultdict
from gensim.models import Word2Vec
from IPython import display
from nltk import word_tokenize
from nltk.translate.bleu_score import sentence_bleu
from PIL import Image
from torch import nn
from torch.autograd import Variable
from torchvision import models, transforms

import json
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import random
import torch
import torch.nn.functional as F

# Data Acquisition

For this assignment, you will reuse the dataset you downloaded in assignment 2. This dataset contains a very large set of images, approximately 80K training images and 100 validation images, with multiple tags for each image. However that data *lacks captions* for the images, which is **vital** for this assignment. To obtain the captions for this assignment, download a few data files as shown below and add them to your `data/annotations` folder from assignment 2.

`wget https://s3-us-west-2.amazonaws.com/cpsc532l-data/a4_data.zip`

Following the data downloading and unzipping, the code below loads in the data into memory accordingly.

In [2]:
# Define a global transformer to appropriately scale images and subsequently convert them to a Tensor.
img_size = 224
loader = transforms.Compose([
  transforms.Scale(img_size),
  transforms.CenterCrop(img_size),
  transforms.ToTensor(),
]) 
def load_image(filename, volatile=False):
    """
    Simple function to load and preprocess the image.

    1. Open the image.
    2. Scale/crop it and convert it to a float tensor.
    3. Convert it to a variable (all inputs to PyTorch models must be variables).
    4. Add another dimension to the start of the Tensor (b/c VGG expects a batch).
    5. Move the variable onto the GPU.
    """
    image = Image.open(filename).convert('RGB')
    image_tensor = loader(image).float()
    image_var = Variable(image_tensor, volatile=volatile).unsqueeze(0)
    return image_var.cuda()

load_image('data/train2014/COCO_train2014_000000000009.jpg')

Variable containing:
( 0 , 0 ,.,.) = 
  0.0039  0.0078  0.0039  ...   0.0471  0.0471  0.0314
  0.0039  0.0039  0.0039  ...   0.0353  0.0353  0.0392
  0.0039  0.0039  0.0039  ...   0.0392  0.0392  0.0510
           ...             ⋱             ...          
  0.7137  0.7294  0.7137  ...   0.1686  0.1843  0.1686
  0.7059  0.6902  0.6863  ...   0.1765  0.1804  0.2039
  0.6784  0.6667  0.6706  ...   0.1922  0.2157  0.2275

( 0 , 1 ,.,.) = 
  0.1490  0.1490  0.1412  ...   0.0039  0.0039  0.0039
  0.1451  0.1412  0.1373  ...   0.0039  0.0039  0.0039
  0.1412  0.1373  0.1373  ...   0.0039  0.0039  0.0039
           ...             ⋱             ...          
  0.4392  0.4667  0.4549  ...   0.2588  0.2745  0.2863
  0.4353  0.4235  0.4196  ...   0.2745  0.2980  0.3137
  0.4118  0.4000  0.4000  ...   0.3020  0.3176  0.3020

( 0 , 2 ,.,.) = 
  0.5294  0.5294  0.5294  ...   0.1451  0.1412  0.1333
  0.5255  0.5333  0.5373  ...   0.1725  0.1451  0.1412
  0.5373  0.5490  0.5451  ...   0.2314  0.1843

In [3]:
# Load annotations file for the training images.
mscoco_train = json.load(open('data/annotations/a4_data/train_captions.json'))
train_ids = [entry['id'] for entry in mscoco_train['images']]
train_id_to_file = {entry['id']: 'data/train2014/' + entry['file_name'] for entry in mscoco_train['images']}

# Extract out the captions for the training images
train_id_set = set(train_ids)
train_id_to_captions = defaultdict(list)
for entry in mscoco_train['annotations']:
    if entry['image_id'] in train_id_set:
        train_id_to_captions[entry['image_id']].append(entry['caption'])

# Load annotations file for the validation images.
mscoco_val = json.load(open('data/annotations/a4_data/val_captions.json'))
val_ids = [entry['id'] for entry in mscoco_val['images']]
val_id_to_file = {entry['id']: 'data/val2014/' + entry['file_name'] for entry in mscoco_val['images']}

# Extract out the captions for the validation images
val_id_set = set(val_ids)
val_id_to_captions = defaultdict(list)
for entry in mscoco_val['annotations']:
    if entry['image_id'] in val_id_set:
        val_id_to_captions[entry['image_id']].append(entry['caption'])

# Load annotations file for the testing images
mscoco_test = json.load(open('data/annotations/a4_data/test_captions.json'))
test_ids = [entry['id'] for entry in mscoco_test['images']]
test_id_to_file = {entry['id']: 'data/val2014/' + entry['file_name'] for entry in mscoco_test['images']}

print("done")

done


# Preprocessing

We do the same preprocessing done in assignment 3. 

In [4]:
sentences = [sentence for caption_set in train_id_to_captions.values() for sentence in caption_set]

# Lower-case the sentence, tokenize them and add <SOS> and <EOS> tokens
sentences = [["<SOS>"] + word_tokenize(sentence.lower()) + ["<EOS>"] for sentence in sentences]

# Create the vocabulary. Note that we add an <UNK> token to represent words not in our vocabulary.
vocabularySize = 1000
word_counts = Counter([word for sentence in sentences for word in sentence])
vocabulary = ["<UNK>"] + [e[0] for e in word_counts.most_common(vocabularySize-1)]
word2index = {word:index for index,word in enumerate(vocabulary)}
one_hot_embeddings = np.eye(vocabularySize)

# Build the word2vec embeddings
wordEncodingSize = 300
filtered_sentences = [[word for word in sentence if word in word2index] for sentence in sentences]
w2v = Word2Vec(filtered_sentences, min_count=0, size=wordEncodingSize)
w2v_embeddings = np.concatenate((np.zeros((1, wordEncodingSize)), w2v.wv.syn0))

# Define the max sequence length to be the longest sentence in the training data. 
maxSequenceLength = max([len(sentence) for sentence in sentences])

def preprocess_numberize(sentence):
    """
    Given a sentence, in the form of a string, this function will preprocess it
    into list of numbers (denoting the index into the vocabulary).
    """
    tokenized = word_tokenize(sentence.lower())
        
    # Add the <SOS>/<EOS> tokens and numberize (all unknown words are represented as <UNK>).
    tokenized = ["<SOS>"] + tokenized + ["<EOS>"]
    numberized = [word2index.get(word, 0) for word in tokenized]
    
    return numberized

def preprocess_one_hot(sentence):
    """
    Given a sentence, in the form of a string, this function will preprocess it
    into a numpy array of one-hot vectors.
    """
    numberized = preprocess_numberize(sentence)
    
    # Represent each word as it's one-hot embedding
    one_hot_embedded = one_hot_embeddings[numberized]
    
    return one_hot_embedded

def preprocess_word2vec(sentence):
    """
    Given a sentence, in the form of a string, this function will preprocess it
    into a numpy array of word2vec embeddings.
    """
    numberized = preprocess_numberize(sentence)
    
    # Represent each word as it's one-hot embedding
    w2v_embedded = w2v_embeddings[numberized]
    
    return w2v_embedded

def compute_bleu(reference_sentences, predicted_sentence):
    """
    Given a list of reference sentences, and a predicted sentence, compute the BLEU similary between them.
    """
    reference_tokenized = [word_tokenize(ref_sent.lower()) for ref_sent in reference_sentences]
    predicted_tokenized = word_tokenize(predicted_sentence.lower())
    return sentence_bleu(reference_tokenized, predicted_tokenized)

#print(preprocess_one_hot(sentences[2]))
print("done")

done


In [5]:
from nltk.translate.bleu_score import SmoothingFunction
cc = SmoothingFunction()

def compute_bleu(reference_sentence, predicted_sentence):
    """
    Given a list of reference sentences, and a predicted sentence, compute the BLEU similary between them.
    """
    reference_tokenized = ["<SOS>"] + word_tokenize(reference_sentence.lower()) + ["<EOS>"] 
    predicted_tokenized = word_tokenize(predicted_sentence.lower())
    return sentence_bleu(reference_tokenized, predicted_tokenized,smoothing_function=cc.method4)

# 1. Setup Image Encoder

We load in the pre-trained VGG-16 model, and remove the final layer, as done in assignment 2.

In [6]:
# Your code goes here
encoder = models.vgg16(pretrained=True).cuda()
modified_classifier = nn.Sequential(*list(encoder.classifier.children())[:-1])

encoder.train()
modified_classifier

Sequential (
  (0): Linear (25088 -> 4096)
  (1): ReLU (inplace)
  (2): Dropout (p = 0.5)
  (3): Linear (4096 -> 4096)
  (4): ReLU (inplace)
  (5): Dropout (p = 0.5)
)

In [7]:
# Your code goes here
encoder_hidden_size = 4096
hidden_size = 512

class conv(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(conv, self).__init__()
        self.hidden_size = hidden_size

        self.out = nn.Linear(input_size, hidden_size)

    def forward(self, input):
        output = self.out(input)
        return output

converter = conv(encoder_hidden_size, hidden_size).cuda() 
converter 


conv (
  (out): Linear (4096 -> 512)
)

# 2. Setup a Language Decoder

We're going to reuse our decoder from Assignment 3.

In [8]:
# Your code goes here
encoder_hidden_size = 4096
input_size = wordEncodingSize
hidden_size = 512
output_size = vocabularySize


class DecoderLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(DecoderLSTM, self).__init__()
        self.hidden_size = hidden_size

        self.lstm = nn.LSTM(input_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)

    def forward(self, input,hidden,state):
        output = F.relu(input)
        output,(hidden,state) = self.lstm(output,(hidden,state))
        output = self.out(output)
        output = F.log_softmax(output.squeeze())
        return output.unsqueeze(0),hidden,state

decoder = DecoderLSTM(input_size, hidden_size, output_size).cuda() 
decoder 

DecoderLSTM (
  (lstm): LSTM(300, 512)
  (out): Linear (512 -> 1000)
)

# 3. Train encoder-decoder



In [9]:
# Your code goes here

def _sequence_mask(sequence_length, max_len=None):
    if max_len is None:
        max_len = sequence_length.data.max()
    batch_size = sequence_length.size(0)
    seq_range = torch.arange(0, max_len).long()
    seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len)
    seq_range_expand = Variable(seq_range_expand)
    if sequence_length.is_cuda:
        seq_range_expand = seq_range_expand.cuda()
    seq_length_expand = (sequence_length.unsqueeze(1)
                         .expand_as(seq_range_expand))
    return seq_range_expand < seq_length_expand


def compute_loss(logits, target, length):
    """
    Args:
        logits: A Variable containing a FloatTensor of size
            (batch, max_len, num_classes) which contains the
            unnormalized probability for each class.
        target: A Variable containing a LongTensor of size
            (batch, max_len) which contains the index of the true
            class for each corresponding step.
        length: A Variable containing a LongTensor of size (batch,)
            which contains the length of each data in a batch.

    Returns:
        loss: An average loss value masked by the length.
    """
    # logits_flat: (batch * max_len, num_classes)
    logits_flat = logits.view(-1, logits.size(-1))
    # log_probs_flat: (batch * max_len, num_classes)
    log_probs_flat = F.log_softmax(logits_flat)
    # target_flat: (batch * max_len, 1)
    target_flat = target.view(-1, 1)
    # losses_flat: (batch * max_len, 1)
    losses_flat = -torch.gather(log_probs_flat, dim=1, index=target_flat)
    # losses: (batch, max_len)
    losses = losses_flat.view(*target.size())
    # mask: (batch, max_len)
    mask = _sequence_mask(sequence_length=length, max_len=target.size(1))
    losses = losses * mask.float()
    loss = losses.sum() / length.float().sum()
    return loss


In [10]:
def pad_seq(arr, length, pad_token):

    if len(arr) == length:
        return np.array(arr)
    
    return np.concatenate((arr, [pad_token]*(length - len(arr))))
                 

def create_training(start,end,set_batch):
    

    sentence_lens = [len(preprocess_numberize(sentence)) for sentence in set_batch] 

    sorted_indices = sorted(list(range(len(sentence_lens))), key=lambda i: sentence_lens[i], reverse=True)
    set_batch = [set_batch[i] for i in sorted_indices if sentence_lens[i] > 0]
    

    train_id_batch = [train_id for train_id in train_ids[start:end]]
    train_id_batch = [train_id_batch[i] for i in sorted_indices if sentence_lens[i] > 0]
    training_input = [load_image(train_id_to_file[train_id]).squeeze() for train_id in train_id_batch]
    training_input = torch.stack(training_input)
    
    sentence_lens = [sentence_lens[i] for i in sorted_indices if sentence_lens[i] > 0]   
    max_len = max(sentence_lens)                            
                         
    # Preprocess all of the sentences in each batch
    w2v_embedded_list = [preprocess_word2vec(sentence) for sentence in set_batch]
    w2v_embedded_list_padded = [pad_seq(embed, max_len, np.zeros(wordEncodingSize)) 
                                        for embed in w2v_embedded_list]
    numberized_list = [preprocess_numberize(sentence) for sentence in set_batch]
    numberized_list_padded = [pad_seq(numb, max_len, 0).astype(torch.LongTensor) for numb in numberized_list]
    
    one_hot_embedded_list = [preprocess_one_hot(sentence) for sentence in set_batch]
    one_hot_embedded_list_padded = [pad_seq(embed, max_len, np.zeros(vocabularySize)) 
                                        for embed in one_hot_embedded_list]
    
    one_hot_output = Variable(torch.FloatTensor(one_hot_embedded_list_padded)).cuda()
    one_hot_output = one_hot_output.transpose(0, 1)

                
    w2v_input = Variable(torch.FloatTensor(w2v_embedded_list_padded)).cuda()    
    training_output = Variable(torch.LongTensor(numberized_list_padded)).cuda()    
    training_output = training_output.transpose(0, 1)
    w2v_input = w2v_input.transpose(0, 1)

    return training_input,training_output,w2v_input,one_hot_output,sentence_lens



In [13]:

def train(input_variable, 
          target_variable, 
          w2v_input, 
          one_hot_output,
          encoder, 
          decoder, 
          decoder_optimizer, 
          converter_optimizer,
          input_lens,
          criterion, 
          embeddings=w2v_embeddings):
    
    decoder_optimizer.zero_grad()
    converter_optimizer.zero_grad()

    input_length = input_variable.size()[0]
    target_length = target_variable.size()[0]
    #print(target_length)

    # Pass through the encoder
    features_output = encoder.features(input_variable)
    classifier_input = features_output.view(batch_size, -1)
    encoder_output = modified_classifier(classifier_input)
    encoder_output = converter(encoder_output).unsqueeze(0)
    
    #print(encoder_output)  
    
    
    # Construct the decoder input (initially <SOS> for every batch)
    decoder_input = Variable(torch.FloatTensor([[embeddings[word2index["<SOS>"]]
                                                for i in range(w2v_input.size(1))]])).cuda()
    #print(decoder_input)

    #print(encoder_output)
    decoder_hidden = encoder_output
    decoder_state = encoder_output

    # Prepare the results tensor
    all_decoder_outputs = Variable(torch.zeros(*one_hot_output.size())).cuda()
    all_decoder_outputs[0] = Variable(torch.FloatTensor([[one_hot_embeddings[word2index["<SOS>"]]
                                                for i in range(w2v_input.size(1))]])).cuda()
        
    # Iterate over the indices after the first.
    for t in range(1,target_length):
        decoder_output,decoder_hidden,decoder_state = decoder(decoder_input,decoder_hidden,decoder_state)
    
        
        if random.random() <= 0.9:
            decoder_input = w2v_input[t].unsqueeze(0)
        else:
            topv, topi = decoder_output.data.topk(1)
                       
            #Prepare the inputs
            decoder_input = torch.stack([Variable(torch.FloatTensor(embeddings[ni])).cuda()
                                         for ni in topi.squeeze()]).unsqueeze(0)
        
        #print(decoder_input)
        #print(decoder_hidden)
        #print(decoder_state)
        # Save the decoder output
        all_decoder_outputs[t] = decoder_output
    
        #print(all_decoder_outputs.transpose(0,1).contiguous())
    
    loss = compute_loss(all_decoder_outputs.transpose(0,1).contiguous(),
                        target_variable.transpose(0,1).contiguous(),
                    Variable(torch.LongTensor(input_lens)).cuda())
    
    loss.backward()    
    torch.nn.utils.clip_grad_norm(decoder.parameters(), 10.0) 
    decoder_optimizer.step()
    converter_optimizer.step()

    return loss.data[0]



In [14]:
num_epochs = 1
batch_size = 20

decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=0.0001) 
converter_optimizer = torch.optim.Adam(converter.parameters(), lr=0.0001) 
criterion = nn.CrossEntropyLoss()

#len(train_id_to_file)

for _ in range(num_epochs):
    total_loss = 0
    for i in range(50000//batch_size):
        
        start_idx = i * batch_size % len(train_id_to_file)
        
        batch_sen = [[] for y in range(batch_size*5)] 
    
        count=0
        for train_id in train_ids[start_idx:start_idx + batch_size]:
            for n in range(5):
                batch_sen[count+(n*batch_size)].append(train_id_to_captions[train_id][n])
            count+=1    
        
        for cap in range(5):
            
            set_batch = []
            for batch in batch_sen[(cap*batch_size):(cap+1)*batch_size]: 
                set_batch.append(batch[0])
        
            training_input,training_output,w2v_input,one_hot_output,input_lens = create_training(
                start_idx, start_idx + batch_size,set_batch)
        
            loss = train(training_input,
                     training_output, 
                     w2v_input,
                     one_hot_output,
                     encoder,
                     decoder, 
                     decoder_optimizer,
                     converter_optimizer,
                     input_lens,    
                     criterion)
            
            total_loss+=loss
        
        
        if i % 50 == 0:
            print(i,total_loss/5000)
            total_loss = 0
    

torch.save(encoder.state_dict(), './encoder4.pth')
torch.save(converter.state_dict(), './converter4.pth')
torch.save(decoder.state_dict(), './decoder4.pth')
print("training done") 

0 0.006761196804046631
50 0.23840796203613282
100 0.21213768920898438
150 0.20291189098358153
200 0.19630762882232666
250 0.189369158411026
300 0.1844574998855591
350 0.17976785559654235
400 0.17656983375549318
450 0.17413294992446898
500 0.1699452073097229
550 0.16861181197166442
600 0.1670388701915741
650 0.16565209383964538
700 0.16334830818176269
750 0.16337445936203002
800 0.16284609575271605
850 0.16028103785514833
900 0.15539328904151917
950 0.15874498701095582
1000 0.1547668893814087
1050 0.15507685441970825
1100 0.1545604338645935
1150 0.1552305139541626
1200 0.1535365128517151
1250 0.1534611372947693
1300 0.15185172510147094
1350 0.15140251383781433
1400 0.1508380982875824
1450 0.15354797415733337
1500 0.1496823037147522
1550 0.14983204655647278
1600 0.15068031125068665
1650 0.14989451909065246
1700 0.14834486274719239
1750 0.14702813482284546
1800 0.1477630047798157
1850 0.14922148065567017
1900 0.14582167086601258
1950 0.14751230807304383
2000 0.14695905299186707
2050 0.142

# 4. MAP and Sampling Inference


Infer with batch size of 1

In [27]:
#val_images and ground truth captions 
decoder.load_state_dict(torch.load('./decoder4new.pth'))
converter.load_state_dict(torch.load('./converter4new.pth'))
encoder.load_state_dict(torch.load('./encoder4new.pth'))

#encoder.eval()

In [16]:
# Your code goes here
def map_inference(input_variable, embeddings=w2v_embeddings, max_length=20):
    
    features_output = encoder.features(input_variable)
    classifier_input = features_output.view(1, -1)
    encoder_output = modified_classifier(classifier_input)
    encoder_output = converter(encoder_output).unsqueeze(0)

    #print(encoder_output)

    # Construct the decoder input (initially <SOS> for every batch)
    decoder_input = Variable(torch.FloatTensor([[embeddings[word2index["<SOS>"]]]])).cuda()
    #print(decoder_input)
    decoder_hidden = encoder_output
    decoder_state = encoder_output
    
    # Iterate over the indices after the first.
    decoder_outputs = []
    for t in range(1,max_length):
        decoder_output,decoder_hidden,decoder_state = decoder(decoder_input,decoder_hidden,decoder_state)
    
        # Get the top result
        topv, topi = decoder_output.data.topk(1)
        ni = topi[0][0]
        decoder_outputs.append(ni)

        if vocabulary[ni] == "<EOS>":
            break
        
        #Prepare the inputs
        decoder_input = Variable(torch.FloatTensor([[embeddings[ni]]])).cuda()

    return ' '.join(vocabulary[i] for i in decoder_outputs)

#send val images 
for val_id in val_ids[:5]:    
    img_input = load_image(val_id_to_file[val_id])
    caption = map_inference(img_input)
    print(caption)
    print(val_id_to_captions[val_id][0])
    print(" ")

a zebra standing in a field with a <UNK> . <EOS>
A zebra in captivity grazing in its exhibit.
 
a man is riding a snowboard on a beach . <EOS>
a person and a dog are standing near some cliffs
 
a zebra standing on a dirt <UNK> in a field . <EOS>
A close up of a zebra foraging on some grass
 
a man is a frisbee in a field . <EOS>
A dog sitting on a grassy hillside by a path.
 
a dog is <UNK> on a <UNK> <UNK> . <EOS>
A small white dog sitting on the floor on top of a rug.
 


In [17]:
# Your code goes here
def sample_inference(input_variable, embeddings=w2v_embeddings, max_length=20):
    
    features_output = encoder.features(input_variable)
    classifier_input = features_output.view(1, -1)
    encoder_output = modified_classifier(classifier_input)
    encoder_output = converter(encoder_output).unsqueeze(0)
    

    # Construct the decoder input (initially <SOS> for every batch)
    decoder_input = Variable(torch.FloatTensor([[embeddings[word2index["<SOS>"]]]])).cuda()
    decoder_hidden = encoder_output
    decoder_state = encoder_output
    
    # Iterate over the indices after the first.
    decoder_outputs = []
    for t in range(1,max_length):
        decoder_output,decoder_hidden,decoder_state = decoder(decoder_input,decoder_hidden,decoder_state)
        probs = np.exp(decoder_output.data[0].cpu().numpy())
        sample_sum = probs[0]
        random_sample = random.random()
        ni = 0
        while sample_sum < random_sample:
            ni += 1
            sample_sum += probs[ni]
            
        decoder_outputs.append(ni)

        if vocabulary[ni] == "<EOS>":
            break
        
        #Prepare the inputs
        decoder_input = Variable(torch.FloatTensor([[embeddings[ni]]])).cuda()

    return ' '.join(vocabulary[i] for i in decoder_outputs)

#send val images #send val images 
for val_id in val_ids[:5]:    
    img_input = load_image(val_id_to_file[val_id])
    caption = sample_inference(img_input)
    print(caption)
    print(val_id_to_captions[val_id][0])
    print(" ")

two zebra are standing in a field with one facing . <EOS>
A zebra in captivity grazing in its exhibit.
 
a teddy bear sitting on <UNK> next to a rock <EOS>
a person and a dog are standing near some cliffs
 
a zebra stands inside with <UNK> running in to the ground next to a fence . <EOS>
A close up of a zebra foraging on some grass
 
a cow laying on a park near a green apples <EOS>
A dog sitting on a grassy hillside by a path.
 
black dog and white dog resting in a room with a dog . <EOS>
A small white dog sitting on the floor on top of a rug.
 


# 5. Evaluate performance

For validation images compute the average BLEU score.

In [18]:
# Your code goes here
total_bleu = 0
for val_id in val_ids:
    #load image
    img_input = load_image(val_id_to_file[val_id])
    predicted = "<SOS>" + map_inference(img_input)
    
    #load all captions 
    cap_set = [[] for x in range(5)]
    for n in range(5):
        cap_set[n].append(val_id_to_captions[val_id][n])
    
    temp_bleu = 0
    for n in range(5):
        sentence = cap_set[n][0]
        bleu_score = compute_bleu(sentence, predicted)
        temp_bleu += bleu_score
    
    total_bleu += temp_bleu/5
    
print(total_bleu/len(val_ids))

0.258505524361466


The inference and the bleu score are not satisfactory. The training is done with 90% teacher forcing and only with 50K images in the training data (5 times for each image). Better results could be possible by playing around with the learning rate, teacher forcing ratio and increasing the number of samples trained on. 