In [1]:
from collections import defaultdict
from IPython import display
from PIL import Image
from torch import nn
from torch.autograd import Variable
from torchvision import models, transforms

import json
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import torch

# Data Acquisition

For this assignment, you must download the dataset provided as a separate link on the course webpage and extract it into `data/`. The dataset contains approximately 20K training images and 100 validation images, with multiple captions/tags for each image. For this assignment, we are only concerned with the tags and ignore the captions.

For question two on the assignment, the dataset also contains a JSON file that maps from the ImageNet labels to the category names. 

Following the data downloading and unzipping, the code below loads in the data into memory accordingly.

In [2]:
# Define a global transformer to appropriately scale images and subsequently convert them to a Tensor.
img_size = 224
loader = transforms.Compose([
  transforms.Resize(img_size),
  transforms.CenterCrop(img_size),
  transforms.ToTensor(),
]) 
def load_image(filename):
    """
    Simple function to load and preprocess the image.

    1. Open the image.
    2. Scale/crop it and convert it to a float tensor.
    3. Convert it to a variable (all inputs to PyTorch models must be variables).
    4. Add another dimension to the start of the Tensor (b/c VGG expects a batch).
    5. Move the variable onto the GPU.
    """
    image = Image.open(filename).convert('RGB')
    image_tensor = loader(image).float()
    image_var = Variable(image_tensor).unsqueeze(0)
    return image_var.cuda()

load_image('data/train2014/COCO_train2014_000000000009.jpg')

tensor([[[[0.0039, 0.0078, 0.0039,  ..., 0.0471, 0.0471, 0.0314],
          [0.0039, 0.0039, 0.0039,  ..., 0.0353, 0.0353, 0.0392],
          [0.0039, 0.0039, 0.0039,  ..., 0.0392, 0.0392, 0.0510],
          ...,
          [0.7137, 0.7294, 0.7137,  ..., 0.1686, 0.1843, 0.1686],
          [0.7059, 0.6902, 0.6863,  ..., 0.1765, 0.1804, 0.2039],
          [0.6784, 0.6667, 0.6706,  ..., 0.1922, 0.2157, 0.2275]],

         [[0.1490, 0.1490, 0.1412,  ..., 0.0039, 0.0039, 0.0039],
          [0.1451, 0.1412, 0.1373,  ..., 0.0039, 0.0039, 0.0039],
          [0.1412, 0.1373, 0.1373,  ..., 0.0039, 0.0039, 0.0039],
          ...,
          [0.4392, 0.4667, 0.4549,  ..., 0.2588, 0.2745, 0.2863],
          [0.4353, 0.4235, 0.4196,  ..., 0.2745, 0.2980, 0.3137],
          [0.4118, 0.4000, 0.4000,  ..., 0.3020, 0.3176, 0.3020]],

         [[0.5294, 0.5294, 0.5294,  ..., 0.1451, 0.1412, 0.1333],
          [0.5255, 0.5333, 0.5373,  ..., 0.1725, 0.1451, 0.1412],
          [0.5373, 0.5490, 0.5451,  ..., 0

In [3]:
# Load ImageNet label to category name mapping.
imagenet_categories = [value for key,value in sorted(json.load(open('data/imagenet_categories.json')).items(), key=lambda t: int(t[0]))]

# Load annotations file for the 20K training images.
mscoco_train = json.load(open('data/annotations/train2014.json'))
train_ids = [entry['id'] for entry in mscoco_train['images']]
train_id_to_file = {entry['id']: 'data/train2014/' + entry['file_name'] for entry in mscoco_train['images']}
category_to_name = {entry['id']: entry['name'] for entry in mscoco_train['categories']}
category_idx_to_name = [entry['name'] for entry in mscoco_train['categories']]
category_to_idx = {entry['id']: i for i,entry in enumerate(mscoco_train['categories'])}

# Load annotations file for the 100 validation images.
mscoco_val = json.load(open('data/annotations/val2014.json'))
val_ids = [entry['id'] for entry in mscoco_val['images']]
val_id_to_file = {entry['id']: 'data/val2014/' + entry['file_name'] for entry in mscoco_val['images']}

# We extract out all of the category labels for the images in the training set. We use a set to ignore 
# duplicate labels.
train_id_to_categories = defaultdict(set)
for entry in mscoco_train['annotations']:
    train_id_to_categories[entry['image_id']].add(entry['category_id'])

# We extract out all of the category labels for the images in the validation set. We use a set to ignore 
# duplicate labels.
val_id_to_categories = defaultdict(set)
for entry in mscoco_val['annotations']:
    val_id_to_categories[entry['image_id']].add(entry['category_id'])

Let us take a look at an image and its corresponding category labels. We consider the image with the id 391895 and the corresponding filename, `data/val2014/COCO_val2014_000000391895.jpg`. The image is shown below.

![image](data/val2014/COCO_val2014_000000391895.jpg)

The following code determines the category labels for this image.

In [4]:
for i,category in enumerate(val_id_to_categories[391895]):
    print("%d. %s" % (i, category_to_name[category]))

0. person
1. bicycle
2. motorcycle


# 1. Loading a Pre-trained Convolutional Neural Network (CNN)

We will work with the VGG-16 image classification CNN network first introduced in [Very Deep Convolutional Neural Networks for Large-Scale Image Recognition](https://arxiv.org/pdf/1409.1556.pdf) by K. Simonyan and A. Zisserman.

Fairly straightforwardly, we load the pre-trained VGG model and indicate to PyTorch that we are using the model for inference rather than training.

In [5]:
vgg_model = models.vgg16(pretrained=True).cuda()
vgg_model.eval()

# Let's see what the model looks like.
vgg_model

VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace)
    (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (17): Conv2d

# 2. Making Predictions Using VGG-16

Given the pre-trained network, we must now write the code to make predictions on the 10 validation images via a forward pass through the network. Typically the final layer of VGG-16 is a softmax layer, however the pre-trained PyTorch model that we are using does not have softmax built into the final layer (instead opting to incorporate it into the loss function) and therefore we must **manually** apply softmax to the output of the function.

In [6]:
softmax = nn.Softmax()
for image_id in val_ids[:10]:
    # Display the image.
    # -- Your code goes here --
    img_str = val_id_to_file[image_id]
    pil_im = Image.open(img_str, 'r')
    pil_im.show()
    
    # Print all of the category labels for this image.
    # -- Your code goes here --
    print("\nTrue value for id: %d" % image_id)
    for i,category in enumerate(val_id_to_categories[image_id]):
        print("%d. %s" % (i, category_to_name[category]))
  
    # Load/preprocess the image.
    img = load_image(img_str)

    # Run the image through the model and softmax.
    label_likelihoods = softmax(vgg_model(img)).squeeze()

    # Get the top 5 labels, and their corresponding likelihoods.
    probs, indices = label_likelihoods.topk(5)

    # Iterate and print out the predictions.
    # -- Your code goes here --
    print("\nPredictions for id: %d" % image_id)
    img_ids = [index.item() for index in indices]
    prob_vals = [prob.item() for prob in probs]
    for prob, id in zip(prob_vals, img_ids):
        y_pred = imagenet_categories[id]
        print(prob, y_pred)


True value for id: 391895
0. person
1. bicycle
2. motorcycle





Predictions for id: 391895
0.15975737571716309 assault rifle, assault gun
0.11209803819656372 stretcher
0.08013104647397995 rifle
0.05695946887135506 military uniform
0.04560713842511177 jeep, landrover

True value for id: 522418
0. sink
1. person
2. cake
3. knife

Predictions for id: 522418
0.2567004859447479 mosquito net
0.08372808247804642 toilet tissue, toilet paper, bathroom tissue
0.08117162436246872 wardrobe, closet, press
0.04646625369787216 washbasin, handbasin, washbowl, lavabo, wash-hand basin
0.0376296229660511 medicine chest, medicine cabinet

True value for id: 184613
0. person
1. umbrella
2. cow

Predictions for id: 184613
0.1697528064250946 Arabian camel, dromedary, Camelus dromedarius
0.08139925450086594 sandbar, sand bar
0.07311606407165527 Italian greyhound
0.06620091944932938 Weimaraner
0.03160366043448448 swimming trunks, bathing trunks

True value for id: 318219
0. tv
1. person
2. mouse
3. keyboard

Predictions for id: 318219
0.1280694156885147 plastic bag
0.0979

# 3. Computing Generic Visual Features using CNN

Since, rather than the output of VGG, we want a fixed sized vector representation of each image, we remove the last linear layer. The implementation of the forward function for VGG is shown below:

```
x = self.features(x)
x = x.view(x.size(0), -1)
x = self.classifier(x)
```
We aim to preserve everything but the final component of the classifier, meaning we must define an alternative equivalent to `self.classifier`.

In [7]:
# Remove the final layer of the classifier, and indicate to PyTorch that the model is being used for inference
# rather than training (most importantly, this disables dropout).

# -- Your code goes here --
short_classifier = vgg_model.classifier[:-1]
print(short_classifier)

Sequential(
  (0): Linear(in_features=25088, out_features=4096, bias=True)
  (1): ReLU(inplace)
  (2): Dropout(p=0.5)
  (3): Linear(in_features=4096, out_features=4096, bias=True)
  (4): ReLU(inplace)
  (5): Dropout(p=0.5)
)


In [8]:
# First we vectorize all of the features of training images and write the results to a file.

# -- Your code goes here --
n = len(train_ids)
training_vectors = np.empty([n, 4096])
print("\n--------------------- Vectorizing Training Images ------------------------------------")
for i, image_id in enumerate(train_ids):
    # find path
    img_str = train_id_to_file[image_id]

    # load into tensor
    image_vector = load_image(img_str)

    # compute forward pass
    x = vgg_model.features(image_vector)
    x = x.view(x.size(0), -1)
    x = short_classifier(x)
    training_vectors[i] = x.detach().cpu().numpy()
    if i % 100 == 0:
        percent_complete = (i / n) * 100
        print("\n %.2f%% complete!" % percent_complete)
    
np.save(open('outputs/training_vectors', 'wb+'), training_vectors)


--------------------- Vectorizing Training Images ------------------------------------

 0.00% complete!

 0.50% complete!

 1.00% complete!

 1.50% complete!

 2.00% complete!

 2.50% complete!

 3.00% complete!

 3.50% complete!

 4.00% complete!

 4.50% complete!

 5.00% complete!

 5.50% complete!

 6.00% complete!

 6.50% complete!

 7.00% complete!

 7.50% complete!

 8.00% complete!

 8.50% complete!

 9.00% complete!

 9.50% complete!

 10.00% complete!

 10.50% complete!

 11.00% complete!

 11.50% complete!

 12.00% complete!

 12.50% complete!

 13.00% complete!

 13.50% complete!

 14.00% complete!

 14.50% complete!

 15.00% complete!

 15.50% complete!

 16.00% complete!

 16.50% complete!

 17.00% complete!

 17.50% complete!

 18.00% complete!

 18.50% complete!

 19.00% complete!

 19.50% complete!

 20.00% complete!

 20.50% complete!

 21.00% complete!

 21.50% complete!

 22.00% complete!

 22.50% complete!

 23.00% complete!

 23.50% complete!

 24.00% complete!



In [9]:
# Next we vectorize all of the features of validation images and write the results to a file.
    
# -- Your code goes here --
n = len(val_ids)
validation_vectors = np.empty([n, 4096])
print("\n--------------------- Vectorizing Validation Images ------------------------------------")
for i, image_id in enumerate(val_ids):
    # find path
    img_str = val_id_to_file[image_id]

    # load into tensor
    image_vector = load_image(img_str)

    # compute forward pass
    x = vgg_model.features(image_vector)
    x = x.view(x.size(0), -1)
    x = short_classifier(x)
    validation_vectors[i] = x.detach().cpu().numpy()
    percent_complete = (i / n) * 100
    print("\n %.2f%% complete!" % percent_complete)

np.save(open('outputs/validation_vectors', 'wb+'), validation_vectors)


--------------------- Vectorizing Validation Images ------------------------------------

 0.00% complete!

 1.00% complete!

 2.00% complete!

 3.00% complete!

 4.00% complete!

 5.00% complete!

 6.00% complete!

 7.00% complete!

 8.00% complete!

 9.00% complete!

 10.00% complete!

 11.00% complete!

 12.00% complete!

 13.00% complete!

 14.00% complete!

 15.00% complete!

 16.00% complete!

 17.00% complete!

 18.00% complete!

 19.00% complete!

 20.00% complete!

 21.00% complete!

 22.00% complete!

 23.00% complete!

 24.00% complete!

 25.00% complete!

 26.00% complete!

 27.00% complete!

 28.00% complete!

 29.00% complete!

 30.00% complete!

 31.00% complete!

 32.00% complete!

 33.00% complete!

 34.00% complete!

 35.00% complete!

 36.00% complete!

 37.00% complete!

 38.00% complete!

 39.00% complete!

 40.00% complete!

 41.00% complete!

 42.00% complete!

 43.00% complete!

 44.00% complete!

 45.00% complete!

 46.00% complete!

 47.00% complete!

 48.00%

# 4. Visual Similarity

We now use the generated vectors, to find the closest training image for each validation image. This can easily be done by finding the training vector that minimizes the Euclidean distance for every validation image. We repeat this exercise for the first 10 images in the validation set.

In [16]:
# -- Your code goes here --
for val_id, v in zip(val_ids[:10], validation_vectors[:10]):
    # initialize defaults
    best_dist = 99999999
    best_vec = None
    best_id = None

    # find closet vector, store info
    print("\n")
    for train_id, t in zip(train_ids, training_vectors):
        dist = np.linalg.norm(v-t)
        if dist < best_dist:
            best_dist = dist
            best_vect = t
            best_id = train_id

    # print closest images
    val_img_str = val_id_to_file[val_id]
    train_img_str = train_id_to_file[best_id]

    # show images
    pil_im = Image.open(val_img_str, 'r')
    pil_im.show()
    pil_im = Image.open(train_img_str, 'r')
    pil_im.show()























# 5. Training a Multi-Label Classification Network

We now build a two layer classification network, which takes 4096-dimensional vectors as input and outputs the probabilities of the 80 categories present in MSCOCO. 

For this purpose, we utilize two layers (both containing sigmoid activation functions) with the hidden dimension set to 512. 

In [24]:
# First we construct a class for the model
# -- Your code goes here --
class Net(nn.Module):
    def __init__(self, dim_in, dim_hidden, dim_out):
        super(Net, self).__init__()
        self.linear1 = nn.Linear(dim_in, dim_hidden)
        self.linear2 = nn.Linear(dim_hidden, dim_out)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.linear1(x)
        x = self.sigmoid(x)
        x = self.linear2(x)
        x = self.sigmoid(x)
        return x

In [44]:
# The output data is prepared by representing each output as a binary vector of categories
# -- Your code goes here --

def train(model, learning_rate=0.001, batch_size=100, epochs=5):
    """
    Training function which takes as input a model, a learning rate and a batch size.
  
    After completing a full pass over the data, the function exists, and the input model will be trained.
    """
    # -- Your code goes here --
    criterion = nn.MultiLabelSoftMarginLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9)
    num_vectors = len(training_vectors)
    batches = num_vectors // batch_size
    
    for _ in range(epochs):    
        running_loss = 0.0
        
        for i in range(batches):
            batch = training_vectors[100*i:100*i+99]
            torch_batch = torch.from_numpy(batch).float().cuda()
            print(torch_batch.is_cuda)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = model(torch_batch)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            if i % 200 == 199:    # print every 200 mini-batches
                print('[%d, %5d] loss: %.3f' %
                      (epoch + 1, i + 1, running_loss / 200))
                running_loss = 0.0


# Finally train the model
model = Net(4096, 512, 80).cuda()
train(model)

True


NameError: name 'labels' is not defined

In [40]:
print(category_to_idx)

{1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7, 9: 8, 10: 9, 11: 10, 13: 11, 14: 12, 15: 13, 16: 14, 17: 15, 18: 16, 19: 17, 20: 18, 21: 19, 22: 20, 23: 21, 24: 22, 25: 23, 27: 24, 28: 25, 31: 26, 32: 27, 33: 28, 34: 29, 35: 30, 36: 31, 37: 32, 38: 33, 39: 34, 40: 35, 41: 36, 42: 37, 43: 38, 44: 39, 46: 40, 47: 41, 48: 42, 49: 43, 50: 44, 51: 45, 52: 46, 53: 47, 54: 48, 55: 49, 56: 50, 57: 51, 58: 52, 59: 53, 60: 54, 61: 55, 62: 56, 63: 57, 64: 58, 65: 59, 67: 60, 70: 61, 72: 62, 73: 63, 74: 64, 75: 65, 76: 66, 77: 67, 78: 68, 79: 69, 80: 70, 81: 71, 82: 72, 84: 73, 85: 74, 86: 75, 87: 76, 88: 77, 89: 78, 90: 79}


# 6. End-to-End Model Fine-tuning

Instead of training *only* the final two layers, we now create an end-to-end model and train the entire thing. 

In [None]:
# First we construct a class for the model
# -- Your code goes here --

In [None]:
# The output data is prepared by representing each output as a binary vector of categories
# -- Your code goes here --

def train(model, learning_rate=0.001, batch_size=50, epochs=2):
    """
    Training function which takes as input a model, a learning rate and a batch size.
  
    After completing a full pass over the data, the function exists, and the input model will be trained.
    """
    # -- Your code goes here --
    
# Finally train the model
train(model)

In [None]:
# Now repeat step two using the end-to-end classifier.
# -- Your code goes here --

# 7. Hyper-parameter Tuning

Now we do a grid search over the learning rate and batch size.

In [None]:
# -- Your code goes here --