In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable as V
from torchvision.models import vgg19
import torchvision.transforms as transforms
import numpy as np
from PIL import Image

#### VGG also has a batch normalized version. The paper didn't specify if batch norm was used

In [5]:
torch.cuda.set_device(0)

vgg = vgg19(True).eval().cuda()

In [6]:
preprocess = transforms.Compose([transforms.Resize(224),
                                         transforms.ToTensor(),
                                         transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                                              std=[0.229, 0.224, 0.225])])

hen = Image.open("hen.jpg")
out = vgg(V(preprocess(hen).unsqueeze(0)).cuda())
print('Prediction for hen.jpg is: %s'%(np.argmax(out.cpu().detach().numpy())))

dog = Image.open("dog.jpg")
out = vgg(V(preprocess(dog).unsqueeze(0)).cuda())
print('Prediction for dog.jpg is: %s'%(np.argmax(out.cpu().detach().numpy())))

Prediction for hen.jpg is: 8
Prediction for dog.jpg is: 207


#### VGG is verified making correct predictions

## Now let's extract the features pyramid (section 3.1)
#### Check the VGG architecture

In [7]:
print(vgg)

VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace)
    (16): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (17): ReLU(inplace)

#### We want the outputs of the first 5 ReLUs
#### use hooks to extract them

In [8]:
feature_pyramid = []

def extract_feature(self, input, output):
    feature_pyramid.append(output)
    print('Inside ' + self.__class__.__name__)
    print('input size:', input[0].size())
    print('output size:', output.data.size())

layer_inds = [1, 3, 6, 8, 11]

for layer_ind in layer_inds:
    vgg.features[layer_ind].register_forward_hook(extract_feature)

In [13]:
feature_pyramid = []
out = vgg(V(preprocess(dog).unsqueeze(0)).cuda())

Inside ReLU
input size: torch.Size([1, 64, 224, 224])
output size: torch.Size([1, 64, 224, 224])
Inside ReLU
input size: torch.Size([1, 64, 224, 224])
output size: torch.Size([1, 64, 224, 224])
Inside ReLU
input size: torch.Size([1, 128, 112, 112])
output size: torch.Size([1, 128, 112, 112])
Inside ReLU
input size: torch.Size([1, 128, 112, 112])
output size: torch.Size([1, 128, 112, 112])
Inside ReLU
input size: torch.Size([1, 256, 56, 56])
output size: torch.Size([1, 256, 56, 56])


and those are the first 5 ReLU outputs