In [1]:
%load_ext autoreload
%autoreload 2

In [102]:
import matplotlib.pyplot as plt
from IPython.display import Image 
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms

import team36
from team36.training import validate, accuracy

DIR = '.'
DATA_DIR = f'{DIR}/data'

test_set = torchvision.datasets.MNIST(root=DATA_DIR, train=False, download=True, 
                                      transform=transforms.ToTensor())
test_loader = torch.utils.data.DataLoader(test_set, batch_size=100, shuffle=False, num_workers=2)

model = team36.mnist.VGG()
state_dict = torch.load(f"{DIR}/checkpoints/mnist-vgg.pth")
model.load_state_dict(state_dict)

criterion = nn.CrossEntropyLoss()

In [103]:
# "fast gradient sign method" from EXPLAINING AND HARNESSING ADVERSARIAL EXAMPLES
# gradient calculation from
#   https://stackoverflow.com/questions/54754153/autograd-grad-for-tensor-in-pytorch

_, (inputs, targets) = next(enumerate(test_loader))
inputs.requires_grad_()

out = model(inputs)
loss = criterion(out, targets)
loss_gradient = torch.autograd.grad(outputs=loss, inputs=inputs)

adversarial_inputs = torch.zeros(inputs.shape)

epsilon = 0.25
for i in range(20):
    eta = epsilon * torch.sign(loss_gradient[0][i])
    input = inputs[i]
    adversarial_input = input + eta
    
    min = torch.min(adversarial_input)
    max = torch.max(adversarial_input)
    adversarial_input = (adversarial_input - min) / (max - min)
    
    # display(transforms.functional.to_pil_image(input))
    # display(transforms.functional.to_pil_image(adversarial_input))
    adversarial_inputs[i] = adversarial_input

out = model(inputs)
loss = criterion(out, targets)
acc = accuracy(out, targets)
print(f"Regular Test Accuracy is {acc}")

out = model(adversarial_inputs)
loss = criterion(out, targets)
acc = accuracy(out, targets)
print(f"Adversarial Test Accuracy is {acc}")

Regular Test Accuracy is 0.8700000047683716
Adversarial Test Accuracy is 0.11999999731779099
