# Boilerplate

Packae installation, loading, and dataloaders. There's also a simple model defined. You can change it your favourite architecture if you want.

In [None]:
!pip install tensorboardX



In [None]:


import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import time

from torchvision import datasets, transforms
from tensorboardX import SummaryWriter

use_cuda = False
device = torch.device("cuda" if use_cuda else "cpu")
batch_size = 64

np.random.seed(42)
torch.manual_seed(42)


## Dataloaders
train_dataset = datasets.MNIST('mnist_data/', train=True, download=True, transform=transforms.Compose(
    [transforms.ToTensor()]
))
test_dataset = datasets.MNIST('mnist_data/', train=False, download=True, transform=transforms.Compose(
    [transforms.ToTensor()]
))

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

## Simple NN. You can change this if you want.
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc = nn.Linear(28*28, 200)
        self.fc2 = nn.Linear(200,10)

    def forward(self, x):
        x = x.view((-1, 28*28))
        x = F.relu(self.fc(x))
        x = self.fc2(x)
        return x

class Normalize(nn.Module):
    def forward(self, x):
        return (x - 0.1307)/0.3081

# Add the data normalization as a first "layer" to the network
# this allows us to search for adverserial examples to the real image, rather than
# to the normalized image
model = nn.Sequential(Normalize(), Net())

model = model.to(device)
model.train()

Sequential(
  (0): Normalize()
  (1): Net(
    (fc): Linear(in_features=784, out_features=200, bias=True)
    (fc2): Linear(in_features=200, out_features=10, bias=True)
  )
)

# Implement the Attacks

Functions are given a simple useful signature that you can start with. Feel free to extend the signature as you see fit.

You may find it useful to create a 'batched' version of PGD that you can use to create the adversarial attack.

In [None]:
# The last argument 'targeted' can be used to toggle between a targeted and untargeted attack.
# We want to maximize loss for an input for an untargeted attack.
def fgsm_untargeted(model, x, eps, targeted=False): # x is a list of (input, correct output) pairs
  ''' Just Untargeted '''
  outputs = []
  number_of_attacks = 0
  L = nn.CrossEntropyLoss()
  for ip,op in x:
    ip.requires_grad_()
    loss = L(model(ip), torch.tensor([op], dtype=torch.long))
    loss.backward()
    nu = eps*torch.sign((ip.grad))
    fgsm_ip = ip + nu
    fgsm_op = model(fgsm_ip).argmax(dim=1).item()
    outputs.append((op,fgsm_op))
    if op != fgsm_op:
      number_of_attacks = number_of_attacks + 1
  robustness = (1 - (number_of_attacks/len(x)))*100
  return outputs, robustness

def project(y,x,eps):
  return torch.clamp(y, torch.add(x, -eps), torch.add(x, eps))

# def pgd_untargeted(model, x, labels, k, eps, eps_step):

# def pgd_untargeted(model, x, k, eps, eps_step):
#   outputs = []
#   number_of_attacks = 0
#   L = nn.CrossEntropyLoss()
#   for ip, op in x:
#     ipk = ip
#     ipk.requires_grad_()
#     for i in range(k):
#       loss = L(model(ipk), torch.tensor([op], dtype=torch.long))
#       ipk.retain_grad()
#       loss.backward(retain_graph=True)
#       nu = eps_step*torch.sign((ipk.grad))
#       ipk = project(ipk+nu, ip, eps)
#       # ipk.retain_grad()
#       fgsm_op = model(ipk).argmax(dim=1).item()

#       outputs.append((op,fgsm_op))
#       if op != fgsm_op:
#         number_of_attacks = number_of_attacks + 1
#         break # found attack
#   robustness = (1 - (number_of_attacks/len(x)))*100
#   return outputs, robustness



def pgd_untargeted(model, x, k, eps, eps_step):

  outputs = []
  number_of_attacks = 0

  L = nn.CrossEntropyLoss()
  for ip, op in x:
    ipk = ip
    ip.requires_grad_()

    loss = L(model(ipk), torch.tensor([op], dtype=torch.long))
    ipk.retain_grad()
    loss.backward(retain_graph = True)
    worstcase, worstloss = ipk, loss
    for j in range(k):
      nu = eps_step*torch.sign((ipk.grad))
      ipk = project(ipk+nu, ip, eps)

      loss = L(model(ipk), torch.tensor([op], dtype=torch.long))
      ipk.retain_grad()
      loss.backward(retain_graph = True)

      if abs(loss) > worstloss:
        worstcase, worstloss = ipk, loss


    worstcase_output = model(ipk).argmax(dim=1).item()
    if op != worstcase_output:
      number_of_attacks = number_of_attacks + 1
      outputs.append((worstcase,op))   #
  robustness = (1 - (number_of_attacks/len(x)))*100
  return outputs, robustness



Accuracy on test set:

In [None]:
# Testing accuracy of the model

def accuracy(model, test_set = test_dataset):
  correct = 0
  for ip,op in test_set:
    if model(ip).argmax(dim=1).item() == op:
      correct = correct + 1
  return (correct/len(test_set))*100

print('Accuracy(percentage):',accuracy(model))

Accuracy(percentage): 10.22


Initial Attacks

In [None]:
print('Percent of examples where FGSM attack was not found:',fgsm_untargeted(model, test_dataset, 0.01, False)[1])
print('Percent of examples where PGD attack was not found:', pgd_untargeted(model, test_dataset, 3, 0.01, 0.005)[1])

Percent of examples where FGSM attack was not found: 2.1700000000000053
Percent of examples where PGD attack was not found: 2.1299999999999986


# Implement Adversarial Training

In [None]:
def train_model(model, num_epochs, enable_defense=True):
    learning_rate = 0.0001

    opt = optim.Adam(params=model.parameters(), lr=learning_rate)

    ce_loss = torch.nn.CrossEntropyLoss()

    tot_steps = 0

    for epoch in range(1,num_epochs+1):
        t1 = time.time()
        for batch_idx, (x_batch, y_batch) in enumerate(train_loader):

            if enable_defense:
              x = list(zip(x_batch, y_batch))
              worstcase = pgd_untargeted(model, x, 10, 0.01, 0.005)[0] # this does not necessarily find the worst loss in the eps-ball around inputs, but it does find some bad loss.
              if len(worstcase) != 0:
                worst_ip, op = [list(i) for i in zip(*worstcase)]       # bad input list and the outputs needed for robustness
                # worst_ip, op = worst_ip.to(device), op.to(device)  # op is just y_batch
                opt.zero_grad()
                out = model(torch.stack(worst_ip, dim=0))
                batch_loss = ce_loss(out, torch.stack(op, dim=0))
                batch_loss.backward()
                opt.step()


            x_batch, y_batch = x_batch.to(device), y_batch.to(device)
            tot_steps += 1
            opt.zero_grad()
            out = model(x_batch)
            batch_loss = ce_loss(out, y_batch)
            batch_loss.backward()
            opt.step()

        tot_test, tot_acc = 0.0, 0.0
        for batch_idx, (x_batch, y_batch) in enumerate(test_loader):
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)
            out = model(x_batch)
            pred = torch.max(out, dim=1)[1]
            acc = pred.eq(y_batch).sum().item()
            tot_acc += acc
            tot_test += x_batch.size()[0]
        t2 = time.time()

        print('Epoch %d: Accuracy %.5lf [%.2lf seconds]' % (epoch, tot_acc/tot_test, t2-t1))



Training (first run the file with enable_defense turned off, then onn)

In [None]:
train_model(model, 3, enable_defense=True)

Epoch 1: Accuracy 0.94200 [473.35 seconds]
Epoch 2: Accuracy 0.95480 [472.19 seconds]
Epoch 3: Accuracy 0.96360 [473.92 seconds]


After training

In [None]:
print('Accuracy(percentage):',accuracy(model))
print('Percent of examples where FGSM attack was not found:',fgsm_untargeted(model, test_dataset, 0.01, False)[1])
print('Percent of examples where PGD attack was not found:', pgd_untargeted(model, test_dataset, 3, 0.01, 0.005)[1])

Accuracy(percentage): 96.36
Percent of examples where FGSM attack was not found: 93.77
Percent of examples where PGD attack was not found: 93.72


# Study Accuracy, Quality, etc.

Compare the various results and report your observations on the submission.

In [None]:
# Your code here