In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision 
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from torch.autograd import Variable, grad
import time
import torchvision.models as models
import os
from torch.utils.data.sampler import SubsetRandomSampler

## Load dataset

In [2]:
# CIFAR10 Test dataset and dataloader declaration
transform = torchvision.transforms.Compose(
    [torchvision.transforms.ToTensor()])

trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=128,
                                          shuffle=True, num_workers=2)

testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                 download=True, transform=transform)
indices = list(range(100))
testSampler = SubsetRandomSampler(indices)

testloader = torch.utils.data.DataLoader(testset, batch_size=1,
                                         shuffle=False, num_workers=2, sampler=testSampler)

classes = ('plane', 'car', 'bird', 'cat',
           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

Files already downloaded and verified
Files already downloaded and verified


## Load model

In [3]:
# Define what device we are using
print("CUDA Available: ",torch.cuda.is_available())
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize the network
model = models.resnet50(pretrained=True)
model.fc = nn.Linear(2048, 10)

# freeze front layers
ct = 0
for child in model.children():
    ct += 1
    if ct < 5:
        for param in child.parameters():
            param.requires_grad = False
model = model.to(device)
# model.eval()

CUDA Available:  False


## Train

In [4]:
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, gamma=0.1, step_size=30)
criterion = nn.CrossEntropyLoss()

In [5]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def accuracy(output, target, topk=(1,)):
    """Computes the precision@k for the specified values of k"""
    maxk = max(topk)
    batch_size = target.size(0)

    _, pred = output.topk(maxk, 1, True, True)
    pred = pred.t()
    correct = pred.eq(target.view(1, -1).expand_as(pred))

    res = []
    for k in topk:
        correct_k = correct[:k].view(-1).float().sum(0)
        res.append(correct_k.mul_(100.0 / batch_size))
    return res

In [6]:
def train(train_loader, model, criterion, optimizer, epoch):
    """
        Run one train epoch
    """
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    total = 0
    correct = 0
    
    # switch to train mode
    model.train()

    end = time.time()
    for i, (input, target) in enumerate(train_loader):

        # measure data loading time
        data_time.update(time.time() - end)

        target = target.to(device)
        input = input.to(device)
        target_var = target

        # compute output
        output = model(input)
        loss = criterion(output, target)

        # compute gradient and do SGD step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # measure accuracy and record loss
        _, predicted = output.max(1)
        losses.update(loss.item(), input.size(0))
        total += target.size(0)
        correct += predicted.eq(target).sum().item()


        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % 50 == 0:
            print('Epoch: [{0}][{1}/{2}]\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                  'Accuracy {acc:.3f}'.format(
                      epoch, i, len(train_loader), batch_time=batch_time,
                      data_time=data_time, loss=losses, acc=correct/total))



In [7]:
def save_checkpoint(state,filename='checkpoint.pth.tar'):
    """
    Save the training model
    """
    torch.save(state, filename)

In [None]:
global best_prec1
best_prec1 = 0

for epoch in range(50):

    # train for one epoch
    print('current lr {:.5e}'.format(optimizer.param_groups[0]['lr']))
    train(trainloader, model, criterion, optimizer, epoch)
    lr_scheduler.step()

    if epoch > 0 and epoch % 50 == 0:
        save_checkpoint({
            'epoch': epoch + 1,
            'state_dict': model.state_dict()
        },  filename=os.path.join('./checkpoints/', 'checkpoint.th'))

    save_checkpoint({
        'state_dict': model.state_dict()
    },  filename=os.path.join('./checkpoints/', 'model.th'))

## Attack function

In [8]:
model.load_state_dict(torch.load('./checkpoints/model.th')['state_dict'])
model.eval()
print(type(model))

<class 'torchvision.models.resnet.ResNet'>


In [15]:
def test(model, device, test_loader, criterion, attack_method, epsilon):
    assert attack_method in ['jsma'] 
    
    # Accuracy counter
    correct = 0
    total = 0
    adv_examples = []

    confidence_score_diff = 0.0
    # Loop over all examples in test set
    for data, target in test_loader:
        data, target = data.to(device), target.to(device)
        
        # Set requires_grad attribute of tensor. Important for Attack
        data.requires_grad = True

        # Forward pass the data through the model
        output = model(data)
        init_pred = output.max(1, keepdim=True)[1] # get the index of the max log-probability

        if init_pred.item() != target.item(): # initially was incorrect --> no need to generate adversary
            continue
        
        if attack_method == 'fgsm':
            loss = criterion(output, target) # loss for ground-truth class
        else:
            ll = output.min(1, keepdim=True)[1][0]
            loss = criterion(output, ll)  # Loss for least-likely class
            
        # Back propogation
        model.zero_grad()
        loss.backward()

        # Collect data_grad
        data_grad = data.grad.data

        # Call Attack
        perturbed_data = smm(model, data, epsilon, None)[0]

        # Re-classify the perturbed image
        output = model(perturbed_data)

        # Check for success
        final_pred = output.max(1, keepdim=True)[1]
        print(output)
        if final_pred.item() == target.item():
            correct += 1 # still correct
        else:
            arr = (output[0]).tolist()
            arr.sort(reverse=True)
            diff = arr[0] - arr[1]
            confidence_score_diff += diff
            # Special case for saving 0 epsilon examples
        adv_ex = perturbed_data.squeeze().detach().cpu().numpy()
        adv_examples.append((init_pred.item(), final_pred.item(), adv_ex))
        total += 1

    # Calculate final accuracy for this epsilon
    final_acc = correct/float(len(test_loader))
    print("Epsilon: {}\tTest Accuracy = {} / {} = {}".format(epsilon, correct, total, final_acc))
    
    # Calculate difference in confidence score for this epsilon
    if total - correct == 0:
        print("Epsilon: {}\tDifference in confidence score = {} / {} = {}".format(epsilon, total - correct, total, 0.0))
    else:
        final_diff = confidence_score_diff / ((total - correct) * 1.0)
        print("Epsilon: {}\tDifference in confidence score = {} / {} = {}".format(epsilon, total - correct, total, final_diff))
    
    # Return the accuracy and an adversarial example
    return final_acc, adv_examples

In [16]:
def smm(net, x, theta=1., y=None, gamma=1., clip_min=0., clip_max=1.):
  """
  The Jacobian-based Saliency Map Method (Papernot et al. 2016)
  Paper Link: https://arxiv.org/pdf/1511.07528.pdf
  Arguments
  ---------
  model : nn.Module
          The model on which the attack needs to be performed.
  x : torch.Tensor
      The input ot the model.
  y : torch.tensor, optional
      Target tensor if the attack is targetted
  theta : float, optional
          Perturbation introduced to modified components
          (can be positive or negative). Defaults to 1.
  gamma : float, optional
          Maximum percentage of perturbed features. Defaults to 1.
  clip_min : float, optional
             Minimum component value for clipping
  clip_max : float, optional
             Maximum component value for clipping
  Returns
  -------
  adv_x : torch.tensor
          The adversarial Example of the input.
  """

  if y is None:
    # TODO torch.autograd.grad doesn't support batches
    # So, revise the implementation when it does in future releases
    def random_targets(gt):
      result = gt.clone()
      classes = gt.shape[1]
      # TODO Remove the blank () after #18315 in pytorch is resolved.
      return torch.roll(result, int(torch.randint(nb_classes, ())))
    
    labels, nb_classes = get_or_guess_labels(net, x, y)
    y = random_targets(labels)
    y = y.view([1, nb_classes])
    #print (torch.argmax(y, dim=1))
  
  x.requires_grad = True
  x_adv = jsma_symbolic(x, y, net, theta, gamma, clip_min, clip_max)
  print(x_adv)
  return x_adv

In [17]:
def get_or_guess_labels(net, x, y=None):
  """
  Get the label to use in generating an adversarial example for x.
  The kwargs are fed directly from the kwargs of the attack.
  If 'y' is in kwargs, then assume its an untargetted attack and use
  that as the label.
  If 'y_target' is in kwargs and is not None, then assume it's a 
  targetted attack and use that as the label.
  Otherwise, use the model's prediction as the label and perform an 
  untargetted attack.
  Returns
  -------
  labels : torch.tensor
           Return a 1-hot vector with 1 in the position of the class predicted.
  nc : int
       # Number of classes
  """

  if y is not None:
    labels = kwargs['y']
  else:
    logits = net(x if len(x.shape) == 4 else x.unsqueeze(0))
    # TODO Remove cls, not needed, just there for debugging purpose
    pred_max, _ = torch.max(logits, dim=1)
    #print (cls)
    labels = (logits == pred_max).float()
    labels.requires_grad = False
  
  return labels, labels.size(1)

In [18]:
def jsma_symbolic(x, y, net, theta, gamma, clip_min, clip_max):
  """
  PyTorch Implementation of the JSMA (see https://arxiv.org/abs/1511.07520
  for the details about the algorithm design choices).
  Arguments
  ---------
  x : torch.tensor
    The input to the model
  y : torch.tensor
       The target tensor
  model : nn.Module
      The pytorch model
  theta : float
      delta for each feature adjustment.
  gamma : float
      a float between 0 and 1 indicating the maximum distortion
      percentage.
  clip_min : float
       minimum value for components of the example returned
  clip_max : float
       maximum value for components of the example returned.
  
  Returns
  -------
  x_adv : torch.tensor
      The adversarial example.
  """

  classes = int(y.shape[1])
  features = int(np.product(x.shape[1:]))
  #print (features)

  max_iters = np.floor(features * gamma / 2)
  increase = bool(theta > 0)

  zdiag = np.ones((features, features), int)
  np.fill_diagonal(zdiag, 0)

  # Compute the initial search domain. We optimize the initial search domain
  # by removing all features that are already at their maximum values
  # (if increasing input features -- otherwise, at their minimum value).
  if increase:
    search_domain = (x < clip_max).view(-1, features)
  else:
    search_domain = (x > clip_min).view(-1, features)
  
  # TODO remove this
  max_iters = 30
  net.eval()
  while max_iters:
    if max_iters != 30:
      x = x[0]
    logits = net(x)
    preds  = torch.argmax(logits, dim=1)
    loss = nn.CrossEntropyLoss()(logits, preds)
    
    '''
    loss.backward()
    grads = x.grad
    print (grads.shape)
    '''

    # Create the Jacobian Graph
    list_deriv = []
    for idx in range(classes):
      #print (x.requires_grad)
      deriv = grad(logits[:, idx], x, retain_graph=True)[0]
      #print (deriv[0].shape, x.shape)
      list_deriv.append(deriv)

    #print (list_deriv[0].shape)
    #grads = (torch.stack(list_deriv, dim=0).view(classes, -1, features))
    grads = torch.stack(list_deriv, dim=0).view(classes, -1, features)
    print (grads.shape)
    #'''
    
    # Compute the Jacobian components
    # To help with the computation later, reshape the target_class
    # and other_class to [nb_classes, -1, 1].
    # The last dimension is added to allow broadcasting later.
    tclass = y.view(classes, -1, 1)
    oclass = (tclass == 0).float()
    print (tclass.shape, oclass.shape)
    
    # TODO Check the dim
    a = grads * tclass
    gtarget = torch.sum(grads * tclass, dim=0)
    gother = torch.sum(grads * oclass, dim=0)
    print (gtarget.shape, gother.shape)
    print (gtarget[:1])
    print (gother[:1])
    
    # Remove the already-used input features from the search space
    # Subtract 2 times the maximum value from those so that they
    # won't be picked later
    increase_coef = (4 * int(increase) - 2) * (search_domain == 0).float()

    target_tmp = gtarget
    target_tmp -= increase_coef * torch.max(torch.abs(gtarget), dim=1)[0]
    target_sum = target_tmp.view(-1, features, 1) + target_tmp.view(-1, 1, features)

    other_tmp = gother
    other_tmp += increase_coef * torch.max(torch.abs(gother), dim=1)[0]
    other_sum = other_tmp.view(-1, features, 1) * other_tmp.view(-1, 1, features)
    print ('BOOM')

    # Create a mask to only keep features that match conditions
    if increase:
      scores_mask = ((target_sum > 0) & (other_sum < 0))
    else:
      scores_mask = ((target_sum < 0) & (other_sum > 0))

    # Create a 2D numpy array of scores for each pair of candidate features
    scores = scores_mask * (-target_sum * other_sum) *  zdiag

    # Extract the best 2 pixels
    best = torch.argmax(scores.view(-1, features*features), dim=1)

    p1 = best % features
    p2 = best // features
    p1_ohot = torch.nn.functional.one_hot(p1, num_classes=features)
    p2_ohot = torch.nn.functional.one_hot(p2, num_classes=features)

    # Check if more modification is needed
    # TODO preds is 1 hot vector in tf implementation
    mod_not_done = torch.sum(y * preds, dim=1) == 0
    cond = mod_not_done & (torch.sum(search_domain, dim=1) >= 2)

    # Update the search domain
    cond_float = cond.view(-1, 1)
    to_mod = (p1_ohot + p2_ohot) * cond_float

    #search_domain = search_domain - to_mod

    # Apply the modifications to the image
    to_mod = to_mod.view(-1, *x.shape)
    if increase:
      x = x + to_mod * theta
    else:
      x = x + to_mod * theta

    max_iters -= 1

  return x

In [19]:
accuracies = []
examples = []
epsilons = [.1, .2, .3, .4, .5]
attack_method = 'jsma'

for eps in epsilons:
    print(eps)
    acc, ex = test(model, device, testloader, criterion, attack_method, eps)
    accuracies.append(acc)
    examples.append(ex)

0.1
torch.Size([10, 1, 3072])
torch.Size([10, 1, 1]) torch.Size([10, 1, 1])
torch.Size([1, 3072]) torch.Size([1, 3072])
tensor([[ 0.3132,  0.0479, -0.1560,  ..., -0.1597,  0.0405, -0.4720]])
tensor([[-0.7688, -0.2237,  0.3955,  ...,  0.3190, -0.2477,  0.4240]])
BOOM
torch.Size([10, 1, 3072])
torch.Size([10, 1, 1]) torch.Size([10, 1, 1])
torch.Size([1, 3072]) torch.Size([1, 3072])
tensor([[ 0.3132,  0.0479, -0.1560,  ..., -0.1597,  0.0405, -0.4720]])
tensor([[-0.7688, -0.2237,  0.3955,  ...,  0.3190, -0.2477,  0.4240]])
BOOM
torch.Size([10, 1, 3072])
torch.Size([10, 1, 1]) torch.Size([10, 1, 1])
torch.Size([1, 3072]) torch.Size([1, 3072])
tensor([[ 0.3132,  0.0479, -0.1560,  ..., -0.1597,  0.0405, -0.4720]])
tensor([[-0.7688, -0.2237,  0.3955,  ...,  0.3190, -0.2477,  0.4240]])
BOOM
torch.Size([10, 1, 3072])
torch.Size([10, 1, 1]) torch.Size([10, 1, 1])
torch.Size([1, 3072]) torch.Size([1, 3072])
tensor([[ 0.3132,  0.0479, -0.1560,  ..., -0.1597,  0.0405, -0.4720]])
tensor([[-0.7688, -0

In [None]:
plt.figure(figsize=(5,5))
plt.plot(epsilons[:len(accuracies)], accuracies, "*-")
# plt.yticks(np.arange(0, 1.1, step=0.1))
# plt.xticks(np.arange(0, .35, step=0.05))
plt.title("Accuracy vs Epsilon")
plt.xlabel("Epsilon")
plt.ylabel("Accuracy")
plt.show()