In this exercise we will focus on some basic Adversarial Attack methods - Fast Gradient Sign Method (FGSM), Targeted Gradient Sign Method (TGSM), Basic Iterative Method (BIM) & Projected Gradient Descent (PGD). Before starting make sure that you feel comfortable with the basic concept of adversarial examples for classification models, and that you have a good understanding of the 4 methods. Your tutor may ask you questions about them. 

In [0]:
import os
# os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" 
# os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import math

# Enable inline plotting
%matplotlib inline

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

--- Create and Train a Simple MNIST CNN Classifier --- for those of you less familiar with tensorflow and keras, this is an opportunity (notice that we are using the keras API from within tensorflow)

In [0]:
''' Build a simple MNIST classification CNN
    The network takes ~3 minutes to train on a normal laptop and reaches roughly 97% of accuracy
    Model structure: Conv, Conv, Max pooling, Dropout, Dense, Dense
'''
def build_mnist_model():
    
    activation = 'relu'
    # input image dimensions
    img_rows, img_cols, img_colors = 28, 28, 1
    
    model = keras.Sequential()
    model.add(layers.Conv2D(8, kernel_size=(3, 3), input_shape=(img_rows, img_cols, img_colors), activation=activation))
    model.add(layers.Conv2D(8, (3, 3), activation=activation))
    model.add(layers.MaxPooling2D(pool_size=(2, 2)))
    model.add(layers.Dropout(0.25))
    model.add(layers.Flatten())
    model.add(layers.Dense(128, activation=activation))
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(num_classes))
    model.add(layers.Activation('softmax', name='y_pred'))
              
    return model

In [0]:
''' Normalize input to the range of [0..1]
    Apart from assisting in the convergance of the training process, this 
    will also make our lives easier during the adversarial attack process
'''
def normalize(x_train,x_test):
    x_train -= x_train.min()
    x_train /= x_train.max()
    x_test -= x_test.min()
    x_test /= x_test.max()
    
    return x_train, x_test

In [0]:
# Load and prepare the datasets for training
num_classes = 10

img_rows, img_cols, img_colors = 28, 28, 1
(train_images, train_labels), (test_images, test_labels) = keras.datasets.mnist.load_data()
train_images = train_images.astype('float32')
test_images = test_images.astype('float32')
train_images = train_images.reshape(train_images.shape[0], img_rows, img_cols, 1)
test_images = test_images.reshape(test_images.shape[0], img_rows, img_cols, 1)
train_images, test_images = normalize(train_images, test_images)
    
train_labels = keras.utils.to_categorical(train_labels, num_classes)
test_labels = keras.utils.to_categorical(test_labels, num_classes)

the classifier might take a few minutes to train but should reach quite a high accuracy

In [0]:
# Train the model
batch_size = 128
maxepoches = 12
learning_rate = 0.1
lr_decay = 1e-6
lr_drop = 20

sess = tf.Session()
keras.backend.set_session(sess)

def lr_scheduler(epoch):
    return learning_rate * (0.5 ** (epoch // lr_drop))
reduce_lr = keras.callbacks.LearningRateScheduler(lr_scheduler)

model = build_mnist_model()

model.compile(loss='categorical_crossentropy',
              optimizer=keras.optimizers.Adadelta(),
              metrics=[keras.metrics.CategoricalAccuracy()])

history = model.fit(train_images, train_labels,
                    batch_size=batch_size,
                    epochs=maxepoches,
                    verbose=1,
                    validation_data=(test_images, test_labels),
                    callbacks=[reduce_lr])


In [0]:
''' A simple utility funcion for evaluating the success of an attack
'''
def TestAttack(model, adv_images, orig_images, true_labels, target_labels=None, targeted=False):
    score = model.evaluate(adv_images, true_labels, verbose=0)
    print('Test loss: {:.2f}'.format(score[0]))
    print('Successfully moved out of source class: {:.2f}'.format( 1 - score[1]))
    
    if targeted:
        score = model.evaluate(adv_images, target, verbose=0)
        print('Test loss: {:.2f}'.format(score[0]))
        print('Successfully perturbed to target class: {:.2f}'.format(score[1]))
    
    dist = np.mean(np.sqrt(np.mean(np.square(adv_images - orig_images), axis=(1,2,3))))
    print('Mean perturbation distance: {:.2f}'.format(dist))
    
    index = 10
    img = adv_images[index].reshape(28, 28)
    plt.imshow(img, cmap='gray')
    plt.show()

--- Fast Gradient Sign Method (FGSM) --- here FGSM is implemented for you as an example, make sure you understand the code, in particular look at the functions keras.backend.function and keras.backend.gradients

** the solution will also include a more efficient pure tensorflow implementation but you are welcome, to use this approach for your solution

In [0]:
''' Fast Gradient Sign Method implementation - perturb all input features by an epsilon sized step in 
    the direction of loss gradient
'''
def FastGradientSignMethod(model, images, labels, epsilon=0.3):
    true_label_tensor = keras.backend.placeholder(shape=(None, num_classes))
    adv_loss = keras.losses.categorical_crossentropy(true_label_tensor, model.output)
    get_grads = keras.backend.function([model.input, true_label_tensor], keras.backend.gradients(adv_loss, model.input))
    
    adv_grads = get_grads([images, labels])[0]
    
    adv_out = images + epsilon * np.sign(adv_grads)
    return adv_out

test out the attack, play around with the parameters and see how they influence the result visually and regarding the metrics

In [0]:
adv_images = FastGradientSignMethod(model, test_images, test_labels, epsilon=0.3)
TestAttack(model, adv_images, test_images, test_labels, targeted=False)

--- Targeted Gradient Sign Method (TGSM) --- implement this method and test out the results

In [0]:
''' Targeted Gradient Sign Method implementation - A targeted variant of the FGSM attack
    here we minimize the loss with respect to the target class, as opposed to maximizing the loss with respect
    to the source class
'''
def TargetedGradientSignMethod(model, images, target, epsilon=0.3):
    # TODO: Your code comes here
    return adv_out

In [0]:
target = (np.argmax(test_labels, axis=1) + np.random.randint(1, num_classes, size=(test_labels.shape[0]))) % num_classes
target = keras.utils.to_categorical(target, num_classes)
adv_images = TargetedGradientSignMethod(model, test_images, target, epsilon=0.3)
TestAttack(model, adv_images, test_images, test_labels, target, targeted=True)

--- Basic Iterative Method (BIM) --- implement this method which is the iterative upgrade to any of the two previous attacks, like before test the attack out and play around with the parameters

In [0]:
def BasicIterativeMethod(model, images, labels, epsilon=0.1, iter_eps = 0.05, iterations=10, min_x=0.0, max_x=1.0, targeted=False):
    
    # TODO: Your code comes here
    
    return adv_out

In [0]:
target = (np.argmax(test_labels, axis=1) + np.random.randint(1, num_classes, size=(test_labels.shape[0]))) % num_classes
target = keras.utils.to_categorical(target, num_classes)
adv_images = BasicIterativeMethod(model, test_images, target, iterations = 30, epsilon=4.0, iter_eps=0.05, targeted=True)
TestAttack(model, adv_images, test_images, test_labels, target, targeted=True)

--- Projected Gradient Descent (PGD) --- implement this improvement on the BIM basic variant, as usual test it out

after you finish the exercise make sure you look at the solutions, they include the TensorFlow only implementation which is more computationally efficient (but more complex to write), and the Cleverhans implementation which is a well known library for Adversarial Machine Learning