In [None]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

#### Let's load our mnist data

In [None]:
mnist = tf.keras.datasets.mnist
(x_train, y_train),(x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0
x_train.shape

#### Let's train our simple neural network to predict handwritten digits

In [None]:
model = tf.keras.models.Sequential([
  tf.keras.layers.Flatten(input_shape=(28,28)),
  tf.keras.layers.Dense(512, activation=tf.nn.relu),
  tf.keras.layers.Dropout(0.2),
  tf.keras.layers.Dense(10, activation=tf.nn.softmax)
])

In [None]:
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

model.fit(x_train, y_train, epochs=5)

#### Let's evaluate our model

In [None]:
model.evaluate(x_test, y_test)

We can see that our model does pretty well and achieves areound **98%** accuracy

#### Let's now extract all the images of the digit 2 that we have

In [None]:
idx_train = np.where(y_train == 2)
x_train_two = x_train[idx_train]
y_train_two = y_train[idx_train]
num_samples = x_train_two.shape[0]

#### Let's visualize a few random examples of the digit two

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=3)
fig.tight_layout()
for col in ax:
    ix = np.random.choice(num_samples)
    col.set_title('Label is {label}'.format(label=y_train_two[ix]))
    col.imshow(x_train_two[ix], cmap='gray')

We now need to find a way to make our original network predict that these images of the digit "2" as "6". To do that we have to someohow tweak the pixels in the images of "2", but only so much so that they still look like the digit "2" to the human eye. So we can sum this up with two constraints:

* We want to maximize the probability output of our original neural network that the image is the digit 6
* We want to minimize how much we change the images so that they don't change too much to the human eye. 

We'll use the idea from the following blog post: https://medium.com/@ageitgey/machine-learning-is-fun-part-8-how-to-intentionally-trick-neural-networks-b55da32b7196. We'll pick a sample image of two, pass it through our network and use a cost function to calculate the loss given that the image should be predicted as "6". We'll then calculate the gradients and tweak our image towards "6".

### Let's define a function that can give us the gradient and loss based on an image w.r.t to our preferred prediction

The code in the next two blocks is from this blog here: [link](https://medium.com/@ageitgey/machine-learning-is-fun-part-8-how-to-intentionally-trick-neural-networks-b55da32b7196)

In [None]:
#label we want to maximize
correct_label = 6

#this is a function that grabs the probability from the model of an input being a six
prob_func = model.layers[-1].output[0, correct_label] 

#the gradient w.r.t the input (not the weights of the model) to maximize the prob of our prob_func
gradients_func = tf.keras.backend.gradients(prob_func, model.layers[0].input)[0] 

#function that accepts an image and returns the gradient and prob of image being a six after a forward pass through the network
grab_cost_and_gradients_from_model = tf.keras.backend.function([model.layers[0].input, tf.keras.backend.learning_phase()], [prob_func, gradients_func])

In [None]:
prob = 0.0 #probability of the image being a six
learning_rate = 0.1
hacked_image = np.copy(x_train_two[0]) #the image that we will tweak to fool the neural net
hacked_image = np.expand_dims(hacked_image, axis=0) #add an extra dimension because the model expects a batch

# In a loop, keep adjusting the hacked image slightly so that it tricks the model more and more
# until it gets to at least 80% confidence
iteration = 0
while prob < 0.8:
    
    # Check how close the image is to our target class and grab the gradients we
    # can use to push it one more step in that direction.
    prob, gradients = grab_cost_and_gradients_from_model([hacked_image, 0])

    # Move the hacked image one step further towards fooling the model
    hacked_image += gradients * learning_rate
    iteration += 1
    
    #minimize output in notebook
    if iteration % 10000 == 0:   
        print("Model's predicted likelihood that the image is a six: {:.8}%".format(prob * 100))
print("Model's predicted likelihood that the image is a six: {:.8}%".format(prob * 100))

### Let's plot our adversarial image

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=3)
fig.tight_layout()
images = [x_train_two[0], abs(x_train_two[0]-hacked_image[0]), hacked_image[0]]
titles = ["Original image", "Delta", "Adversarial image"]
for i, col in enumerate(ax):
    col.set_title(label=titles[i])
    col.imshow(images[i], cmap='gray')

Not bad at all, however we can see that the image looks very noisy, to remedy that we to constrain each pixel to not change too much from its original value

### Let's add a constraint that pixels can't change too much

The pixel constraint idea and code is also from [here](https://medium.com/@ageitgey/machine-learning-is-fun-part-8-how-to-intentionally-trick-neural-networks-b55da32b7196)

In [None]:
prob = 0.0
learning_rate = 1

#pixel can at most change by 0.1 from it's original value
max_change = x_train_two[0]+0.1 
min_change = x_train_two[0]-0.1 
hacked_image = np.copy(x_train_two[0])
hacked_image = np.expand_dims(hacked_image, axis=0)

# In a loop, keep adjusting the hacked image slightly so that it tricks the model more and more
# until it gets to at least 80% confidence
iteration = 0
while prob < 0.8:
    
    # Check how close the image is to our target class and grab the gradients we
    # can use to push it one more step in that direction.
    prob, gradients = grab_cost_and_gradients_from_model([hacked_image, 0])

    # Move the hacked image one step further towards fooling the model
    hacked_image += gradients * learning_rate
    hacked_image = np.clip(hacked_image, min_change, max_change)
    iteration += 1
    
    #minimize output in notebook
    if iteration % 10000 == 0:   
        print("Model's predicted likelihood that the image is a six: {:.8}%".format(prob * 100))
print("Model's predicted likelihood that the image is a six: {:.8}%".format(prob * 100))

### Let's plot the adverserial image with the pixel constraint

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=3)
fig.tight_layout()
images = [x_train_two[0], abs(x_train_two[0]-hacked_image[0]), hacked_image[0]]
titles = ["Original image", "Delta", "Adversarial image"]
for i, col in enumerate(ax):
    col.set_title(label=titles[i])
    col.imshow(images[i], cmap='gray')

We can see that this is better than before, our delta is mostly white so not much has been added to the image. The adverserial image also has a consistently blacker background than the previous one. 

#### Let's now do this for 10 images

In [None]:
prob = 0.0 #probability of image being a six
learning_rate = 0.1

#set up original images and copies that we will tweak
num_images = 10
images_idx = np.arange(0,num_images)
original_images = x_train_two[images_idx]
hacked_images = np.copy(original_images)

#let's use a momentum gradient descent since training was pretty slow using the gradient unmodified
velocity = np.zeros((1,28,28), dtype=np.float64)
momentum = 1.1

for i in range(0,num_images):
    
    hacked_image = np.copy(hacked_images[i])
    max_change = np.copy(original_images[i]+0.1)
    min_change = np.copy(original_images[i]-0.1)
    hacked_image = np.expand_dims(hacked_image, axis=0)
    
    # In a loop, keep adjusting the hacked image slightly so that it tricks the model more and more
    # until it gets to at least 80% confidence
    iteration = 0
    while prob < 0.8:

        # Check how close the image is to our target class and grab the gradients we
        # can use to push it one more step in that direction.
        prob, gradients = grab_cost_and_gradients_from_model([hacked_image, 0])
        # Move the hacked image one step further towards fooling the model
        velocity = momentum*velocity + gradients * learning_rate
        hacked_image += velocity
        hacked_image = np.clip(hacked_image, min_change, max_change)
        iteration += 1

        #minimize output in notebook
        if iteration % 10000 == 0:   
            print("Model's predicted likelihood that the image is a six: {:.8}%".format(prob * 100))
        
        #Don't do more than 1M iterations
        if iteration == 10000:
            break
    print("Done training with a predicted likelihood that the image is  a six: {:.8}%".format(prob * 100))
    hacked_images[i] = hacked_image[0]
    velocity = np.zeros((1,28,28), dtype=np.float64)
    prob = 0

### Let's plot our 10 images

In [None]:
fig, ax = plt.subplots(nrows=10, ncols=3, figsize=(10, 20))
plt.subplots_adjust(top = 2, bottom=1, hspace=0.5, wspace=0.4)
titles = ["Original image", "Delta", "Adversarial image"]
plot_vec = []
for i in range(num_images):
    plot_vec.append((original_images[i], original_images[i]-hacked_images[i], hacked_images[i]))
j = 0
for i, row in enumerate(ax):
    for j, col in enumerate(row):
        col.set_title(label=titles[j])
        col.imshow(plot_vec[i][j], cmap='gray', aspect='equal')
plt.savefig('adversarial_examples.png')