# Testing the 3 Models against Benign and FGSM/PGD Perturbed Images; AutoAttack

## Imports and GPU Check

In [1]:
#imports
#Main Libraries
import tensorflow as tf
from tensorflow import keras
from keras import datasets, layers, models
import torch
from autoattack import utils_tf2
from autoattack import AutoAttack

# #Helper Libraries (in order to interpret and view the data)
import matplotlib.pyplot as plt
import numpy as np

In [2]:
#Prints if a GPU is detected by the TensorFlow system
print(len(tf.config.list_physical_devices('GPU')) > 0)

True


## Importing and Scaling the Data

In [3]:
#importing the MNIST dataset from tensorflow
from tensorflow.keras.datasets import mnist 
(X_train, Y_train), (X_test, Y_test) = mnist.load_data()

In [4]:
#scaling the data from 0 to 1 as float (decimal) numbers
X_train, X_test = X_train / 255.0, X_test / 255.0

## Model Loading

In [5]:
#loading the model
model = tf.keras.models.load_model('complete_saved_mnist_model')

In [6]:
#loading the fgsm model
fgsm_model = tf.keras.models.load_model('complete_saved_adv_mnist_model')

In [7]:
#loading the pgd model
pgd_model = tf.keras.models.load_model('complete_saved_adv_pgd_40_mnist_model')

## Attack Creation

In [8]:
#Creating the adversarial attack for adversarial training
loss_object = tf.keras.losses.SparseCategoricalCrossentropy()

def create_adv(input_image, input_label, model_type = "adv_model"):
  with tf.GradientTape() as tape:
    tape.watch(input_image)
    if(model_type == "model"):
      prediction = model(input_image)
    elif(model_type == "fgsm_model"):
        prediction = fgsm_model(input_image)
    else:
      prediction = pgd_model(input_image)
    loss = loss_object(input_label, prediction)

  gradient = tape.gradient(loss, input_image)
  signed_grad = tf.sign(gradient)
  return signed_grad

def fgsm(input_image, input_label, eps=0.25, model_type = "adv_model"):
  perturbation = create_adv(input_image, input_label, model_type)
  adv_image = input_image + perturbation * eps
  adv_image = tf.clip_by_value(adv_image, 0, 1)

  return adv_image

def pgd(input_image, input_label, num_steps=100, eps=0.25, alpha=0.01, model_type = "adv_model"):
  adv_image = input_image
  for i in range(num_steps):
    adv_image = fgsm(adv_image, input_label, alpha, model_type)
    perturbation = adv_image - input_image
    perturbation = tf.clip_by_value(perturbation, -eps, eps)
    adv_image = input_image + perturbation
  
  return adv_image

## Testing the 3 models against benign, FGSM Perturbed, and PGD Perturbed Images

### Benign Test

In [9]:
#Base Model
model.evaluate(X_test, Y_test)



[0.06955765187740326, 0.9876999855041504]

In [10]:
#FGSM Model
fgsm_model.evaluate(X_test, Y_test)



[0.05805297940969467, 0.9876999855041504]

In [11]:
#PGD Model
pgd_model.evaluate(X_test, Y_test)



[0.06305889785289764, 0.9796000123023987]

### FGSM Test

Data Prep for a FGSM attack

In [12]:
#Base Model
fgsm_x = fgsm(tf.convert_to_tensor(X_test), Y_test, model_type = "model")
#FGSM Model
fgsm_x_fgsm = fgsm(tf.convert_to_tensor(X_test), Y_test, model_type = "fgsm_model") 
#PGD Model
fgsm_x_pgd = fgsm(tf.convert_to_tensor(X_test), Y_test)

Testing

In [13]:
#Base Model
model.evaluate(fgsm_x, Y_test)



[2.346208333969116, 0.6554999947547913]

In [14]:
#FGSM Model
fgsm_model.evaluate(fgsm_x_fgsm, Y_test)



[2.894890308380127, 0.57669997215271]

In [15]:
#PGD Model
pgd_model.evaluate(fgsm_x_pgd, Y_test)



[0.22797919809818268, 0.9277999997138977]

### PGD Test

Data Prep for a PGD attack

In [16]:
#Base Model
pgd_x = pgd(tf.convert_to_tensor(X_test), Y_test, model_type = "model")
#FGSM Model
pgd_x_fgsm = pgd(tf.convert_to_tensor(X_test), Y_test, model_type = "fgsm_model") 
#PGD Model
pgd_x_pgd = pgd(tf.convert_to_tensor(X_test), Y_test)

In [17]:
#Base Model
model.evaluate(pgd_x, Y_test)



[24.44923973083496, 0.003100000089034438]

In [18]:
#FGSM Model
fgsm_model.evaluate(pgd_x_fgsm, Y_test)



[33.12518310546875, 0.007199999876320362]

In [19]:
#PGD Model
pgd_model.evaluate(pgd_x_pgd, Y_test)



[0.40323835611343384, 0.8668000102043152]

## AutoAttack

Data Preperation - converting np arrays to PyTorch Tensors

In [None]:
X_test = np.expand_dims(X_test, axis = 3)
torch_testX = torch.from_numpy(np.transpose((X_test), (0,3,1,2))).float().cuda()
torch_testY = torch.from_numpy( Y_test ).long().cuda()

### Base Model

In [20]:
model_adapted = utils_tf2.ModelAdapter(model)
adversary = AutoAttack(model_adapted, norm='Linf', eps= 0.3, version='standard', is_tf_model=True)
x_adv = adversary.run_standard_evaluation(torch_testX, torch_testY, bs = 200)

[INFO] set data_format = 'channels_last'
setting parameters for standard version
using standard version including apgd-ce, apgd-t, fab-t, square.
initial accuracy: 98.77%
apgd-ce - 1/50 - 200 out of 200 successfully perturbed
apgd-ce - 2/50 - 200 out of 200 successfully perturbed
apgd-ce - 3/50 - 200 out of 200 successfully perturbed
apgd-ce - 4/50 - 200 out of 200 successfully perturbed
apgd-ce - 5/50 - 200 out of 200 successfully perturbed
apgd-ce - 6/50 - 200 out of 200 successfully perturbed
apgd-ce - 7/50 - 200 out of 200 successfully perturbed
apgd-ce - 8/50 - 200 out of 200 successfully perturbed
apgd-ce - 9/50 - 200 out of 200 successfully perturbed
apgd-ce - 10/50 - 200 out of 200 successfully perturbed
apgd-ce - 11/50 - 200 out of 200 successfully perturbed
apgd-ce - 12/50 - 200 out of 200 successfully perturbed
apgd-ce - 13/50 - 200 out of 200 successfully perturbed
apgd-ce - 14/50 - 200 out of 200 successfully perturbed
apgd-ce - 15/50 - 200 out of 200 successfully perturbe

### FGSM Model

In [21]:
model_adapted = utils_tf2.ModelAdapter(fgsm_model)
adversary = AutoAttack(model_adapted, norm='Linf', eps= 0.3, version='standard', is_tf_model=True)
x_adv = adversary.run_standard_evaluation(torch_testX, torch_testY, bs = 200)

[INFO] set data_format = 'channels_last'
setting parameters for standard version
using standard version including apgd-ce, apgd-t, fab-t, square.
initial accuracy: 98.77%
apgd-ce - 1/50 - 200 out of 200 successfully perturbed
apgd-ce - 2/50 - 200 out of 200 successfully perturbed
apgd-ce - 3/50 - 200 out of 200 successfully perturbed
apgd-ce - 4/50 - 200 out of 200 successfully perturbed
apgd-ce - 5/50 - 200 out of 200 successfully perturbed
apgd-ce - 6/50 - 200 out of 200 successfully perturbed
apgd-ce - 7/50 - 200 out of 200 successfully perturbed
apgd-ce - 8/50 - 200 out of 200 successfully perturbed
apgd-ce - 9/50 - 200 out of 200 successfully perturbed
apgd-ce - 10/50 - 200 out of 200 successfully perturbed
apgd-ce - 11/50 - 200 out of 200 successfully perturbed
apgd-ce - 12/50 - 200 out of 200 successfully perturbed
apgd-ce - 13/50 - 200 out of 200 successfully perturbed
apgd-ce - 14/50 - 200 out of 200 successfully perturbed
apgd-ce - 15/50 - 200 out of 200 successfully perturbe

### PGD Model

In [None]:
model_adapted = utils_tf2.ModelAdapter(pgd_model)
adversary = AutoAttack(model_adapted, norm='Linf', eps= 0.3, version='standard', is_tf_model=True)
x_adv = adversary.run_standard_evaluation(torch_testX, torch_testY, bs = 200)

using standard version including apgd-ce, apgd-t, fab-t, square.
initial accuracy: 97.96%
apgd-ce - 1/49 - 41 out of 200 successfully perturbed
apgd-ce - 2/49 - 48 out of 200 successfully perturbed
apgd-ce - 3/49 - 50 out of 200 successfully perturbed
apgd-ce - 4/49 - 43 out of 200 successfully perturbed
apgd-ce - 5/49 - 49 out of 200 successfully perturbed
apgd-ce - 6/49 - 50 out of 200 successfully perturbed
apgd-ce - 7/49 - 58 out of 200 successfully perturbed
apgd-ce - 8/49 - 61 out of 200 successfully perturbed
apgd-ce - 9/49 - 50 out of 200 successfully perturbed
apgd-ce - 10/49 - 55 out of 200 successfully perturbed
apgd-ce - 11/49 - 46 out of 200 successfully perturbed
apgd-ce - 12/49 - 55 out of 200 successfully perturbed
apgd-ce - 13/49 - 50 out of 200 successfully perturbed
apgd-ce - 14/49 - 57 out of 200 successfully perturbed
apgd-ce - 15/49 - 47 out of 200 successfully perturbed
apgd-ce - 16/49 - 49 out of 200 successfully perturbed
apgd-ce - 17/49 - 52 out of 200 success