In [1]:
#imports
#Main Library (Actual Neural Network Part)
import tensorflow as tf
from tensorflow import keras
from keras import datasets, layers, models
import neural_structured_learning as nsl
import torch
from autoattack import utils_tf2
from autoattack import AutoAttack

# #Helper Libraries (in order to interpret and view the data)
import matplotlib.pyplot as plt
import numpy as np

In [2]:
model = tf.keras.models.load_model('complete_saved_adv_mnist_model')

In [3]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 26, 26, 4)         40        
                                                                 
 max_pooling2d (MaxPooling2D  (None, 13, 13, 4)        0         
 )                                                               
                                                                 
 conv2d_1 (Conv2D)           (None, 11, 11, 16)        592       
                                                                 
 max_pooling2d_1 (MaxPooling  (None, 5, 5, 16)         0         
 2D)                                                             
                                                                 
 conv2d_2 (Conv2D)           (None, 3, 3, 32)          4640      
                                                                 
 flatten (Flatten)           (None, 288)               0

In [15]:
base_model = tf.keras.models.load_model('complete_saved_mnist_model')

In [4]:
#Prints if a GPU is detected by the TensorFlow system
print(len(tf.config.list_physical_devices('GPU')) > 0)

True


In [5]:
#importing the MNIST dataset from tensorflow
from tensorflow.keras.datasets import mnist 
(X_train, Y_train), (X_test, Y_test) = mnist.load_data()

In [6]:
#scaling the data from 0 to 1 as float (decimal) numbers
X_train, X_test = X_train / 255.0, X_test / 255.0

In [7]:
#premiliminary testing (accuracy of benign images)
model.evaluate(x = X_test, y = Y_test)



[0.05805297940969467, 0.9876999855041504]

In [8]:
adv_config = nsl.configs.make_adv_reg_config(multiplier = 0.2, adv_step_size = 0.05)
adv_model = nsl.keras.AdversarialRegularization(model, adv_config = adv_config)

In [9]:
#compiler which configures the model
adv_model.compile(optimizer='adam',
              loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True),
              metrics = ['accuracy'])

In [10]:
adv_model.evaluate({'feature': X_test, 'label': Y_test})

  return dispatch_target(*args, **kwargs)


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method


[0.07364562153816223,
 0.05796034634113312,
 0.9876999855041504,
 0.015685245394706726]

In [16]:
#Creating the adversarial attack for adversarial training
loss_object = tf.keras.losses.SparseCategoricalCrossentropy()

def create_adv(input_image, input_label, model_type = "adv_model"):
  with tf.GradientTape() as tape:
    tape.watch(input_image)
    if(model_type == "model"):
      prediction = base_model(input_image)
    else:
      prediction = model(input_image)
    loss = loss_object(input_label, prediction)

  gradient = tape.gradient(loss, input_image)
  signed_grad = tf.sign(gradient)
  return signed_grad

def fgsm(input_image, input_label, eps=0.25, model_type = "adv_model"):
  perturbation = create_adv(input_image, input_label, model_type)
  adv_image = input_image + perturbation * eps
  adv_image = tf.clip_by_value(adv_image, 0, 1)

  return adv_image

def pgd(input_image, input_label, num_steps=100, eps=0.25, alpha=0.01, model_type = "adv_model"):
  adv_image = input_image
  for i in range(num_steps):
    adv_image = fgsm(adv_image, input_label, alpha, model_type)
    perturbation = adv_image - input_image
    perturbation = tf.clip_by_value(perturbation, -eps, eps)
    adv_image = input_image + perturbation
  
  return adv_image

In [17]:
fgsm_x = fgsm(tf.convert_to_tensor(X_test), Y_test)
adv_model.evaluate({'feature': fgsm_x, 'label': Y_test})

  return dispatch_target(*args, **kwargs)




[3.601750373840332, 2.8952455520629883, 0.57669997215271, 0.706505537033081]

In [18]:
pgd_x = pgd(tf.convert_to_tensor(X_test), Y_test)
adv_model.evaluate({'feature': pgd_x, 'label': Y_test})



[39.95924758911133, 33.12449645996094, 0.007199999876320362, 6.834768295288086]

In [19]:
fgsm_x_base = fgsm(tf.convert_to_tensor(X_test), Y_test, model_type = "model")
adv_model.evaluate({'feature': fgsm_x_base, 'label': Y_test})



[0.5362680554389954,
 0.41891056299209595,
 0.90829998254776,
 0.11735693365335464]

In [20]:
pgd_x_base = pgd(tf.convert_to_tensor(X_test), Y_test, model_type = "model")
adv_model.evaluate({'feature': pgd_x_base, 'label': Y_test})



[1.0536247491836548, 0.82384192943573, 0.839900016784668, 0.22978337109088898]

In [11]:
model_adapted = utils_tf2.ModelAdapter(model)
adversary = AutoAttack(model_adapted, norm='Linf', eps= 0.15, version='standard', is_tf_model=True)



[INFO] set data_format = 'channels_last'
setting parameters for standard version


In [12]:
X_test = np.expand_dims(X_test, axis = 3)
torch_testX = torch.from_numpy(np.transpose((X_test), (0,3,1,2))).float().cuda()
torch_testY = torch.from_numpy( Y_test ).long().cuda()

In [13]:
x_adv = adversary.run_standard_evaluation(torch_testX, torch_testY, bs = 200)

using standard version including apgd-ce, apgd-t, fab-t, square.
initial accuracy: 98.77%
apgd-ce - 1/50 - 200 out of 200 successfully perturbed
apgd-ce - 2/50 - 200 out of 200 successfully perturbed
apgd-ce - 3/50 - 200 out of 200 successfully perturbed
apgd-ce - 4/50 - 200 out of 200 successfully perturbed
apgd-ce - 5/50 - 200 out of 200 successfully perturbed
apgd-ce - 6/50 - 200 out of 200 successfully perturbed
apgd-ce - 7/50 - 200 out of 200 successfully perturbed
apgd-ce - 8/50 - 200 out of 200 successfully perturbed
apgd-ce - 9/50 - 200 out of 200 successfully perturbed
apgd-ce - 10/50 - 200 out of 200 successfully perturbed
apgd-ce - 11/50 - 200 out of 200 successfully perturbed
apgd-ce - 12/50 - 200 out of 200 successfully perturbed
apgd-ce - 13/50 - 200 out of 200 successfully perturbed
apgd-ce - 14/50 - 200 out of 200 successfully perturbed
apgd-ce - 15/50 - 200 out of 200 successfully perturbed
apgd-ce - 16/50 - 200 out of 200 successfully perturbed
apgd-ce - 17/50 - 200 o