Implementation of Adversarial Patch

https://arxiv.org/pdf/1712.09665.pdf

https://github.com/jhayes14/adversarial-patch

https://github.com/A-LinCui/Adversarial_Patch_Attack

Original paper tested with inceptionv3, restnet50, xception, VGG16, and VGG19

2 whitebox attacks, 1 blackbox attack, and a control patch of a toaster

In [None]:
import tensorflow as tf
import numpy as np
import os

device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
    print('GPU device not found')
else:
    print('Found GPU at: {}'.format(device_name))
print("Tensorflow version: " + str(tf.__version__))

!nvidia-smi

Found GPU at: /device:GPU:0
Tensorflow version: 2.11.0
Tue Feb 28 06:58:44 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   54C    P0    28W /  70W |   4147MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+--------------------------------

In [None]:
# Mount Drive
from google.colab import drive
drive.mount("/content/drive/")
data_dir = "/content/drive/My Drive/imagenet/val"
!ls "/content/drive/My Drive/imagenet"

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
final_patch_0.jpeg  patch_0.png   patch_40.png	val
final_patch.jpeg    patch_20.png  patch_60.png


In [None]:
# HYPERPARAMETERS
BATCH_SIZE = 1
TARGET_SIZE = (299, 299)
IMAGE_HEIGHT = TARGET_SIZE[0]
IMAGE_WIDTH = TARGET_SIZE[1]
PATCH_PROP = 0.05 # proportion of patch w.r.t image

ATTACK_CLASS = 859 # 859 == toaster
TARGET_CONF_SCORE = 0.8
MAX_ITER = 200
LR = 1.0

CHAN_FIRST = [0, 3, 1, 2]
CHAN_LAST = [0, 2, 3, 1]

In [None]:
# https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/image/ImageDataGenerator
img_gen = tf.keras.preprocessing.image.ImageDataGenerator(
    rescale=1./255,
    rotation_range=20,
    validation_split=0.5)

print("Training data")
train_gen = img_gen.flow_from_directory(
    data_dir,
    target_size=TARGET_SIZE,
    batch_size=BATCH_SIZE,
    subset='training')

print("Validation data")
val_gen = img_gen.flow_from_directory(
    data_dir,
    target_size=TARGET_SIZE,
    batch_size=BATCH_SIZE,
    subset='validation')

print("Batch size       = {}".format(BATCH_SIZE))
print("Image dimensions = {} x {}".format(TARGET_SIZE[0], TARGET_SIZE[1]))

Training data
Found 25064 images belonging to 1000 classes.
Validation data
Found 25053 images belonging to 1000 classes.
Batch size       = 1
Image dimensions = 299 x 299


In [None]:
# Model to attack
inceptionv3 = tf.keras.applications.inception_v3.InceptionV3(
    include_top=True,
    weights='imagenet',
    input_tensor=None,
    input_shape=None,
    pooling=None,
    classes=1000, # Imagenet has 1000 classes
    classifier_activation=None)

inceptionv3.trainable = False

In [None]:
def initialize_patch():
    # Initialize adversarial patch w/ random values
    img_size = IMAGE_HEIGHT * IMAGE_WIDTH
    patch_size = img_size * PATCH_PROP
    patch_dim = int(patch_size**(0.5))
    patch = np.random.rand(1, 3, patch_dim, patch_dim)
    print("Patch shape = " + str(patch.shape))
    return patch

patch = initialize_patch()

Patch shape = (1, 3, 66, 66)


In [None]:
def get_prob_and_class(pred_logits):
    probs = tf.nn.softmax(pred_logits, axis=1)
    pred_class = tf.argmax(probs, axis=1)
    prob = probs[:, pred_class[0]]
    return prob[0], pred_class[0], probs

In [None]:
# https://stackoverflow.com/questions/39465526/tensorflow-optimize-over-input-with-gradient-descent
# https://www.tensorflow.org/guide/autodiff
# https://stackoverflow.com/questions/37689423/convert-between-nhwc-and-nchw-in-tensorflow
import matplotlib.pyplot as plt

def attack(model, pred_logits, data, patch_dummy, mask):
    data = tf.transpose(data, CHAN_FIRST)
    cutout_data = tf.multiply((1-mask), data)
    adv_img = cutout_data + patch_dummy
    adv_img = tf.transpose(adv_img, CHAN_LAST)
    
    # pred_probs.shape == (batch size, # classes)
    pred_probs = tf.nn.softmax(pred_logits)
    attack_conf = pred_probs[:,ATTACK_CLASS][0]

    iter = 0
    while attack_conf < TARGET_CONF_SCORE and iter < MAX_ITER:
        with tf.GradientTape() as tape:
            # print("Optimizing input iteration: {} | attack conf: {}".format(iter, attack_conf))

            adv_img = tf.Variable(adv_img, trainable=True, name="adversarial_patch_toaster", dtype=tf.float32)
            adv_pred_logit = model(adv_img) # Output is softmax-ed
            adv_pred_log_prob = tf.nn.log_softmax(adv_pred_logit, axis=1)

            attack_log_prob = tf.gather(adv_pred_log_prob, [ATTACK_CLASS], axis=1)[0][0]

            # Optimize patch
            grad = tape.gradient(attack_log_prob, adv_img)
            grad = tf.transpose(grad, CHAN_FIRST)
            patch_grad = tf.multiply(mask, grad)
            patch_grad = tf.transpose(patch_grad, CHAN_LAST)
            adv_img = adv_img + patch_grad
            adv_img = tf.clip_by_value(adv_img, 0.0, 1.0)

            if iter % 25 == 0:
                print("Optimizing input iteration: {} | New attack conf: {}".format(iter, attack_conf))

            iter += 1

            # Check attack confidence score
            adv_pred_logit = model(adv_img) # Output is softmax-ed
            pred_class = tf.math.argmax(adv_pred_logit, axis=1)
            adv_pred_prob = tf.nn.softmax(adv_pred_logit, axis=1)
            attack_conf = tf.gather(adv_pred_prob, [ATTACK_CLASS], axis=1)[0][0]

    return adv_img

In [None]:
def get_dummy_image_with_patch(patch, data_shape):
    # Get dummy image which we will place attack patch on.
    dummy = np.zeros((data_shape[0], 3, IMAGE_HEIGHT, IMAGE_WIDTH))
    
    # Get width or height dimension of patch
    patch_size = patch.shape[-1] # patch.shape == (1, 3, patch_dim, patch_dim)
    
    for i in range(dummy.shape[0]): # for each data in batch (for jhayes14, 1)
        # Perform random # of 90 deg rotations
        num_rots = np.random.choice(4)
        for j in range(patch[i].shape[0]): # for each RGB value
            patch[i][j] = np.rot90(patch[i][j], num_rots)
        
        # Choose random location on dummy image for patch
        patch_x = np.random.choice(IMAGE_HEIGHT)
        while patch_x + patch_size > dummy.shape[-1]:
            patch_x = np.random.choice(IMAGE_HEIGHT)
        patch_y = np.random.choice(IMAGE_WIDTH)
        while patch_y + patch_size > dummy.shape[-2]:
            patch_y = np.random.choice(IMAGE_WIDTH)
       
        # Apply patch to dummy image  
        dummy[i][0][patch_x:patch_x+patch_size, patch_y:patch_y+patch_size] = patch[i][0]
        dummy[i][1][patch_x:patch_x+patch_size, patch_y:patch_y+patch_size] = patch[i][1]
        dummy[i][2][patch_x:patch_x+patch_size, patch_y:patch_y+patch_size] = patch[i][2]
    
    mask = np.copy(dummy)
    mask[mask != 0] = 1.0 # Turn patch values into 1's
    
    return dummy, mask, (patch_x, patch_y)

_, _, _ = get_dummy_image_with_patch(patch, (1, 299, 299, 3))

In [None]:
def train(epoch, model, train_gen, patch):
    success = 0
    total = 0
    recover_time = 0
    for batch_idx, (train_data, train_labels) in enumerate(train_gen):
        print("\nBatch index: {}".format(batch_idx))
        pred_logits = model(train_data)
        pred_prob, pred_class, _ = get_prob_and_class(pred_logits)
        real_class = tf.argmax(train_labels, axis=1)

        # Only create adversarial examples on examples that were originally classified correctly
        if batch_idx == 200:
            # Save patch
            print("Saving patch")
            # save_patch = tf.image.crop_to_bounding_box(
            #     trained_adv_img, off_h, off_w, targ_h, targ_w)
            plt.imsave(
                "/content/drive/My Drive/imagenet/patch_{}.png".format(batch_idx),
                np.array(tf.transpose(patch, CHAN_LAST))[0])
            return
        if batch_idx % 20 == 0:
            plt.imsave(
                "/content/drive/My Drive/imagenet/patch_{}.png".format(batch_idx),
                np.array(tf.transpose(patch, CHAN_LAST))[0])
        if pred_class != real_class:
            print("Original image was NOT classified correctly.")
            continue

        # Get dummy image and mask, both with patch
        patch_dummy, mask, patch_loc = get_dummy_image_with_patch(
            patch, train_data.shape)
        
        trained_adv_img = attack(model, pred_logits, train_data, patch_dummy, mask)
        
        # Compare performance of the adversarial image vs. original image
        pred_logits = model(trained_adv_img)
        adv_pred_prob, adv_pred_class, all_probs = get_prob_and_class(pred_logits)
        orig_class_prob = all_probs[:, pred_class][0]
        print("Adversarial image's predicted class: {} vs. Original image's predicted class: {}".
              format(adv_pred_class, pred_class))
        print("Adversarial image predicted class conf: {} vs. Original image predicated class conf: {}".
              format(adv_pred_prob, pred_prob))
        print("Conf of original class on adversarial image: {}".format(orig_class_prob))

        # Save image if the adversarial image was successful
        if adv_pred_class == ATTACK_CLASS:
            tf.keras.utils.save_img(
                "/content/drive/My Drive/imagenet/adversarial.jpeg",
                np.array(trained_adv_img)[0],
                data_format="channels_last")
        
        off_h = patch_loc[0]
        off_w = patch_loc[1]
        targ_h = patch.shape[-2]
        targ_w = patch.shape[-1]

        patch = tf.image.crop_to_bounding_box(
            trained_adv_img, off_h, off_w, targ_h, targ_w)
        patch = np.array(tf.transpose(patch, CHAN_FIRST))

In [None]:
train(0, inceptionv3, train_gen, patch)


Batch index: 0
Optimizing input iteration: 0 | New attack conf: 0.00053295714315027
Optimizing input iteration: 25 | New attack conf: 0.025591399520635605
Adversarial image's predicted class: 859 vs. Original image's predicted class: 698
Adversarial image predicted class conf: 0.8664863109588623 vs. Original image predicated class conf: 0.09126339107751846
Conf of original class on adversarial image: 0.004368039779365063

Batch index: 1
Optimizing input iteration: 0 | New attack conf: 2.294249316037167e-05
Optimizing input iteration: 25 | New attack conf: 0.2929437458515167
Adversarial image's predicted class: 859 vs. Original image's predicted class: 7
Adversarial image predicted class conf: 0.8636278510093689 vs. Original image predicated class conf: 0.9393996596336365
Conf of original class on adversarial image: 0.11946824938058853

Batch index: 2
Optimizing input iteration: 0 | New attack conf: 6.172019493533298e-05
Optimizing input iteration: 25 | New attack conf: 0.0200094394385