# Defensive Distillation Application

In this notebook an application example of defensive distillation (Papernot, et at. 2016) method for preventing adversarial examples is presented. For deeper details check for the original paper of the method in [https://arxiv.org/pdf/1511.04508.pdf].

**References**

N. Papernot, P. McDaniel, X. Wu, S. Jha and A. Swami, "Distillation as a Defense to Adversarial Perturbations Against Deep Neural Networks," 2016 IEEE Symposium on Security and Privacy (SP), San Jose, CA, 2016, pp. 582-597.

In [2]:
from __future__ import absolute_import, division, print_function, unicode_literals

import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.patches as mspatches
from mpl_toolkits.mplot3d import Axes3D
import math
import random
import json
%matplotlib inline

import tensorflow as tf #import the tensorflow library
from tensorflow.keras import datasets, layers, models #import the keras library from tensorflow
from tensorflow import keras
import cv2

In [None]:
def convert_to_rgb(img, dim1, dim2):
    img = cv2.resize(img, (dim1, dim2), interpolation=cv2.INTER_AREA) 
    img_rgb = np.asarray(np.dstack((img, img, img)), dtype=np.uint8)
    
    return img_rgb

def convert_img_set_to_rgb(img_set, dim1, dim2):

    rgb_img_list = list()
    
    for i in range(len(img_set)):
        rgb = convert_to_rgb(img_set[i], dim1, dim2)
        rgb_img_list.append(rgb)
        
    rgb_arr = np.stack([rgb_img_list],axis=4)
    img_set = np.squeeze(rgb_arr, axis=4)
    # print(img_set.shape)
    
    return img_set


In [8]:
(train_images, train_labels), (test_images, test_labels) = datasets.mnist.load_data()

train_labels_list = list()

for i in range(len(train_labels)):
    aux_array = np.zeros(10)
    aux_array[train_labels[i]] = 1
    train_labels_list.append(aux_array)

train_labels = np.asarray(train_labels_list)

test_labels_list = list()

for i in range(len(test_labels)):
    aux_array = np.zeros(10)
    aux_array[test_labels[i]] = 1
    test_labels_list.append(aux_array)

test_labels = np.asarray(test_labels_list)

train_images_adjusted = convert_img_set_to_rgb(train_images, 32, 32)

test_images_adjusted = convert_img_set_to_rgb(test_images, 32, 32)

train_images_adjusted, test_images_adjusted = train_images_adjusted/255, test_images_adjusted/255

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz


## VGG 16

In [None]:
def conv2d(input_x, W, b, strides=1):
    x = tf.nn.conv2d(input_x, W, strides=[1, strides, strides, 1], padding='SAME')
    return tf.nn.bias_add(x, b)

def act_relu(x):
    return tf.nn.relu(x)

def maxpool2d(input_x, k=2):
    return tf.nn.max_pool(input_x, ksize=[1, k, k, 1], strides=[1, k, k, 1], padding='SAME')

def VGG_16_Temperature(input_x, weights, biases, temperature):
    conv1 = conv2d(input_x, weights[0], biases[0])
    conv1_relu = act_relu(conv1)
    conv2 = conv2d(conv1_relu, weights[1], biases[1])
    conv2_relu = act_relu(conv2)
    # conv1_do = tf.nn.dropout(conv1_relu, rate=0.25)
    conv2_max_pooling = maxpool2d(conv2_relu, k=2)

    conv3 = conv2d(conv2_max_pooling, weights[2], biases[2])
    conv3_relu = act_relu(conv3)
    conv4 = conv2d(conv3_relu, weights[3], biases[3])
    conv4_relu = act_relu(conv4)
    # conv1_do = tf.nn.dropout(conv1_relu, rate=0.25)
    conv4_max_pooling = maxpool2d(conv4_relu, k=2)

    conv5 = conv2d(conv4_max_pooling, weights[4], biases[4])
    conv5_relu = act_relu(conv5)
    conv6 = conv2d(conv5_relu, weights[5], biases[5])
    conv6_relu = act_relu(conv6)
    conv7 = conv2d(conv6_relu, weights[6], biases[6])
    conv7_relu = act_relu(conv7)
    # conv1_do = tf.nn.dropout(conv1_relu, rate=0.25)
    conv7_max_pooling = maxpool2d(conv7_relu, k=2)

    conv8 = conv2d(conv7_max_pooling, weights[7], biases[7])
    conv8_relu = act_relu(conv8)
    conv9 = conv2d(conv8_relu, weights[8], biases[8])
    conv9_relu = act_relu(conv9)
    conv10 = conv2d(conv9_relu, weights[9], biases[9])
    conv10_relu = act_relu(conv10)
    # conv1_do = tf.nn.dropout(conv1_relu, rate=0.25)
    conv10_max_pooling = maxpool2d(conv10_relu, k=2)

    conv11 = conv2d(conv10_max_pooling, weights[10], biases[10])
    conv11_relu = act_relu(conv11)
    conv12 = conv2d(conv11_relu, weights[11], biases[11])
    conv12_relu = act_relu(conv12)
    conv13 = conv2d(conv12_relu, weights[12], biases[12])
    conv13_relu = act_relu(conv13)
    # conv1_do = tf.nn.dropout(conv1_relu, rate=0.25)
    conv13_max_pooling = maxpool2d(conv13_relu, k=2)
    
    dens1 = tf.reshape(conv13_max_pooling, [-1, weights[13].get_shape().as_list()[0]])
    dens1 = tf.add(tf.matmul(dens1, weights[13]), biases[13])
    dens1_relu = tf.nn.relu(dens1)
    
    dens2 = tf.add(tf.matmul(dens1_relu, weights[14]), biases[14])
    dens2_relu = tf.nn.relu(dens2)

    dens3 = tf.add(tf.matmul(dens2_relu, weights[15]), biases[15])
    
    dens4 = tf.divide(dens3, temperature) #insert the temperature
    return dens4

x = tf.placeholder(tf.float32, shape=(None, 32, 32, 3))
y = tf.placeholder(tf.float32, shape=(None, 10))

In [None]:
weights = [tf.Variable(tf.random_normal([3, 3, 3, 64], stddev=0.1)),
    tf.Variable(tf.random_normal([3, 3, 64, 64], stddev=0.1)),
    tf.Variable(tf.random_normal([3, 3, 64, 128], stddev=0.1)),
    tf.Variable(tf.random_normal([3, 3, 128, 128], stddev=0.1)),
    tf.Variable(tf.random_normal([3, 3, 128, 256], stddev=0.1)),
    tf.Variable(tf.random_normal([3, 3, 256, 256], stddev=0.1)),
    tf.Variable(tf.random_normal([1, 1, 256, 256], stddev=0.1)),
    tf.Variable(tf.random_normal([3, 3, 256, 512], stddev=0.1)),
    tf.Variable(tf.random_normal([3, 3, 512, 512], stddev=0.1)),
    tf.Variable(tf.random_normal([1, 1, 512, 512], stddev=0.1)),
    tf.Variable(tf.random_normal([3, 3, 512, 512], stddev=0.1)),
    tf.Variable(tf.random_normal([3, 3, 512, 512], stddev=0.1)),
    tf.Variable(tf.random_normal([1, 1, 512, 512], stddev=0.1)),
    tf.Variable(tf.random_normal([512, 4096], stddev=0.1)),
    tf.Variable(tf.random_normal([4096, 4096], stddev=0.1)),
    tf.Variable(tf.random_normal([4096, 10], stddev=0.1))       
]

biases = [tf.Variable(tf.random_normal([64], stddev=0.1)),
    tf.Variable(tf.random_normal([64], stddev=0.1)),
    tf.Variable(tf.random_normal([128], stddev=0.1)),
    tf.Variable(tf.random_normal([128], stddev=0.1)),
    tf.Variable(tf.random_normal([256], stddev=0.1)),
    tf.Variable(tf.random_normal([256], stddev=0.1)),
    tf.Variable(tf.random_normal([256], stddev=0.1)),
    tf.Variable(tf.random_normal([512], stddev=0.1)),
    tf.Variable(tf.random_normal([512], stddev=0.1)),
    tf.Variable(tf.random_normal([512], stddev=0.1)),
    tf.Variable(tf.random_normal([512], stddev=0.1)),
    tf.Variable(tf.random_normal([512], stddev=0.1)),
    tf.Variable(tf.random_normal([512], stddev=0.1)),
    tf.Variable(tf.random_normal([4096], stddev=0.1)),
    tf.Variable(tf.random_normal([4096], stddev=0.1)),
    tf.Variable(tf.random_normal([10]))      
]

## Training procedure

In [None]:
def train_distilled_temperature(name_to_save, x, y, weights, biases, train_images, train_labels, test_images, test_labels, epochs, temperature):
    logit = VGG_16_Temperature(x, weights, biases, temperature)

    prediction = tf.nn.softmax(logit)

    loss_op = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=y, logits=logit))
    train_op = tf.train.AdamOptimizer(learning_rate=0.000005).minimize(loss_op)

    correct_pred = tf.equal(tf.argmax(prediction, 1), tf.argmax(y, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

    batch_size = 150

    model_weights_trained = name_to_save + "_" + str(temperature) + "_VGG16_Weights"
    model_bias_trained = name_to_save + "_" + str(temperature) + "_VGG16_Bias"

    acc_train_list = list()
    acc_test_list = list()
    loss_train_list = list()
    loss_test_list = list()

    with tf.Session() as sess:

        sess.run(tf.global_variables_initializer())

        # print(sess.run(weights['wc1']))

        for epoch in range(epochs):

            print("EPOCH: " + str(epoch+1))

            for j in range(int(len(train_images)//batch_size)):
                
                image_batch = train_images[j*batch_size : min((j+1)*batch_size, len(train_images))]
                
                label_batch = train_labels[j*batch_size : min((j+1)*batch_size, len(train_labels))]

                # print(sess.run(loss_op, feed_dict={x: image_batch, y: label_batch}))

                sess.run(train_op, feed_dict={x: image_batch, y: label_batch})

            # print(sess.run(weights['wc1']))

            acc_train = 0
            loss_train = 0

            for j in range(10):

                image_batch = train_images[j*len(train_images)//10 : min((j+1)*len(train_images)//10, len(train_images))]
                label_batch = train_labels[j*len(train_labels)//10 : min((j+1)*len(train_labels)//10, len(train_labels))]

                loss_train += sess.run(loss_op, feed_dict={x: image_batch, y: label_batch})
                acc_train += sess.run(accuracy, feed_dict={x: image_batch, y: label_batch})
            
            loss_test = sess.run(loss_op, feed_dict={x: test_images, y: test_labels})
            acc_test = sess.run(accuracy, feed_dict={x: test_images, y: test_labels})

            acc_train_list.append(acc_train/10)
            acc_test_list.append(acc_test)
            loss_train_list.append(loss_train/10)
            loss_test_list.append(loss_test)

            print(" train loss: " + str(loss_train/10) + " train acc: " + str(acc_train/10) + " test loss: " + str(loss_test) + " test acc: " + str(acc_test))

        weights_after_train = sess.run(weights)
        biases_after_train = sess.run(biases)

    np.save(model_weights_trained + "_" + str(temperature) + ".npy", weights_after_train)
    np.save(model_bias_trained + "_" + str(temperature) + ".npy", biases_after_train)


In [None]:
def open_professor_weights_temperature(professor_file_name, temperature):
    weights_trained = np.load(professor_file_name + '_' + str(temperature) + '_VGG16_Weights.npy', allow_pickle=True).tolist()
    for i in range(len(weights_trained)):
        weights_trained[i] = tf.Variable(weights_trained[i])
    bias_trained = np.load(professor_file_name + '_' + str(temperature) + "_VGG16_Bias.npy", allow_pickle=True).tolist()
    for i in range(len(bias_trained)):
        bias_trained[i] = tf.Variable(bias_trained[i])
    return weights_trained, bias_trained

def get_professor_labels(professor_file_name, input_train_images, temperature):
    
    weights_trained, bias_trained = open_professor_weights_temperature(professor_file_name, temperature)

    logit = VGG_16_Temperature(x, weights_trained, bias_trained, temperature)

    prediction = tf.nn.softmax(logit)

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())

        for j in range(10):
            image_batch = input_train_images[j*len(input_train_images)//10 : min((j+1)*len(input_train_images)//10, len(input_train_images))]
            if j == 0:
                test_labels_professor = sess.run(prediction, feed_dict={x: image_batch})
            else:
                test_labels_professor = np.append(np.copy(test_labels_professor), sess.run(prediction, feed_dict={x: image_batch}), axis=0)
        # print(sess.run(accuracy, feed_dict={x: test_images_adjusted, y: test_labels}))
    return test_labels_professor

In [None]:
train_distilled_temperature("Original", x, y, weights, biases, train_images_adjusted, train_labels, test_images_adjusted, test_labels, 50, 2000)

In [None]:
test_labels_professor = get_professor_labels("Original", train_images_adjusted, 2000)

train_distilled_temperature("Original_Distilled", x, y, weights, biases, train_images_adjusted, test_labels_professor, test_images_adjusted, test_labels, 50, 2000)