In [None]:
# @authors: Raj Vardhan

In [1]:
import tensorflow as tf 
import pickle
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from cleverhans.attacks import SaliencyMapMethod,FastGradientMethod,CarliniWagnerL2,DeepFool
from cleverhans.utils import other_classes, set_log_level
from cleverhans.utils import pair_visual, grid_visual, AccuracyReport
from cleverhans.utils_mnist import data_mnist
from cleverhans.utils_tf import model_train, model_eval, model_argmax
from cleverhans.utils_keras import KerasModelWrapper, cnn_model
from cleverhans.model import *

Using TensorFlow backend.


In [2]:
### Load data and model
import os
import pickle
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import tensorflow as tf
import pickle
import datetime

from keras.models import load_model
from keras.utils import to_categorical

In [3]:
role = 'adversary'
dataset = 'twitter'
attack_method = 'jsma'
adversarial_examples_already_created = True
target_type = 'target_next'

directory = "./twitter_data_new/"
model_dir = directory + "model/"
model_name = 'model_twitter.h5'

attack_directory = directory + role + '/' + dataset + '/' + target_type + '/' + attack_method + '/'


In [4]:
n_classes = 2

#Load Drebin DNN model
#Todo: model to be stored as ckpt file instead of pickle
print('\nLoading DNN Model')

DNNmodel = load_model(model_dir + model_name)

# Format W={'weights1':  , 'weights2':   ,'weights3':     ,'biases1':     ,'biases2':,    'biases3':     }
WD = DNNmodel.get_weights()
W={'weights1': WD[0], 
   'weights2': WD[2],
   'weights3': WD[4],
   'biases1' : WD[1],
   'biases2' : WD[3],    
   'biases3' : WD[5]    
  }


Loading DNN Model


In [5]:
print('weights1: {}'.format(WD[0].shape))
print('biases1: {}'.format(WD[1].shape))
print('weights2: {}'.format(WD[2].shape))
print('biases2: {}'.format(WD[3].shape))
print('weights3: {}'.format(WD[4].shape))
print('biases3: {}'.format(WD[5].shape))


weights1: (28, 20)
biases1: (20,)
weights2: (20, 20)
biases2: (20,)
weights3: (20, 2)
biases3: (2,)


In [8]:
#Load stored data
print('\nLoading All malicious and benign Samples')

if role == 'adversary':
    x_input = np.load(directory+'x_test.npy')
    y_input = np.load(directory+'y_test.npy')
elif role == 'defender':
    x_input = np.load(directory+'x_train.npy')
    y_input = np.load(directory+'y_train.npy')    
    
y_class = np.argmax(y_input, axis=1)

ind_mal = np.where(y_class == 1)[0]
x_mal = x_input[ind_mal]
y_mal = np.ones(x_mal.shape[0])
y_mal = to_categorical(y_mal)

ind_ben = np.where(y_class == 0)[0]
x_ben = x_input[ind_ben]
y_ben = np.zeros(x_ben.shape[0])
y_ben = to_categorical(y_ben)

no_of_col = x_input.shape[1]


Loading All malicious and benign Samples


In [None]:
xadv = np.load(attack_directory + '/xadvclev_twitter.npy')

In [9]:
# Are we doing adversarial generation or detection?
increase = True # generation => features will change from 0 to 1; set one_hot_target[0, 0] = 1 (BENIGN)
#increase = False # detection => features will change from 1 to 0; set one_hot_target[0, 1] = 1 (MALWARE)

one_hot_target = np.zeros((1, 2), dtype=np.float32)
one_hot_target[0, 0] = 1 #00 BENIGN; 01 MALWARE

In [10]:
clip_min = 0
clip_max = 1

In [11]:

max_count = x_ben.shape[0] - x_ben.shape[0]%10
x_in = x_ben[:max_count,:]

# max_count = xadv.shape[0]
# x_in = xadv[:max_count,:]

batch_size = 10
x_in_batch = x_in[0:batch_size,:]

In [12]:
x_in.shape

(3850, 28)

In [13]:
print('\nConstruction ATD graph')
#sess = tf.InteractiveSession()
sess = tf.Session()
##sess.run(tf.global_variables_initializer())
#x = tf.Variable(xTe,dtype=tf.float32)
x = tf.Variable(x_in_batch,dtype=tf.float32)

### Construct Tensorlow Graph
def model(x):
    x = tf.cast(x, tf.float32)
    with tf.variable_scope('mlp0',reuse=tf.AUTO_REUSE):
        z = tf.layers.dense(x, units=20, 
                            activation=tf.nn.relu, 
                            kernel_initializer=tf.constant_initializer(W['weights1']), 
                            bias_initializer=tf.constant_initializer(W['biases1']) ) 
        # weight matrix automatically created by the model
        
        #z = tf.layers.dropout(z, rate=0.25, training=training) #Boolean variable training can
                                                                #be set to false to avoid this step during inference

    with tf.variable_scope('mlp1',reuse=tf.AUTO_REUSE):
        z = tf.layers.dense(z, units=20, 
                            activation=tf.nn.relu, 
                            kernel_initializer=tf.constant_initializer(W['weights2']),
                            bias_initializer=tf.constant_initializer(W['biases2']))
        #z = tf.layers.dropout(z, rate=0.25, training=training)
   
    with tf.variable_scope('mlp2',reuse=tf.AUTO_REUSE):
        logits = tf.layers.dense(z, units=2, 
                                 name='logits', 
                                 kernel_initializer=tf.constant_initializer(W['weights3']), 
                                 bias_initializer=tf.constant_initializer(W['biases3']))
    y = tf.nn.softmax(logits, name='ybar')

    
    #return logits
    return y



Construction ATD graph


In [14]:
# This function will return a list of TF gradients
nb_features = x_in.shape[1]
nb_classes=2

pred = model(x)

list_derivatives = []

# Define the TF graph elements to compute our derivatives for each class
for class_ind in range(nb_classes):
    derivatives = tf.gradients(pred[:,class_ind], x)    #dF(x)i/dxj
    d = derivatives
    list_derivatives.append(derivatives[0])
    
# shape of grads is (nb_classes, batch_size, nb_features)
grads = tf.reshape(tf.stack(list_derivatives), shape=[nb_classes, -1, nb_features])

tf_dtype = tf.float32

y_target = one_hot_target

target_class = tf.reshape(tf.transpose(y_target, perm=[1, 0]), shape=[nb_classes, -1, 1])
other_classes = tf.cast(tf.not_equal(target_class, 1), dtype=np.float32)

from cleverhans.compat import reduce_mean, reduce_sum
grads_target = reduce_sum(grads * target_class, axis=0)
grads_other = reduce_sum(grads * other_classes, axis=0)

In [15]:
sess.run(tf.global_variables_initializer())

In [16]:
x_in.shape[0]

3850

In [18]:
startTime=datetime.datetime.now() 

max_count = x_in.shape[0]

grads_res = np.zeros((nb_classes, x_in.shape[0], x_in.shape[1]))
start = 0
end = batch_size
while end <= max_count:
    print('batch {}'.format(end/batch_size))
    grads_res[:,start:end,:] = sess.run(grads, feed_dict={x:x_in[start:end,:]})
    start = end
    end = end + batch_size
    
endTime=datetime.datetime.now()
diffTime=endTime-startTime
print('Time taken: {}'.format(diffTime.total_seconds()))
#np.save('data//grads_res.npy', grads_res)

batch 1.0
batch 2.0
batch 3.0
batch 4.0
batch 5.0
batch 6.0
batch 7.0
batch 8.0
batch 9.0
batch 10.0
batch 11.0
batch 12.0
batch 13.0
batch 14.0
batch 15.0
batch 16.0
batch 17.0
batch 18.0
batch 19.0
batch 20.0
batch 21.0
batch 22.0
batch 23.0
batch 24.0
batch 25.0
batch 26.0
batch 27.0
batch 28.0
batch 29.0
batch 30.0
batch 31.0
batch 32.0
batch 33.0
batch 34.0
batch 35.0
batch 36.0
batch 37.0
batch 38.0
batch 39.0
batch 40.0
batch 41.0
batch 42.0
batch 43.0
batch 44.0
batch 45.0
batch 46.0
batch 47.0
batch 48.0
batch 49.0
batch 50.0
batch 51.0
batch 52.0
batch 53.0
batch 54.0
batch 55.0
batch 56.0
batch 57.0
batch 58.0
batch 59.0
batch 60.0
batch 61.0
batch 62.0
batch 63.0
batch 64.0
batch 65.0
batch 66.0
batch 67.0
batch 68.0
batch 69.0
batch 70.0
batch 71.0
batch 72.0
batch 73.0
batch 74.0
batch 75.0
batch 76.0
batch 77.0
batch 78.0
batch 79.0
batch 80.0
batch 81.0
batch 82.0
batch 83.0
batch 84.0
batch 85.0
batch 86.0
batch 87.0
batch 88.0
batch 89.0
batch 90.0
batch 91.0
batch 92

In [19]:
class_ind = 0
elem_wise_prod = np.multiply(grads_res[class_ind], x_in)

In [20]:
# grads_sorted_ind[j] will store the indices k of xadv[j] in decreasing order of df(class_ind)/dxadv(k),  
grads_sorted_ind = np.zeros(elem_wise_prod.shape, dtype=int)
for index in range(elem_wise_prod.shape[0]):
    grads_sorted_ind[index,:] = sorted(range(len(elem_wise_prod[index])), 
                                       key=lambda i:elem_wise_prod[index][i],  
                                       reverse=True)

In [60]:
# remove last few values that didn't run in batch
#grads_sorted_ind = grads_sorted_ind[:19270,:]

In [21]:
#np.save(directory+'//grads_sorted_ind_adv.npy', grads_sorted_ind)
np.save(directory+'//grads_sorted_ind_benign.npy', grads_sorted_ind)

In [21]:
# adv
grads_sorted_ind[0:20,0:20]

array([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
        16, 17, 18, 19],
       [ 6,  2,  3, 13, 25, 11,  5,  9,  0, 15,  8, 21, 10, 17, 18, 19,
        20, 22, 16, 23],
       [ 6,  2,  3,  9, 13, 14, 24,  8, 25, 11,  5,  0, 10, 15, 16, 17,
        18, 19, 20, 21],
       [ 3,  6, 14,  9, 25, 24,  5,  8, 21,  0,  2, 13, 15, 16, 17, 18,
        19, 20, 10, 23],
       [ 3, 14,  9,  6, 24, 23, 25,  0,  2,  5,  7,  8, 13, 15, 16, 17,
        18, 19, 20, 21],
       [ 3, 14,  6, 25,  9,  4,  8, 23,  0, 21,  2, 13, 15, 16, 17, 18,
        19, 20, 26, 10],
       [ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
        16, 17, 18, 19],
       [ 3,  2, 19, 14,  0,  6,  7,  8,  9, 10, 11, 12, 13, 15, 16, 17,
        18, 20, 21, 22],
       [ 3, 14, 15,  6,  9, 23, 24, 25,  8,  0,  2,  5, 13, 17, 18, 19,
        20, 21, 22, 10],
       [ 3, 25,  6, 14, 13, 22, 21,  2,  8, 16, 24, 11, 12, 15, 17, 19,
        20, 10, 23,  1],
       [ 3, 14,  0,  2,  5,  6

In [22]:
# ben
grads_sorted_ind[0:20,0:20]

array([[ 6, 25,  9, 24,  0,  8, 23,  3,  2, 13, 14, 15, 16, 17, 18, 19,
        20, 21, 22, 27],
       [ 6,  9, 25,  3, 21, 24,  0, 17,  8, 14,  5,  2,  1, 15, 16, 19,
        20, 27, 10, 13],
       [ 6, 14, 15, 17, 25,  3,  9, 24,  0,  8,  2, 21,  5, 19, 20, 22,
        23, 10, 12, 11],
       [25, 14,  6,  9,  0,  8, 23, 17,  3,  2,  5, 15, 16, 19, 20, 21,
        22, 24, 10, 12],
       [ 6, 14, 25,  9,  3,  8,  0, 17,  5,  2, 11, 12, 15, 16, 19, 20,
        21, 22, 24, 10],
       [ 6,  9, 17, 14, 25, 15,  8,  0,  3, 24, 21,  2,  5, 19, 20, 10,
        23, 22, 12, 27],
       [ 6, 25, 14,  9,  8, 24,  0,  3, 17,  5,  2, 15, 16, 19, 20, 21,
        22, 26, 10, 23],
       [ 6, 25, 13,  3,  0,  9, 17,  5, 11,  2,  8, 21, 10, 15, 16, 19,
        20, 22, 24, 23],
       [ 6,  3, 15, 14, 17,  0, 25,  9,  8, 24,  5, 21,  2, 19, 20, 10,
        23, 22, 12, 11],
       [ 6, 21,  9,  3, 25,  0,  5,  2,  7,  8, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20],
       [ 6, 25, 13, 17,  0, 15

In [None]:
from cleverhans.compat import reduce_max

# Compute our initial search domain. We optimize the initial search domain
# by removing all features that are already at their maximum values (if
# increasing input features---otherwise, at their minimum value).
if increase:
    print('Features will be increased')
    search_domain = tf.reshape(
    tf.cast(x < clip_max, tf_dtype), [-1, nb_features])
else:
    print('Features will be decreased')
    search_domain = tf.reshape(
    tf.cast(x > clip_min, tf_dtype), [-1, nb_features])

domain_in = search_domain

tmp = np.ones((nb_features, nb_features), int)
np.fill_diagonal(tmp, 0)
zero_diagonal = tf.constant(tmp, tf_dtype)

increase_coef = (4 * int(increase) - 2) \
        * tf.cast(tf.equal(domain_in, 0), tf_dtype)

target_tmp = grads_target
target_tmp -= increase_coef \
    * reduce_max(tf.abs(grads_target), axis=1, keepdims=True)

#target_sum.shape = (batch_size, nb_features, nb_features)
target_sum = tf.reshape(target_tmp, shape=[-1, nb_features, 1]) \
    + tf.reshape(target_tmp, shape=[-1, 1, nb_features])
    
other_tmp = grads_other
other_tmp += increase_coef \
    * reduce_max(tf.abs(grads_other), axis=1, keepdims=True)
other_sum = tf.reshape(other_tmp, shape=[-1, nb_features, 1]) \
    + tf.reshape(other_tmp, shape=[-1, 1, nb_features])

# Create a mask to only keep features that match conditions
if increase:
    scores_mask = ((target_sum > 0) & (other_sum < 0))
else:
    scores_mask = ((target_sum < 0) & (other_sum > 0))

In [None]:
# Create a 2D numpy array of scores for each pair of candidate features
scores = tf.cast(scores_mask, tf_dtype) \
    * (-target_sum * other_sum) * zero_diagonal
    
# Extract the best two pixels
best = tf.argmax(
    tf.reshape(scores, shape=[-1, nb_features * nb_features]), axis=1)

p1 = tf.mod(best, nb_features) #p1.shape = (batch_size,)
p2 = tf.floordiv(best, nb_features)
p1_2 = tf.stack([p1,p2]) 

In [None]:
sess.run(tf.global_variables_initializer())

In [None]:
p1_2_res = sess.run(p1_2, feed_dict={x:x_in})

In [None]:
p1_2_res

In [None]:
np.save('data//p1_res_clean.npy', p1_res)

In [None]:
p1_old = np.load('data//p1_res_clean.npy')

In [None]:
p2_old = np.load('data//p2_res_clean.npy')

In [None]:
p1_old

In [None]:
p2_old

In [None]:
# Create a 2D numpy array of scores for each pair of candidate features
scores = tf.cast(scores_mask, tf_dtype) \
    * (-target_sum * other_sum) * zero_diagonal

# Extract the best two pixels
best = tf.argmax(
    tf.reshape(scores, shape=[-1, nb_features * nb_features]), axis=1)

p1 = tf.mod(best, nb_features)
p2 = tf.floordiv(best, nb_features)
p1_one_hot = tf.one_hot(p1, depth=nb_features)
p2_one_hot = tf.one_hot(p2, depth=nb_features)

In [None]:
grads_res = sess.run(grads, feed_dict={x:x_in})       # grads is (nb_classes, batch_size, nb_features) 
grads_t = sess.run(grads_target, feed_dict={x:x_in})  # grads_t.shape is (batch_size, nb_features) 
grads_o = sess.run(grads_other, feed_dict={x:x_in})   # grads_o.shape is (batch_size, nb_features) 