In [None]:
# @authors: Raj Vardhan and Vaisakh S

In [38]:
import tensorflow as tf 
import pickle
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from cleverhans.attacks import SaliencyMapMethod,FastGradientMethod,CarliniWagnerL2,DeepFool
from cleverhans.utils import other_classes, set_log_level
from cleverhans.utils import pair_visual, grid_visual, AccuracyReport
from cleverhans.utils_mnist import data_mnist
from cleverhans.utils_tf import model_train, model_eval, model_argmax
from cleverhans.utils_keras import KerasModelWrapper, cnn_model
from cleverhans.model import *

In [39]:
import pandas
import sklearn
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from keras.layers import Dense, Dropout, Activation
import pickle
import operator
from keras.models import Sequential
from graphviz import Digraph

In [40]:
### Load data and model
import os
import pickle
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import tensorflow as tf
import pickle
import datetime

In [41]:
from keras.models import load_model
from keras.utils import to_categorical

In [42]:
role = 'adversary'
dataset = 'twitter'
attack_method = 'jsma'
adversarial_examples_already_created = True
target_type = 'target_next'

directory = "./twitter_data_new/"
model_dir = directory + "model/"
model_name = 'model_twitter.h5'

attack_directory = directory + role + '/' + dataset + '/' + target_type + '/' + attack_method + '/'

##ATTACK METHOD SPECIFIC PARAMETERS
# CW ATTACK METHOD
CW_LEARNING_RATE = .2
ATTACK_ITERATIONS = 100


#JSMA
THETA_JSMA = 0.09
GAMMA_JSMA = 0.6

# Region 1: This code region is to be run for training the model. Move to next segment if model is already trained

In [None]:
seed = 7
np.random.seed(seed)

# load datasets
# file name can be passed as parameter
df = pandas.read_csv(directory + "honeypot.csv", header=None)

In [None]:
no_of_col = df.columns.size
print('no. of columns is {}'.format(no_of_col))

In [None]:
ds = df.values
orig_X = ds[:,1:no_of_col].astype(float)
orig_Y = ds[:,0]

In [None]:
indices = np.random.permutation(len(orig_X))
X_unscaled = orig_X[indices]
Y = orig_Y[indices]

In [None]:
#should be shuffled 0s and 1s
Y[0:10]

In [None]:
np.save(directory + 'X_unscaled_shuffled.npy',X_unscaled)
np.save(directory + 'Y_shuffled.npy',Y)

In [None]:
"""
Features #0 to #5 : 
0 - age 
1 - NumerOfFollowings 
2 - NumberOfFollowers 
3 - NumberOfTweets      [see description of feature #6 for difference]
4 - LengthOfScreenName 
5 - LengthOfDescriptionInUserProfile

Features #6 to #15: 
6 - num_tws         [smaller or equal than feature #3 as it depends on sampling period]
7 - ratio_question
8 - ratio_exclam
9 - len_tws
10 - speed_tws
11 - num_url
12 - ratio_url
13 - num_at
14 - ratio_at  [What's the ratio of tweets containing @; some samples have > 1; could be corrected]
15 - num_RT

Features #16 to #25: 
16 - ratio_RT
17 - num_uniq_at
18 - ratio_uniq_at
19 - num_reply
20 - ratio_reply
21 - num_hash
22 - ratio_hash
23 - jacc_tw
24 - jacc_url
25 - compress_ratio

Features #26-#27: 
26 - spam word ratio
27 - FollowingChangeRatio

There are total 6+10+10+2=28 features.
"""

In [None]:
# Use previously shuffled data (don't shuffle every time)
X_unscaled = np.load(directory + 'X_unscaled_shuffled.npy')
Y = np.load(directory + 'Y_shuffled.npy')

In [None]:
Y_categ = to_categorical(Y)

In [None]:
from sklearn.model_selection import train_test_split

#train, test split
x_train_unscaled, x_test_unscaled, y_train, y_test = train_test_split(X_unscaled,Y_categ,test_size=0.20, random_state=42, shuffle=True)

# We will use this later for preparing the scaler object
np.save(directory + "x_train_unscaled.npy", x_train_unscaled)

# Feature Scaling
from sklearn.preprocessing import StandardScaler, MinMaxScaler
#sc = StandardScaler()
sc = MinMaxScaler()
x_train = sc.fit_transform(x_train_unscaled)
x_test = sc.transform(x_test_unscaled)

# save the scaled data
np.save(directory+'x_train.npy',x_train)
np.save(directory+'y_train.npy',y_train)

np.save(directory+'x_test.npy',x_test)
np.save(directory+'y_test.npy',y_test)

In [None]:
# build DNNs
DNNmodel = Sequential()
DNNmodel.add(Dense(20, input_dim = no_of_col-1, kernel_initializer="uniform",activation="relu"))
DNNmodel.add(Dropout(0.5))
DNNmodel.add(Dense(20, kernel_initializer="uniform", activation="relu"))
DNNmodel.add(Dropout(0.5))
DNNmodel.add(Dense(2))
DNNmodel.add(Activation("softmax"))

#compile Model
DNNmodel.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
# 3-fold validations
history=DNNmodel.fit(x_train, y_train,validation_split=0.20,
          epochs=100, batch_size=256)

In [None]:
#evaluation
score = DNNmodel.evaluate(x_test, y_test, batch_size=256)
print("Test Accuracy",score)
print("\n%s: %.2f%%" % (DNNmodel.metrics_names[1], score[1]*100))

In [None]:
# evaluation through predict function

y_pred = DNNmodel.predict(x_test)

tp =0
fp =0
tn =0
fn =0
for i in range(len(y_pred)):
    pred_class = np.argmax(y_pred[i])
    true_class = np.argmax(y_test[i])
    if(pred_class == true_class):
        if true_class == 1:
            tp += 1
        else:
            tn += 1
    else:
        if true_class == 1:
            fn += 1
        else:
            fp += 1        
    
precision = tp*100/ (tp+fp)
recall = tp*100/ (tp+fn)
acc = (tp+tn)*100/(tp+tn+fp+fn)

total_actual_positives = tp+fn
total_actual_negatives = fp+tn

print("Accuracy is {}%".format(acc))
print('tp: {} fp: {} tn: {} fn:{} '.format(tp,fp,tn,fn))
print("precision: {}%   recall: {}%".format(precision, recall))

print("Total true +ves: {} Total true -ves: {}".format(total_actual_positives, total_actual_negatives))

In [None]:
# Save the model
DNNmodel.save(model_dir + model_name)

## ---x----- Region 1 ends ------x------

# Region 2 begins: Here we do adversarial attack generation

In [43]:
# Load the model (Starting point)
DNNmodel = load_model(model_dir + model_name)

W = DNNmodel.get_weights()
W={'weights1': W[0], 
   'weights2': W[2],
   'weights3': W[4],
   'biases1' : W[1],
   'biases2' : W[3],    
   'biases3' : W[5]    
  }

In [44]:
n_classes = 2

#Load stored data
print('\nLoading All Malicious Samples from pickle file')

if role == 'adversary':
    x_input = np.load(directory+'x_test.npy')
    y_input = np.load(directory+'y_test.npy')
elif role == 'defender':
    x_input = np.load(directory+'x_train.npy')
    y_input = np.load(directory+'y_train.npy')    
    
y_class = np.argmax(y_input, axis=1)
ind_mal = np.where(y_class == 1)[0]

x_mal = x_input[ind_mal]
y_mal = np.ones(x_mal.shape[0])

y_mal = to_categorical(y_mal)

no_of_col = x_input.shape[1]


Loading All Malicious Samples from pickle file


In [45]:
x_mal.shape

(4447, 28)

In [46]:
if attack_method == 'jsma':
    batch_size = 10
else:
    batch_size = 1
    
count = x_mal.shape[0] - x_mal.shape[0]%batch_size
x_in = x_mal[:count,:]
nb_features = x_in.shape[1]
x_in_batch = x_in[:batch_size,:]

In [47]:
print('\nConstruction of tensorflow graph')
#sess = tf.InteractiveSession()
sess = tf.Session()
##sess.run(tf.global_variables_initializer())
x = tf.Variable(x_in_batch,dtype=tf.float32)

### Construct Tensorlow Graph
def model(x):
    x = tf.cast(x, tf.float32)
    with tf.variable_scope('mlp0',reuse=tf.AUTO_REUSE):
        z = tf.layers.dense(x, units=20, 
                            activation=tf.nn.relu, 
                            kernel_initializer=tf.constant_initializer(W['weights1']), 
                            bias_initializer=tf.constant_initializer(W['biases1']) ) 
        # weight matrix automatically created by the model
        
        #z = tf.layers.dropout(z, rate=0.25, training=training) #Boolean variable training can
                                                                #be set to false to avoid this step during inference

    with tf.variable_scope('mlp1',reuse=tf.AUTO_REUSE):
        z = tf.layers.dense(z, units=20, 
                            activation=tf.nn.relu, 
                            kernel_initializer=tf.constant_initializer(W['weights2']),
                            bias_initializer=tf.constant_initializer(W['biases2']))
        #z = tf.layers.dropout(z, rate=0.25, training=training)
   
    with tf.variable_scope('mlp2',reuse=tf.AUTO_REUSE):
        logits = tf.layers.dense(z, units=2, 
                                 name='logits', 
                                 kernel_initializer=tf.constant_initializer(W['weights3']), 
                                 bias_initializer=tf.constant_initializer(W['biases3']))
    #y = tf.nn.softmax(logits, name='ybar')

    
    return logits



Construction of tensorflow graph


In [48]:
#In this cell, we perform jsma attack on malware samples to convert them into benign (adversarial generation)
one_hot_target = np.zeros((1, 2), dtype=np.float32)
one_hot_target[0, 0] = 1  # NOTE: We do this to convert samples to benign 
                          # this is for generation of adversarial samples using cleverhans library

In [49]:
## NOTE: If you get an error in the next to next cell with jsma.generate_np method call, 
##       run sess.run(tf.global_variables_initializer()) again and then retry running that cell.

#### Wrap the model as per cleverhans abstraction
atdModel = CallableModelWrapper(model, 'logits')

attack_obj = None

if attack_method == 'jsma':
    attack_obj = SaliencyMapMethod(atdModel, back='tf', sess=sess)
elif attack_method == 'cwl2':
    attack_obj = CarliniWagnerL2(atdModel, back='tf', sess=sess)
    
    cw_par = {'binary_search_steps': 1,
          'max_iterations': ATTACK_ITERATIONS,
          'learning_rate': CW_LEARNING_RATE,
          'batch_size': 1,
          'initial_const': 10}

    cw_par['y_target'] = one_hot_target
    
else:
    print('Unknown attack method')




In [50]:
sess.run(tf.global_variables_initializer())

In [56]:
if not adversarial_examples_already_created:

    startTime=datetime.datetime.now() 

    adv_x = np.zeros((count,x_in.shape[1]))
    start = 0
    end = batch_size
    while end <= count:
        print('batch {}'.format(end/batch_size))

        if attack_method == 'jsma':
            adv_x[start:end,:] = attack_obj.generate_np(x_in[start:end,:], theta = THETA_JSMA, gamma= GAMMA_JSMA, 
                       clip_min= 0., clip_max= 1.,y_target= one_hot_target)  # num_iterations = nb_features*gamma/2
        elif attack_method == 'cwl2':
            adv_x[start:end,:] = attack_obj.generate_np(x_in[start:end,:], **cw_par)

        start = end
        end = end + batch_size

    endTime=datetime.datetime.now()
    diffTime=endTime-startTime
    print('Time taken: {}'.format(diffTime.total_seconds()))

    # Save the adversarial examples
    import os
    if not os.path.exists(attack_directory):
        os.makedirs(attack_directory)

    print('saving adv examples to ', attack_directory)
    np.save(attack_directory + '/xadvclev_twitter.npy', adv_x)
    
    x_adv = tf.placeholder(dtype=tf.float64, shape = (x_in.shape[0], nb_features) )

    #find prediction logits (before softmax) for adversarial samples
    pred = model(x_adv)
    y_pred = tf.nn.softmax(pred)

    sess.run(tf.global_variables_initializer())

    #Find predictions on adv samples
    y_pred_res_adv = sess.run(y_pred, feed_dict={x_adv:adv_x})
    y_pred_res_sm_adv = np.argmax(y_pred_res_adv, axis=1)
    y_pred_res_sm_adv

    print('saving predictions on adv examples to ', attack_directory)
    np.save(attack_directory + '/y_class_adv.npy', y_pred_res_sm_adv)
    
    attack_succ_rate = len(np.where(y_class_adv==0)[0])*100.0/len(y_class_adv)
    print('attack success rate is {}%'.format(attack_succ_rate))
    
    # samples at which indices were able to fool the classifier i.e. have predicted class 0
    ind_succ_on_target = np.where(y_class_adv == 0)[0]
    np.save(attack_directory+'/ind_succ_on_target.npy',ind_succ_on_target)

    print('saving attack success rate of adv examples to ', attack_directory)
    np.save(attack_directory + '/attack_succ_rate.npy', attack_succ_rate)

else:
    adv_x = np.load(attack_directory + '/xadvclev_twitter.npy')
    print('shape of adv_x is', adv_x.shape)
    
    y_class_adv = np.load(attack_directory + '/y_class_adv.npy')
    print('shape of y_class_adv is', y_class_adv.shape)
    
    attack_succ_rate = np.load(attack_directory + '/attack_succ_rate.npy')
    print('attack succ rate is {}%'.format(attack_succ_rate))

shape of adv_x is (4440, 28)
shape of y_class_adv is (4440,)
attack succ rate is 70.11261261261261%


some of the below analysis may only be valid for JSMA. For instance, CW or FGSM may make changes to all features compared to JSMA that just changes a few


In [57]:
if attack_method == 'jsma':
    theta = THETA_JSMA
else:
    theta = 0
    
count = x_in.shape[0]

# Finding which features are changed across samples
# diff [num_mal_samples X max_steps]
step_max=x_in.shape[1]
diff = np.full((count,step_max), -1)
for i in range(0, count):
    #print('i: {}'.format(i))
    m = 0
    d = dict()
    x = np.where((abs(x_in[i] - adv_x[i]) > theta))[0]
    if (len(x) > 0):
        for k in range(0, len(x)):
            if x[k] not in d:
                diff[i,m]=x[k]
                m+=1
                d[x[k]]=1
    else:
        diff[i,0]=-2


In [58]:
x_in[0]

array([3.36516583e-01, 1.56795186e-01, 4.13798193e-03, 1.10428503e-02,
       7.33333333e-01, 3.35443038e-01, 1.00000000e+00, 2.00000000e-02,
       1.00000000e-02, 5.47250616e-01, 5.40153090e-05, 2.10810811e-01,
       2.10810811e-01, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.21455850e-02,
       9.69235751e-01, 4.43694007e-02, 4.25000000e-02, 1.20511444e-03])

In [59]:
adv_x[0]

array([1.00000000e+00, 1.56795189e-01, 4.13798215e-03, 1.01042852e-01,
       7.33333349e-01, 3.35443050e-01, 1.00000000e+00, 1.99999996e-02,
       9.99999978e-03, 5.47250628e-01, 5.40153087e-05, 2.10810810e-01,
       2.10810810e-01, 9.00000036e-02, 9.00000036e-02, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 9.00000036e-02, 9.00000036e-02, 3.21455859e-02,
       9.69235778e-01, 1.34369403e-01, 4.25000004e-02, 1.20511441e-03])

In [60]:
diff[0:10,:]

array([[ 0,  3, 13, 14, 21, 22, 25, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
       [ 2,  3, 13, 14, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
       [ 2,  3, 13, 14, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
       [ 3, 14, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
       [ 3, 14, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
       [ 3, 14, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
       [ 0,  3, 13, 14, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
       [ 2,  3, 14, 19, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -

In [61]:
#### Get the unscaled feature vectors by doing reverse ransform
x_train_unscaled = np.load(directory+"x_train_unscaled.npy")

#First prepare the sc object to be used for inverse transform
from sklearn.preprocessing import StandardScaler, MinMaxScaler
#sc = StandardScaler()
sc = MinMaxScaler()
x_train = sc.fit_transform(x_train_unscaled)

#Now that we have sc object, use it for inv transform
# Unscaled adversarial samples
x_adv_inv_scale = sc.inverse_transform(adv_x)

# Unscaled original malicious samples
x_in_inv_scale = sc.inverse_transform(x_in)

In [62]:
# Print the changes in features between malicious and adversarial samples
for i in range(20):
    
    print('----Changes for sample {} are as follows----'.format(i))
    for c in range(no_of_col):
        if diff[i,c] < 0:
            break
        print('feature {}'.format(diff[i,c]))
        print('changed from {} to {}\n'.format( x_in_inv_scale[i, diff[i,c]], x_adv_inv_scale[i, diff[i,c]]))
            
        

----Changes for sample 0 are as follows----
feature 0
changed from 38656417.0 to 114872250.0

feature 3
changed from 3870.0000000000005 to 35410.77054385841

feature 13
changed from 0.0 to 125.55000498890877

feature 14
changed from 0.0 to 0.6408725086874608

feature 21
changed from 0.0 to 108.00000429153442

feature 22
changed from 0.0 to 0.7309923954592407

feature 25
changed from 0.2514266036993309 to 0.7614266177018482

----Changes for sample 1 are as follows----
feature 2
changed from 6993.0 to 394834.2440955788

feature 3
changed from 1992.0 to 33532.770738355815

feature 13
changed from 0.0 to 125.55000498890877

feature 14
changed from 0.0 to 0.6408725086874608

----Changes for sample 2 are as follows----
feature 2
changed from 221.0 to 388062.2388363853

feature 3
changed from 427.0 to 31967.771335616708

feature 13
changed from 0.0 to 125.55000498890877

feature 14
changed from 0.0 to 0.6408725086874608

----Changes for sample 3 are as follows----
feature 3
changed from 625.0

In [24]:
x_train_unscaled = np.load(directory + "x_train_unscaled.npy")
y_train = np.load(directory + "y_train.npy")

In [35]:
y_train[0:10]

array([[0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.]], dtype=float32)

In [71]:
x_in_inv_scale[1]

array([2.16435760e+07, 7.69200000e+03, 6.99300000e+03, 1.99200000e+03,
       1.50000000e+01, 1.55000000e+02, 2.00000000e+02, 4.00000000e-02,
       6.00000000e-02, 4.59750000e+01, 1.22995802e+01, 1.99000000e+02,
       9.95000000e-01, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00,
       5.00000000e-03, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 1.00000000e+00, 5.00000000e-03, 9.67283609e-03,
       9.69853931e-01, 4.17164329e-01, 1.15000000e-01, 2.71579562e+00])

In [None]:
"""
Features #0 to #5 : 
0 - age 
1 - NumerOfFollowings 
2 - NumberOfFollowers 
3 - NumberOfTweets      [see description of feature #6 for difference]
4 - LengthOfScreenName 
5 - LengthOfDescriptionInUserProfile

Features #6 to #15: 
6 - num_tws         [smaller or equal than feature #3 as it depends on sampling period]
7 - ratio_question
8 - ratio_exclam
9 - len_tws
10 - speed_tws
11 - num_url
12 - ratio_url
13 - num_at
14 - ratio_at  [What's the ratio of tweets containing @; some samples have > 1; could be corrected]
15 - num_RT

Features #16 to #25: 
16 - ratio_RT
17 - num_uniq_at
18 - ratio_uniq_at
19 - num_reply
20 - ratio_reply
21 - num_hash
22 - ratio_hash
23 - jacc_tw
24 - jacc_url
25 - compress_ratio

Features #26-#27: 
26 - spam word ratio
27 - FollowingChangeRatio

There are total 6+10+10+2=28 features.
