In [None]:
# @authors: Raj Vardhan

In [None]:
import tensorflow as tf 
import pickle
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec

import pandas
import sklearn
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from keras.layers import Dense, Dropout, Activation
import pickle
import operator
from keras.models import Sequential

### Load data and model
import os
import pickle
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import tensorflow as tf
import pickle
import datetime

from keras.models import load_model
from keras.utils import to_categorical

In [None]:
dataset = 'twitter'

directory = "../twitter_data_new/"
model_dir = directory + "model/"
model_name = 'model_twitter.h5'


# Region 1: This code region is to be run for training the model.

## Region 1.1. Run this if you are starting from scratch. Move to 1.2 if data is already shuffled and scaled.

In [None]:
seed = 7
np.random.seed(seed)

# load datasets
# file name can be passed as parameter
df = pandas.read_csv(directory + "honeypot.csv", header=None)

no_of_col = df.columns.size
print('no. of columns is {}'.format(no_of_col))

ds = df.values
orig_X = ds[:,1:no_of_col].astype(float)
orig_Y = ds[:,0]

indices = np.random.permutation(len(orig_X))
X_unscaled = orig_X[indices]
Y = orig_Y[indices]

np.save(directory + 'X_unscaled_shuffled.npy',X_unscaled)
np.save(directory + 'Y_shuffled.npy',Y)

In [None]:
"""
Features #0 to #5 : 
0 - age 
1 - NumerOfFollowings 
2 - NumberOfFollowers 
3 - NumberOfTweets      [see description of feature #6 for difference]
4 - LengthOfScreenName 
5 - LengthOfDescriptionInUserProfile

Features #6 to #15: 
6 - num_tws         [smaller or equal than feature #3 as it depends on sampling period]
7 - ratio_question
8 - ratio_exclam
9 - len_tws
10 - speed_tws
11 - num_url
12 - ratio_url
13 - num_at
14 - ratio_at  [What's the ratio of tweets containing @; some samples have > 1; could be corrected]
15 - num_RT

Features #16 to #25: 
16 - ratio_RT
17 - num_uniq_at
18 - ratio_uniq_at
19 - num_reply
20 - ratio_reply
21 - num_hash
22 - ratio_hash
23 - jacc_tw
24 - jacc_url
25 - compress_ratio

Features #26-#27: 
26 - spam word ratio
27 - FollowingChangeRatio

There are total 6+10+10+2=28 features.
"""

In [None]:
# Use previously shuffled data (don't shuffle every time)
X_unscaled = np.load(directory + 'X_unscaled_shuffled.npy')
Y = np.load(directory + 'Y_shuffled.npy')

Y_categ = to_categorical(Y)

from sklearn.model_selection import train_test_split

#train, test split
x_train_unscaled, x_test_unscaled, y_train, y_test = train_test_split(X_unscaled,Y_categ,test_size=0.20, random_state=42, shuffle=True)

# We will use this later for preparing the scaler object
np.save(directory + "x_train_unscaled.npy", x_train_unscaled)

# Feature Scaling
from sklearn.preprocessing import StandardScaler, MinMaxScaler
#sc = StandardScaler()
sc = MinMaxScaler()
x_train = sc.fit_transform(x_train_unscaled)
x_test = sc.transform(x_test_unscaled)

# save the scaled data
np.save(directory+'x_train.npy',x_train)
np.save(directory+'y_train.npy',y_train)

np.save(directory+'x_test.npy',x_test)
np.save(directory+'y_test.npy',y_test)

## Region 1.2: Load previously created train and test data that is shuffled and scaled to be between 0 and 1

In [None]:
x_train = np.load(directory + '/x_train.npy')
x_test = np.load(directory + '/x_test.npy')
y_train = np.load(directory + '/y_train.npy')
y_test = np.load(directory + '/y_test.npy')

# build DNNs
DNNmodel = Sequential()
DNNmodel.add(Dense(20, input_dim = no_of_col-1, kernel_initializer="uniform",activation="relu"))
DNNmodel.add(Dropout(0.5))
DNNmodel.add(Dense(20, kernel_initializer="uniform", activation="relu"))
DNNmodel.add(Dropout(0.5))
DNNmodel.add(Dense(2))
DNNmodel.add(Activation("softmax"))

#compile Model
DNNmodel.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
# 3-fold validations
history=DNNmodel.fit(x_train, y_train,validation_split=0.20,
          epochs=100, batch_size=256)

#evaluation
score = DNNmodel.evaluate(x_test, y_test, batch_size=256)
print("Test Accuracy",score)
print("\n%s: %.2f%%" % (DNNmodel.metrics_names[1], score[1]*100))

# evaluation through predict function

y_pred = DNNmodel.predict(x_test)

tp =0
fp =0
tn =0
fn =0
for i in range(len(y_pred)):
    pred_class = np.argmax(y_pred[i])
    true_class = np.argmax(y_test[i])
    if(pred_class == true_class):
        if true_class == 1:
            tp += 1
        else:
            tn += 1
    else:
        if true_class == 1:
            fn += 1
        else:
            fp += 1        
    
precision = tp*100/ (tp+fp)
recall = tp*100/ (tp+fn)
acc = (tp+tn)*100/(tp+tn+fp+fn)

total_actual_positives = tp+fn
total_actual_negatives = fp+tn

print("Accuracy is {}%".format(acc))
print('tp: {} fp: {} tn: {} fn:{} '.format(tp,fp,tn,fn))
print("precision: {}%   recall: {}%".format(precision, recall))

print("Total true +ves: {} Total true -ves: {}".format(total_actual_positives, total_actual_negatives))

# Save the model
DNNmodel.save(model_dir + model_name)

## ---x----- Region 1 ends ------x------

# Region 2 begins: Here we will do model explanation

In [None]:
# Load the model (Starting point)
DNNmodel = load_model(model_dir + model_name)

W = DNNmodel.get_weights()
W={'weights1': W[0], 
   'weights2': W[2],
   'weights3': W[4],
   'biases1' : W[1],
   'biases2' : W[3],    
   'biases3' : W[5]    
  }