In [1]:
## import libraries
import numpy as np
np.random.seed(123)

import pandas as pd
import subprocess
from scipy.sparse import csr_matrix, hstack
from sklearn.metrics import mean_absolute_error,matthews_corrcoef,classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import KFold
from sklearn.utils import shuffle
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers.normalization import BatchNormalization
from keras.layers.core import Activation
from keras.layers.advanced_activations import PReLU
from keras.callbacks import CSVLogger,EarlyStopping, ModelCheckpoint

Using Theano backend.
Using gpu device 0: GeForce 940MX (CNMeM is disabled, cuDNN 5105)


In [2]:
## Batch generators ##################################################################################################################################

def batch_generator(X, y, batch_size, shuffle):
    #chenglong code for fiting from generator (https://www.kaggle.com/c/talkingdata-mobile-user-demographics/forums/t/22567/neural-network-for-sparse-matrices)
    number_of_batches = np.ceil(X.shape[0]/batch_size)
    counter = 0
    sample_index = np.arange(X.shape[0])
    if shuffle:
        np.random.shuffle(sample_index)
    while True:
        batch_index = sample_index[batch_size*counter:batch_size*(counter+1)]
        X_batch = X[batch_index,:].toarray()
        y_batch = y[batch_index]
        counter += 1
        yield X_batch, y_batch
        if (counter == number_of_batches):
            if shuffle:
                np.random.shuffle(sample_index)
            counter = 0

def batch_generatorp(X, batch_size, shuffle):
    number_of_batches = X.shape[0] / np.ceil(X.shape[0]/batch_size)
    counter = 0
    sample_index = np.arange(X.shape[0])
    while True:
        batch_index = sample_index[batch_size * counter:batch_size * (counter + 1)]
        X_batch = X[batch_index, :].toarray()
        counter += 1
        yield X_batch
        if (counter == number_of_batches):
            counter = 0


## Read Data

In [3]:
Train = pd.read_csv('Bidirectional_Botnet_Training_Final_Flow_Based_Features.csv',verbose=False)
Test = pd.read_csv('Bidirectional_Botnet_Test_Final_Flow_Based_Features.csv',verbose=False)
print 'Done Reading'

Done Reading


  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
features = ['APL',
 'AvgPktPerSec',
 'IAT',
 'NumForward',
 'Protocol',
 'BytesEx',
 'BitsPerSec',
 'NumPackets',
 'StdDevLen',
 'SameLenPktRatio',
 'FPL',
 'Duration',
 'NPEx']

target = 'isBot'

In [5]:
Train = Train[features+[target]]
Test = Test[features+[target]]
Train.head(3)

Unnamed: 0,APL,AvgPktPerSec,IAT,NumForward,Protocol,BytesEx,BitsPerSec,NumPackets,StdDevLen,SameLenPktRatio,FPL,Duration,NPEx,isBot
0,60.0,0.0,0.0,1,TCP,60,0.0,1,0.0,1.0,60,0.0,1,0
1,60.0,0.144511,0.0,0,TCP,120,69.3651,2,0.0,0.5,60,13.8398,2,0
2,61.0,9.5004,0.0,2,TCP,244,4636.19,4,1.0,0.5,62,0.421035,4,1


In [6]:
ntrain = Train.shape[0]

In [7]:
Tr_Te = pd.concat((Train,Test),axis=0)

In [8]:
num_features = ['APL',
 'AvgPktPerSec',
 'IAT',
 'NumForward',
 'BytesEx',
 'BitsPerSec',
 'NumPackets',
 'StdDevLen',
 'SameLenPktRatio',
 'FPL',
 'Duration',
 'NPEx']
cat_features = ['Protocol']

## Dummify categorical variables and normalize numerical

In [9]:
X = []
##Categorical Varialbes
for x in cat_features:
    temp = pd.get_dummies(Tr_Te[x].astype('category'))
    X.append(temp)

scaler = StandardScaler()
tmp = scaler.fit_transform(Tr_Te[num_features])
X.append(tmp)

## Extract Target labes

In [10]:
Y = Tr_Te['isBot']

### Remove unnecessary variables

In [11]:
del(Tr_Te,Train,Test)

In [12]:
temp = X[0]
for i in range(1,len(X)):
    temp = np.hstack((temp,X[i]))
    
import copy
X = copy.deepcopy(temp)
print X.shape
del(temp)

(859478, 119)


In [13]:
X_train = X[:ntrain,:]
X_test = X[ntrain:,:]
Y_train = Y[:ntrain]
Y_test = Y[ntrain:]

In [14]:
del(X)

In [15]:
X_train

array([[ 0.        ,  0.        ,  0.        , ..., -0.30760879,
        -0.00177444, -0.01383582],
       [ 0.        ,  0.        ,  0.        , ..., -0.30760879,
        -0.00176461, -0.01130548],
       [ 0.        ,  0.        ,  0.        , ..., -0.29266384,
        -0.00177414, -0.0062448 ],
       ..., 
       [ 0.        ,  0.        ,  0.        , ...,  0.1332672 ,
        -0.00177413, -0.01636615],
       [ 0.        ,  0.        ,  0.        , ...,  0.02865256,
        -0.00177444, -0.01636615],
       [ 0.        ,  0.        ,  0.        , ...,  0.3051341 ,
        -0.00176804, -0.01636615]])

In [16]:
Y_train

0         0
1         0
2         1
3         0
4         1
5         0
6         1
7         1
8         0
9         0
10        0
11        1
12        0
13        0
14        0
15        1
16        1
17        1
18        0
19        0
20        0
21        0
22        0
23        0
24        0
25        1
26        1
27        1
28        1
29        1
         ..
504014    0
504015    0
504016    0
504017    1
504018    0
504019    1
504020    0
504021    1
504022    0
504023    1
504024    0
504025    1
504026    0
504027    1
504028    1
504029    0
504030    1
504031    1
504032    0
504033    1
504034    0
504035    0
504036    0
504037    0
504038    0
504039    0
504040    0
504041    0
504042    1
504043    0
Name: isBot, dtype: int64

In [17]:
print len(X_train),len(Y_train)
print len(X_test),len(Y_test)

504044 504044
355434 355434


In [24]:
from keras import backend as K
def custom_obj(y_true, y_pred):
    '''Calculates the Matthews correlation coefficient measure for quality
    of binary classification problems.
    '''
    y_pred_pos = K.round(K.clip(y_pred, 0, 1))
    y_pred_neg = 1 - y_pred_pos

    y_pos = K.round(K.clip(y_true, 0, 1))
    y_neg = 1 - y_pos

    tp = K.sum(y_pos * y_pred_pos)
    tn = K.sum(y_neg * y_pred_neg)

    fp = K.sum(y_neg * y_pred_pos)
    fn = K.sum(y_pos * y_pred_neg)

    return 2.0*fn*fp/(fn+fp)


In [25]:
from keras import backend as K
def matthews_correlation(y_true, y_pred):
    '''Calculates the Matthews correlation coefficient measure for quality
    of binary classification problems.
    '''
    y_pred_pos = K.round(K.clip(y_pred, 0, 1))
    y_pred_neg = 1 - y_pred_pos

    y_pos = K.round(K.clip(y_true, 0, 1))
    y_neg = 1 - y_pos

    tp = K.sum(y_pos * y_pred_pos)
    tn = K.sum(y_neg * y_pred_neg)

    fp = K.sum(y_neg * y_pred_pos)
    fn = K.sum(y_pos * y_pred_neg)

    numerator = (tp * tn - fp * fn)
    denominator = K.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))

    return numerator / (denominator + K.epsilon())


In [38]:
def nn_model():
    model = Sequential()
    
    model.add(Dense(100, input_dim = X_train.shape[1], init = 'he_normal'))
    model.add(PReLU())
    model.add(BatchNormalization())
        
    model.add(Dense(50, init = 'he_normal'))
    model.add(PReLU())
    model.add(BatchNormalization())    
    
    model.add(Dense(50, init = 'he_normal'))
    model.add(Activation('sigmoid'))
    model.add(BatchNormalization())    
    
    model.add(Dense(1, init = 'he_normal'))
    model.compile(loss = 'binary_crossentropy', optimizer = 'adam',metrics=['accuracy','fbeta_score','matthews_correlation'])
    return(model)

def nn_model_dropout():
    model = Sequential()
    
    model.add(Dense(100, input_dim = X_train.shape[1], init = 'he_normal'))
    model.add(Activation('sigmoid'))
    model.add(BatchNormalization())
    model.add(Dropout(0.4))
        
    model.add(Dense(50, init = 'he_normal'))
    model.add(Activation('sigmoid'))
    model.add(BatchNormalization())    
    model.add(Dropout(0.2))
    
    model.add(Dense(20, init = 'he_normal'))
    model.add(Activation('sigmoid'))
    model.add(BatchNormalization())    
    model.add(Dropout(0.2))
    
    model.add(Dense(1, init = 'he_normal'))
    model.compile(loss = 'binary_crossentropy', optimizer = 'adam',metrics=['accuracy','fbeta_score','matthews_correlation'])
    return(model)

In [51]:
model = nn_model_dropout()

In [52]:
csv_logger = CSVLogger('log.txt')
checkpointer = ModelCheckpoint(filepath="Models/Best.hdf5", verbose=1, save_best_only=True)
earlyStopping = EarlyStopping(monitor='val_acc', patience=10, verbose=2, mode='min')

In [None]:
model.fit(X_train,Y_train,nb_epoch=100,batch_size=128,callbacks=[csv_logger,checkpointer],validation_data=(X_test,Y_test),verbose=1)

In [42]:
y_pred = model.predict_classes(X_train)
y_pred = np.reshape(y_pred,(y_pred.shape[0]))



In [43]:
true_pred = np.array(Y_train)

In [44]:
def print_metr(y_pred,y_true):
    print '\n',classification_report(y_pred,y_true)

In [45]:
print_metr(y_pred,true_pred)


             precision    recall  f1-score   support

          0       0.79      0.84      0.81    301696
          1       0.73      0.67      0.70    202348

avg / total       0.77      0.77      0.77    504044



In [46]:
pred_test = model.predict_classes(X_test)
true_test = np.reshape(Y_test,(Y_test.shape[0]))
print_metr(pred_test,true_test)

             precision    recall  f1-score   support

          0       0.75      0.69      0.72    202237
          1       0.63      0.69      0.66    153197

avg / total       0.70      0.69      0.69    355434



  return reshape(newshape, order=order)


In [47]:
confusion_matrix(pred_test,true_test)

array([[138903,  63334],
       [ 46845, 106352]])

In [48]:
sum(Y_test)

169686

In [49]:
len(Y_test)-sum(Y_test)

185748

In [50]:
print sum(Y_train),len(Y_train)-sum(Y_train)

185311 318733
