In [1]:
from keras.models import Sequential
from keras.layers import LSTM,Dropout
from keras.layers import Dense
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import decomposition
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.pipeline import FeatureUnion
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
import scikitplot as skplt
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_curve, auc
from scipy import interp
from itertools import cycle
import time
from keras import metrics
from keras import optimizers
from keras import initializers
from keras.callbacks import ModelCheckpoint
import pprint
from keras.callbacks import Callback
from keras.layers import Bidirectional
from keras.layers import TimeDistributed


Using TensorFlow backend.


In [2]:
np.random.seed(42)
kdd_dataset = pd.read_csv('KDD.preProcessed.csv')
#kdd_dataset.dropna()

In [3]:
kdd_dataset.head()

Unnamed: 0,duration,service,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,...,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH,label_DoS,label_Normal,label_Probe,label_R2L,label_U2R
0,0.0,0.347826,0.030859,0.846454,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.0,0.217391,0.072004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,0.217391,0.143175,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,0.42029,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.0,0.710145,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [4]:
features = list(kdd_dataset)
features.remove('label_DoS')
features.remove('label_Normal')
features.remove('label_Probe')
features.remove('label_R2L')
features.remove('label_U2R')

In [5]:

DoS = kdd_dataset.loc[kdd_dataset['label_DoS'] == 1]
Normal = kdd_dataset.loc[kdd_dataset['label_Normal'] == 1]
Probe = kdd_dataset.loc[kdd_dataset['label_Probe'] == 1]
R2L = kdd_dataset.loc[kdd_dataset['label_R2L'] == 1]
U2R = kdd_dataset.loc[kdd_dataset['label_U2R'] == 1]


In [6]:
#Reduce oversampled data by a factor of 40. Same class distribution maintained after reduction.

DoS = DoS.sample(n=97084,random_state = 42)
Normal = Normal.sample(n=24320, random_state = 42)
Probe = Probe.sample(n=1027, random_state=42)
R2L = R2L.sample(n=1126, random_state=42)



In [6]:
#Optional oversampling of undersampled data.

#U2R = U2R.append([U2R]*500,ignore_index=True)
#R2L = R2L.append([R2L]*100,ignore_index=True)

In [7]:
reduced_dataset = pd.concat([DoS,Normal,Probe,R2L,U2R])
kdd_dataset = reduced_dataset.sample(n=len(reduced_dataset), random_state = 42)

In [8]:

x = kdd_dataset[features].values
y = kdd_dataset.iloc[:,53:].values
print("number of datapoints = {} and number of features = {}".format(len(x),len(x[0])))

number of datapoints = 123579 and number of features = 53


In [9]:
# Counting occurrences
from collections import Counter
print(Counter([tuple(x) for x in y]))

Counter({(1.0, 0.0, 0.0, 0.0, 0.0): 97084, (0.0, 1.0, 0.0, 0.0, 0.0): 24320, (0.0, 0.0, 0.0, 1.0, 0.0): 1126, (0.0, 0.0, 1.0, 0.0, 0.0): 1027, (0.0, 0.0, 0.0, 0.0, 1.0): 22})


In [10]:
#PCA

print("Shape before transformation - {}".format(np.asarray(x).shape))
pca = decomposition.PCA(n_components=10)
pca.fit(x)
x_pca = pca.transform(x)
print("Shape after transformation - {}".format(x_pca.shape))
x_pca = x_pca.tolist()



Shape before transformation - (123579, 53)
Shape after transformation - (123579, 10)


In [11]:
#Train:Test:Val split - 60:20:20
from sklearn.model_selection import train_test_split
X_tr, X_val, Y_tr, Y_val = train_test_split(x_pca, y,
                                                    stratify=y, 
                                                    test_size=0.2,
                                                    random_state=42)


In [12]:
X_train, X_test, Y_train, Y_test = train_test_split(X_tr, Y_tr,
                                                    stratify=Y_tr, 
                                                    test_size=0.25,
                                                    random_state=42)
max_len = len(X_train)

In [23]:
def processTrainTestArrays(x,y):
    x=np.asarray(x)
    y=np.asarray(y)
    x = np.reshape(x, (x.shape[0], 1, x.shape[1]))
    y=np.reshape(y, (y.shape[0], 1, y.shape[1]))
    #x = np.reshape(x, (x.shape[0],x.shape[1],1))
    #y = np.reshape(y, (y.shape[0], y.shape[1]))
    return x,y

In [14]:
def onehotencode(y):
    from keras.utils.np_utils import to_categorical
    y_binary = to_categorical(y)
    return y_binary

In [21]:
# define model

def create_model(X_train,Y_train):
    Batch_size = 1
    number_of_units = 20
    randomInit = initializers.RandomUniform(seed=42)
    
    model = Sequential()
    model.add(Bidirectional(LSTM(number_of_units,stateful=True,return_sequences=True,
                                 kernel_initializer = randomInit, bias_initializer = randomInit,), 
                            batch_input_shape=(Batch_size,X_train.shape[1], X_train.shape[2])))
    model.add(Dropout(0.15))
    model.add(TimeDistributed(Dense(5,activation='softmax')))
    
    
    nadam = optimizers.Nadam(lr=0.002, beta_1=0.9, beta_2=0.999, epsilon=1e-08, schedule_decay=0.004)
    model.compile(loss='categorical_crossentropy', optimizer=nadam, metrics=['accuracy'])

    
    print(model.summary())
    return model

In [16]:
def compute_metrics(predicted, Y_test):
   
    Y_classes = [np.argmax(item) for item in Y_test]
    predicted_classes = [np.argmax(item) for item in predicted]
    
    predictions = np.array(predicted_classes)
    Y_actual = np.array(Y_classes)
   
    y_actu = pd.Series(np.reshape(Y_actual,len(Y_actual)), name='Actual')
    y_pred = pd.Series(np.reshape(predictions, len(predictions)), name='Predicted')
    
    y_act = pd.Categorical(y_actu, categories=[0,1,2,3,4])
    y_pre = pd.Categorical(y_pred, categories=[0,1,2,3,4])
    
    #F1 score
    from sklearn.metrics import f1_score
    print("Average F1 score is {}".format(f1_score(Y_classes, predicted_classes, average='weighted', labels = [0,1,2,3,4])))
    
    df_confusion = pd.crosstab(y_act, y_pre, rownames =['Actual'],colnames=['Predicted'])
    print("Confusion matrix: \n", df_confusion)
    #pprint.pprint(df_confusion)
    predicted= np.array(predicted).tolist()
    Y_test = np.array(Y_test).tolist()

    n_classes = 5
    Y_test= np.asarray(Y_test)
    predicted = np.asarray(predicted)
    
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(Y_test[:, i], predicted[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    # Compute micro-average ROC curve and ROC area
    fpr["micro"], tpr["micro"], _ = roc_curve(Y_test.ravel(), predicted.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
    
    lw = 2
    # Compute macro-average ROC curve and ROC area
    #First aggregate all false positive rates
    all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

    # Then interpolate all ROC curves at this points
    mean_tpr = np.zeros_like(all_fpr)
    for i in range(n_classes):
        mean_tpr += interp(all_fpr, fpr[i], tpr[i])

    # Finally average it and compute AUC
    mean_tpr /= n_classes

    fpr["macro"] = all_fpr
    tpr["macro"] = mean_tpr
    roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

    # Plot all ROC curves
    plt.figure()
    plt.plot(fpr["micro"], tpr["micro"],
             label='micro-average ROC curve (area = {0:0.2f})'
                   ''.format(roc_auc["micro"]),
             color='deeppink', linestyle=':', linewidth=4)

    plt.plot(fpr["macro"], tpr["macro"],
             label='macro-average ROC curve (area = {0:0.2f})'
                   ''.format(roc_auc["macro"]),
             color='navy', linestyle=':', linewidth=4)

    colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
    for i, color in zip(range(n_classes), colors):
        plt.plot(fpr[i], tpr[i], color=color, lw=lw,
                 label='ROC curve of class {0} (area = {1:0.2f})'
                 ''.format(i, roc_auc[i]))

    plt.plot([0, 1], [0, 1], 'k--', lw=lw)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend(loc="lower right")
    plt.show()

In [17]:
#checkpoint save weights

hdf5FileName = "bestWeightsBiLSTMStateful1L20.hdf5"
checkpoint = ModelCheckpoint(hdf5FileName,monitor='val_acc',verbose=1,save_best_only=True,mode='max',save_weights_only=True)

In [18]:
class ResetStatesCallback(Callback):
    def __init__(self):
        self.counter = 0

    def on_batch_begin(self, batch, logs={}):
        if self.counter % max_len == 0:
            self.model.reset_states()
            print("Model reset. ",self.counter)
            self.counter = 0
        self.counter += 1
        #print(self.counter)
        

In [24]:
#Implementing validation

Batch_size=1
start_time = time.time()

x_train,y_train = processTrainTestArrays(X_train,Y_train)
x_val,y_val = processTrainTestArrays(X_val,Y_val)
model = create_model(x_train,y_train)

model.fit(x_train, y_train, epochs=30, batch_size=Batch_size, verbose=1, shuffle=False,
            validation_data = (x_val,y_val),callbacks=[checkpoint,ResetStatesCallback()])


print("--- %s seconds ---" % (time.time() - start_time))


AssertionError: 

In [None]:
#Test with best model
#Load weights and compile again
print("\n===========================\nTime for testing\n===========================\n")
model.load_weights(hdf5FileName)
nadam = optimizers.Nadam(lr=0.002, beta_1=0.9, beta_2=0.999, epsilon=1e-08, schedule_decay=0.004)
model.compile(loss='categorical_crossentropy', optimizer=nadam, metrics=['accuracy'])
print("Optimal weights loaded from file {}".format(hdf5FileName))
print("Model Successfully compiled with loaded weights\n")

#Do same preprocessing for test data
x_test,y_test = processTrainTestArrays(X_test,Y_test)
loss,acc = model.evaluate(x_test,y_test,batch_size=Batch_size)
print("Loss for testing = {} and Accuracy for testing = {}".format(loss,acc))
predicted = model.predict(x_test,batch_size=Batch_size)
predicted = np.reshape(predicted,(predicted.shape[0],predicted.shape[2]))
y_test = np.reshape(y_test,(y_test.shape[0],y_test.shape[2]))
compute_metrics(predicted, y_test)    

