In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import itertools

import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import BatchNormalization
from keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.python.client import device_lib 

import random
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance
from sklearn.preprocessing import MinMaxScaler

In [None]:
print(device_lib.list_local_devices())

In [2]:
rawData = pd.read_csv('Skype.csv')
rawData = rawData[~rawData.isin([np.nan,np.inf,-np.inf]).any(1)] # drop all inf and nan rows
# rawTest = pd.read_csv('facebook.csv')
# rawTest = rawTest[~rawTest.isin([np.nan,np.inf,-np.inf]).any(1)]

In [3]:
selectCols = rawData.drop(['Flow ID','Src IP','Dst IP','Timestamp','Web_service','Label'],axis=1)
X = np.array(selectCols)
#X = np.array(rawData.loc[:,['Tot Fwd Pkts','Tot Bwd Pkts','TotLen Fwd Pkts','TotLen Bwd Pkts','Fwd Pkt Len Max','Bwd Pkt Len Max','FirstNPkt_size','Flow Duration','Flow Byts/s','Flow Pkts/s']])
Y = rawData.loc[:,['Label']]
Y = np.array(pd.get_dummies(Y,columns=['Label']))

In [4]:
Xtrain,XTest,Ytrain,YTest = train_test_split(X,Y,test_size=0.3,random_state=21)

#Ytrain = tf.keras.utils.to_categorical(Ytrain, num_classes=3)
#YTest = tf.keras.utils.to_categorical(YTest, num_classes=3)
print(Ytrain.shape)
print(YTest.shape)
#Ytrain = Ytrain.reshape(Ytrain.shape[0],)
Ytrain = Ytrain.astype(np.float32)

#YTest = YTest.reshape(YTest.shape[0],)
YTest = YTest.astype(np.float32)
print("Training set size:", Xtrain.shape)
print("Test set size:",XTest.shape)

(19387, 4)
(8309, 4)
Training set size: (19387, 83)
Test set size: (8309, 83)


In [None]:
feature_names = list(selectCols.columns)
forest = RandomForestClassifier(random_state=0)
forest.fit(Xtrain, Ytrain)

In [None]:
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0)

In [None]:
fig, ax = plt.subplots(figsize=(15,5))
forest_importances = pd.Series(importances, index=feature_names)
forest_importances.plot.bar(yerr=std, ax=ax)
ax.set_title("Feature importances using MDI")
ax.set_ylabel("Mean decrease in impurity")
fig.tight_layout()

In [None]:
result = permutation_importance(
    forest, Xtrain, Ytrain, n_repeats=10, random_state=21, n_jobs=2
)

In [None]:
fig, ax = plt.subplots(figsize=(15,5))
forest_importances.plot.bar(yerr=result.importances_std, ax=ax)
ax.set_title("Feature importances using permutation on full model")
ax.set_ylabel("Mean accuracy decrease")
fig.tight_layout()
plt.show()

In [48]:
X = np.array(selectCols)
#X = np.array(selectCols.loc[:,['Idle Max','Idle Mean','Tot Fwd Pkts','Tot Bwd Pkts','TotLen Fwd Pkts','TotLen Bwd Pkts','Fwd Pkt Len Max','Bwd Pkt Len Max','FirstNPkt_size','Flow Duration','Flow Byts/s','Flow Pkts/s','flow_start']])

In [49]:
scalar = MinMaxScaler()
scalar.fit(X)
X = scalar.transform(X)

In [50]:
Xtrain,XTest,Ytrain,YTest = train_test_split(X,Y,test_size=0.3,random_state=42)
Ytrain = Ytrain.astype(np.float32)
YTest = YTest.astype(np.float32)
print("Training set size:", Xtrain.shape)
print("Test set size:",XTest.shape)

Training set size: (19387, 83)
Test set size: (8309, 83)


In [51]:
n_x = X.shape[1]
n_y = Y.shape[1]
# early stop call back function
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=2)
# save best model during the training process
iteration_checkpoint = keras.callbacks.ModelCheckpoint(
    f'models/graph_model_iteration_10.h5',
    monitor='val_loss',
    verbose=1,
    save_best_only=True
)
def baseline_model():
    # create model
    model = Sequential()
    model.add(Dense(120, input_dim=n_x, activation='relu'))
    model.add(Dense(n_y, activation='softmax'))
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [22]:
#model = KerasClassifier(build_fn=baseline_model,epochs=10, batch_size=10, verbose=1,callbacks=[callback])

In [53]:
# Use Kfold cross validation to avoid overfitting
K = [2,4,8,10,20]
result = []
bestModels = {}
for k in K:
    kfold = KFold(n_splits=k, shuffle=True)
    k_result = []
    bestModels[k] = [None,0]
    for train_index, test_index in kfold.split(X):
        x_train,x_test=X[train_index],X[test_index]
        y_train,y_test=Y[train_index],Y[test_index]
        model = baseline_model()
        model.fit(x_train,y_train,epochs=10,batch_size=5, verbose=1)
        test_loss, test_acc = model.evaluate(x_test,y_test)
        k_result.append(test_acc)
        if test_acc > bestModels[k][1]:
            bestModels[k][0] = model
            bestModels[k][1] = test_acc
        print('Test Accuracy:',test_acc)
    result.append(k_result)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Accuracy: 0.941724419593811
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Accuracy: 0.9500288963317871
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Accuracy: 0.9478625059127808
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Accuracy: 0.954361617565155
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Accuracy: 0.9533506631851196
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Accuracy: 0.9517619609832764
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Accuracy: 0.9436

In [58]:
R = np.array([np.array(xi) for xi in result])

  result = np.array([np.array(xi) for xi in result])


In [62]:
for i in range(len(K)):
    print("K=",K[i],"Average accuracy and std: %.2f%% (%.2f%%)" % (result[i].mean()*100, result[i].std()*100))

K= 2 Average accuracy and std: 94.59% (0.42%)
K= 4 Average accuracy and std: 95.18% (0.25%)
K= 8 Average accuracy and std: 95.05% (0.41%)
K= 10 Average accuracy and std: 95.14% (0.27%)
K= 20 Average accuracy and std: 95.26% (0.43%)


In [63]:
for k in K:
    test_loss, test_acc = bestModels[k][0].evaluate(XTest,YTest)
    print("K=",k,"Accuracy(Test): %.2f%%" % (test_acc*100))

K= 2 Accuracy(Test): 95.46%
K= 4 Accuracy(Test): 95.70%
K= 8 Accuracy(Test): 95.76%
K= 10 Accuracy(Test): 95.79%
K= 20 Accuracy(Test): 95.85%
