In [2]:
import pandas as pd
import numpy as np

In [3]:
datapath = '/content/drive/MyDrive/SelectedFeatures-10s-TOR-NonTOR.csv'

In [18]:
# Read data from csv
dataframe = pd.read_csv(datapath,low_memory=False)

In [19]:
# Normalise the data
def dfNormalize(df):
    for feature_name in df.columns:
        df.loc[:,feature_name]= pd.to_numeric(df.loc[:,feature_name], errors='coerce').fillna(0)
        max_value = df[feature_name].max()
        min_value = df[feature_name].min()   
        if (max_value - min_value) > 0:
            df.loc[:,feature_name] = (df.loc[:,feature_name] - min_value) / (max_value - min_value)
        else:
            df.loc[:,feature_name] = (df.loc[:,feature_name]- min_value)    
    return df

In [20]:
def clean_dataset(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    return df[indices_to_keep].astype(np.float64)

In [21]:
# Randomly permute the data
print(dataframe.shape)
dataframe = dataframe.reindex(np.random.permutation(dataframe.index)).copy()
print(dataframe.describe())
print (list(dataframe))

(67834, 29)
        Source Port   Destination Port  ...      Idle Max      Idle Min
count  67834.000000       67834.000000  ...  6.783400e+04  6.783400e+04
mean   37912.753324       11566.395967  ...  3.085054e+05  3.085054e+05
std    20986.077326       18374.765123  ...  1.453953e+06  1.453953e+06
min       21.000000          21.000000  ...  0.000000e+00  0.000000e+00
25%    19305.000000         137.000000  ...  0.000000e+00  0.000000e+00
50%    43677.000000         443.000000  ...  0.000000e+00  0.000000e+00
75%    54685.000000       16311.000000  ...  0.000000e+00  0.000000e+00
max    65534.000000       65514.000000  ...  9.998126e+06  9.998126e+06

[8 rows x 26 columns]
['Source IP', ' Source Port', ' Destination IP', ' Destination Port', ' Protocol', ' Flow Duration', ' Flow Bytes/s', ' Flow Packets/s', ' Flow IAT Mean', ' Flow IAT Std', ' Flow IAT Max', ' Flow IAT Min', 'Fwd IAT Mean', ' Fwd IAT Std', ' Fwd IAT Max', ' Fwd IAT Min', 'Bwd IAT Mean', ' Bwd IAT Std', ' Bwd IAT Max',

In [23]:
keys = dataframe.keys()
feature_keys = keys[1:2] # adding source port
feature_keys = feature_keys.append(keys[3:4]) # adding destination port
feature_keys = feature_keys.append(keys[5:len(keys)-1]) # adding rest of features without label field 25 features
new_feature_keys = [' Source Port', ' Destination Port', ' Flow Duration',' Flow IAT Std',' Flow IAT Min', 'Fwd IAT Mean', ' Fwd IAT Std', 'Bwd IAT Mean', ' Bwd IAT Std', ' Bwd IAT Max',' Bwd IAT Min']
data_to_process = dataframe[feature_keys].copy()
#data_to_process = dataframe[new_feature_keys].copy()
x_normalised = dfNormalize(data_to_process)
x_normalised = clean_dataset(x_normalised).copy()
print(x_normalised.describe())

        Source Port   Destination Port  ...      Idle Max      Idle Min
count  67828.000000       67828.000000  ...  67828.000000  67828.000000
mean       0.578420           0.176270  ...      0.030859      0.030859
std        0.320324           0.280536  ...      0.145429      0.145429
min        0.000000           0.000000  ...      0.000000      0.000000
25%        0.294354           0.001771  ...      0.000000      0.000000
50%        0.666410           0.006443  ...      0.000000      0.000000
75%        0.834399           0.248729  ...      0.000000      0.000000
max        1.000000           1.000000  ...      1.000000      1.000000

[8 rows x 25 columns]


In [24]:
# get the train and test data
x_train = x_normalised.sample(frac=0.8, replace=True)
x_test = x_normalised.drop(x_train.index)

# change the labels and affix them
change_labels = lambda x: 1 if x == 'nonTOR' else 0
y_train = dataframe['label'].apply(change_labels).loc[x_train.index]
y_test = dataframe['label'].apply(change_labels).loc[x_test.index]

In [25]:
# Figure the Feature dimensions so that it can be used in Deep Neural Net later
feature_dim = x_train.shape[1]
print (feature_dim)
print (x_train[y_train==0].shape)

25
(6368, 25)


In [26]:
# Logistic Regression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
lr=LogisticRegression(solver='lbfgs', max_iter=1000)
lr.fit(x_train, y_train)
y_predict = lr.predict(x_test)
target_names = ['class 0 - NonTor', 'class 1 - Tor']
print(classification_report(y_test, y_predict, target_names=target_names))
print("Accuracy = {:.2f}".format(lr.score(x_test, y_test.values)*100))

                  precision    recall  f1-score   support

class 0 - NonTor       0.80      0.55      0.65      3643
   class 1 - Tor       0.94      0.98      0.96     26746

        accuracy                           0.93     30389
       macro avg       0.87      0.77      0.81     30389
    weighted avg       0.92      0.93      0.92     30389

Accuracy = 92.96


In [27]:
new_feature_keys[np.argsort(lr.coef_[0])[::-1][0]] # which feature has highest impact

'Fwd IAT Mean'

In [28]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
hidden_layers = 10
neurons_num = 200
model = Sequential()
model.add(Dense(feature_dim, input_dim= feature_dim, kernel_initializer='normal', activation='relu'))
for _ in range(0, hidden_layers-1):
    model.add(Dense(neurons_num, kernel_initializer='normal', activation='relu'))
model.add(Dense(1,kernel_initializer='normal', activation='sigmoid'))
print (model.summary())
model.compile(optimizer="adam",loss='binary_crossentropy', metrics=["accuracy"])


Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_11 (Dense)            (None, 25)                650       
                                                                 
 dense_12 (Dense)            (None, 200)               5200      
                                                                 
 dense_13 (Dense)            (None, 200)               40200     
                                                                 
 dense_14 (Dense)            (None, 200)               40200     
                                                                 
 dense_15 (Dense)            (None, 200)               40200     
                                                                 
 dense_16 (Dense)            (None, 200)               40200     
                                                                 
 dense_17 (Dense)            (None, 200)              

In [29]:
# Deep Neural Net Implementation using Keras and TensorFlow
# Compute the accuracies and visualise using TensorBoard
from time import time
model.fit(x_train,y_train, epochs=20, batch_size=100, verbose=2,validation_split=0.1)
scores = model.evaluate(x_test, y_test, verbose=2)
#loss, accuracy = model.evaluate(x_test, y_test)
print("\nTest %s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
scores_0 = model.evaluate(x_test[y_test==0], y_test[y_test==0])
print("\nTest %s for class 0: %.2f%%" % (model.metrics_names[1], scores_0[1]*100))
scores_1 = model.evaluate(x_test[y_test==1], y_test[y_test==1])
print("\nTest %s for class 1: %.2f%%" % (model.metrics_names[1], scores_1[1]*100))

Epoch 1/20
489/489 - 5s - loss: 0.1952 - accuracy: 0.9278 - val_loss: 0.1259 - val_accuracy: 0.9490 - 5s/epoch - 10ms/step
Epoch 2/20
489/489 - 4s - loss: 0.1374 - accuracy: 0.9445 - val_loss: 0.1307 - val_accuracy: 0.9504 - 4s/epoch - 8ms/step
Epoch 3/20
489/489 - 4s - loss: 0.1333 - accuracy: 0.9450 - val_loss: 0.1201 - val_accuracy: 0.9491 - 4s/epoch - 8ms/step
Epoch 4/20
489/489 - 4s - loss: 0.1273 - accuracy: 0.9456 - val_loss: 0.1182 - val_accuracy: 0.9530 - 4s/epoch - 8ms/step
Epoch 5/20
489/489 - 4s - loss: 0.1259 - accuracy: 0.9468 - val_loss: 0.1200 - val_accuracy: 0.9440 - 4s/epoch - 8ms/step
Epoch 6/20
489/489 - 4s - loss: 0.1223 - accuracy: 0.9476 - val_loss: 0.1204 - val_accuracy: 0.9508 - 4s/epoch - 9ms/step
Epoch 7/20
489/489 - 4s - loss: 0.1209 - accuracy: 0.9485 - val_loss: 0.1129 - val_accuracy: 0.9512 - 4s/epoch - 8ms/step
Epoch 8/20
489/489 - 4s - loss: 0.1183 - accuracy: 0.9506 - val_loss: 0.1165 - val_accuracy: 0.9445 - 4s/epoch - 8ms/step
Epoch 9/20
489/489 - 4s

In [50]:
nn_y_predict = (model.predict(x_test)>0.5).astype(int)

In [51]:
print(classification_report(y_test, nn_y_predict, target_names=target_names))

                  precision    recall  f1-score   support

class 0 - NonTor       0.82      0.75      0.78      3627
   class 1 - Tor       0.97      0.98      0.97     26878

        accuracy                           0.95     30505
       macro avg       0.89      0.86      0.88     30505
    weighted avg       0.95      0.95      0.95     30505

