In [201]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Dense, LSTM, Conv1D, MaxPooling1D, Flatten, ConvLSTM2D, Dropout

In [202]:
datapath = 'https://raw.githubusercontent.com/JonathanC1/Datasets/master/TorNonTor/SelectedFeatures-10s-TOR-NonTOR.csv'
dataframe = pd.read_csv(datapath,low_memory=False)

In [203]:
def dfNormalize(df):
    for feature_name in df.columns:
        df.loc[:,feature_name]= pd.to_numeric(df.loc[:,feature_name], errors='coerce').fillna(0)
        max_value = df[feature_name].max()
        min_value = df[feature_name].min()   
        if (max_value - min_value) > 0:
            df.loc[:,feature_name] = (df.loc[:,feature_name] - min_value) / (max_value - min_value)
        else:
            df.loc[:,feature_name] = (df.loc[:,feature_name]- min_value)    
    return df

In [204]:
print(dataframe.shape)
dataframe = dataframe.reindex(np.random.permutation(dataframe.index)).copy()
print(dataframe.describe())
print(list(dataframe))

(16088, 29)
        Source Port   Destination Port   Protocol   Flow Duration  \
count  16088.000000       16088.000000    16088.0    1.608800e+04   
mean   36296.304388       11777.729426        6.0    4.422426e+06   
std    21217.827170       19907.298714        0.0    4.650899e+06   
min       21.000000          21.000000        6.0    1.000000e+00   
25%    33224.000000         443.000000        6.0    3.710925e+04   
50%    41282.000000         443.000000        6.0    7.780915e+05   
75%    54420.000000        9001.000000        6.0    9.965477e+06   
max    65087.000000       65514.000000        6.0    1.000000e+07   

        Flow IAT Max   Flow IAT Min   Fwd IAT Max   Fwd IAT Min   Bwd IAT Max  \
count   1.608800e+04   1.608800e+04  1.608800e+04  1.608800e+04  1.608800e+04   
mean    8.139237e+05   6.711377e+04  7.562887e+05  8.203885e+04  5.994034e+05   
std     1.775624e+06   4.228532e+05  1.745859e+06  6.422426e+05  1.525304e+06   
min     1.000000e+00  -2.255000e+03  0.000

In [205]:
keys = dataframe.keys()
data_to_process = dataframe[keys[4:len(keys) - 1]].copy()
x_normalised = dfNormalize(data_to_process)
print(x_normalised.describe())

        Protocol   Flow Duration   Flow Bytes/s   Flow Packets/s  \
count    16088.0    16088.000000   16088.000000     16088.000000   
mean         0.0        0.442243       0.000295         0.000620   
std          0.0        0.465090       0.012537         0.017354   
min          0.0        0.000000       0.000000         0.000000   
25%          0.0        0.003711       0.000000         0.000000   
50%          0.0        0.077809       0.000000         0.000000   
75%          0.0        0.996548       0.000000         0.000000   
max          0.0        1.000000       1.000000         1.000000   

        Flow IAT Mean   Flow IAT Std   Flow IAT Max   Flow IAT Min  \
count    16088.000000        16088.0   16088.000000   16088.000000   
mean         0.014998            0.0       0.081417       0.007002   
std          0.062858            0.0       0.177617       0.042685   
min          0.000000            0.0       0.000000       0.000000   
25%          0.000000            0.0 

In [206]:
change_labels = lambda x: 1 if x == 'nonTOR' else 0
y_normalised = dataframe['label'].apply(change_labels)

X_train, X_test, y_train, y_test = train_test_split(x_normalised, y_normalised, test_size = 0.20)

In [207]:
# Build the LSTM model
model = Sequential()
model.add(LSTM(128,activation='relu',input_shape=(x_normalised.shape[1],1)))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [208]:
X_train = X_train.values.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test = X_test.values.reshape(X_test.shape[0], X_test.shape[1], 1)

In [209]:
# Fit the model
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=20,batch_size=32)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x2bb8160d5b0>

In [212]:
# Evaluate the model
score = model.evaluate(X_test, y_test, verbose=0)
#print('Test loss:', score[0])
print('Test accuracy:', score[1])

Test accuracy: 0.9735860824584961
