In [53]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Dense, LSTM, Conv1D, MaxPooling1D, Flatten, ConvLSTM2D

In [54]:
datapath = 'https://raw.githubusercontent.com/JonathanC1/Datasets/master/TorNonTor/TimeBasedFeatures-10s-Layer2.csv'
dataframe = pd.read_csv(datapath,low_memory=False)

In [55]:
def dfNormalize(df):
    for feature_name in df.columns:
        df.loc[:,feature_name]= pd.to_numeric(df.loc[:,feature_name], errors='coerce').fillna(0)
        max_value = df[feature_name].max()
        min_value = df[feature_name].min()   
        if (max_value - min_value) > 0:
            df.loc[:,feature_name] = (df.loc[:,feature_name] - min_value) / (max_value - min_value)
        else:
            df.loc[:,feature_name] = (df.loc[:,feature_name]- min_value)    
    return df

In [56]:
print(dataframe.shape)
dataframe = dataframe.reindex(np.random.permutation(dataframe.index)).copy()
print(dataframe.describe())
print(list(dataframe))

(2256, 29)
        Source Port   Destination Port   Protocol   Flow Duration   
count   2256.000000        2256.000000     2256.0    2.256000e+03  \
mean   37249.008422        8246.570035        6.0    7.717811e+06   
std    19072.507181       16078.566419        0.0    3.539168e+06   
min      110.000000         110.000000        6.0    3.400000e+01   
25%    34328.000000         443.000000        6.0    6.420080e+06   
50%    41071.000000         443.000000        6.0    9.911042e+06   
75%    54453.000000         443.000000        6.0    9.983053e+06   
max    60740.000000       58368.000000        6.0    1.000000e+07   

        Flow IAT Max   Flow IAT Min   Fwd IAT Max   Fwd IAT Min   Bwd IAT Max   
count   2.256000e+03    2256.000000  2.256000e+03  2.256000e+03  2.256000e+03  \
mean    1.497806e+06      63.697695  1.504453e+06  5.715877e+04  1.439050e+06   
std     2.157067e+06    1131.040217  2.154873e+06  5.658512e+05  2.099171e+06   
min     3.400000e+01    -442.000000  0.0000

In [57]:
keys = dataframe.keys()
data_to_process = dataframe[keys[4:len(keys) - 1]].copy()
x_normalised = dfNormalize(data_to_process)
print(x_normalised.describe())

        Protocol   Flow Duration   Flow IAT Max   Flow IAT Min   Fwd IAT Max   
count     2256.0     2256.000000    2256.000000    2256.000000   2256.000000  \
mean         0.0        0.771780       0.150347       0.013083      0.151014   
std          0.0        0.353918       0.216527       0.029262      0.216302   
min          0.0        0.000000       0.000000       0.000000      0.000000   
25%          0.0        0.642007       0.012921       0.011435      0.013672   
50%          0.0        0.991104       0.037548       0.011565      0.038433   
75%          0.0        0.998305       0.217486       0.012082      0.217563   
max          0.0        1.000000       1.000000       1.000000      1.000000   

        Fwd IAT Min   Bwd IAT Max   Bwd IAT Min  Active Mean   Active Std   
count   2256.000000   2256.000000   2256.000000  2256.000000       2256.0  \
mean       0.005754      0.144449      0.007218     0.015409          0.0   
std        0.056798      0.210711      0.063026 

In [58]:
def change_labels(value):
    if value == 'AUDIO':
        return 1
    elif value == 'BROWSING':
        return 2
    elif value == 'CHAT':
        return 3
    elif value == 'FILE-TRANSFER':
        return 4
    elif value == 'MAIL':
        return 5
    elif value == 'P2P':
        return 6
    elif value == 'VIDEO':
        return 7
    else:
        return 8

In [59]:
y_normalised = dataframe['label'].apply(change_labels)

X_train, X_test, y_train, y_test = train_test_split(x_normalised, y_normalised, test_size = 0.20)

In [60]:
# Build the LSTM model
model = Sequential()
model.add(LSTM(128,activation='relu',input_shape=(X_train.shape[1],1)))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', metrics=['accuracy'])

In [61]:
X_train = X_train.values.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test = X_test.values.reshape(X_test.shape[0], X_test.shape[1], 1)

In [63]:
# Fit the model
X_train = np.asarray(X_train).astype(np.float32)
y_train = np.asarray(y_train).astype(np.float32)
model.fit(X_train, y_train, epochs=20, batch_size=32)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x1dd97669cc0>

In [67]:
# Evaluate the model
X_test = np.asarray(X_test).astype(np.float32)
y_test = np.asarray(y_test).astype(np.float32)
score = model.evaluate(X_test, y_test, verbose=0)
#print('Test loss:', score[0])
print('Test accuracy:', score[1])


Test accuracy: 0.14823009073734283
