In [70]:
import pandas as pd
from pycaret.classification import *
from anonymizeip import anonymize_ip


sample = True 
binary = False 

In [71]:
df = pd.read_csv("NF-ToN-IoT.csv")

In [72]:
df.columns

Index(['IPV4_SRC_ADDR', 'L4_SRC_PORT', 'IPV4_DST_ADDR', 'L4_DST_PORT',
       'PROTOCOL', 'L7_PROTO', 'IN_BYTES', 'OUT_BYTES', 'IN_PKTS', 'OUT_PKTS',
       'TCP_FLAGS', 'FLOW_DURATION_MILLISECONDS', 'Label', 'Attack'],
      dtype='object')

In [73]:
# Remove non-ipfix standardized features
# See: https://www.ntop.org/guides/nprobe/cli_options.html#netflow-v9-ipfix-format-t
# Thesis: Refer to table 2 for feature names and descriptions

df_ipfix = df[['IPV4_SRC_ADDR', 'IPV4_DST_ADDR', 'L4_SRC_PORT', 'L4_DST_PORT', 'PROTOCOL', 'L7_PROTO', 'IN_BYTES', 'OUT_BYTES', 'IN_PKTS', 'OUT_PKTS', 'TCP_FLAGS', 'FLOW_DURATION_MILLISECONDS', 'Label', 'Attack']]
df_ipfix.dtypes
df_ipfix['IPV4_SRC_ADDR'] = df_ipfix['IPV4_SRC_ADDR'].astype('category')
df_ipfix['IPV4_DST_ADDR'] = df_ipfix['IPV4_DST_ADDR'].astype('category')
df_ipfix['L4_SRC_PORT'] = df_ipfix['L4_SRC_PORT'].astype('uint16')
df_ipfix['L4_DST_PORT'] = df_ipfix['L4_DST_PORT'].astype('uint16')


In [74]:
df_ipfix.head()

Unnamed: 0,IPV4_SRC_ADDR,IPV4_DST_ADDR,L4_SRC_PORT,L4_DST_PORT,PROTOCOL,L7_PROTO,IN_BYTES,OUT_BYTES,IN_PKTS,OUT_PKTS,TCP_FLAGS,FLOW_DURATION_MILLISECONDS,Label,Attack
0,192.168.1.195,52.139.250.253,63318,443,6,91.0,181,165,2,1,24,327,0,Benign
1,192.168.1.79,192.168.1.255,57442,15600,17,0.0,63,0,1,0,0,0,0,Benign
2,192.168.1.79,239.255.255.250,57452,15600,17,0.0,63,0,1,0,0,0,0,Benign
3,192.168.1.193,192.168.1.255,138,138,17,10.16,472,0,2,0,0,0,0,Benign
4,192.168.1.79,192.168.1.255,51989,15600,17,0.0,63,0,1,0,0,0,0,Benign


In [75]:
if(binary):
    df_ = df_ipfix.drop("Attack", axis=1)
else:
    df_ = df_ipfix.drop("Label", axis=1)

# Samples
if(sample):
    df_ = df_.sample(n=10000)


## Feature Anonymization

### IP-address anonymization 

In [76]:
#pd.set_option("display.max_rows", None)

# /24 network address truncation

if(0):
    df_['IPV4_DST_ADDR'] = df_['IPV4_DST_ADDR'].apply(lambda x: anonymize_ip(x))
    df_['IPV4_SRC_ADDR'] = df_['IPV4_SRC_ADDR'].apply(lambda x: anonymize_ip(x))

# black-marker anonymization
if(0):
    df_['IPV4_SRC_ADDR'] = 0
    df_['IPV4_DST_ADDR'] = 0
    df_['IPV4_SRC_ADDR'] = df_['IPV4_SRC_ADDR'].astype(str) 
    df_['IPV4_DST_ADDR'] = df_['IPV4_DST_ADDR'].astype(str)
    df_.dtypes

df_.head()
df_['IPV4_SRC_ADDR'].value_counts()

192.168.1.30      1949
192.168.1.31      1881
192.168.1.36      1858
192.168.1.33      1398
192.168.1.184      801
                  ... 
173.194.28.105       0
173.245.59.167       0
184.84.165.128       0
184.84.165.131       0
91.189.95.3          0
Name: IPV4_SRC_ADDR, Length: 174, dtype: int64

In [77]:
session_binary = setup(df_, target = 'Attack', experiment_name='binary', categorical_features = ['IPV4_SRC_ADDR', 'IPV4_DST_ADDR', 'L4_SRC_PORT', 'L4_DST_PORT', 'PROTOCOL', 'L7_PROTO', 'TCP_FLAGS'])

Unnamed: 0,Description,Value
0,Session id,904
1,Target,Attack
2,Target type,Multiclass
3,Target mapping,"Benign: 0, backdoor: 1, ddos: 2, dos: 3, injection: 4, mitm: 5, password: 6, ransomware: 7, scanning: 8, xss: 9"
4,Original data shape,"(10000, 13)"
5,Transformed data shape,"(10000, 16)"
6,Transformed train set shape,"(6999, 16)"
7,Transformed test set shape,"(3001, 16)"
8,Numeric features,10
9,Categorical features,7


In [78]:
svm = create_model('svm')
evaluate_model(svm)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.6935,0.0848,0.6935,0.588,0.6204,0.5693,0.61,0.097
lda,Linear Discriminant Analysis,0.6644,0.0889,0.6644,0.6824,0.6685,0.5673,0.5689,0.046
gbc,Gradient Boosting Classifier,0.6622,0.0866,0.6622,0.5474,0.5895,0.5253,0.5577,1.216
rf,Random Forest Classifier,0.6602,0.0824,0.6602,0.5663,0.5922,0.5231,0.5617,0.097
knn,K Neighbors Classifier,0.66,0.0807,0.66,0.6189,0.6278,0.5393,0.5491,0.051
ridge,Ridge Classifier,0.6395,0.0,0.6395,0.5231,0.5579,0.4905,0.5265,0.04
ada,Ada Boost Classifier,0.5494,0.0809,0.5494,0.3878,0.4244,0.3464,0.4508,0.088
dt,Decision Tree Classifier,0.4876,0.0669,0.4876,0.7122,0.4588,0.3927,0.4903,0.048
lr,Logistic Regression,0.4871,0.0694,0.4871,0.4139,0.3898,0.2617,0.3011,0.847
lightgbm,Light Gradient Boosting Machine,0.4355,0.0814,0.4355,0.3922,0.3689,0.2465,0.2846,0.137


Processing:   0%|          | 0/61 [00:00<?, ?it/s]

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…