In [53]:
import pandas as pd
from pycaret.classification import *
from anonymizeip import anonymize_ip


sample = False 

In [3]:
df = pd.read_csv("NF-ToN-IoT-v2.csv")

In [4]:
df.columns

Index(['IPV4_SRC_ADDR', 'L4_SRC_PORT', 'IPV4_DST_ADDR', 'L4_DST_PORT',
       'PROTOCOL', 'L7_PROTO', 'IN_BYTES', 'IN_PKTS', 'OUT_BYTES', 'OUT_PKTS',
       'TCP_FLAGS', 'CLIENT_TCP_FLAGS', 'SERVER_TCP_FLAGS',
       'FLOW_DURATION_MILLISECONDS', 'DURATION_IN', 'DURATION_OUT', 'MIN_TTL',
       'MAX_TTL', 'LONGEST_FLOW_PKT', 'SHORTEST_FLOW_PKT', 'MIN_IP_PKT_LEN',
       'MAX_IP_PKT_LEN', 'SRC_TO_DST_SECOND_BYTES', 'DST_TO_SRC_SECOND_BYTES',
       'RETRANSMITTED_IN_BYTES', 'RETRANSMITTED_IN_PKTS',
       'RETRANSMITTED_OUT_BYTES', 'RETRANSMITTED_OUT_PKTS',
       'SRC_TO_DST_AVG_THROUGHPUT', 'DST_TO_SRC_AVG_THROUGHPUT',
       'NUM_PKTS_UP_TO_128_BYTES', 'NUM_PKTS_128_TO_256_BYTES',
       'NUM_PKTS_256_TO_512_BYTES', 'NUM_PKTS_512_TO_1024_BYTES',
       'NUM_PKTS_1024_TO_1514_BYTES', 'TCP_WIN_MAX_IN', 'TCP_WIN_MAX_OUT',
       'ICMP_TYPE', 'ICMP_IPV4_TYPE', 'DNS_QUERY_ID', 'DNS_QUERY_TYPE',
       'DNS_TTL_ANSWER', 'FTP_COMMAND_RET_CODE', 'Label', 'Attack'],
      dtype='object')

In [54]:
# Remove non-ipfix standardized features
# See: https://www.ntop.org/guides/nprobe/cli_options.html#netflow-v9-ipfix-format-t
# Thesis: Refer to table 2 for feature names and descriptions

df_ipfix = df[['IPV4_SRC_ADDR', 'IPV4_DST_ADDR', 'L4_SRC_PORT', 'L4_DST_PORT', 'PROTOCOL', 'L7_PROTO', 'IN_BYTES', 'OUT_BYTES', 'IN_PKTS', 'OUT_PKTS', 'TCP_FLAGS', 'FLOW_DURATION_MILLISECONDS', 'MIN_TTL', 'MAX_TTL', 'ICMP_TYPE', 'ICMP_IPV4_TYPE', 'Label', 'Attack']]

In [55]:
df_ipfix.head()

Unnamed: 0,IPV4_SRC_ADDR,IPV4_DST_ADDR,L4_SRC_PORT,L4_DST_PORT,PROTOCOL,L7_PROTO,IN_BYTES,OUT_BYTES,IN_PKTS,OUT_PKTS,TCP_FLAGS,FLOW_DURATION_MILLISECONDS,MIN_TTL,MAX_TTL,ICMP_TYPE,ICMP_IPV4_TYPE,Label,Attack
0,192.168.1.193,192.168.1.33,49235,4444,6,0.0,155392,34552,202,149,24,4294952,128,128,0,0,1,ransomware
1,192.168.1.193,192.168.1.152,49228,1880,6,0.0,1600,35741,40,65,24,4294952,128,128,0,0,0,Benign
2,192.168.1.152,192.168.1.193,0,0,1,0.0,212,0,2,0,0,0,64,64,771,3,0,Benign
3,192.168.1.169,239.255.255.250,65317,1900,17,0.0,165,0,1,0,0,0,0,0,0,0,0,Benign
4,192.168.1.79,192.168.1.255,60766,15600,17,0.0,63,0,1,0,0,0,0,0,0,0,0,Benign


In [56]:
df_binary = df_ipfix.drop("Attack", axis=1)
df_multi = df_ipfix.drop("Label", axis=1)

# Samples

if(sample):
    df_binary = df_binary.sample(n=50000)

## Feature Anonymization

### IP-address anonymization 

In [57]:
#pd.set_option("display.max_rows", None)

# /24 network address truncation

'''
df_binary_sample['IPV4_DST_ADDR'] = df_binary_sample['IPV4_DST_ADDR'].apply(lambda x: anonymize_ip(x))
df_binary_sample['IPV4_SRC_ADDR'] = df_binary_sample['IPV4_SRC_ADDR'].apply(lambda x: anonymize_ip(x))
'''

# black-marker anonymization
if(0):
    df_binary['IPV4_SRC_ADDR'] = 0
    df_binary['IPV4_DST_ADDR'] = 0
    df_binary['IPV4_SRC_ADDR'] = df_binary['IPV4_SRC_ADDR'].astype(str) 
    df_binary['IPV4_DST_ADDR'] = df_binary['IPV4_DST_ADDR'].astype(str)
    df_binary.dtypes

df_binary.head()

Unnamed: 0,IPV4_SRC_ADDR,IPV4_DST_ADDR,L4_SRC_PORT,L4_DST_PORT,PROTOCOL,L7_PROTO,IN_BYTES,OUT_BYTES,IN_PKTS,OUT_PKTS,TCP_FLAGS,FLOW_DURATION_MILLISECONDS,MIN_TTL,MAX_TTL,ICMP_TYPE,ICMP_IPV4_TYPE,Label
0,192.168.1.193,192.168.1.33,49235,4444,6,0.0,155392,34552,202,149,24,4294952,128,128,0,0,1
1,192.168.1.193,192.168.1.152,49228,1880,6,0.0,1600,35741,40,65,24,4294952,128,128,0,0,0
2,192.168.1.152,192.168.1.193,0,0,1,0.0,212,0,2,0,0,0,64,64,771,3,0
3,192.168.1.169,239.255.255.250,65317,1900,17,0.0,165,0,1,0,0,0,0,0,0,0,0
4,192.168.1.79,192.168.1.255,60766,15600,17,0.0,63,0,1,0,0,0,0,0,0,0,0


In [58]:
#
session_binary = setup(df_binary, target = 'Label', experiment_name='binary', categorical_features = ['IPV4_SRC_ADDR', 'IPV4_DST_ADDR', 'L4_SRC_PORT', 'L4_DST_PORT', 'PROTOCOL', 'L7_PROTO', 'TCP_FLAGS', 'ICMP_TYPE', 'ICMP_IPV4_TYPE'])

Unnamed: 0,Description,Value
0,Session id,7490
1,Target,Label
2,Target type,Binary
3,Original data shape,"(16940496, 17)"
4,Transformed data shape,"(16940496, 17)"
5,Transformed train set shape,"(11858347, 17)"
6,Transformed test set shape,"(5082149, 17)"
7,Numeric features,14
8,Categorical features,9
9,Preprocess,1


In [59]:
dt = create_model('dt')
evaluate_model(dt)


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

KeyboardInterrupt: 