In [1]:
import pandas as pd
from enum import Enum     # for enum34, or the stdlib version
from pycaret.classification import *
from anonymizeip import anonymize_ip

class IP_techniques(Enum):
    bm = 1 #  black-marker
    tr = 2 # truncation
    no = 3 # Not anonymized
    
sample = 1  
binary = 1 

ip_technique = IP_techniques.no

In [2]:
from typing import List

def optimize_floats(df: pd.DataFrame) -> pd.DataFrame:
    floats = df.select_dtypes(include=['float64']).columns.tolist()
    df[floats] = df[floats].apply(pd.to_numeric, downcast='float')
    return df


def optimize_ints(df: pd.DataFrame) -> pd.DataFrame:
    ints = df.select_dtypes(include=['int64']).columns.tolist()
    df[ints] = df[ints].apply(pd.to_numeric, downcast='integer')
    return df


def optimize_objects(df: pd.DataFrame, datetime_features: List[str]) -> pd.DataFrame:
    for col in df.select_dtypes(include=['object']):
        if col not in datetime_features:
            if not (type(df[col][0])==list):
                num_unique_values = len(df[col].unique())
                num_total_values = len(df[col])
                if float(num_unique_values) / num_total_values < 0.5:
                    df[col] = df[col].astype('category')
        else:
            df[col] = pd.to_datetime(df[col])
    return df



def optimize(df: pd.DataFrame, datetime_features: List[str] = []):
    return optimize_floats(optimize_ints(optimize_objects(df, datetime_features)))

In [3]:
df = pd.read_csv("NF-ToN-IoT-v2.csv")

## Checks

In [4]:
df.columns

Index(['IPV4_SRC_ADDR', 'L4_SRC_PORT', 'IPV4_DST_ADDR', 'L4_DST_PORT',
       'PROTOCOL', 'L7_PROTO', 'IN_BYTES', 'IN_PKTS', 'OUT_BYTES', 'OUT_PKTS',
       'TCP_FLAGS', 'CLIENT_TCP_FLAGS', 'SERVER_TCP_FLAGS',
       'FLOW_DURATION_MILLISECONDS', 'DURATION_IN', 'DURATION_OUT', 'MIN_TTL',
       'MAX_TTL', 'LONGEST_FLOW_PKT', 'SHORTEST_FLOW_PKT', 'MIN_IP_PKT_LEN',
       'MAX_IP_PKT_LEN', 'SRC_TO_DST_SECOND_BYTES', 'DST_TO_SRC_SECOND_BYTES',
       'RETRANSMITTED_IN_BYTES', 'RETRANSMITTED_IN_PKTS',
       'RETRANSMITTED_OUT_BYTES', 'RETRANSMITTED_OUT_PKTS',
       'SRC_TO_DST_AVG_THROUGHPUT', 'DST_TO_SRC_AVG_THROUGHPUT',
       'NUM_PKTS_UP_TO_128_BYTES', 'NUM_PKTS_128_TO_256_BYTES',
       'NUM_PKTS_256_TO_512_BYTES', 'NUM_PKTS_512_TO_1024_BYTES',
       'NUM_PKTS_1024_TO_1514_BYTES', 'TCP_WIN_MAX_IN', 'TCP_WIN_MAX_OUT',
       'ICMP_TYPE', 'ICMP_IPV4_TYPE', 'DNS_QUERY_ID', 'DNS_QUERY_TYPE',
       'DNS_TTL_ANSWER', 'FTP_COMMAND_RET_CODE', 'Label', 'Attack'],
      dtype='object')

In [5]:
df['IPV4_SRC_ADDR'].value_counts()

192.168.1.31      4308656
192.168.1.32      3614669
192.168.1.30      3462837
192.168.1.35      1254234
192.168.1.39       983745
                   ...   
27.150.244.184          1
163.172.235.3           1
97.86.192.166           1
248.42.224.200          1
184.25.57.139           1
Name: IPV4_SRC_ADDR, Length: 23079, dtype: int64

In [6]:
df['IPV4_DST_ADDR'].value_counts()

192.168.1.195      2118567
192.168.1.1        2012390
192.168.1.190      1805130
192.168.1.184      1653333
192.168.1.180      1288737
                    ...   
189.209.90.89            1
150.238.203.200          1
23.196.161.55            1
111.31.243.236           1
205.129.11.118           1
Name: IPV4_DST_ADDR, Length: 6868, dtype: int64

## =================================================================

In [7]:
# Remove non-ipfix standardized features
# See: https://www.ntop.org/guides/nprobe/cli_options.html#netflow-v9-ipfix-format-t
# Thesis: Refer to table 2 for feature names and descriptions

df_ipfix = df[['L4_SRC_PORT', 'L4_DST_PORT', 'PROTOCOL', 'IN_BYTES', 'OUT_BYTES', 'IN_PKTS', 'OUT_PKTS', 'TCP_FLAGS', 'FLOW_DURATION_MILLISECONDS', 'MIN_TTL', 'MAX_TTL', 'ICMP_TYPE', 'ICMP_IPV4_TYPE', 'Label', 'Attack']]
df_ipfix.dtypes
#df_ipfix['IPV4_SRC_ADDR'] = df_ipfix['IPV4_SRC_ADDR'].astype('category')
#df_ipfix['IPV4_DST_ADDR'] = df_ipfix['IPV4_DST_ADDR'].astype('category')

df_ipfix = optimize(df_ipfix, [])
df_ipfix.memory_usage()

Index                              128
L4_SRC_PORT                   67761984
L4_DST_PORT                   67761984
PROTOCOL                      16940496
IN_BYTES                      67761984
OUT_BYTES                     67761984
IN_PKTS                       67761984
OUT_PKTS                      67761984
TCP_FLAGS                     33880992
FLOW_DURATION_MILLISECONDS    67761984
MIN_TTL                       33880992
MAX_TTL                       33880992
ICMP_TYPE                     67761984
ICMP_IPV4_TYPE                33880992
Label                         16940496
Attack                        16940876
dtype: int64

In [8]:
df_ipfix.head()

Unnamed: 0,L4_SRC_PORT,L4_DST_PORT,PROTOCOL,IN_BYTES,OUT_BYTES,IN_PKTS,OUT_PKTS,TCP_FLAGS,FLOW_DURATION_MILLISECONDS,MIN_TTL,MAX_TTL,ICMP_TYPE,ICMP_IPV4_TYPE,Label,Attack
0,49235,4444,6,155392,34552,202,149,24,4294952,128,128,0,0,1,ransomware
1,49228,1880,6,1600,35741,40,65,24,4294952,128,128,0,0,0,Benign
2,0,0,1,212,0,2,0,0,0,64,64,771,3,0,Benign
3,65317,1900,17,165,0,1,0,0,0,0,0,0,0,0,Benign
4,60766,15600,17,63,0,1,0,0,0,0,0,0,0,0,Benign


In [18]:
if(binary):
    df_ = df_ipfix.drop("Attack", axis=1)
else:
    df_ = df_ipfix.drop("Label", axis=1)

# Samples
if(sample):
    df_ = df_.sample(n=1000)

## Feature Anonymization

### IP-address anonymization 

In [19]:
#pd.set_option("display.max_rows", None)

# /24 network address truncation

'''
if(ip_technique == IP_techniques.tr):
    df_['IPV4_DST_ADDR'] = df_['IPV4_DST_ADDR'].apply(lambda x: anonymize_ip(x))
    df_['IPV4_SRC_ADDR'] = df_['IPV4_SRC_ADDR'].apply(lambda x: anonymize_ip(x))
elif(ip_technique == IP_techniques.bm):
    df_['IPV4_SRC_ADDR'] = 0
    df_['IPV4_DST_ADDR'] = 0
    df_['IPV4_SRC_ADDR'] = df_['IPV4_SRC_ADDR'].astype('str') # fix fill_value'=constant is invalid error  
    df_['IPV4_DST_ADDR'] = df_['IPV4_DST_ADDR'].astype('str')
df_.head()
df_['IPV4_SRC_ADDR'].value_counts()
'''

"\nif(ip_technique == IP_techniques.tr):\n    df_['IPV4_DST_ADDR'] = df_['IPV4_DST_ADDR'].apply(lambda x: anonymize_ip(x))\n    df_['IPV4_SRC_ADDR'] = df_['IPV4_SRC_ADDR'].apply(lambda x: anonymize_ip(x))\nelif(ip_technique == IP_techniques.bm):\n    df_['IPV4_SRC_ADDR'] = 0\n    df_['IPV4_DST_ADDR'] = 0\n    df_['IPV4_SRC_ADDR'] = df_['IPV4_SRC_ADDR'].astype('str') # fix fill_value'=constant is invalid error  \n    df_['IPV4_DST_ADDR'] = df_['IPV4_DST_ADDR'].astype('str')\ndf_.head()\ndf_['IPV4_SRC_ADDR'].value_counts()\n"

In [20]:
df_['ICMP_TYPE'] = df_['ICMP_TYPE'].astype(str)
df_['ICMP_IPV4_TYPE'] = df_['ICMP_IPV4_TYPE'].astype(str)
if(binary):
    session_binary = setup(df_, target = 'Label', experiment_name='binary', categorical_features = ['L4_SRC_PORT', 'L4_DST_PORT', 'PROTOCOL', 'TCP_FLAGS', 'ICMP_TYPE', 'ICMP_IPV4_TYPE'])
else:
    session_multi = setup(df_, target = 'Attack', experiment_name='multiclass', categorical_features = ['L4_SRC_PORT', 'L4_DST_PORT', 'PROTOCOL', 'TCP_FLAGS',  'ICMP_TYPE', 'ICMP_IPV4_TYPE'])


Unnamed: 0,Description,Value
0,Session id,8282
1,Target,Label
2,Target type,Binary
3,Original data shape,"(1000, 14)"
4,Transformed data shape,"(1000, 15)"
5,Transformed train set shape,"(699, 15)"
6,Transformed test set shape,"(301, 15)"
7,Numeric features,11
8,Categorical features,6
9,Preprocess,1


In [31]:
dt = create_model('dt')
evaluate_model(dt)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6429,0.5,1.0,0.6429,0.7826,0.0,0.0
1,0.6429,0.5,1.0,0.6429,0.7826,0.0,0.0
2,0.6429,0.5,1.0,0.6429,0.7826,0.0,0.0
3,0.6429,0.5,1.0,0.6429,0.7826,0.0,0.0
4,0.6429,0.5,1.0,0.6429,0.7826,0.0,0.0
5,0.6571,0.5,1.0,0.6571,0.7931,0.0,0.0
6,0.6571,0.5,1.0,0.6571,0.7931,0.0,0.0
7,0.6571,0.5,1.0,0.6571,0.7931,0.0,0.0
8,0.6571,0.5,1.0,0.6571,0.7931,0.0,0.0
9,0.6522,0.5,1.0,0.6522,0.7895,0.0,0.0


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [32]:
results = pull()
print("--------------------------------")
print("Decision Tree - Results:")
print(results)
print("--------------------------------")

Decision Tree - Results:
      Accuracy  AUC  Recall   Prec.      F1  Kappa  MCC
Fold                                                   
0       0.6429  0.5     1.0  0.6429  0.7826    0.0  0.0
1       0.6429  0.5     1.0  0.6429  0.7826    0.0  0.0
2       0.6429  0.5     1.0  0.6429  0.7826    0.0  0.0
3       0.6429  0.5     1.0  0.6429  0.7826    0.0  0.0
4       0.6429  0.5     1.0  0.6429  0.7826    0.0  0.0
5       0.6571  0.5     1.0  0.6571  0.7931    0.0  0.0
6       0.6571  0.5     1.0  0.6571  0.7931    0.0  0.0
7       0.6571  0.5     1.0  0.6571  0.7931    0.0  0.0
8       0.6571  0.5     1.0  0.6571  0.7931    0.0  0.0
9       0.6522  0.5     1.0  0.6522  0.7895    0.0  0.0
Mean    0.6495  0.5     1.0  0.6495  0.7875    0.0  0.0
Std     0.0068  0.0     0.0  0.0068  0.0050    0.0  0.0


In [33]:

rf = create_model('rf')
evaluate_model(rf)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6429,0.8813,1.0,0.6429,0.7826,0.0,0.0
1,0.6429,0.8524,1.0,0.6429,0.7826,0.0,0.0
2,0.6429,0.8982,1.0,0.6429,0.7826,0.0,0.0
3,0.6429,0.8573,1.0,0.6429,0.7826,0.0,0.0
4,0.6429,0.7244,1.0,0.6429,0.7826,0.0,0.0
5,0.6571,0.8632,1.0,0.6571,0.7931,0.0,0.0
6,0.6571,0.7591,1.0,0.6571,0.7931,0.0,0.0
7,0.6571,0.933,1.0,0.6571,0.7931,0.0,0.0
8,0.6571,0.8986,1.0,0.6571,0.7931,0.0,0.0
9,0.6522,0.8532,1.0,0.6522,0.7895,0.0,0.0


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [36]:
print("--------------------------------")
results = pull()
print("Random Forest - Results:")
print(results)
print("--------------------------------")


--------------------------------
Random Forest - Results:
      Accuracy     AUC  Recall   Prec.      F1  Kappa  MCC
Fold                                                      
0       0.6429  0.8813     1.0  0.6429  0.7826    0.0  0.0
1       0.6429  0.8524     1.0  0.6429  0.7826    0.0  0.0
2       0.6429  0.8982     1.0  0.6429  0.7826    0.0  0.0
3       0.6429  0.8573     1.0  0.6429  0.7826    0.0  0.0
4       0.6429  0.7244     1.0  0.6429  0.7826    0.0  0.0
5       0.6571  0.8632     1.0  0.6571  0.7931    0.0  0.0
6       0.6571  0.7591     1.0  0.6571  0.7931    0.0  0.0
7       0.6571  0.9330     1.0  0.6571  0.7931    0.0  0.0
8       0.6571  0.8986     1.0  0.6571  0.7931    0.0  0.0
9       0.6522  0.8532     1.0  0.6522  0.7895    0.0  0.0
Mean    0.6495  0.8521     1.0  0.6495  0.7875    0.0  0.0
Std     0.0068  0.0607     0.0  0.0068  0.0050    0.0  0.0
--------------------------------
