### Declaration of Functions

The section below presents all the functions used in the notebook.

In [1]:
# Function: Read the dataset and return a dataframe
import pandas as pd

def read_data(dataset):
    df = pd.read_csv(dataset, nrows=62500)
    print(f'[MALICIOUS] Dataset Imported With Success')
    return df

In [2]:
# Function: Fill empty cells with N value
def fill_empty_cells(df, fields, n, avg=False):
    if avg:
        for i in fields:
            average_n = df[i].mean()
            df[i].fillna(average_n, inplace=True)
            print(f'[DONE] Empty Cells Filling ({i}): {average_n}')
    else:
        for i in fields:
            df[i] = df[i].fillna(n)
            print(f'[DONE] Empty Cells Filling ({i}): {n}')

In [3]:
# Function: Delete unnecessary fields
def delete_fields(df, fields):
    print('[INFO] Deleting unnecessary fields')
    for i in fields:
        df.drop(i, axis=1, inplace=True)
        print(f'[REMOVED] ' + i)
    print('[INFO] Unnecessary fields deleted with success')

### Merging Datasets

This section has the purpose to merge all the dataset in a folder and merge them into one.

In [None]:
import pandas as pd
import glob

# malicious Datasets -- 62.500 Rows Each
malicious_datasets = glob.glob('../../../Datasets/IP-Based/Final Samples/*.csv')

dataframes = []

for dataset in malicious_datasets:
    df = read_data(dataset)
    dataframes.append(df)

# Normal Dataset -- 500.000 Rows
df = pd.read_csv('../../../Datasets/IP-Based/Final Samples/Normal/Normal.csv', nrows=500000)
print(f'[NORMAL] Dataset Imported With Success')

dataframes.append(df)

df = pd.concat(dataframes, ignore_index=True)

print(df.shape)

df.to_csv('../../../Datasets/IP-Based/IP-Based Dataset.csv', index=False)
print(f'[MERGED] Dataset Saved With Success')

### Loading Dataset

The upcoming step will load the desired dataset and read all the data from it, storing it in the variable `df`.

In [4]:
%%time

# Import dataset and read the data into a dataframe
DATASET = '../../../Datasets/IP-Based/IP-Based Dataset.csv'
df = pd.read_csv(DATASET)
print(f'[DONE] Dataset Imported With Success')

[DONE] Dataset Imported With Success
CPU times: user 4.6 s, sys: 392 ms, total: 4.99 s
Wall time: 4.99 s


In [5]:
# Array: Fields to fill with the average value
average = [
    'http.content_length', 'http.response.code', 'http.time', 'tcp.analysis.ack_rtt', 'tcp.analysis.bytes_in_flight',
    'tcp.analysis.initial_rtt', 'tcp.analysis.push_bytes_sent', 'tcp.flags', 'tcp.hdr_len', 'tcp.option_kind',
    'tcp.option_len', 'coap.mid', 'coap.payload_length', 'data.len', 'tcp.len', 'tcp.pdu.size', 'ip.ttl'
]

# Array: Fields to fill with -1
negative_one = [
    'http.chat', 'http.request', 'http.response', 'http.response_number', 'tcp.connection.fin', 'tcp.connection.syn',
    'tcp.connection.synack', 'tcp.flags.cwr', 'tcp.flags.ecn', 'tcp.flags.ns', 'tcp.flags.res', 'tcp.flags.urg',
    'ip.flags.mf', 'ip.flags.rb', 'ip.frag_offset', 'coap.opt.end_marker', 'tcp.connection.rst', 'mqtt.dupflag',
    'mqtt.retain', 'mqtt.unknown_version', 'coap.code_3.0', 'coap.code_68.0', 'coap.code_0', 'coap.opt.delta_3.0',
    'coap.opt.delta_12.0', 'coap.opt.delta_0', 'coap.opt.length_0.0', 'coap.opt.length_12.0', 'coap.opt.length_1',
    'coap.type_0.0', 'coap.type_2.0', 'coap.type_1', 'ip.flags_0', 'ip.flags_64', 'eth.dst.ig', 'eth.src.ig',
    'eth.src.lg', 'eth.src_not_group', 'arp.isannouncement'
]

# Array: Fields to fill with +2
positive_two = [
    'tcp.flags.ack', 'tcp.flags.fin', 'tcp.flags.push', 'tcp.flags.reset', 'tcp.flags.syn', 'tcp.urgent_pointer',
    'ip.flags.df', 'mqtt.qos'
]

fill_empty_cells(df, average, None, True)   # Fill: Average
fill_empty_cells(df, negative_one, '-1')    # Fill: -1
fill_empty_cells(df, positive_two, '2')     # Fill: +1

[DONE] Empty Cells Filling (http.content_length): 0.05757309064384904
[DONE] Empty Cells Filling (http.response.code): 0.031394018226409796
[DONE] Empty Cells Filling (http.time): 0.009820563285673285
[DONE] Empty Cells Filling (tcp.analysis.ack_rtt): 0.0010524898079999998
[DONE] Empty Cells Filling (tcp.analysis.bytes_in_flight): 0.07665542563914451
[DONE] Empty Cells Filling (tcp.analysis.initial_rtt): 0.0005186277131428573
[DONE] Empty Cells Filling (tcp.analysis.push_bytes_sent): 0.0733303429345606
[DONE] Empty Cells Filling (tcp.flags): -0.009500820646586599
[DONE] Empty Cells Filling (tcp.hdr_len): 0.01946208281654316
[DONE] Empty Cells Filling (tcp.option_kind): 0.8966217142857142
[DONE] Empty Cells Filling (tcp.option_len): 4.952188571428572
[DONE] Empty Cells Filling (coap.mid): -0.0010966947198726743
[DONE] Empty Cells Filling (coap.payload_length): -0.0010748528121440292
[DONE] Empty Cells Filling (data.len): 0.03238968212580538
[DONE] Empty Cells Filling (tcp.len): -0.03293

In [6]:
# Z-Score normalisation
from scipy.stats import zscore
df['tcp.urgent_pointer'] = df['tcp.urgent_pointer'].astype(float)
df['tcp.urgent_pointer'] = zscore(df['tcp.urgent_pointer'])

df['coap.opt.end_marker'] = df['coap.opt.end_marker'].astype(float)
df['coap.opt.end_marker'] = zscore(df['coap.opt.end_marker'])

print('[INFO] Z-Score normalisation')

[INFO] Z-Score normalisation


In [7]:
# Features that were dropped to ensure a more reliable result
fdelete = [
    'ip.flags_64', 'eth.lg', 'arp.isgratuitous', 'arp.opcode',
    'coap.opt.delta_12.0', 'coap.payload_length', 'coap.code_0', 'coap.type_1', 'tcp.option_len', 
    'coap.opt.length_0.0', 'coap.opt.length_12.0', 'coap.opt.length_1', 'mqtt.retain', 'coap.opt.delta_0',
    'coap.mid', 'data.len', 'ip.flags_0', 'tcp.hdr_len', 'tcp.flags.push', 'mqtt.dupflag',
    'frame.len', 'ip.flags.df', 'ip.flags.mf', 'ip.ttl', 'tcp.flags', 'mqtt.qos', 'tcp.option_kind',
    'coap.opt.delta_3.0', 'coap.code_68.0', 'coap.type_0.0', 'coap.code_3.0', 'coap.type_2.0', 'tcp.connection.rst',
    'coap.opt.end_marker', 'mqtt.unknown_version', 'http.chat', 'http.response', 'tcp.len', 
    'tcp.pdu.size', 'ip.flags.rb', 'tcp.flags.ack', 'tcp.flags.reset', 'frame.cap_len',
    'tcp.analysis.bytes_in_flight', 'tcp.analysis.push_bytes_sent', 'tcp.analysis.ack_rtt'
]

# tcp.analysis.initial_rtt

delete_fields(df, fdelete)

[INFO] Deleting unnecessary fields
[REMOVED] ip.flags_64
[REMOVED] eth.lg
[REMOVED] arp.isgratuitous
[REMOVED] arp.opcode
[REMOVED] coap.opt.delta_12.0
[REMOVED] coap.payload_length
[REMOVED] coap.code_0
[REMOVED] coap.type_1
[REMOVED] tcp.option_len
[REMOVED] coap.opt.length_0.0
[REMOVED] coap.opt.length_12.0
[REMOVED] coap.opt.length_1
[REMOVED] mqtt.retain
[REMOVED] coap.opt.delta_0
[REMOVED] coap.mid
[REMOVED] data.len
[REMOVED] ip.flags_0
[REMOVED] tcp.hdr_len
[REMOVED] tcp.flags.push
[REMOVED] mqtt.dupflag
[REMOVED] frame.len
[REMOVED] ip.flags.df
[REMOVED] ip.flags.mf
[REMOVED] ip.ttl
[REMOVED] tcp.flags
[REMOVED] mqtt.qos
[REMOVED] tcp.option_kind
[REMOVED] coap.opt.delta_3.0
[REMOVED] coap.code_68.0
[REMOVED] coap.type_0.0
[REMOVED] coap.code_3.0
[REMOVED] coap.type_2.0
[REMOVED] tcp.connection.rst
[REMOVED] coap.opt.end_marker
[REMOVED] mqtt.unknown_version
[REMOVED] http.chat
[REMOVED] http.response
[REMOVED] tcp.len
[REMOVED] tcp.pdu.size
[REMOVED] ip.flags.rb
[REMOVED] tcp

In [8]:
df.dropna(axis=1, how='any', inplace=True)
print('[INFO] Drop NaN')

[INFO] Drop NaN


In [9]:
# Converting from dtypes to float
for i in df:
    df[i] = df[i].astype(float)
print(f'[DONE] Convertion to float done with success')

[DONE] Convertion to float done with success


In [10]:
%%time

# Save the dataset with all the pre-processing and data encoding done
DATASET_ALTER = '../../../Datasets/IP-Based/IP-Based Alter Dataset.csv'
df.to_csv(DATASET_ALTER, index=False)
print(f'[DONE] Dataset with pre-processing saved with success')

[DONE] Dataset with pre-processing saved with success
CPU times: user 13.4 s, sys: 233 ms, total: 13.7 s
Wall time: 13.9 s


### Dataset Classification & Test/Train Splitting

The sections below will cover the following themes: dataset classification, splitting into training and test fractions, and saving the training and testing fractions into separated datasets.

In [14]:
import numpy as np

# Convert to Numpy Classification

# CLASSIFICATION: 0 - Binary / 1 - Multiclass
CLASSIFICATION = 1

# For a binary classification, use 'is_malicious' as target column
# For a multiclass classification, use 'attack_type' as target column

if (CLASSIFICATION):
    x_columns = df.columns.drop(['attack_type', 'is_malicious'])
    dummies = pd.get_dummies(df['attack_type'])
else:
    x_columns = df.columns.drop(['attack_type', 'is_malicious'])
    dummies = pd.get_dummies(df['is_malicious'])
    
x = df[x_columns].values
attack = dummies.columns
y = dummies.values

print(f'[DONE] Numpy Classification')

[DONE] Numpy Classification


In [15]:
from sklearn.model_selection import train_test_split

# Splitting the dataset into test and training fractions
x_train, x_test, y_train, y_test = train_test_split(
    x,
    y,
    test_size = 0.25,
    random_state = 10
)

# Splitting the training set into training and validation sets
x_train, x_val, y_train, y_val = train_test_split(
    x_train,
    y_train,
    test_size=0.25,
    random_state=10
)
print(f'[DONE] Training validation splitting')

[DONE] Training validation splitting


In [16]:
import pickle as pkl

if(CLASSIFICATION):
    # Save validation dataset
    with open('Saved Datasets/Validation_Multiclass.pkl', 'wb') as f:
        pkl.dump([x_val, y_val], f)    
    print(f'[VALIDATION] Dataset Saved With Success')
    
    # Save training dataset
    with open('Saved Datasets/Training_Multiclass.pkl', 'wb') as f:
        pkl.dump([x_train, y_train], f)    
    print(f'[TRAINING] Dataset Saved With Success')

    # Save testing dataset
    with open('Saved Datasets/Testing_Multiclass.pkl', 'wb') as f:
        pkl.dump([x_test, y_test], f)
    print(f'[TESTING] Dataset Saved With Success')
    
    # Save dataset
    with open('Saved Datasets/Dataset_Multiclass.pkl', 'wb') as f:
        pkl.dump([x, y], f)    
    print(f'[GENERAL] Dataset Saved With Success')
else:
    # Save validation dataset
    with open('Saved Datasets/Validation_Binary.pkl', 'wb') as f:
        pkl.dump([x_val, y_val], f)    
    print(f'[VALIDATION] Dataset Saved With Success')
    
    # Save training dataset
    with open('Saved Datasets/Training_Binary.pkl', 'wb') as f:
        pkl.dump([x_train, y_train], f)    
    print(f'[TRAINING] Dataset Saved With Success')

    # Save testing dataset
    with open('Saved Datasets/Testing_Binary.pkl', 'wb') as f:
        pkl.dump([x_test, y_test], f)
    print(f'[TESTING] Dataset Saved With Success')
    
    # Save dataset
    with open('Saved Datasets/Dataset_Binary.pkl', 'wb') as f:
        pkl.dump([x, y], f)    
    print(f'[GENERAL] Dataset Saved With Success')

[VALIDATION] Dataset Saved With Success
[TRAINING] Dataset Saved With Success
[TESTING] Dataset Saved With Success
[GENERAL] Dataset Saved With Success
