### Declaration of Functions

The section below presents all the functions used in the notebook.

In [1]:
# Function: Read the dataset and return a dataframe
import pandas as pd

def read_data(dataset):
    # Number of rows in each dataframe
    chunksize = 500000
    
    # List that will contain all the dataframes
    list_of_dataframes = []
    
    for df in pd.read_csv(dataset, chunksize = chunksize):
        list_of_dataframes.append(df)
        
    df = pd.concat(list_of_dataframes)
    print(f'[DONE] Dataset imported with success')
    return df

In [12]:
# Function: Delete unnecessary fields
def delete_fields(df, fields):
    print('[INFO] Deleting unnecessary fields \n')
    for i in fields:
        df.drop(i, axis = 1, inplace = True)
        print(f'[REMOVED] ' + i)
    print('\n[INFO] Unnecessary fields deleted with success')

In [3]:
# Function: Z-Score normalization
from scipy.stats import zscore

def zscore_normalization(df, fields):
    for i in fields:
        df[i] = zscore(df[i])
    print(f'[DONE] Z-Score Normalization')

In [4]:
# Function: Fill empty cells with N value
def fill_empty_cells(df, fields, n):
    for i in fields:
        df[i] = df[i].fillna(n)
    print(f'[DONE] Empty Cells Filling: ' + n)

In [5]:
# Function: Replace old value with a new one 
def value_replacing(df, field, old_value, new_value):
    df[field] = df[field].replace(old_value, new_value)
    print(f'[DONE] Value Replacing: ' + old_value + ' -- ' + new_value)

In [6]:
# Function: Convert fields to dummy variables
def dummy_encode(df, fields):
    for i in fields:
        df = pd.concat([df, pd.get_dummies(df[i], prefix = i, dtype = float)], axis = 1)
        df.drop(i, axis = 1, inplace = True)
    print(f'[DONE] Categorical Values (Dummies)')
    return df

In [7]:
# Function: Convert hexadecimal to int
def hexadecimal_convert(df, fields):
    for i in fields:
        df[i] = df[i].astype(str)
        df[i] = df[i].apply(int, base = 16)
    print(f'[DONE] Integer Convert (Hex - Int)')

### Declaration of Global Variables

The section below presents all the global functions used in the notebook.

In [8]:
DATASET = '../../../Datasets/IP-Based/IPBased_Benign_Dataset.csv'
DATASET_PRE_PROCESSING = '../../../Datasets/IP-Based/IPBased_Benign_Dataset_PreProcessing.csv'

### Loading Dataset
The upcoming step will load the desired dataset and read all the data from it, storing it in the variable `df`.

The code below will reproduce an warning saying the following **"DtypeWarning: Columns (68, 79, 83) have mixed types"**. Since the collumns 68, 79 and 83 represent the fields <mark>tcp.options.mss</mark>, <mark>tcp.reassembled.data</mark>, and <mark>tcp.segment_data</mark> respectively, and these collumns are going to be deleted later in this notebook, so there is no problem whatsoever about the warning shown.

In [9]:
# Import dataset and read the data into a dataframe
df = read_data(DATASET)

  for df in pd.read_csv(dataset, chunksize = chunksize):
  for df in pd.read_csv(dataset, chunksize = chunksize):
  for df in pd.read_csv(dataset, chunksize = chunksize):


[DONE] Dataset imported with success


### Data Cleansing

This section covers all the data cleaning done in the dataset.

In [10]:
# Remove all empty collumns
empty_columns = []
for column in df.columns:
    if df[column].isnull().all():
        empty_columns.append(column)

# Print the empty columns
print('-- Completely Empty Columns --')
for column in empty_columns:
    df.drop(column, axis = 1, inplace = True)
    print(f'[REMOVED] Collum: ' + column)

-- Completely Empty Columns --


In [11]:
# Array: Fields to delete (IDs, Timestamps, Checksums, etc...)
fieldsToDelete = [
    'coap.opt.ctype', 'coap.opt.desc', 'coap.opt.name', 'coap.opt.uri_host', 'coap.opt.uri_path',
    'coap.opt.uri_path_recon', 'coap.payload', 'coap.payload_desc', 'coap.token_len', 'data.data', 'udp.checksum',
    'udp.checksum.status', 'udp.dstport', 'udp.length', 'udp.payload', 'udp.port', 'udp.possible_traceroute',
    'udp.srcport', 'udp.time_delta', 'udp.stream', 'frame.encap_type', 'frame.ignored', 'frame.interface_id',
    'frame.interface_name', 'frame.marked', 'frame.number', 'frame.offset_shift', 'frame.protocols', 'frame.time',
    'frame.time_delta', 'frame.time_delta_displayed', 'frame.time_epoch', 'frame.time_relative', 'mqtt.hdr_reserved',
    'mqtt.hdrflags', 'mqtt.len', 'mqtt.msg', 'mqtt.msgtype', 'mqtt.topic', 'mqtt.topic_len', 'ip.addr',
    'ip.checksum', 'ip.checksum.status', 'ip.dsfield', 'ip.dsfield.dscp', 'ip.dsfield.ecn', 'ip.dst', 'ip.dst_host',
    'ip.hdr_len', 'ip.host', 'ip.id', 'ip.len', 'ip.proto', 'ip.src', 'ip.src_host', 'ip.ttl.lncb',
    'tcp.window_size', 'tcp.time_relative', 'tcp.time_delta', 'tcp.stream', 'tcp.srcport', 'tcp.seq_raw', 'tcp.seq',
    'tcp.segments', 'tcp.segment_data', 'tcp.segment.count', 'tcp.segment', 'tcp.reassembled.length',
    'tcp.reassembled.data', 'tcp.port', 'tcp.payload', 'tcp.options.wscale.shift', 'tcp.options.wscale.multiplier',
    'tcp.options.wscale', 'tcp.options.timestamp.tsval', 'tcp.options.timestamp.tsecr', 'tcp.options.sack_perm',
    'tcp.options.mss_val', 'tcp.options.mss', 'tcp.options', 'tcp.nxtseq', 'tcp.dstport', 'tcp.checksum',
    'tcp.checksum.status', 'tcp.flags.str', 'tcp.analysis.window_full', 'tcp.analysis.reused_ports',
    'tcp.analysis.lost_segment', 'tcp.analysis.keep_alive_ack', 'tcp.analysis.keep_alive', 'tcp.analysis.flags',
    'tcp.analysis.acks_frame', 'tcp.analysis.ack_lost_segment', 'tcp.analysis', 'tcp.ack_raw', 'tcp.ack'
]

delete_fields(df, fieldsToDelete)

[INFO] Deleting unnecessary fields 

[REMOVED] coap.opt.ctype
[REMOVED] coap.opt.desc
[REMOVED] coap.opt.name
[REMOVED] coap.opt.uri_host
[REMOVED] coap.opt.uri_path
[REMOVED] coap.opt.uri_path_recon
[REMOVED] coap.payload
[REMOVED] coap.payload_desc
[REMOVED] coap.token_len
[REMOVED] data.data
[REMOVED] udp.checksum
[REMOVED] udp.checksum.status
[REMOVED] udp.dstport
[REMOVED] udp.length
[REMOVED] udp.payload
[REMOVED] udp.port
[REMOVED] udp.possible_traceroute
[REMOVED] udp.srcport
[REMOVED] udp.time_delta
[REMOVED] udp.stream
[REMOVED] frame.encap_type
[REMOVED] frame.ignored
[REMOVED] frame.interface_id
[REMOVED] frame.interface_name
[REMOVED] frame.marked
[REMOVED] frame.number
[REMOVED] frame.offset_shift
[REMOVED] frame.protocols
[REMOVED] frame.time
[REMOVED] frame.time_delta
[REMOVED] frame.time_delta_displayed
[REMOVED] frame.time_epoch
[REMOVED] frame.time_relative
[REMOVED] mqtt.hdr_reserved
[REMOVED] mqtt.hdrflags
[REMOVED] mqtt.len
[REMOVED] mqtt.msg
[REMOVED] mqtt.msgtyp

### Pre-Processing & Data Encoding

This section covers all the pre-processing and data encoding done in the dataset.

In [13]:
# Array: Hexadecimal fields to fill with 0x00000000
hexadecimalFill = [
    'ip.flags', 'tcp.flags'
]

fill_empty_cells(df, hexadecimalFill, '0x00000000') # Fill: 0x00000000

[DONE] Empty Cells Filling: 0x00000000


In [14]:
# Array: Fields to convert from Hexadecimal to Integer
hexadecimalFields = [
    'ip.flags', 'tcp.flags'
]

hexadecimal_convert(df, hexadecimalFields)

[DONE] Integer Convert (Hex - Int)


In [15]:
# Array: Fields to fill with 0
fill_0 = [
    'coap.code', 'coap.mid', 'coap.opt.delta', 'coap.opt.end_marker', 'coap.payload_length', 'data.len',
    'mqtt.unknown_version', 'tcp.pdu.size', 'tcp.option_len', 'tcp.option_kind', 'tcp.hdr_len', 'tcp.connection.rst',
    'tcp.connection.syn', 'tcp.connection.synack', 'tcp.connection.fin', 'tcp.analysis.bytes_in_flight',
    'tcp.analysis.push_bytes_sent', 'tcp.analysis.ack_rtt', 'tcp.analysis.initial_rtt'
]

# Array: Fields to fill with 1
fill_positive_1 = [
    'coap.opt.length', 'coap.type', 'mqtt.dupflag', 'mqtt.retain', 'ip.flags.mf', 'ip.flags.rb', 'ip.frag_offset',
    'tcp.urgent_pointer', 'tcp.flags.urg', 'tcp.flags.ns', 'tcp.flags.ecn', 'tcp.flags.cwr'
]

# Array: Fields to fill with -1
fill_negative_1 = [
    'mqtt.qos', 'ip.flags.df', 'ip.ttl', 'tcp.flags.syn', 'tcp.flags.reset', 'tcp.flags.push', 'tcp.flags.fin',
    'tcp.flags.ack', 
]

fill_empty_cells(df, fill_0, '0')            # Fill: 0
fill_empty_cells(df, fill_positive_1, '1')   # Fill: 1
fill_empty_cells(df, fill_negative_1, '-1')  # Fill: -1

[DONE] Empty Cells Filling: 0
[DONE] Empty Cells Filling: 1
[DONE] Empty Cells Filling: -1


In [16]:
# Value replacing
value_replacing(df, 'coap.opt.end_marker', '255', '0')
value_replacing(df, 'tcp.pdu.size', '2', '1')
value_replacing(df, 'tcp.pdu.size', '6', '2')
value_replacing(df, 'tcp.option_len', '4', '1')
value_replacing(df, 'tcp.option_len', '10', '2')

[DONE] Value Replacing: 255 -- 0
[DONE] Value Replacing: 2 -- 1
[DONE] Value Replacing: 6 -- 2
[DONE] Value Replacing: 4 -- 1
[DONE] Value Replacing: 10 -- 2


In [17]:
# Array: Fields to apply z-score normalization
zScoreNormalization = [
    'coap.mid', 'coap.payload_length', 'data.len', 'frame.cap_len', 'frame.len', 'ip.ttl', 'tcp.len', 'tcp.hdr_len',
    'tcp.flags', 'tcp.analysis.push_bytes_sent', 'tcp.analysis.bytes_in_flight'
]

for i in zScoreNormalization:
    df[i] = df[i].astype(float)

zscore_normalization(df, zScoreNormalization)

[DONE] Z-Score Normalization


In [18]:
# Array: Fields to transform into categorical variables
dummyEncoding = [
    'coap.code', 'coap.opt.delta', 'coap.opt.length', 'coap.type', 'ip.flags'
]

df = dummy_encode(df, dummyEncoding)

[DONE] Categorical Values (Dummies)


In [24]:
# Labelling the dataset
df['is_malicious'] = 0
df['attack_type'] = 0
print(f'[DONE] Dataset labelling')

[DONE] Dataset labelling


In [23]:
# Save the dataset with all the pre-processing and data encoding done
df.to_csv(DATASET_PRE_PROCESSING, index = False)
print(f'[DONE] Dataset with pre-processing saved with success')

[DONE] Dataset with pre-processing saved with success
