### Declaration of Functions

The section below presents all the functions used in the notebook.

In [1]:
# Function: Read the dataset and return a dataframe
import pandas as pd

def read_data(dataset):
    df = pd.read_csv(dataset, nrows=380000)
    print('[DONE] Dataset imported successfully')
    return df

In [2]:
# Function: Delete unnecessary fields
def delete_fields(df, fields):
    print('[INFO] Deleting unnecessary fields')
    for i in fields:
        df.drop(i, axis=1, inplace=True)
        print(f'[REMOVED] ' + i)
    print('[INFO] Unnecessary fields deleted with success')

In [3]:
# Function: Z-Score normalization
from scipy.stats import zscore

def zscore_normalization(df, fields):
    for i in fields:
        df[i] = zscore(df[i])
    print(f'[DONE] Z-Score Normalization')

In [4]:
# Function: Fill empty cells with N value
def fill_empty_cells(df, fields, n):
    for i in fields:
        df[i] = df[i].fillna(n)
    print(f'[DONE] Empty Cells Filling: ' + n)

In [5]:
# Function: Convert fields to dummy variables
def dummy_encode(df, fields):
    for i in fields:
        df = pd.concat([df, pd.get_dummies(df[i], prefix=i, dtype=float)], axis=1)
        df.drop(i, axis=1, inplace=True)
    print(f'[DONE] Categorical Values (Dummies)')
    return df

In [6]:
# Function: Convert hexadecimal to int
def hexadecimal_convert(df, fields):
    for i in fields:
        df[i] = df[i].astype(str)
        df[i] = df[i].apply(int, base=16)
    print(f'[DONE] Integer Convert (Hex - Int)')

### Declaration of Global Variables

The section below presents all the global functions used in the notebook.

In [7]:
DATASET = '../../../Datasets/IP-Based/Samples/CAM.csv'
DATASET_PRE_PROCESSING = '../../../Datasets/IP-Based/CAM.csv'

### Loading Dataset
The upcoming step will load the desired dataset and read all the data from it, storing it in the variable `df`.

In [8]:
# Import dataset and read the data into a dataframe
df = read_data(DATASET)

[DONE] Dataset imported successfully


### Labelling the Dataset

This next section will cover the labelling process of the current dataset.

In [9]:
# Creates a mask to only contain the malicious IP address
mask = (df['frame.protocols'] == 'eth:ethertype:ip')

# Filter the DataFrame to keep only the rows matching the IP address
df = df[mask]

print(f'[DONE] Protocols mask applied on the dataframe')

[DONE] Protocols mask applied on the dataframe


In [10]:
# Labelling the malicious packets
import numpy as np

df['is_malicious'] = 1
df['attack_type'] = 6
print(f'[DONE] Dataset malicious labeling done with success')

[DONE] Dataset malicious labeling done with success


### Data Cleansing

This section covers all the data cleaning done in the dataset.

In [11]:
# Array: Fields to delete (IDs, Timestamps, Checksums, etc...)
fieldsToDelete = [
    'frame.encap_type', 'frame.ignored', 'frame.interface_id', 'frame.interface_name', 'frame.marked', 
    'frame.number', 'frame.offset_shift', 'frame.protocols', 'frame.time', 'frame.time_delta',
    'frame.time_delta_displayed', 'frame.time_epoch', 'frame.time_relative', 'ip.addr', 'ip.checksum',
    'ip.checksum.status', 'ip.dsfield', 'ip.dsfield.dscp', 'ip.dsfield.ecn', 'ip.dst', 'ip.dst_host', 'ip.hdr_len',
    'ip.host', 'ip.id', 'ip.len', 'ip.proto', 'ip.src', 'ip.src_host', 'ip.ttl.lncb', 'eth.padding', 'ip.flags',
    'ip.ttl', 'eth.addr', 'eth.addr.oui', 'eth.addr.oui_resolved', 'eth.addr_resolved', 'eth.dst', 'eth.dst.lg',
    'eth.dst.oui', 'eth.dst.oui_resolved', 'eth.src', 'eth.src.oui', 'eth.src.oui_resolved', 'eth.src_resolved',
    'eth.trailer', 'eth.type', 'ip.flags.df', 'ip.flags.mf', 'ip.flags.rb', 'ip.frag_offset', 'eth.dst_resolved'
]

delete_fields(df, fieldsToDelete)

[INFO] Deleting unnecessary fields
[REMOVED] frame.encap_type
[REMOVED] frame.ignored
[REMOVED] frame.interface_id
[REMOVED] frame.interface_name
[REMOVED] frame.marked
[REMOVED] frame.number
[REMOVED] frame.offset_shift
[REMOVED] frame.protocols
[REMOVED] frame.time
[REMOVED] frame.time_delta
[REMOVED] frame.time_delta_displayed
[REMOVED] frame.time_epoch
[REMOVED] frame.time_relative
[REMOVED] ip.addr
[REMOVED] ip.checksum
[REMOVED] ip.checksum.status
[REMOVED] ip.dsfield
[REMOVED] ip.dsfield.dscp
[REMOVED] ip.dsfield.ecn
[REMOVED] ip.dst
[REMOVED] ip.dst_host
[REMOVED] ip.hdr_len
[REMOVED] ip.host
[REMOVED] ip.id
[REMOVED] ip.len
[REMOVED] ip.proto
[REMOVED] ip.src
[REMOVED] ip.src_host
[REMOVED] ip.ttl.lncb
[REMOVED] eth.padding
[REMOVED] ip.flags
[REMOVED] ip.ttl
[REMOVED] eth.addr
[REMOVED] eth.addr.oui
[REMOVED] eth.addr.oui_resolved
[REMOVED] eth.addr_resolved
[REMOVED] eth.dst
[REMOVED] eth.dst.lg
[REMOVED] eth.dst.oui
[REMOVED] eth.dst.oui_resolved
[REMOVED] eth.src
[REMOVED]

In [12]:
# Check all empty collumns
empty_columns = []
for column in df.columns:
    if df[column].isnull().all():
        empty_columns.append(column)
list(empty_columns)        

['eth.fcs',
 'eth.fcs.status',
 'eth.invalid_lentype',
 'eth.invalid_lentype.expert',
 'eth.len',
 'eth.len.past_end',
 'eth.padding_bad']

In [13]:
# Delete and print the empty columns
print('[INFO] Deleting empty columns')
for column in empty_columns:
    df.drop(column, axis = 1, inplace = True)
    print(f'[REMOVED] ' + column)
print('[INFO] Empty columns deleted with success')    

[INFO] Deleting empty columns
[REMOVED] eth.fcs
[REMOVED] eth.fcs.status
[REMOVED] eth.invalid_lentype
[REMOVED] eth.invalid_lentype.expert
[REMOVED] eth.len
[REMOVED] eth.len.past_end
[REMOVED] eth.padding_bad
[INFO] Empty columns deleted with success


### Pre-Processing & Data Encoding

This section covers all the pre-processing and data encoding done in the dataset.

In [14]:
# Array: Fields to fill with 0
fill_0 = [
    'eth.src_not_group'
]

fill_empty_cells(df, fill_0, '0') # Fill: 0

[DONE] Empty Cells Filling: 0


In [15]:
# Value replacing
df['frame.cap_len'] = df['frame.cap_len'].replace(54, 0)
df['frame.len'] = df['frame.len'].replace(54, 0)
print(f'[DONE] tcp.option_len: values 10 and 4 were replaced.')

[DONE] tcp.option_len: values 10 and 4 were replaced.


In [16]:
# Converting from dtypes to float
for i in df:
    df[i] = df[i].astype(float)
print(f'[DONE] Convertion to float done with success')

[DONE] Convertion to float done with success


In [17]:
# Save the dataset with all the pre-processing and data encoding done
df.to_csv(DATASET_PRE_PROCESSING, index=False)
print(f'[DONE] Dataset with pre-processing saved with success')

[DONE] Dataset with pre-processing saved with success
