### Declaration of Functions

The section below presents all the functions used in the notebook.

In [1]:
# Function: Read the dataset and return a dataframe
import pandas as pd

def read_data(dataset):
    df = pd.read_csv(dataset, nrows=380000)
    print('[DONE] Dataset imported successfully')
    return df

In [2]:
# Function: Delete unnecessary fields
def delete_fields(df, fields):
    print('[INFO] Deleting unnecessary fields')
    for i in fields:
        df.drop(i, axis=1, inplace=True)
        print(f'[REMOVED] ' + i)
    print('[INFO] Unnecessary fields deleted with success')

In [3]:
# Function: Z-Score normalization
from scipy.stats import zscore

def zscore_normalization(df, fields):
    for i in fields:
        df[i] = zscore(df[i])
    print(f'[DONE] Z-Score Normalization')

In [4]:
# Function: Fill empty cells with N value
def fill_empty_cells(df, fields, n):
    for i in fields:
        df[i] = df[i].fillna(n)
    print(f'[DONE] Empty Cells Filling: ' + n)

In [5]:
# Function: Convert fields to dummy variables
def dummy_encode(df, fields):
    for i in fields:
        df = pd.concat([df, pd.get_dummies(df[i], prefix=i, dtype=float)], axis=1)
        df.drop(i, axis=1, inplace=True)
    print(f'[DONE] Categorical Values (Dummies)')
    return df

In [6]:
# Function: Convert hexadecimal to int
def hexadecimal_convert(df, fields):
    for i in fields:
        df[i] = df[i].astype(str)
        df[i] = df[i].apply(int, base=16)
    print(f'[DONE] Integer Convert (Hex - Int)')

### Declaration of Global Variables

The section below presents all the global functions used in the notebook.

In [7]:
DATASET = '../../../Datasets/IP-Based/Samples/ARP.csv'
DATASET_PRE_PROCESSING = '../../../Datasets/IP-Based/ARP.csv'

### Loading Dataset
The upcoming step will load the desired dataset and read all the data from it, storing it in the variable `df`.

In [8]:
# Import dataset and read the data into a dataframe
df = read_data(DATASET)

[DONE] Dataset imported successfully


### Labelling the Dataset

This next section will cover the labelling process of the current dataset.

In [9]:
# Creates a mask to only contain the malicious IP address
mask = (df['frame.protocols'] == 'sll:ethertype:arp')

# Filter the DataFrame to keep only the rows matching the IP address
df = df[mask]

print(f'[DONE] Protocols mask applied on the dataframe')

[DONE] Protocols mask applied on the dataframe


In [10]:
# Labelling the malicious packets
import numpy as np

df['is_malicious'] = 1
df['attack_type'] = 5
print(f'[DONE] Dataset malicious labeling done with success')

[DONE] Dataset malicious labeling done with success


### Data Cleansing

This section covers all the data cleaning done in the dataset.

In [11]:
# Array: Fields to delete (IDs, Timestamps, Checksums, etc...)
fieldsToDelete = [
    'frame.encap_type', 'frame.ignored', 'frame.interface_id', 'frame.interface_name', 'frame.marked', 
    'frame.number', 'frame.offset_shift', 'frame.protocols', 'frame.time', 'frame.time_delta',
    'frame.time_delta_displayed', 'frame.time_epoch', 'frame.time_relative', 'arp.dst.hw_mac', 'arp.dst.proto_ipv4',
    'arp.hw.size', 'arp.hw.type', 'arp.proto.size', 'arp.proto.type', 'arp.src.hw_mac', 'arp.src.proto_ipv4'
]

delete_fields(df, fieldsToDelete)

[INFO] Deleting unnecessary fields
[REMOVED] frame.encap_type
[REMOVED] frame.ignored
[REMOVED] frame.interface_id
[REMOVED] frame.interface_name
[REMOVED] frame.marked
[REMOVED] frame.number
[REMOVED] frame.offset_shift
[REMOVED] frame.protocols
[REMOVED] frame.time
[REMOVED] frame.time_delta
[REMOVED] frame.time_delta_displayed
[REMOVED] frame.time_epoch
[REMOVED] frame.time_relative
[REMOVED] arp.dst.hw_mac
[REMOVED] arp.dst.proto_ipv4
[REMOVED] arp.hw.size
[REMOVED] arp.hw.type
[REMOVED] arp.proto.size
[REMOVED] arp.proto.type
[REMOVED] arp.src.hw_mac
[REMOVED] arp.src.proto_ipv4
[INFO] Unnecessary fields deleted with success


In [12]:
# Check all empty collumns
empty_columns = []
for column in df.columns:
    if df[column].isnull().all():
        empty_columns.append(column)
list(empty_columns)        

['arp.dst.atm_num_e164',
 'arp.dst.atm_num_nsap',
 'arp.dst.atm_subaddr',
 'arp.dst.drarp_error_status',
 'arp.dst.hlen',
 'arp.dst.htype',
 'arp.dst.hw',
 'arp.dst.hw_ax25',
 'arp.dst.pln',
 'arp.dst.proto',
 'arp.dst.slen',
 'arp.dst.stype',
 'arp.duplicate-address-detected',
 'arp.duplicate-address-frame',
 'arp.isprobe',
 'arp.packet-storm-detected',
 'arp.seconds-since-duplicate-address-frame',
 'arp.src.atm_afi',
 'arp.src.atm_afi.unknown',
 'arp.src.atm_data_country_code',
 'arp.src.atm_data_country_code_group',
 'arp.src.atm_e.164_isdn',
 'arp.src.atm_e.164_isdn_group',
 'arp.src.atm_end_system_identifier',
 'arp.src.atm_high_order_dsp',
 'arp.src.atm_international_code_designator',
 'arp.src.atm_international_code_designator_group',
 'arp.src.atm_num_e164',
 'arp.src.atm_num_nsap',
 'arp.src.atm_rest_of_address',
 'arp.src.atm_selector',
 'arp.src.atm_subaddr',
 'arp.src.hlen',
 'arp.src.htype',
 'arp.src.hw',
 'arp.src.hw_ax25',
 'arp.src.pln',
 'arp.src.proto',
 'arp.src.sle

In [13]:
# Delete and print the empty columns
print('[INFO] Deleting empty columns')
for column in empty_columns:
    df.drop(column, axis = 1, inplace = True)
    print(f'[REMOVED] ' + column)
print('[INFO] Empty columns deleted with success')    

[INFO] Deleting empty columns
[REMOVED] arp.dst.atm_num_e164
[REMOVED] arp.dst.atm_num_nsap
[REMOVED] arp.dst.atm_subaddr
[REMOVED] arp.dst.drarp_error_status
[REMOVED] arp.dst.hlen
[REMOVED] arp.dst.htype
[REMOVED] arp.dst.hw
[REMOVED] arp.dst.hw_ax25
[REMOVED] arp.dst.pln
[REMOVED] arp.dst.proto
[REMOVED] arp.dst.slen
[REMOVED] arp.dst.stype
[REMOVED] arp.duplicate-address-detected
[REMOVED] arp.duplicate-address-frame
[REMOVED] arp.isprobe
[REMOVED] arp.packet-storm-detected
[REMOVED] arp.seconds-since-duplicate-address-frame
[REMOVED] arp.src.atm_afi
[REMOVED] arp.src.atm_afi.unknown
[REMOVED] arp.src.atm_data_country_code
[REMOVED] arp.src.atm_data_country_code_group
[REMOVED] arp.src.atm_e.164_isdn
[REMOVED] arp.src.atm_e.164_isdn_group
[REMOVED] arp.src.atm_end_system_identifier
[REMOVED] arp.src.atm_high_order_dsp
[REMOVED] arp.src.atm_international_code_designator
[REMOVED] arp.src.atm_international_code_designator_group
[REMOVED] arp.src.atm_num_e164
[REMOVED] arp.src.atm_num

### Pre-Processing & Data Encoding

This section covers all the pre-processing and data encoding done in the dataset.

In [14]:
# Array: Fields to fill with 0
fill_0 = [
    'arp.isannouncement', 'arp.isgratuitous'
]

fill_empty_cells(df, fill_0, '0') # Fill: 0

[DONE] Empty Cells Filling: 0


In [16]:
# Array: Fields to apply z-score normalization
zScoreNormalization = [
    'frame.cap_len', 'frame.len'
]

for i in zScoreNormalization:
    df[i] = df[i].astype(float)

zscore_normalization(df, zScoreNormalization)

[DONE] Z-Score Normalization


In [17]:
# Converting from dtypes to float
for i in df:
    df[i] = df[i].astype(float)
print(f'[DONE] Convertion to float done with success')

[DONE] Convertion to float done with success


In [18]:
# Replication of rows

# Set the seed for reproducibility
np.random.seed(42)

# Number of times to replicate the rows
replications = 21700

# Randomly select rows from the original DataFrame
replicated_rows = df.sample(n=replications, replace=True)

# Concatenate the original DataFrame with the replicated rows
df = pd.concat([df, replicated_rows], ignore_index=True)

# Verify the resulting DataFrame
df.shape

(375055, 7)

In [19]:
# Save the dataset with all the pre-processing and data encoding done
df.to_csv(DATASET_PRE_PROCESSING, index=False)
print(f'[DONE] Dataset with pre-processing saved with success')

[DONE] Dataset with pre-processing saved with success
