### Declaration of Functions

The section below presents all the functions used in the notebook.

In [None]:
# Function: Read the dataset and return a dataframe
import pandas as pd

def read_data(dataset):
    df = pd.read_csv(dataset, nrows=380000)
    print('[DONE] Dataset imported successfully')
    return df

In [None]:
# Function: Delete unnecessary fields
def delete_fields(df, fields):
    print('[INFO] Deleting unnecessary fields')
    for i in fields:
        df.drop(i, axis=1, inplace=True)
        print(f'[REMOVED] ' + i)
    print('[INFO] Unnecessary fields deleted with success')

In [None]:
# Function: Z-Score normalization
from scipy.stats import zscore

def zscore_normalization(df, fields):
    for i in fields:
        df[i] = zscore(df[i])
    print(f'[DONE] Z-Score Normalization')

In [None]:
# Function: Fill empty cells with N value
def fill_empty_cells(df, fields, n):
    for i in fields:
        df[i] = df[i].fillna(n)
    print(f'[DONE] Empty Cells Filling: ' + n)

In [None]:
# Function: Convert fields to dummy variables
def dummy_encode(df, fields):
    for i in fields:
        df = pd.concat([df, pd.get_dummies(df[i], prefix=i, dtype=float)], axis=1)
        df.drop(i, axis=1, inplace=True)
    print(f'[DONE] Categorical Values (Dummies)')
    return df

In [None]:
# Function: Convert hexadecimal to int
def hexadecimal_convert(df, fields):
    for i in fields:
        df[i] = df[i].astype(str)
        df[i] = df[i].apply(int, base=16)
    print(f'[DONE] Integer Convert (Hex - Int)')

### Declaration of Global Variables

The section below presents all the global functions used in the notebook.

In [None]:
DATASET = '../../../Datasets/IP-Based/Samples/SlowLoris.csv'
DATASET_PRE_PROCESSING = '../../../Datasets/IP-Based/SlowLoris.csv'

### Loading Dataset
The upcoming step will load the desired dataset and read all the data from it, storing it in the variable `df`.

In [None]:
# Import dataset and read the data into a dataframe
df = read_data(DATASET)

### Labelling the Dataset

This next section will cover the labelling process of the current dataset.

In [None]:
# Creates a mask to only contain the malicious IP address
mask = (df['frame.protocols'] == 'sll:ethertype:ip:tcp') \
    | (df['frame.protocols'] == 'sll:ethertype:ip:tcp:http:data-text-lines') \
    | (df['frame.protocols'] == 'sll:ethertype:ip:tcp:http')

# Filter the DataFrame to keep only the rows matching the IP address
df = df[mask]

print(f'[DONE] Protocols mask applied on the dataframe')

In [None]:
# Labelling the malicious packets
import numpy as np

# df['ip.addr'] = df['ip.addr'].fillna('')
# df['is_malicious'] = np.where(df['ip.addr'] == MALICIOUS_IP, 1, 0)
df['is_malicious'] = 1
df['attack_type'] = 4
print(f'[DONE] Dataset malicious labeling done with success')

### Data Cleansing

This section covers all the data cleaning done in the dataset.

In [None]:
# Array: Fields to delete (IDs, Timestamps, Checksums, etc...)
fieldsToDelete = [
    'frame.encap_type', 'frame.ignored', 'frame.interface_id', 'frame.interface_name', 'frame.marked', 
    'frame.number', 'frame.offset_shift', 'frame.protocols', 'frame.time', 'frame.time_delta',
    'frame.time_delta_displayed', 'frame.time_epoch', 'frame.time_relative', 'ip.addr', 'ip.checksum',
    'ip.checksum.status', 'ip.dsfield', 'ip.dsfield.dscp', 'ip.dsfield.ecn', 'ip.dst', 'ip.dst_host', 'ip.hdr_len',
    'ip.host', 'ip.id', 'ip.len', 'ip.proto', 'ip.src', 'ip.src_host', 'ip.ttl.lncb', 'tcp.window_size',
    'tcp.time_relative', 'tcp.time_delta', 'tcp.stream', 'tcp.srcport', 'tcp.seq_raw', 'tcp.seq',
    'tcp.segments', 'tcp.segment_data', 'tcp.segment.count', 'tcp.segment', 'tcp.reassembled.length',
    'tcp.reassembled.data', 'tcp.port', 'tcp.payload', 'tcp.options.wscale.shift', 'tcp.options.wscale.multiplier',
    'tcp.options.wscale', 'tcp.options.timestamp.tsval', 'tcp.options.timestamp.tsecr', 'tcp.options.sack_perm',
    'tcp.options.mss_val', 'tcp.options.mss', 'tcp.options', 'tcp.nxtseq', 'tcp.dstport', 'tcp.checksum',
    'tcp.checksum.status', 'tcp.flags.str', 'tcp.analysis.window_full', 'tcp.analysis.reused_ports',
    'tcp.analysis.lost_segment', 'tcp.analysis.keep_alive_ack', 'tcp.analysis.keep_alive', 'tcp.analysis.flags',
    'tcp.analysis.acks_frame', 'tcp.analysis.ack_lost_segment', 'tcp.analysis', 'tcp.ack_raw', 'tcp.ack',
    'eth.padding', 'http.accept_encoding', 'http.connection', 'http.content_length_header', 'http.content_type',
    'http.date', 'http.file_data', 'http.last_modified', 'http.referer', 'http.request.full_uri',
    'http.request.line', 'http.request.method', 'http.request.uri', 'http.request.version', 'http.request_in',
    'http.request_number', 'http.response.code.desc', 'http.response.line', 'http.response.phrase',
    'http.response.version', 'http.response_for.uri', 'http.server', 'http.user_agent', 'http.host', 'tcp.len',
    'ip.flags', 'ip.ttl', 'tcp.connection.rst'
]

delete_fields(df, fieldsToDelete)

In [None]:
# Check all empty collumns
empty_columns = []
for column in df.columns:
    if df[column].isnull().all():
        empty_columns.append(column)
list(empty_columns)        

In [None]:
# Delete and print the empty columns
print('[INFO] Deleting empty columns')
for column in empty_columns:
    df.drop(column, axis = 1, inplace = True)
    print(f'[REMOVED] ' + column)
print('[INFO] Empty columns deleted with success')    

### Pre-Processing & Data Encoding

This section covers all the pre-processing and data encoding done in the dataset.

In [None]:
# Array: Hexadecimal fields to fill with 0x00000000
hexadecimalFill = [
    'tcp.flags'
]

fill_empty_cells(df, hexadecimalFill, '0x00000000') # Fill: 0x00000000

In [None]:
# Array: Fields to convert from Hexadecimal to Integer
hexadecimalFields = [
    'tcp.flags'
]

hexadecimal_convert(df, hexadecimalFields)

In [None]:
# Array: Fields to fill with 0
fill_0 = [
    'tcp.option_len', 'tcp.option_kind', 'tcp.hdr_len', 'tcp.connection.syn', 'tcp.connection.fin',
    'tcp.analysis.bytes_in_flight', 'tcp.analysis.push_bytes_sent', 'tcp.analysis.ack_rtt',
    'tcp.analysis.initial_rtt', 'http.chat', 'tcp.connection.synack', 'http.content_length',
    'http.response', 'http.response.code', 'http.response_number', 'http.time', 'http.request'
]

# Array: Fields to fill with 1
fill_positive_1 = [
    'ip.flags.mf', 'ip.flags.rb', 'ip.frag_offset', 'tcp.urgent_pointer', 'tcp.flags.urg', 'tcp.flags.ns',
    'tcp.flags.ecn', 'tcp.flags.cwr'
]

# Array: Fields to fill with -1
fill_negative_1 = [
    'ip.flags.df', 'tcp.flags.syn', 'tcp.flags.reset', 'tcp.flags.push', 'tcp.flags.fin', 'tcp.flags.ack', 
]

fill_empty_cells(df, fill_0, '0')            # Fill: 0
fill_empty_cells(df, fill_positive_1, '1')   # Fill: 1
fill_empty_cells(df, fill_negative_1, '-1')  # Fill: -1

In [None]:
# Value replacing
df['tcp.option_len'] = df['tcp.option_len'].replace(4, 1)
df['tcp.option_len'] = df['tcp.option_len'].replace(10, 2)
print(f'[DONE] tcp.option_len: values 10 and 4 were replaced.')

#df['http.content_length'] = df['http.content_length'].replace(233, 1)
#df['http.content_length'] = df['http.content_length'].replace(635, 2)
#print(f'[DONE] http.content_length: values 635 and 233 were replaced.')

#df['http.response.code'] = df['http.response.code'].replace(400, 1)
#print(f'[DONE] http.response.code: value 400 were replaced.')

In [None]:
# Array: Fields to apply z-score normalization
zScoreNormalization = [
    'frame.cap_len', 'frame.len', 'tcp.hdr_len', 'tcp.flags', 'tcp.analysis.push_bytes_sent',
    'tcp.analysis.bytes_in_flight', 'http.time', 'http.content_length', 'http.response.code'
]

for i in zScoreNormalization:
    df[i] = df[i].astype(float)

zscore_normalization(df, zScoreNormalization)

In [None]:
# Converting from dtypes to float
for i in df:
    df[i] = df[i].astype(float)
print(f'[DONE] Convertion to float done with success')

In [None]:
# Replication of rows

# Set the seed for reproducibility
np.random.seed(42)

# Number of times to replicate the rows
replications = 243170

# Randomly select rows from the original DataFrame
replicated_rows = df.sample(n=replications, replace=True)

# Concatenate the original DataFrame with the replicated rows
df = pd.concat([df, replicated_rows], ignore_index=True)

# Verify the resulting DataFrame
df.shape

In [None]:
# Save the dataset with all the pre-processing and data encoding done
df.to_csv(DATASET_PRE_PROCESSING, index=False)
print(f'[DONE] Dataset with pre-processing saved with success')