# Packages Installs, imports, and presets

In [80]:
from scapy.all import *
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
from collections import defaultdict
import os

In [81]:
INPUT_SIZE = (256,256,3)

In [82]:
df = pd.read_parquet('data/iec104_15.parquet')

## Removing missing features

In [83]:
df.shape

(518377, 1347)

In [84]:
df['tv_sec']=df['tv_sec'].astype(int)
df['tv_usec']=df['tv_usec'].astype(int)

# Model training

## Dataset preparation

In [85]:
df[['label']].value_counts()

label        
normal           400317
c_rd_na_1_DoS     20167
c_rp_na_1_DoS     19949
c_sc_na_1_DoS     16615
c_rp_na_1         11517
c_se_na_1_DoS     10854
c_rd_na_1         10821
c_ci_na_1_DoS      9146
c_se_na_1          6960
c_sc_na_1          6767
c_ci_na_1          5264
Name: count, dtype: int64

In [86]:
df[['label']]

Unnamed: 0_level_0,label
flow,Unnamed: 1_level_1
1_192.168.1.13_192.168.1.20_2404_49259_TCP_c_rp_na_1_normal,normal
2_192.168.1.13_192.168.1.20_2404_49259_TCP_c_rp_na_1_normal,normal
2_192.168.1.13_192.168.1.20_2404_49259_TCP_c_rp_na_1_normal,normal
2_192.168.1.13_192.168.1.20_2404_49259_TCP_c_rp_na_1_normal,normal
2_192.168.1.13_192.168.1.20_2404_49259_TCP_c_rp_na_1_normal,normal
...,...
1_192.168.1.29_192.168.1.22_46861_2404_TCP_c_sc_na_1_DoS_c_sc_na_1_DoS,c_sc_na_1_DoS
1_192.168.1.29_192.168.1.22_46861_2404_TCP_c_sc_na_1_DoS_c_sc_na_1_DoS,c_sc_na_1_DoS
1_192.168.1.29_192.168.1.22_46871_2404_TCP_c_sc_na_1_c_sc_na_1,c_sc_na_1
1_192.168.1.29_192.168.1.22_46871_2404_TCP_c_sc_na_1_c_sc_na_1,c_sc_na_1


In [87]:
from feature_engine.encoding import OrdinalEncoder

In [91]:
od = OrdinalEncoder(encoding_method='arbitrary')
od.fit(df[['label']])
df[['label']]= od.transform(df[['label']])

In [92]:
TRAIN_RATE= 0.8
import random
indexes = list(set(df.index))
random.Random(1).shuffle(indexes)
TRAIN_SIZE = int(len(indexes)*TRAIN_RATE)
TEST_SIZE = len(indexes)-TRAIN_SIZE
TRAIN_SAMPLES = indexes[:TRAIN_SIZE]
TEST_SAMPLES = indexes[:TEST_SIZE]
df_train_initial = df.loc[TRAIN_SAMPLES]
df_test = df.loc[TEST_SAMPLES]
# df_test = df.iloc[TRAIN_SAMPLES:]
# del df

In [93]:
indexes = list(set(df_train_initial.index))
random.Random(1).shuffle(indexes)
TRAIN_SIZE = int(len(indexes)*TRAIN_RATE)
TEST_SIZE = len(indexes)-TRAIN_SIZE
TRAIN_SAMPLES = indexes[:TRAIN_SIZE]
TEST_SAMPLES = indexes[:TEST_SIZE]
df_train = df_train_initial.loc[TRAIN_SAMPLES]
df_validation = df_train_initial.loc[TEST_SAMPLES]

In [94]:
df_train.shape, df_validation.shape, df_test.shape

((296634, 1347), (76659, 1347), (84064, 1347))

In [95]:
indexes[:3]

['548_192.168.1.20_192.168.1.22_49754_2404_TCP_c_rd_na_1_normal',
 '163_192.168.1.13_192.168.1.20_2404_49604_TCP_c_ci_na_1_normal',
 '75_192.168.1.20_192.168.1.13_49403_2404_TCP_c_ci_na_1_DoS_normal']

In [13]:
len(set(df_train.index))

63161

In [14]:
del df_train_initial
del df

In [15]:
assert df_train.groupby(df_train.index)['label'].nunique().max()==1

In [16]:
len(set(df_train.index))

63161

In [18]:
df_train

Unnamed: 0_level_0,tv_sec,tv_usec,ipv4_ver_0,ipv4_ver_1,ipv4_ver_2,ipv4_ver_3,ipv4_hl_0,ipv4_hl_1,ipv4_hl_2,ipv4_hl_3,...,payload_bit_375,payload_bit_376,payload_bit_377,payload_bit_378,payload_bit_379,payload_bit_380,payload_bit_381,payload_bit_382,payload_bit_383,label
flow,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
548_192.168.1.20_192.168.1.22_49754_2404_TCP_c_rd_na_1_normal,1591370094,233353,0,1,0,0,0,1,0,1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,0
548_192.168.1.20_192.168.1.22_49754_2404_TCP_c_rd_na_1_normal,1591370100,234932,0,1,0,0,0,1,0,1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,0
163_192.168.1.13_192.168.1.20_2404_49604_TCP_c_ci_na_1_normal,1587901978,438435,0,1,0,0,0,1,0,1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,0
75_192.168.1.20_192.168.1.13_49403_2404_TCP_c_ci_na_1_DoS_normal,1587932657,398630,0,1,0,0,0,1,0,1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,0
306_192.168.1.20_192.168.1.26_49728_2404_TCP_c_sc_na_1_normal,1588107515,409181,0,1,0,0,0,1,0,1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38_192.168.1.26_192.168.1.20_2404_49751_TCP_c_rd_na_1_normal,1591354242,516989,0,1,0,0,0,1,0,1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,0
1_192.168.1.29_192.168.1.19_45301_2404_TCP_c_rp_na_1_c_rp_na_1,1591444852,738307,0,1,0,0,0,1,0,1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,2
1_192.168.1.29_192.168.1.19_45301_2404_TCP_c_rp_na_1_c_rp_na_1,1591444862,751322,0,1,0,0,0,1,0,1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,2
1_192.168.1.29_192.168.1.19_45301_2404_TCP_c_rp_na_1_c_rp_na_1,1591444873,174295,0,1,0,0,0,1,0,1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,2


In [42]:
import numpy as np
from scapy.all import IP, send
def create_tcp_packet_from_binary(binary_sequence):
    """
    Creates a TCP packet from a given binary sequence with -1 indicating "don't care" bits.
    
    Args:
        binary_sequence (numpy.ndarray): The binary sequence with 0, 1, and -1 values.
    
    Returns:
        TCP: The constructed TCP packet.
    """
    # Ensure the binary_sequence is a NumPy array
    binary_sequence = np.array(binary_sequence)

    # Replace -1 with 0 (default value) in the binary sequence
    cleaned_sequence = np.where(binary_sequence == -1, 0, binary_sequence)

    # Convert the cleaned binary sequence to a binary string
    binary_string = ''.join(map(str, cleaned_sequence))

    # Group binary string into bytes (8 bits each)
    bytes_sequence = [binary_string[i:i+8] for i in range(0, len(binary_string), 8)]

    # Convert each byte from binary string to integer
    byte_values = [int(byte, 2) for byte in bytes_sequence]

    # Construct the TCP packet
    tcp_packet = TCP()

    # Set the fields of the TCP packet based on the byte values
    tcp_packet.sport = (byte_values[0] << 8) | byte_values[1]  # Source port
    tcp_packet.dport = (byte_values[2] << 8) | byte_values[3]  # Destination port
    tcp_packet.seq = (byte_values[4] << 24) | (byte_values[5] << 16) | (byte_values[6] << 8) | byte_values[7]  # Sequence number
    tcp_packet.ack = (byte_values[8] << 24) | (byte_values[9] << 16) | (byte_values[10] << 8) | byte_values[11]  # Acknowledgment number
    tcp_packet.dataofs = byte_values[12] >> 4  # Data offset (4 bits)
    tcp_packet.reserved = (byte_values[12] & 0x0F)  # Reserved (4 bits)
    tcp_packet.flags = byte_values[13]  # Flags
    tcp_packet.window = (byte_values[14] << 8) | byte_values[15]  # Window size
    tcp_packet.chksum = (byte_values[16] << 8) | byte_values[17]  # Checksum
    tcp_packet.urgptr = (byte_values[18] << 8) | byte_values[19]  # Urgent pointer

    return tcp_packet

def create_ipv4_packet_from_binary(binary_sequence):
    """
    Creates an IPv4 packet from a given binary sequence with -1 indicating "don't care" bits.
    
    Args:
        binary_sequence (numpy.ndarray): The binary sequence with 0, 1, and -1 values.
    
    Returns:
        IP: The constructed IPv4 packet.
    """
    # Ensure the binary_sequence is a NumPy array
    binary_sequence = np.array(binary_sequence)

    # Replace -1 with 0 (default value) in the binary sequence
    cleaned_sequence = np.where(binary_sequence == -1, 0, binary_sequence)

    # Convert the cleaned binary sequence to a binary string
    binary_string = ''.join(map(str, cleaned_sequence))

    # Group binary string into bytes (8 bits each)
    bytes_sequence = [binary_string[i:i+8] for i in range(0, len(binary_string), 8)]

    # Convert each byte from binary string to integer
    byte_values = [int(byte, 2) for byte in bytes_sequence]

    # Construct the IPv4 packet
    ip_packet = IP()

    # Set the fields of the IP packet based on the byte values
    ip_packet.version = byte_values[0] >> 4
    ip_packet.ihl = byte_values[0] & 0x0F
    ip_packet.tos = byte_values[1]
    ip_packet.len = (byte_values[2] << 8) | byte_values[3]
    ip_packet.id = (byte_values[4] << 8) | byte_values[5]
    ip_packet.flags = byte_values[6] >> 5
    ip_packet.frag = ((byte_values[6] & 0x1F) << 8) | byte_values[7]
    ip_packet.ttl = byte_values[8]
    ip_packet.proto = byte_values[9]
    ip_packet.chksum = (byte_values[10] << 8) | byte_values[11]
    ip_packet.src = '.'.join(map(str, byte_values[12:16]))
    ip_packet.dst = '.'.join(map(str, byte_values[16:20]))

    return ip_packet


In [55]:
import importlib
import flowmeter #import the module here, so that it can be reloaded.
importlib.reload(flowmeter)

<module 'flowmeter' from '/home/hwerneck/fast-ids/flowmeter.py'>

In [67]:
def create_payload_from_binary(binary_sequence):
    """
    Creates a payload from a given binary sequence with -1 indicating "don't care" bits.
    
    Args:
        binary_sequence (numpy.ndarray): The binary sequence with 0, 1, and -1 values.
    
    Returns:
        bytes: The constructed payload.
    """
    # Ensure the binary_sequence is a NumPy array
    binary_sequence = np.array(binary_sequence)

    # Replace -1 with 0 (default value) in the binary sequence
    cleaned_sequence = np.where(binary_sequence == -1, 0, binary_sequence)

    # Convert the cleaned binary sequence to a binary string
    binary_string = ''.join(map(str, cleaned_sequence))

    # Group binary string into bytes (8 bits each)
    bytes_sequence = [binary_string[i:i+8] for i in range(0, len(binary_string), 8)]

    # Convert each byte from binary string to integer
    byte_values = [int(byte, 2) for byte in bytes_sequence]

    # Convert byte values to bytes
    payload = bytes(byte_values)

    return payload

In [59]:
from tqdm import tqdm

In [79]:
def get_flows_statistics(df_input):
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", category=FutureWarning)
        fm=flowmeter.Flowmeter()
        flows_statistics = []
        fgb = df_input.groupby('flow')
        for i in tqdm(fgb):
            
            column, rows = i
            # print(len(rows.filter(regex='ipv4_*').columns))
            # print(len(rows.filter(regex='payload_*').columns))
            # print(len(rows.filter(regex='tcp_*').columns))
            packets = []
            for index, value in rows.iterrows():
                packet_binary=  value.filter(regex='ipv4_*').values
                ipv4 = create_ipv4_packet_from_binary(packet_binary)
                packet_binary=  value.filter(regex='tcp_*').values
                tcp = create_tcp_packet_from_binary(packet_binary)
                packet_binary=  value.filter(regex='payload_*').values
                payload = create_payload_from_binary(packet_binary)
                packet= ipv4/tcp/raw(payload)
                packets.append(packet)
                # packet.show()
            packet_list = PacketList(packets)
            df=fm.build_feature_from_packet_list(packet_list)
            
            label = rows['label'].iloc[0]
            df['label'] = label
            df['flow'] = column
            flows_statistics.append(df)
            display(df.head())
            
        return pd.concat(flows_statistics)

df_train_statistics=get_flows_statistics(df_train)
df_train_statistics.to_parquet('data/df_train_statistics.parquet')
df_validation_statistics.to_parquet('data/df_validation_statistics.parquet')
df_test_statistics.to_parquet('data/df_test_statistics.parquet')

  0%|                                                                | 0/63161 [00:00<?, ?it/s]

Unnamed: 0,flow,src,src_port,dst,dst_port,feduration,total_fpackets,total_bpackets,total_fpktl,total_bpktl,...,avgPacketSize,fAvgSegmentSize,fAvgBytesPerBulk,fAvgPacketsPerBulk,fAvgBulkRate,bAvgSegmentSize,bAvgBytesPerBulk,bAvgPacketsPerBulk,bAvgBulkRate,label
0,100_192.168.1.13_192.168.1.20_2404_49368_TCP_c...,192.168.1.13,2404,192.168.1.20,49368,0.017559,3,0,264,0,...,88.0,48.0,0,0,0,,0,0,0,0


  0%|                                                                | 0/63161 [00:05<?, ?it/s]
