# Packages Installs, imports, and presets

In [17]:
from scapy.all import *
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
from collections import defaultdict
import os

In [18]:
TIMEOUT= 30
df_train= pd.read_parquet(f'data/iec104_train_{TIMEOUT}_v2.parquet')
df_validation = pd.read_parquet(f'data/iec104_validation_{TIMEOUT}_v2.parquet')
df_test=pd.read_parquet(f'data/iec104_test_{TIMEOUT}_v2.parquet')

In [19]:
assert df_train.groupby(df_train.index)['label'].nunique().max()==1

In [20]:
len(set(df_train.index))

8673

In [21]:
df_train

Unnamed: 0_level_0,packet_number,ip_version,ip_ihl,ip_tos,ip_len,ip_id,ip_flags,ip_frag,ip_ttl,ip_proto,...,tcp_reserved,tcp_flags,tcp_window,tcp_chksum,tcp_urgptr,tcp_options,size,payload,payload_raw,label
flow,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"[1, '192.168.1.13', '192.168.1.29', 2404, 37139, 'TCP', 'c_rp_na_1']",0,4,5,0,58,22697,DF,0,64,6,...,0,PA,502,33703,0,3,72,6,b'h\x04\x07\x00\x00\x00',6
"[1, '192.168.1.13', '192.168.1.29', 2404, 37139, 'TCP', 'c_rp_na_1']",1,4,5,0,58,24218,DF,0,128,6,...,0,PA,260,23602,0,3,72,6,b'h\x04\x0b\x00\x00\x00',6
"[1, '192.168.1.13', '192.168.1.29', 2404, 37139, 'TCP', 'c_rp_na_1']",2,4,5,0,68,22699,DF,0,64,6,...,0,PA,502,33713,0,3,82,16,b'h\x0e\x00\x00\x00\x00i\x01\x06\x00\x01\x00\x...,6
"[1, '192.168.1.13', '192.168.1.29', 2404, 37139, 'TCP', 'c_rp_na_1']",3,4,5,0,67,24244,DF,0,128,6,...,0,PA,260,35058,0,3,81,15,b'h\r\x00\x00\x02\x00i\x01G\x00\x01\x00\x00\x0...,6
"[1, '192.168.1.13', '192.168.1.29', 2404, 37139, 'TCP', 'c_rp_na_1']",4,4,5,0,58,22701,DF,0,64,6,...,0,PA,502,33703,0,3,72,6,b'h\x04\x13\x00\x00\x00',6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"[1, '192.168.1.24', '192.168.1.28', 2404, 40103, 'TCP', 'c_rd_na_1']",3,4,5,0,67,11080,DF,0,128,6,...,0,PA,260,17584,0,3,81,15,b'h\r\x00\x00\x02\x00f\x01G\x00\x01\x00\x00\x0...,5
"[1, '192.168.1.24', '192.168.1.28', 2404, 40103, 'TCP', 'c_rd_na_1']",4,4,5,0,58,24188,DF,0,64,6,...,0,PA,502,33713,0,3,72,6,b'h\x04\x13\x00\x00\x00',5
"[1, '192.168.1.24', '192.168.1.28', 2404, 40103, 'TCP', 'c_rd_na_1']",5,4,5,0,58,11098,DF,0,128,6,...,0,PA,260,42652,0,3,72,6,b'h\x04#\x00\x00\x00',5
"[1, '192.168.1.24', '192.168.1.28', 2404, 40103, 'TCP', 'c_rd_na_1']",6,4,5,0,58,11101,DF,0,128,6,...,0,PA,260,50739,0,3,72,6,b'h\x04\x01\x00\x02\x00',5


In [22]:
df_train.columns

Index(['packet_number', 'ip_version', 'ip_ihl', 'ip_tos', 'ip_len', 'ip_id',
       'ip_flags', 'ip_frag', 'ip_ttl', 'ip_proto', 'ip_chksum', 'ip_src',
       'ip_dst', 'ip_options', 'time', 'tcp_sport', 'tcp_dport', 'tcp_seq',
       'tcp_ack', 'tcp_dataofs', 'tcp_reserved', 'tcp_flags', 'tcp_window',
       'tcp_chksum', 'tcp_urgptr', 'tcp_options', 'size', 'payload',
       'payload_raw', 'label'],
      dtype='object')

In [23]:
df_train.drop('tcp_flags',axis=1,inplace=True)
df_test.drop('tcp_flags',axis=1,inplace=True)
df_validation.drop('tcp_flags',axis=1,inplace=True)

In [24]:
def rename_columns(col):
    return re.sub(r'^(ip_|tcp_)', '', col)

# Rename columns
df_train.rename(columns=lambda x: rename_columns(x), inplace=True)
df_test.rename(columns=lambda x: rename_columns(x), inplace=True)
df_validation.rename(columns=lambda x: rename_columns(x), inplace=True)

In [25]:
# df_train.reset_index('packet_number',inplace=True)

In [26]:
df_train.columns

Index(['packet_number', 'version', 'ihl', 'tos', 'len', 'id', 'flags', 'frag',
       'ttl', 'proto', 'chksum', 'src', 'dst', 'options', 'time', 'sport',
       'dport', 'seq', 'ack', 'dataofs', 'reserved', 'window', 'chksum',
       'urgptr', 'options', 'size', 'payload', 'payload_raw', 'label'],
      dtype='object')

In [27]:
df_train.sort_values('time',inplace=True)
df_test.sort_values('time',inplace=True)
df_validation.sort_values('time',inplace=True)

In [28]:
from concurrent.futures import ProcessPoolExecutor, as_completed
from datetime import datetime
from scapy.all import IP, TCP, Raw, PacketList
import importlib
import my_flowmeter  # import the module here, so that it can be reloaded.
import warnings
from tqdm import tqdm

def process_flow(group):
    column, rows = group
    fm = my_flowmeter.Flowmeter()

    # Record the start time
    start_time = datetime.now()

    df = fm.build_feature_from_packet_list(rows)

    # Record the end time
    end_time = datetime.now()
    duration = (end_time - start_time).total_seconds()

    label = rows['label'].iloc[0]
    df['label'] = label
    df['flow'] = column
    df['computation_time'] = duration
    return df

def get_flows_statistics(df_input):
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", category=FutureWarning)
        fgb = df_input.groupby('flow')
        flows_statistics = []

        with ProcessPoolExecutor() as executor:
            futures = {executor.submit(process_flow, group): group for group in fgb}
            for future in tqdm(as_completed(futures), total=len(futures)):
                result = future.result()
                flows_statistics.append(result)

        return pd.concat(flows_statistics)

df_train_statistics = get_flows_statistics(df_train)
df_train_statistics.to_parquet(f'data/df_train_v2_statistics_{TIMEOUT}.parquet')

df_validation_statistics = get_flows_statistics(df_validation)
df_validation_statistics.to_parquet(f'data/df_validation_v2_statistics_{TIMEOUT}.parquet')

df_test_statistics = get_flows_statistics(df_test)
df_test_statistics.to_parquet(f'data/df_test_v2_statistics_{TIMEOUT}.parquet')

100%|██████████| 8673/8673 [05:31<00:00, 26.17it/s]
100%|██████████| 2169/2169 [01:19<00:00, 27.39it/s]
100%|██████████| 2711/2711 [02:04<00:00, 21.76it/s]


In [29]:
df_train_statistics

Unnamed: 0,flow,src,src_port,dst,dst_port,feduration,total_fpackets,total_bpackets,total_fpktl,total_bpktl,...,fAvgSegmentSize,fAvgBytesPerBulk,fAvgPacketsPerBulk,fAvgBulkRate,bAvgSegmentSize,bAvgBytesPerBulk,bAvgPacketsPerBulk,bAvgBulkRate,label,computation_time
0,"['192.168.1.13', '192.168.1.29', 2, 2404, 4424...",192.168.1.29,44243,192.168.1.13,2404,21.597911,3,6,226,462,...,9.333333,226.0,1.0,0.642857,11.000000,0,1.166667,0.750000,11,1.543233
0,"['192.168.1.19', '192.168.1.29', 2, 2404, 3622...",192.168.1.29,36229,192.168.1.19,2404,20.984048,3,4,226,297,...,9.333333,226.0,1.0,0.700000,8.250000,0,1.000000,0.700000,8,1.174849
0,"['192.168.1.13', '192.168.1.29', 2, 2404, 3408...",192.168.1.29,34087,192.168.1.13,2404,21.012660,3,5,226,379,...,9.333333,226.0,1.0,0.727273,9.800000,0,1.000000,0.727273,8,1.145502
0,"['192.168.1.13', '192.168.1.29', 2, 2404, 4481...",192.168.1.29,44811,192.168.1.13,2404,25.610905,3,8,226,625,...,9.333333,226.0,1.0,0.478261,12.125000,0,1.750000,0.836957,9,0.907562
0,"['192.168.1.19', '192.168.1.29', 2, 2404, 4031...",192.168.1.29,40319,192.168.1.19,2404,30.995201,2,5,154,382,...,11.000000,154.0,1.0,0.777778,10.400000,0,1.000000,0.777778,10,0.442967
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,"[1, '192.168.1.26', '192.168.1.27', 2404, 4682...",192.168.1.27,46827,192.168.1.26,2404,41.011422,3,6,226,454,...,9.333333,226.0,1.0,0.750000,9.666667,0,1.000000,0.750000,10,1.050429
0,"[1, '192.168.1.26', '192.168.1.27', 2404, 4676...",192.168.1.27,46769,192.168.1.26,2404,59.072910,3,10,226,796,...,9.333333,226.0,1.0,0.684211,13.600000,0,1.200000,0.821053,11,1.249609
0,"[1, '192.168.1.26', '192.168.1.27', 2404, 4684...",192.168.1.27,46849,192.168.1.26,2404,29.990441,3,5,226,379,...,9.333333,226.0,1.0,0.727273,9.800000,0,1.000000,0.727273,8,0.750040
0,"[1, '192.168.1.26', '192.168.1.27', 2404, 4683...",192.168.1.27,46839,192.168.1.26,2404,30.155121,3,5,226,379,...,9.333333,226.0,1.0,0.727273,9.800000,0,1.000000,0.727273,9,1.028203


In [30]:
df_test_statistics

Unnamed: 0,flow,src,src_port,dst,dst_port,feduration,total_fpackets,total_bpackets,total_fpktl,total_bpktl,...,fAvgSegmentSize,fAvgBytesPerBulk,fAvgPacketsPerBulk,fAvgBulkRate,bAvgSegmentSize,bAvgBytesPerBulk,bAvgPacketsPerBulk,bAvgBulkRate,label,computation_time
0,"['192.168.1.13', '192.168.1.29', 2, 2404, 4390...",192.168.1.29,43903,192.168.1.13,2404,20.993662,3,5,226,384,...,9.333333,226.0,1.0,0.615385,10.800000,0,1.200000,0.738462,7,0.866473
0,"['192.168.1.19', '192.168.1.29', 2, 2404, 3622...",192.168.1.29,36229,192.168.1.19,2404,20.984048,3,4,226,297,...,9.333333,226.0,1.0,0.700000,8.250000,0,1.000000,0.700000,8,0.977323
0,"['192.168.1.19', '192.168.1.29', 2, 2404, 3462...",192.168.1.29,34627,192.168.1.19,2404,41.008667,3,9,226,697,...,9.333333,226.0,1.0,0.666667,11.444444,0,1.333333,0.888889,6,1.077671
0,"['192.168.1.19', '192.168.1.29', 2, 2404, 3943...",192.168.1.29,39439,192.168.1.19,2404,44.419011,3,10,226,780,...,9.333333,226.0,1.0,0.619048,12.000000,0,1.400000,0.866667,11,1.211560
0,"['192.168.1.13', '192.168.1.29', 2, 2404, 4588...",192.168.1.29,45887,192.168.1.13,2404,30.994677,2,8,154,625,...,11.000000,154.0,1.0,0.666667,12.125000,0,1.375000,0.916667,8,1.264353
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,"[1, '192.168.1.26', '192.168.1.27', 2404, 4666...",192.168.1.27,46669,192.168.1.26,2404,41.004516,3,9,226,714,...,9.333333,226.0,1.0,0.666667,13.333333,0,1.222222,0.814815,11,1.521307
0,"[1, '192.168.1.26', '192.168.1.27', 2404, 4675...",192.168.1.27,46753,192.168.1.26,2404,30.145040,3,7,226,559,...,9.333333,226.0,1.0,0.714286,13.857143,0,1.142857,0.816327,8,1.396594
0,"[1, '192.168.1.26', '192.168.1.27', 2404, 4671...",192.168.1.27,46719,192.168.1.26,2404,21.506563,3,6,226,466,...,9.333333,226.0,1.0,0.642857,11.666667,0,1.166667,0.750000,2,1.408157
0,"[1, '192.168.1.26', '192.168.1.27', 2404, 4676...",192.168.1.27,46761,192.168.1.26,2404,41.037891,3,7,226,534,...,9.333333,226.0,1.0,0.666667,10.285714,0,1.142857,0.761905,11,1.378557


In [31]:
# from datetime import datetime
# from scapy.all import IP, TCP, Raw, PacketList
# import importlib
# import my_flowmeter  # import the module here, so that it can be reloaded.
# import warnings
# from tqdm import tqdm

# importlib.reload(my_flowmeter)
# def create_scapy_packet(row):
#     ip_layer = IP(
#         version=row['ip_version'],
#         ihl=row['ip_ihl'],
#         tos=row['ip_tos'],
#         len=row['ip_len'],
#         id=row['ip_id'],
#         flags=row['ip_flags'],
#         frag=row['ip_frag'],
#         ttl=row['ip_ttl'],
#         proto=row['ip_proto'],
#         chksum=row['ip_chksum'],
#         src=row['ip_src'],
#         dst=row['ip_dst'],
#         # options=row['ip_options']
#     )

#     tcp_layer = TCP(
#         sport=row['tcp_sport'],
#         dport=row['tcp_dport'],
#         seq=row['tcp_seq'],
#         ack=row['tcp_ack'],
#         dataofs=row['tcp_dataofs'],
#         reserved=row['tcp_reserved'],
#         flags=row['tcp_flags'],
#         window=row['tcp_window'],
#         chksum=row['tcp_chksum'],
#         urgptr=row['tcp_urgptr'],
#         # options=row['tcp_options']
#     )

#     # Payload can be either a string or raw bytes
#     if row['payload_raw']:
#         payload_layer = Raw(load=row['payload_raw'])


#     # Combine the layers to form the packet
#     packet = ip_layer / tcp_layer / payload_layer
#     return packet

# # def binary_vector_to_bytes(binary_vector):
# #     """Convert a binary numpy vector to a bytes variable."""
# #     return bytes(np.packbits(binary_vector))

# def get_flows_statistics(df_input):
#     with warnings.catch_warnings():
#         warnings.filterwarnings("ignore", category=FutureWarning)
#         fm = my_flowmeter.Flowmeter()
#         flows_statistics = []
#         fgb = df_input.groupby('flow')
#         a = 0
#         for i in tqdm(fgb):
#             a += 1
#             column, rows = i
#             packets = []

#             # for index, row in rows.iterrows():
#             #     packets.append(create_scapy_packet(row))

#             # packet_list = PacketList(rows)
#             # Record the start time
#             start_time = datetime.now()
#             # print(rows)
#             df = fm.build_feature_from_packet_list(rows)#.copy()
#             # Record the end time
#             end_time = datetime.now()
#             # Calculate the duration
#             duration = (end_time - start_time).total_seconds()

#             label = rows['label'].iloc[0]
#             df['label'] = label
#             df['flow'] = column
#             df['computation_time'] = duration
#             flows_statistics.append(df)
#             # display(df.head())
#             # if a == 4:
#             #     break
#         return pd.concat(flows_statistics)

# df_train_statistics = get_flows_statistics(df_train)
# df_train_statistics.to_parquet('data/df_train_v2_statistics.parquet')

# df_validation_statistics = get_flows_statistics(df_validation)
# df_validation_statistics.to_parquet('data/df_validation_v2_statistics.parquet')

# df_test_statistics = get_flows_statistics(df_test)
# df_test_statistics.to_parquet('data/df_test_v2_statistics.parquet')