In [349]:
from scapy.all import PcapReader
import re
from typing import List, Any
from scapy.plist import PacketList
import glob
from tqdm import tqdm   
import csv
import numpy as np
import scipy.stats as stats

In [350]:
# read the pcap file and return a list of packets
def read_pcap(filename: str) -> PacketList:
	reader = PcapReader(filename)
	return reader.read_all()

In [351]:
# get the number of packets of each packet size occurring in the dataset
PATH = "data/*.pcap"
files = sorted(glob.glob(PATH)) 
packet_sizes_dict = {}
for file in tqdm(files):
    plist = read_pcap(file)
    for packet in plist:
        if len(packet) not in packet_sizes_dict:
            packet_sizes_dict[len(packet)] = 1
        else:
            packet_sizes_dict[len(packet)] = packet_sizes_dict[len(packet)] + 1

100%|██████████| 3010/3010 [03:02<00:00, 16.50it/s]


In [352]:
# get the top K packet sizes to later use them as features
K = 2
sorted_dict = sorted(packet_sizes_dict.items(), key = lambda item: item[1], reverse=True)
TOP_K_INDICES, _ = zip(*sorted_dict[:K])

In [353]:
""" Feature extraction helper functions """
# get label name
def extract_label_from_filename(filename: str) -> int:
	return int(re.search('grid_(\d+).+', filename).group(1))

# get number of packets
def number_of_packets(plist: PacketList) -> int:
    return len(plist)

# get time duration
def time_duration(plist: PacketList) -> float:
    max_timestamp = max([packet.time for packet in plist])
    min_timestamp = min([packet.time for packet in plist])
    return float(max_timestamp - min_timestamp)

# get total traffic volume
def total_traffic_volume(plist: PacketList) -> int:
    return sum([len(packet) for packet in plist])

# get number of packets sent by client
def count_packets_sent_by_client(plist: PacketList) -> PacketList:
    return len([packet for packet in plist if packet[0][1].src.startswith('172.')])

# get number of packets sent to the client
def count_packets_sent_to_client(plist: PacketList) -> PacketList:
    return len([packet for packet in plist if packet[0][1].dst.startswith('172.')])

# count the number of packets with length in top k
def count_packets_with_length_in_top_k(plist: PacketList, top_k_indices) -> int:
    packet_count = {index: 0 for index in top_k_indices}
    for packet in plist:
        if len(packet) in packet_count:
            packet_count[len(packet)] = packet_count[len(packet)] + 1
    return packet_count

# get maximum inter-arrival time
def max_inter_arrival_time(plist: PacketList) -> float:
    timestamps = [packet.time for packet in plist]
    inter_arrival_times = [float(timestamps[i]) - float(timestamps[i-1]) for i in range(1, len(timestamps))]
    return max(inter_arrival_times)

# get minimum inter-arrival time
def min_inter_arrival_time(plist: PacketList) -> float:
    timestamps = [packet.time for packet in plist]
    inter_arrival_times = [float(timestamps[i]) - float(timestamps[i-1]) for i in range(1, len(timestamps))]
    return min(inter_arrival_times)

# get average inter-arrival time
def avg_inter_arrival_time(plist: PacketList) -> float:
    timestamps = [packet.time for packet in plist]
    inter_arrival_times = [float(timestamps[i]) - float(timestamps[i-1]) for i in range(1, len(timestamps))]
    return sum(inter_arrival_times) / len(inter_arrival_times)

# get standard deviation of inter-arrival time
def std_inter_arrival_time(plist: PacketList) -> float:
    timestamps = [packet.time for packet in plist]
    inter_arrival_times = [float(timestamps[i]) - float(timestamps[i-1]) for i in range(1, len(timestamps))]
    return np.std(inter_arrival_times)

# skewness of inter-arrival time
def skew_inter_arrival_time(plist: PacketList) -> float:
    timestamps = [packet.time for packet in plist]
    inter_arrival_times = [float(timestamps[i]) - float(timestamps[i-1]) for i in range(1, len(timestamps))]
    return stats.skew(inter_arrival_times)

# kurtosis of inter-arrival time
def kurt_inter_arrival_time(plist: PacketList) -> float:
    timestamps = [packet.time for packet in plist]
    inter_arrival_times = [float(timestamps[i]) - float(timestamps[i-1]) for i in range(1, len(timestamps))]
    return stats.kurtosis(inter_arrival_times)


In [354]:
SIZED_PACKETS_NUM = 3
def count_K_packets(plist, K: int):
    packet_sizes_dict = {}
    for packet in plist:
        if len(packet) not in packet_sizes_dict:
            packet_sizes_dict[len(packet)] = 1
        else:
            packet_sizes_dict[len(packet)] = packet_sizes_dict[len(packet)] + 1

    # biggest packets
    sorted_rev_keys = sorted(packet_sizes_dict.keys(), reverse=True)
    keys_len = len(sorted_rev_keys)
    if K <= keys_len:
        biggest_list = [packet_sizes_dict[sorted_rev_keys[i]] for i in range(K)]
    else:
        biggest_list = [packet_sizes_dict[sorted_rev_keys[i]] for i in range(keys_len)]
        for i in range(K-keys_len):
            biggest_list.append(0)

    # smallest packets
    sorted_keys = sorted(packet_sizes_dict.keys())
    if K <= keys_len:
        smallest_list = [packet_sizes_dict[sorted_keys[i]] for i in range(K)]
    else:
        smallest_list = [packet_sizes_dict[sorted_keys[i]] for i in range(keys_len)]
        for i in range(K-keys_len):
            smallest_list.append(0)

    return biggest_list, smallest_list

def count_avg_packet_size(plist):
    return np.mean([len(packet) for packet in plist])

In [355]:
# extract features
def extract_features(filename: str) -> List[Any]:
    # read pcap file
    plist = read_pcap(filename)
    # extract features
    features = {}
    features['label'] = extract_label_from_filename(filename)
    features['number_of_packets'] = number_of_packets(plist)
    features['time_duration'] = int(time_duration(plist))
    features['total_traffic_volume'] = total_traffic_volume(plist)
    features['count_packets_sent_by_client'] = count_packets_sent_by_client(plist)
    features['count_packets_sent_to_client'] = count_packets_sent_to_client(plist)
    for index, count in count_packets_with_length_in_top_k(plist, TOP_K_INDICES).items():
        features[f'count_packets_with_length_{index}'] = count
    biggest_packets_list, smallest_packets_list = count_K_packets(plist, SIZED_PACKETS_NUM)
    for index in range(SIZED_PACKETS_NUM):
        features[f'biggest_packets_{index}'] = biggest_packets_list[index]
    features['avg_packet_size'] = round(count_avg_packet_size(plist) ,2)
    features['max_inter_arrival_time'] = round(max_inter_arrival_time(plist), 2)
    features['avg_inter_arrival_time'] = round(avg_inter_arrival_time(plist), 2)
    features['std_inter_arrival_time'] = round(std_inter_arrival_time(plist), 2)
    features['skew_inter_arrival_time'] = round(skew_inter_arrival_time(plist), 2)
    features['kurt_inter_arrival_time'] = round(kurt_inter_arrival_time(plist), 2)
    return features

In [356]:
extract_features("data/grid_1_20230527_095444.pcap")

{'label': 1,
 'number_of_packets': 437,
 'time_duration': 18,
 'total_traffic_volume': 461872,
 'count_packets_sent_by_client': 181,
 'count_packets_sent_to_client': 256,
 'count_packets_with_length_54': 183,
 'count_packets_with_length_590': 79,
 'biggest_packets_0': 1,
 'biggest_packets_1': 1,
 'biggest_packets_2': 1,
 'avg_packet_size': 1056.92,
 'max_inter_arrival_time': 0.75,
 'avg_inter_arrival_time': 0.04,
 'std_inter_arrival_time': 0.13,
 'skew_inter_arrival_time': 3.88,
 'kurt_inter_arrival_time': 14.03}

In [357]:
# extract features
feature_list = []
for filename in tqdm(files):
    feature_list.append(extract_features(filename))

100%|██████████| 3010/3010 [03:06<00:00, 16.12it/s]


In [359]:
# write features to csv file
keys = feature_list[0].keys()
with open('features.csv', 'w', newline='') as output_file:
    dict_writer = csv.DictWriter(output_file, keys)
    dict_writer.writerows(feature_list)