# apply masks & generate pickle

In [14]:
import os
from scapy.all import rdpcap, IP, TCP, UDP, DNS
import pickle
from utils import mask_ip_header, mask_tcpudp_header

def preprocess_packet(packet, exclude_payload=False, payload_maxlen=1500):
    if TCP in packet:
        payload = bytes(packet[TCP].payload)
    elif UDP in packet:
        payload = bytes(packet[UDP].payload)
    else:
        return ''
    
    ip_header = mask_ip_header(packet)
    tcpudp_header = mask_tcpudp_header(packet)
    if len(ip_header) == 0 or len(tcpudp_header) == 0:
        return ''
    
    out = [list(ip_header), list(tcpudp_header)]
    if len(payload) == 0:
        if exclude_payload:
            return out
        else:
            return ''
    else:
        if exclude_payload:
            return ''
        else:
            payload = list(payload)
            payload = payload[:min(payload_maxlen, len(payload))]
            out.append(payload)
            return out
        

In [15]:
def process_dataset(dataset_dir, out_dir, exclude_payload=False):
    os.makedirs(out_dir, exist_ok=True)
    DNS.dissect = lambda self, s: None
    for root, dirs, files in os.walk(dataset_dir):  # subdir level
        for file in files: 
            if file.endswith('.pcap'):
                file_path = os.path.join(root, file)  
                label = file_path.split('/')[-2]
                out_file = os.path.join(out_dir, label+'.pkl')
                
                print(file_path)
                i = 0
                packets = rdpcap(file_path)
                for packet in packets:
                    packet = preprocess_packet(packet, exclude_payload=exclude_payload)
                    if len(packet) == 0:
                        continue
                    
                    if exclude_payload:
                        assert(len(packet) == 2)
                    else:
                        assert(len(packet[2]) > 0)
                    i += 1
                    with open(out_file, "ab") as f: 
                        pickle.dump(packet, f)
                print(f'Extracted {i} out of {len(packets)} packets')


In [16]:
dataset_dir = '../data/raw/D3'
out_dir = '../data/processed/D3'

process_dataset(dataset_dir, out_dir, exclude_payload=False)


../data/raw/D3/appleStocks/appleStocks.pcap
Extracted 14910 out of 19167 packets
../data/raw/D3/steam/steam_update_full.pcap
Extracted 143676 out of 234558 packets
../data/raw/D3/steam/steam_browsing_download.pcap
Extracted 2198 out of 3689 packets
../data/raw/D3/minecraft/Minecraft1.21_local_server.pcap
Extracted 25225 out of 40798 packets
../data/raw/D3/minecraft/Minecraft1.21_hypixel.pcap
Extracted 62559 out of 76313 packets


# subsettting


In [4]:
dataset_dir = '../data/processed/D1g'
out_dir = '../data/processed/D1g_half'

import pickle, os
os.makedirs(out_dir, exist_ok=True)

In [5]:
import random
subset_pct = 0.5
num_packets = 0
for file in os.listdir(dataset_dir):  # subdir level
    if file.endswith('.pkl'):
        lines = []
        with open(os.path.join(dataset_dir, file), 'rb') as f:
            try:
                while True:
                    lines.append(pickle.load(f))
            except EOFError:
                print("Finished reading " + file)

        subset = random.sample(lines, int(len(lines)*subset_pct))
        num_packets += len(subset)
        print(f"Extracting {len(subset)} packets from " + file)
        with open(os.path.join(out_dir, file), "ab") as f: 
            for packet in subset:
                pickle.dump(packet, f)

Finished reading kugou.pkl
Extracting 85083 packets from kugou.pkl
Finished reading tudou.pkl
Extracting 44860 packets from tudou.pkl
Finished reading spotify.pkl
Extracting 4911 packets from spotify.pkl
Finished reading vimeo.pkl
Extracting 17612 packets from vimeo.pkl
Finished reading aimchat.pkl
Extracting 9312 packets from aimchat.pkl
Finished reading thunder.pkl
Extracting 32162 packets from thunder.pkl
Finished reading weibo.pkl
Extracting 24898 packets from weibo.pkl
Finished reading sohu.pkl
Extracting 19507 packets from sohu.pkl
Finished reading steam.pkl
Extracting 72937 packets from steam.pkl
Finished reading minecraft.pkl
Extracting 43892 packets from minecraft.pkl
Finished reading voipbuster.pkl
Extracting 179895 packets from voipbuster.pkl
Finished reading gmail.pkl
Extracting 5501 packets from gmail.pkl
Finished reading jd.pkl
Extracting 9743 packets from jd.pkl
Finished reading skype.pkl
Extracting 294419 packets from skype.pkl
Finished reading baidu.pkl
Extracting 1818

# extract flow

In [7]:
import os, pickle
from scapy.all import rdpcap, IP, TCP, UDP, DNS
from collections import defaultdict
from utils import mask_ip_header, mask_tcpudp_header

def extract_flows(packets, threshold=50, exclude_payload=False):
    tcp_flows = defaultdict(list)
    udp_flows = defaultdict(list)
    for pkt in packets:
        if IP in pkt:
            if TCP in pkt:
                if len(bytes(pkt[TCP].payload)) > 0:
                    flow_key = (pkt[IP].src, pkt[IP].dst, pkt[TCP].sport, pkt[TCP].dport,pkt[TCP].name )
                    tcp_flows[flow_key].append(pkt)
            elif UDP in pkt:
                if len(bytes(pkt[UDP].payload)) > 0:
                    flow_key = (pkt[IP].src, pkt[IP].dst, pkt[UDP].sport, pkt[UDP].dport,pkt[UDP].name)
                    udp_flows[flow_key].append(pkt)

    for key, flow in tcp_flows.items():
        flow.sort(key=lambda pkt: pkt.time)
        tcp_flows[key] = flow[:threshold]

    for key, flow in udp_flows.items():
        flow.sort(key=lambda pkt: pkt.time)
        udp_flows[key] = flow[:threshold]

    return tcp_flows, udp_flows

def process_flow(flow, payload_maxlen=1500, exclude_payload=False):
    out = []
    for packet in flow:
        ip_header = mask_ip_header(packet)
        tcpudp_header = mask_tcpudp_header(packet)

        if TCP in packet:
            payload = bytes(packet[TCP].payload)
        elif UDP in packet:
            payload = bytes(packet[UDP].payload)
        else:
            raise ValueError
        assert len(payload) > 0
        payload = list(payload)
        payload = payload[:min(payload_maxlen, len(payload))]

        out.append([list(ip_header), list(tcpudp_header), payload])
        
    return out

In [8]:
def process_flow_dataset(dataset_dir, out_dir, exclude_payload=False, flow_threshold=10):
    os.makedirs(out_dir, exist_ok=True)
    DNS.dissect = lambda self, s: None
    for root, dirs, files in os.walk(dataset_dir):  # subdir level
        for file in files: 
            if file.endswith('.pcap'):
                file_path = os.path.join(root, file)  
                label = file_path.split('/')[-2]
                
                print(file_path)
                packets = rdpcap(file_path)
                tcp_flows, udp_flows = extract_flows(packets) #list of flows

                flows = []
                out_file = os.path.join(out_dir, label+'.pkl')

                if len(tcp_flows) > 0:
                    for flow in tcp_flows.values():
                        if len(flow) > flow_threshold:
                            flows.append(process_flow(flow))
                if len(udp_flows) > 0:
                    for flow in udp_flows.values():
                        if len(flow) > flow_threshold:
                            flows.append(process_flow(flow))

                with open(out_file, "ab") as f: 
                    pickle.dump(flows, f)
                print(f'Extracted {len(flows)} flows out of {len(packets)} packets')


In [10]:
dataset_dir = '../data/raw/D2'
out_dir = '../data/processed/D2_flow'

In [None]:
process_flow_dataset(dataset_dir, out_dir)

../data/raw/D2/twitter/twitter_1.pcap
Extracted 137 flows out of 356192 packets
../data/raw/D2/ted/TED_2.pcap
Extracted 65 flows out of 78244 packets
../data/raw/D2/ted/TED_1.pcap
Extracted 44 flows out of 353894 packets
../data/raw/D2/amazon/amazon_1.pcap
Extracted 277 flows out of 250460 packets
../data/raw/D2/baidu/baidu_2.pcap
Extracted 123 flows out of 23688 packets
../data/raw/D2/baidu/baidu_1.pcap
Extracted 164 flows out of 294552 packets
../data/raw/D2/youku/youku_1.pcap
Extracted 435 flows out of 221172 packets
../data/raw/D2/douban/douban_1.pcap
Extracted 223 flows out of 215897 packets
../data/raw/D2/google/google_2.pcap
Extracted 38 flows out of 20460 packets
../data/raw/D2/google/google_1.pcap
Extracted 189 flows out of 112995 packets
../data/raw/D2/bing/bing_2.pcap
Extracted 24 flows out of 55395 packets
../data/raw/D2/bing/bing_1.pcap
Extracted 88 flows out of 200663 packets
../data/raw/D2/youtube/youtube_1.pcap
Extracted 279 flows out of 519091 packets
../data/raw/D2/fa

In [45]:

with open('../data/processed/D1_flow/tudou_tcp.pkl', "rb") as f: 
    data = pickle.load(f)

In [47]:
len(data)

452

# compress

In [1]:
compress_dir = '../data/processed/D1_flow'
out_name = '../data/processed/D1_flow'

In [2]:
import shutil
shutil.make_archive(out_name, 'gztar', compress_dir)


'/Users/tianh/Desktop/EBSNN/data/processed/D1_flow.tar.gz'