# apply masks & generate pickle

In [None]:
import os
from scapy.all import rdpcap, IP, TCP, UDP, DNS
import pickle
from utils import mask_ip_header, mask_tcpudp_header

def preprocess_packet(packet, exclude_payload=False, payload_maxlen=1500):
    if TCP in packet:
        payload = bytes(packet[TCP].payload)
    elif UDP in packet:
        payload = bytes(packet[UDP].payload)
    else:
        return ''
    
    ip_header = mask_ip_header(packet)
    tcpudp_header = mask_tcpudp_header(packet)
    if len(ip_header) == 0 or len(tcpudp_header) == 0:
        return ''
    
    out = [list(ip_header), list(tcpudp_header)]
    if len(payload) == 0:
        if exclude_payload:
            return out
        else:
            return ''
    else:
        if exclude_payload:
            return ''
        else:
            payload = list(payload)
            payload = payload[:min(payload_maxlen, len(payload))]
            out.append(payload)
            return out
        

In [None]:
def process_dataset(dataset_dir, out_dir, exclude_payload=False):
    os.makedirs(out_dir, exist_ok=True)
    DNS.dissect = lambda self, s: None
    for root, dirs, files in os.walk(dataset_dir):  # subdir level
        for file in files: 
            if file.endswith('.pcap'):
                file_path = os.path.join(root, file)  
                label = file_path.split('/')[-2]
                out_file = os.path.join(out_dir, label+'.pkl')
                
                print(file_path)
                i = 0
                packets = rdpcap(file_path)
                for packet in packets:
                    packet = preprocess_packet(packet, exclude_payload=exclude_payload)
                    if len(packet) == 0:
                        continue
                    
                    if exclude_payload:
                        assert(len(packet) == 2)
                    else:
                        assert(len(packet[2]) > 0)
                    i += 1
                    with open(out_file, "ab") as f: 
                        pickle.dump(packet, f)
                print(f'Extracted {i} out of {len(packets)} packets')


In [None]:
dataset_dir = '../data/raw/D2'
out_dir = '../data/processed/D2_nopayload'

process_dataset(dataset_dir, out_dir, exclude_payload=True)


# more processing

In [None]:
dataset_dir = '../data/processed/D2'
out_dir = '../data/processed/D2_half'

import pickle, os
os.makedirs(out_dir, exist_ok=True)

In [None]:
import random
subset_pct = 0.5
num_packets = 0
for file in os.listdir(dataset_dir):  # subdir level
    if file.endswith('.pkl'):
        lines = []
        with open(os.path.join(dataset_dir, file), 'rb') as f:
            try:
                while True:
                    lines.append(pickle.load(f))
            except EOFError:
                print("Finished reading " + file)

        subset = random.sample(lines, int(len(lines)*subset_pct))
        num_packets += len(subset)
        print(f"Extracting {len(subset)} packets from " + file)
        with open(os.path.join(out_dir, file), "ab") as f: 
            for packet in subset:
                pickle.dump(packet, f)

In [None]:
num_packets

In [None]:
import os, pickle
for root, dirs, files in os.walk('../data/processed/D1/'):  # subdir level
    for file in files: 
        X, Y = [], []
        if file.endswith('.pkl'):
            label = file[:-4]
            file_name = os.path.join(root, file)
            with open(file_name, 'rb') as f:
                try:
                    while True:
                        packet = pickle.load(f)
                        assert(len(packet)==3)
                        X.append(packet)
                        Y.append(label)
                except EOFError:
                    print("Finished reading " + file_name)
            
            lens = [len(segment_array(x)) for x in X]
            print(max(lens)*8)

# extract flow

In [1]:
import os, pickle
from scapy.all import rdpcap, IP, TCP, UDP, DNS
from collections import defaultdict
from utils import mask_ip_header, mask_tcpudp_header

def extract_flows(packets, threshold=500, exclude_payload=False):
    tcp_flows = defaultdict(list)
    udp_flows = defaultdict(list)
    for pkt in packets:
        if IP in pkt:
            if TCP in pkt:
                if len(bytes(pkt[TCP].payload)) > 0:
                    flow_key = (pkt[IP].src, pkt[IP].dst, pkt[TCP].sport, pkt[TCP].dport,pkt[TCP].name )
                    tcp_flows[flow_key].append(pkt)
            elif UDP in pkt:
                if len(bytes(pkt[UDP].payload)) > 0:
                    flow_key = (pkt[IP].src, pkt[IP].dst, pkt[UDP].sport, pkt[UDP].dport,pkt[UDP].name)
                    udp_flows[flow_key].append(pkt)

    for key, flow in tcp_flows.items():
        flow.sort(key=lambda pkt: pkt.time)
        tcp_flows[key] = flow[:threshold]

    for key, flow in udp_flows.items():
        flow.sort(key=lambda pkt: pkt.time)
        udp_flows[key] = flow[:threshold]

    return tcp_flows, udp_flows

def process_flow(flow, payload_maxlen=1500, exclude_payload=False):
    out = []
    for packet in flow:
        ip_header = mask_ip_header(packet)
        tcpudp_header = mask_tcpudp_header(packet)

        if TCP in packet:
            payload = bytes(packet[TCP].payload)
        elif UDP in packet:
            payload = bytes(packet[UDP].payload)
        else:
            raise ValueError
        assert len(payload) > 0
        payload = list(payload)
        payload = payload[:min(payload_maxlen, len(payload))]

        out.append([list(ip_header), list(tcpudp_header), payload])
        
    return out

  cipher=algorithms.TripleDES,
  cipher=algorithms.TripleDES,


In [2]:
def process_flow_dataset(dataset_dir, out_dir, exclude_payload=False):
    os.makedirs(out_dir, exist_ok=True)
    DNS.dissect = lambda self, s: None
    for root, dirs, files in os.walk(dataset_dir):  # subdir level
        for file in files: 
            if file.endswith('.pcap'):
                file_path = os.path.join(root, file)  
                label = file_path.split('/')[-2]
                
                print(file_path)
                packets = rdpcap(file_path)
                tcp_flows, udp_flows = extract_flows(packets) #list of flows

                if len(tcp_flows) > 0:
                    out_file = os.path.join(out_dir, label+'_tcp.pkl')
                    flows = [process_flow(flow) for flow in tcp_flows.values()]
                    with open(out_file, "ab") as f: 
                        pickle.dump(flows, f)
                    print(f'Extracted {len(tcp_flows)} tcp flows out of {len(packets)} packets')

                if len(udp_flows) > 0:
                    out_file = os.path.join(out_dir, label+'_udp.pkl')
                    flows = [process_flow(flow) for flow in udp_flows.values()]
                    with open(out_file, "ab") as f: 
                        pickle.dump(flows, f)
                    print(f'Extracted {len(udp_flows)} udp flows out of {len(packets)} packets')


In [3]:
dataset_dir = '../data/raw/D1'
out_dir = '../data/processed/D1_flow'

In [4]:
process_flow_dataset(dataset_dir, out_dir)

../data/raw/D1/tudou/TudouVa__download.pcap
Extracted 452 tcp flows out of 38270 packets
Extracted 292 udp flows out of 38270 packets
../data/raw/D1/tudou/TudouVa__overall.pcap
Extracted 875 tcp flows out of 96262 packets
Extracted 438 udp flows out of 96262 packets
../data/raw/D1/twitter/torTwitter.pcap
Extracted 3 tcp flows out of 14654 packets
Extracted 3 udp flows out of 14654 packets
../data/raw/D1/amazon/yamaxun__browse.pcap
Extracted 205 tcp flows out of 7793 packets
../data/raw/D1/amazon/amazon.pcap
Extracted 218 tcp flows out of 7598 packets
../data/raw/D1/amazon/yamaxun__start.pcap
Extracted 86 tcp flows out of 6277 packets
../data/raw/D1/amazon/yamaxun__search.pcap
Extracted 64 tcp flows out of 2170 packets
../data/raw/D1/baidu/baidu__overall.pcap
Extracted 899 tcp flows out of 34479 packets
../data/raw/D1/baidu/extra_baidu__search.pcap
Extracted 180 tcp flows out of 6422 packets
../data/raw/D1/baidu/extra_baidu__start.pcap
Extracted 12 tcp flows out of 456 packets
../data/r

In [45]:

with open('../data/processed/D1_flow/tudou_tcp.pkl', "rb") as f: 
    data = pickle.load(f)

In [47]:
len(data)

452

# compress

In [None]:
compress_dir = '../data/processed/D1_nopayload'
out_name = '../data/processed/D1_nopayload'

In [None]:
import shutil
shutil.make_archive(out_name, 'gztar', compress_dir)
