# apply masks & generate pickle

In [8]:
import os
from scapy.all import rdpcap, IP, TCP, UDP, PcapReader, DNS
import pickle
def bytes2bits(x):
    return ''.join(f'{byte:08b}' for byte in x)
def bits2bytes(x):
    return bytes(int(x[i:i+8], 2) for i in range(0, len(x), 8))
def mask(bits, start, end):
    return bits[:start] + '0'*(end-start) + bits[end:]
def bits2ints(b):
    b = b.zfill((len(b) + 7) // 8 * 8)
    return [int(b[i:i+8], 2) for i in range(0, len(b), 8)]

def mask_ip_header(packet):
    if IP not in packet:
        return ''
    
    ip_header = bytes(packet[IP])
    ip_header_bits = bytes2bits(ip_header)
    U = int(ip_header_bits[4:8],2)
    ip_header_bits = ip_header_bits[:(U*32)]

    ip_header_bits = mask(ip_header_bits, 32, 48) # identification
    ip_header_bits = mask(ip_header_bits, 80, 96) # checksum
    ip_header_bits = mask(ip_header_bits, 96, 128) # src ip
    ip_header_bits = mask(ip_header_bits, 128, 160) # dst ip
    return bits2bytes(ip_header_bits)

def mask_tcpudp_header(packet):
    if TCP in packet:
        tcp_len = packet[TCP].dataofs
        header = bytes(packet[TCP])[:(tcp_len*4)]
        header_bits = bytes2bits(header)
    elif UDP in packet:
        header = bytes(packet[UDP])[:8]
        header_bits = bytes2bits(header)
    else:
        return ''

    header_bits = mask(header_bits, 0, 16) # src port
    header_bits = mask(header_bits, 16, 32) # dst port
    return bits2bytes(header_bits)


def preprocess_packet(packet, exclude_payload=False, payload_maxlen=1500):
    if TCP in packet:
        payload = bytes(packet[TCP].payload)
    elif UDP in packet:
        payload = bytes(packet[UDP].payload)
    else:
        return ''
    
    ip_header = mask_ip_header(packet)
    tcpudp_header = mask_tcpudp_header(packet)
    if len(ip_header) == 0 or len(tcpudp_header) == 0:
        return ''
    
    out = [list(ip_header), list(tcpudp_header)]
    if len(payload) == 0:
        if exclude_payload:
            return out
        else:
            return ''
    else:
        if exclude_payload:
            return ''
        else:
            payload = list(payload)
            payload = payload[:min(payload_maxlen, len(payload))]
            out.append(payload)
            return out
        

  cipher=algorithms.TripleDES,
  cipher=algorithms.TripleDES,


In [17]:
def process_dataset(dataset_dir, out_dir, exclude_payload=False):
    os.makedirs(out_dir, exist_ok=True)
    DNS.dissect = lambda self, s: None
    for root, dirs, files in os.walk(dataset_dir):  # subdir level
        for file in files: 
            if file.endswith('.pcap'):
                file_path = os.path.join(root, file)  
                label = file_path.split('/')[-2]
                out_file = os.path.join(out_dir, label+'.pkl')
                
                print(file_path)
                i = 0
                packets = rdpcap(file_path)
                for packet in packets:
                    out = preprocess_packet(packet, exclude_payload=exclude_payload)
                    if len(out) == 0:
                        continue
                    
                    if exclude_payload:
                        assert(len(out) == 2)
                    else:
                        assert(len(out[2]) > 0)
                    i += 1
                    with open(out_file, "ab") as f: 
                        pickle.dump(out, f)
                print(f'Extracted {i} out of {len(packets)} packets')


In [18]:
dataset_dir = '../datasets/raw/D2'
out_dir = '../datasets/processed/D2_nopayload'

process_dataset(dataset_dir, out_dir, exclude_payload=True)


../datasets/raw/D2/twitter/twitter_1.pcap
Extracted 125186 out of 356192 packets
../datasets/raw/D2/ted/TED_2.pcap
Extracted 35422 out of 78244 packets
../datasets/raw/D2/ted/TED_1.pcap
Extracted 133641 out of 353894 packets
../datasets/raw/D2/amazon/amazon_1.pcap
Extracted 104148 out of 250460 packets
../datasets/raw/D2/baidu/baidu_2.pcap
Extracted 12116 out of 23688 packets
../datasets/raw/D2/baidu/baidu_1.pcap
Extracted 167282 out of 294552 packets
../datasets/raw/D2/youku/youku_1.pcap
Extracted 82031 out of 221172 packets
../datasets/raw/D2/douban/douban_1.pcap
Extracted 76464 out of 215897 packets
../datasets/raw/D2/google/google_2.pcap
Extracted 9017 out of 20460 packets
../datasets/raw/D2/google/google_1.pcap
Extracted 52421 out of 112995 packets
../datasets/raw/D2/bing/bing_2.pcap
Extracted 20467 out of 55395 packets
../datasets/raw/D2/bing/bing_1.pcap
Extracted 73301 out of 200663 packets
../datasets/raw/D2/youtube/youtube_1.pcap
Extracted 203127 out of 519091 packets
../datas

# more processing

In [6]:
dataset_dir = '../datasets/processed/D2'
out_dir = '../datasets/processed/D2_half'

import pickle, os
os.makedirs(out_dir, exist_ok=True)

In [7]:
import random
subset_pct = 0.5
num_packets = 0
for file in os.listdir(dataset_dir):  # subdir level
    if file.endswith('.pkl'):
        lines = []
        with open(os.path.join(dataset_dir, file), 'rb') as f:
            try:
                while True:
                    lines.append(pickle.load(f))
            except EOFError:
                print("Finished reading " + file)

        subset = random.sample(lines, int(len(lines)*subset_pct))
        num_packets += len(subset)
        print(f"Extracting {len(subset)} packets from " + file)
        with open(os.path.join(out_dir, file), "ab") as f: 
            for packet in subset:
                pickle.dump(packet, f)

Finished reading tieba.pkl
Extracting 62876 packets from tieba.pkl
Finished reading instagram.pkl
Extracting 62568 packets from instagram.pkl
Finished reading weibo.pkl
Extracting 161742 packets from weibo.pkl
Finished reading imdb.pkl
Extracting 126816 packets from imdb.pkl
Finished reading jd.pkl
Extracting 75494 packets from jd.pkl
Finished reading baidu.pkl
Extracting 69077 packets from baidu.pkl
Finished reading taobao.pkl
Extracting 113999 packets from taobao.pkl
Finished reading reddit.pkl
Extracting 116287 packets from reddit.pkl
Finished reading netease.pkl
Extracting 78369 packets from netease.pkl
Finished reading youku.pkl
Extracting 69522 packets from youku.pkl
Finished reading qq.pkl
Extracting 49862 packets from qq.pkl
Finished reading twitter.pkl
Extracting 115497 packets from twitter.pkl
Finished reading google.pkl
Extracting 35861 packets from google.pkl
Finished reading ted.pkl
Extracting 131454 packets from ted.pkl
Finished reading iqiyi.pkl
Extracting 115766 packets

In [5]:
num_packets

1255178

In [None]:
import os, pickle
for root, dirs, files in os.walk('../datasets/processed/D1/'):  # subdir level
    for file in files: 
        X, Y = [], []
        if file.endswith('.pkl'):
            label = file[:-4]
            file_name = os.path.join(root, file)
            with open(file_name, 'rb') as f:
                try:
                    while True:
                        packet = pickle.load(f)
                        assert(len(packet)==3)
                        X.append(packet)
                        Y.append(label)
                except EOFError:
                    print("Finished reading " + file_name)
            
            lens = [len(segment_array(x)) for x in X]
            print(max(lens)*8)

# compress

In [25]:
compress_dir = '../datasets/processed/D1_nopayload'
out_name = '../datasets/processed/D1_nopayload'

In [26]:
import shutil
shutil.make_archive(out_name, 'gztar', compress_dir)


'/Users/tianh/Desktop/EBSNN/datasets/processed/D1_nopayload.tar.gz'