In [26]:
import os
from scapy.all import rdpcap, IP, TCP, UDP, PcapReader, DNS
import pickle
def bytes2bits(x):
    return ''.join(f'{byte:08b}' for byte in x)
def bits2bytes(x):
    return bytes(int(x[i:i+8], 2) for i in range(0, len(x), 8))
def mask(bits, start, end):
    return bits[:start] + '0'*(end-start) + bits[end:]
def bits2ints(b):
    b = b.zfill((len(b) + 7) // 8 * 8)
    return [int(b[i:i+8], 2) for i in range(0, len(b), 8)]

def mask_ip_header(packet):
    if IP not in packet:
        return ''
    
    ip_header = bytes(packet[IP])
    ip_header_bits = bytes2bits(ip_header)
    U = int(ip_header_bits[4:8],2)
    ip_header_bits = ip_header_bits[:(U*32)]

    ip_header_bits = mask(ip_header_bits, 32, 48) # identification
    ip_header_bits = mask(ip_header_bits, 80, 96) # checksum
    ip_header_bits = mask(ip_header_bits, 96, 128) # src ip
    ip_header_bits = mask(ip_header_bits, 128, 160) # dst ip
    return bits2bytes(ip_header_bits)

def mask_tcpudp_header(packet):
    if TCP in packet:
        tcp_len = packet[TCP].dataofs
        header = bytes(packet[TCP])[:(tcp_len*4)]
        header_bits = bytes2bits(header)
    elif UDP in packet:
        header = bytes(packet[UDP])[:8]
        header_bits = bytes2bits(header)
    else:
        return ''

    header_bits = mask(header_bits, 0, 16) # src port
    header_bits = mask(header_bits, 16, 32) # dst port
    return bits2bytes(header_bits)


def preprocess_packet(packet, exclude_nopayload=True, payload_maxlen=1500):
    if TCP in packet:
        payload = bytes(packet[TCP].payload)
    elif UDP in packet:
        payload = bytes(packet[UDP].payload)
    else:
        return ''
    
    if exclude_nopayload and len(payload) == 0:
        return ''
    
    ip_header = mask_ip_header(packet)
    tcpudp_header = mask_tcpudp_header(packet)
    if len(ip_header) == 0 or len(tcpudp_header) == 0:
        return ''
    
    out = [list(ip_header), list(tcpudp_header)]
    if len(payload) > 0:
        payload = list(payload)
        payload = payload[:min(payload_maxlen, len(payload))]
        out.append(payload)
    else:
        out.append([])
        
    return out

  cipher=algorithms.TripleDES,
  cipher=algorithms.TripleDES,


In [29]:
def process_dataset(dataset_dir, out_dir, exclude_nopayload=True):
    DNS.dissect = lambda self, s: None
    for root, dirs, files in os.walk(dataset_dir):  # subdir level
        for file in files: 
            if file.endswith('.pcap'):
                file_path = os.path.join(root, file)  
                label = file_path.split('/')[-2]
                out_file = os.path.join(out_dir, label+'.pkl')
                # if label in ['taobao', 'weibo', 'cloudmusic', 'facebook', 'MS-Exchange', 'netflix','aimchat','kugou','youtube','pplive','itunes','vimeo','spotify','google','youku','sinauc','baidu','amazon','twitter','tudou']:
                #     continue
                print(file_path)
                packets = rdpcap(file_path)
                for packet in packets:
                    out = preprocess_packet(packet, exclude_nopayload=exclude_nopayload)
                    if len(out) == 0:
                        continue
                    
                    if exclude_nopayload:
                        assert(len(out[2]) > 0)
                    with open(out_file, "ab") as f: 
                        pickle.dump(out, f)
                # with PcapReader(file_path) as pcap_reader:
                #     for packet in pcap_reader:


In [31]:
dataset_dir = '../datasets/raw/D2'
out_dir = '../datasets/processed/D2'

process_dataset(dataset_dir, out_dir)


../datasets/raw/D2/twitter/twitter_1.pcap
../datasets/raw/D2/ted/TED_2.pcap
../datasets/raw/D2/ted/TED_1.pcap
../datasets/raw/D2/amazon/amazon_1.pcap
../datasets/raw/D2/baidu/baidu_2.pcap
../datasets/raw/D2/baidu/baidu_1.pcap
../datasets/raw/D2/youku/youku_1.pcap
../datasets/raw/D2/douban/douban_1.pcap
../datasets/raw/D2/google/google_2.pcap
../datasets/raw/D2/google/google_1.pcap
../datasets/raw/D2/bing/bing_2.pcap
../datasets/raw/D2/bing/bing_1.pcap
../datasets/raw/D2/youtube/youtube_1.pcap
../datasets/raw/D2/facebook/facebook_1.pcap
../datasets/raw/D2/facebook/facebook_2.pcap
../datasets/raw/D2/weibo/weibo_1.pcap
../datasets/raw/D2/imdb/imdb_2.pcap
../datasets/raw/D2/imdb/imdb_1.pcap
../datasets/raw/D2/tieba/tieba_1.pcap
../datasets/raw/D2/reddit/reddit_1.pcap
../datasets/raw/D2/reddit/reddit_2.pcap
../datasets/raw/D2/taobao/taobao_1.pcap
../datasets/raw/D2/iqiyi/iqiyi_1.pcap
../datasets/raw/D2/jd/JD_1.pcap
../datasets/raw/D2/instagram/instagram_1.pcap
../datasets/raw/D2/instagram/i

In [34]:
# packets = rdpcap('../datasets/raw/D1/thunder/ThunderPlatform__bt.pcap')

packets = rdpcap('../datasets/raw/D1/tudou/TudouVa__download.pcap')
len(packets)

38270

In [41]:
bytes(packets[0][UDP].payload)

b'KU\x00\x01\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'

In [None]:
b  = b'E\x00\x00>\x00\x00\x00\x00@\x11\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'

In [None]:
list(b)

In [None]:
a = []
with open('../datasets/processed/D1/tudou.pkl', "rb") as file: 
    lines = pickle.load(file)

In [None]:
lines = []
with open('../datasets/processed/D1/amazon.pkl', 'rb') as f:
    try:
        while True:
            obj = pickle.load(f)
            lines.append(obj)
            print(len(obj))
    except EOFError:
        print("Finished reading all objects.")

In [None]:
len(lines[1])

In [24]:
import os, pickle
for root, dirs, files in os.walk('../datasets/processed/D1/'):  # subdir level
    for file in files: 
        X, Y = [], []
        if file.endswith('.pkl'):
            label = file[:-4]
            file_name = os.path.join(root, file)
            with open(file_name, 'rb') as f:
                try:
                    while True:
                        packet = pickle.load(f)
                        assert(len(packet)==3)
                        X.append(packet)
                        Y.append(label)
                except EOFError:
                    print("Finished reading " + file_name)
            
            lens = [len(segment_array(x)) for x in X]
            print(max(lens)*8)

Finished reading ../datasets/processed/D1/kugou.pkl
1496
Finished reading ../datasets/processed/D1/tudou.pkl
1504
Finished reading ../datasets/processed/D1/spotify.pkl
34848
Finished reading ../datasets/processed/D1/vimeo.pkl
44216
Finished reading ../datasets/processed/D1/aimchat.pkl
3816
Finished reading ../datasets/processed/D1/thunder.pkl
1504
Finished reading ../datasets/processed/D1/weibo.pkl
1512
Finished reading ../datasets/processed/D1/sohu.pkl
1352
Finished reading ../datasets/processed/D1/voipbuster.pkl
1368
Finished reading ../datasets/processed/D1/gmail.pkl
1400
Finished reading ../datasets/processed/D1/jd.pkl
1512
Finished reading ../datasets/processed/D1/skype.pkl
1520
Finished reading ../datasets/processed/D1/baidu.pkl
1512
Finished reading ../datasets/processed/D1/yahoomail.pkl
1496
Finished reading ../datasets/processed/D1/netflix.pkl
48224
Finished reading ../datasets/processed/D1/taobao.pkl
1512
Finished reading ../datasets/processed/D1/itunes.pkl
1496
Finished read

In [33]:
import shutil

# Compress a directory into a tar.gz file
shutil.make_archive('../datasets/processed/D1', 'gztar', '../datasets/processed/D1')


'/Users/tianh/Desktop/EBSNN/datasets/processed/D1.tar.gz'

In [4]:

def segment_array(arr, segment_len=8): 
    # output: L * N
    result = []
    for subarray in arr:
        for i in range(0, len(subarray), segment_len):
            segment = subarray[i:i+segment_len]
            if len(segment) < segment_len:
                segment.extend([0]*(segment_len-len(segment)))
            result.append(segment)
    return result

In [7]:
a = []
for x in X:
    a.append(segment_array(x))

In [18]:
lens = [len(b) for b in a]

In [20]:
lens

[7,
 7,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 12,
 9,
 105,
 9,
 9,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 13,
 13,
 13,
 13,
 14,
 13,
 13,
 13,
 13,
 14,
 14,
 138,
 138,
 138,
 138,
 138,
 14,
 137,
 138,
 138,
 138,
 138,
 138,
 137,
 137,
 138,
 137,
 137,
 138,
 137,
 138,
 137,
 137,
 137,
 137,
 138,
 14,
 14,
 13,
 13,
 13,
 138,
 138,
 138,
 137,
 138,
 137,
 138,
 14,
 138,
 137,
 138,
 138,
 137,
 138,
 137,
 137,
 137,
 14,
 14,
 137,
 137,
 137,
 138,
 137,
 138,
 137,
 138,
 137,
 138,
 138,
 137,
 137,
 137,
 137,
 138,
 138,
 138,
 138,
 137,
 138,
 138,
 137,
 138,
 137,
 136,
 136,
 136,
 136,
 136,
 136,
 136,
 136,
 13,
 13,
 13,
 13,
 13,
 13,
 13,
 14,
 137,
 138,
 138,
 137,
 137,

In [21]:
187*8

1496

In [19]:
max(lens)

187

In [None]:
bytes.fromhex(a[0])

In [None]:
packets[0].show()

In [None]:
8*32

In [None]:
len(bytes2bits(a[2]))

In [None]:
len(bytes2bits(a[1]))

In [None]:
256-184

In [None]:
udp = bytes2bits(bytes(packets[0][UDP]))

In [None]:
x = bits2bytes(udp[48:])
int.from_bytes(x,byteorder='big')

In [None]:
x

In [None]:
packets[0].show()

In [None]:
bytes(packets[0][UDP].payload)

In [None]:
bytes(packets[0][UDP])

In [None]:
a[2]

In [None]:
32*5

In [None]:
24*8

In [None]:
int.from_bytes(a[2],byteorder='big')

In [None]:
corpus[0]

# pcap to hdf5

In [None]:
from pcapToHdf5 import pcap_to_hdf5
import os

In [None]:
dataset_dir = '../datasets'
dataset_name = 'D1'
subdir_name = 'amazon'

In [None]:
pcap_file = os.path.join(dataset_dir, dataset_name, subdir_name, 'amazon.pcap')

In [None]:
pcap_to_hdf5(pcap_file, 'test.hdf5')

# how to use hdf5

In [None]:
filename = 'test.hdf5'

In [None]:
import h5py
from mpi4py import MPI
class H5Iter:
    def __init__(self, st=0):
        self.corpus = []
        self.st = st
        
    def __call__(self, name, h5obj):
        if hasattr(h5obj, 'dtype') and name.endswith('/X'):
            self.corpus.append([name.split('/')[0], h5obj[:]])
            # if len(self.corpus) % 200000 == 0:
            #     p_log(f'{time() - self.st:.4f}s with {len(self.corpus)} items.')


In [None]:
f_h5 = h5py.File(os.path.join(filename), 'r',
                    driver='mpio', comm=MPI.COMM_WORLD)
h5iter = H5Iter()
f_h5.visititems(h5iter)
corpus = h5iter.corpus
f_h5.close()

In [None]:
import h5py
print(h5py.get_config().mpi)
