In [5]:
import os


In [6]:
dataset_dir = '../datasets'
dataset_name = 'D1'
subdir_name = 'amazon'

In [7]:
pcap_file = os.path.join(dataset_dir, dataset_name, subdir_name, 'amazon.pcap')

with open(pcap_file, 'r', encoding='utf-8',
            errors='ignore') as f:
    corpus = f.readlines()

In [45]:
import scapy
from scapy.all import rdpcap, IP, TCP, UDP
def bytes2bits(x):
    return ''.join(f'{byte:08b}' for byte in x)
def bits2bytes(x):
    return bytes(int(x[i:i+8], 2) for i in range(0, len(x), 8))
def mask(bits, start, end):
    return bits[:start] + '0'*(end-start) + bits[end:]

In [9]:
packets = [p for p in rdpcap(pcap_file)]

In [10]:
packet = packets[0]

In [51]:
def mask_ip_header(packet):
    ip_header = bytes(packet[IP])
    ip_header_bits = bytes2bits(ip_header)
    header_length_bits = int(ip_header_bits[4:8],2)*32
    ip_header_bits = ip_header_bits[:header_length_bits]

    ip_header_bits = mask(ip_header_bits, 32, 48) # identification
    ip_header_bits = mask(ip_header_bits, 80, 96) # checksum
    ip_header_bits = mask(ip_header_bits, 96, 128) # src ip
    ip_header_bits = mask(ip_header_bits, 128, 160) # dst ip
    return ip_header_bits

In [52]:
mask_ip_header(packet)

'0100010100000000000000000011010000000000000000000100000000000000010000000000011000000000000000000000000000000000000000000000000000000000000000000000000000000000'

In [58]:
def mask_tcpudp_header(packet):
    if TCP in packet:
        header = bytes(packet[TCP])
        header_bits = bytes2bits(header)
        header_length_bits = packet[TCP].dataofs * 32
        header_bits = header_bits[:header_length_bits]
    elif UDP in packet:
        header = bytes(packet[UDP])
        header_bits = bytes2bits(header)
    else:
        raise ValueError

    header_bits = mask(header_bits, 0, 16) # src port
    header_bits = mask(header_bits, 16, 32) # dst port
    return header_bits

In [59]:
mask_tcpudp_header(packet)

'0000000000000000000000000000000010110010010100000010011001011010000000000000000000000000000000001000000000000010001000000000000001000000110110000000000000000000000000100000010000000101101101000000000100000011000000110000001000000001000000010000010000000010'

In [55]:
int(tcp_header_bits[0:16],2)

31668

### masking TCP/UDP header

In [33]:
ip_header_bits

'01000101000000000000000000110100011101110000000101000000000000000100000000000110000010111001111111000000101010000000000101100101101011011100001001001000010101000111101110110100000000011011101110110010010100000010011001011010000000000000000000000000000000001000000000000010001000000000000001000000110110000000000000000000000000100000010000000101101101000000000100000011000000110000001000000001000000010000010000000010'

In [34]:
mask_ip_header(ip_header_bits)

'01000101000000000000000000110100000000000000000001000000000000000100000000000110000000000000000000000000000000000000000000000000000000000000000000000000000000000111101110110100000000011011101110110010010100000010011001011010000000000000000000000000000000001000000000000010001000000000000001000000110110000000000000000000000000100000010000000101101101000000000100000011000000110000001000000001000000010000010000000010'

In [37]:
len(ip_header_bits)

416

# pcap to hdf5

In [3]:
from pcapToHdf5 import pcap_to_hdf5
import os

In [9]:
dataset_dir = '../datasets'
dataset_name = 'D1'
subdir_name = 'amazon'

In [10]:
pcap_file = os.path.join(dataset_dir, dataset_name, subdir_name, 'amazon.pcap')

In [11]:
pcap_to_hdf5(pcap_file, 'test.hdf5')

Converted ../datasets/D1/amazon/amazon.pcap to test.hdf5 successfully.


# how to use hdf5

In [1]:
filename = 'test.hdf5'

In [None]:
import h5py
from mpi4py import MPI
class H5Iter:
    def __init__(self, st=0):
        self.corpus = []
        self.st = st
        
    def __call__(self, name, h5obj):
        if hasattr(h5obj, 'dtype') and name.endswith('/X'):
            self.corpus.append([name.split('/')[0], h5obj[:]])
            # if len(self.corpus) % 200000 == 0:
            #     p_log(f'{time() - self.st:.4f}s with {len(self.corpus)} items.')


In [None]:
f_h5 = h5py.File(os.path.join(filename), 'r',
                    driver='mpio', comm=MPI.COMM_WORLD)
h5iter = H5Iter()
f_h5.visititems(h5iter)
corpus = h5iter.corpus
f_h5.close()

ValueError: h5py was built without MPI support, can't use mpio driver

In [1]:
import h5py
print(h5py.get_config().mpi)


ModuleNotFoundError: No module named 'h5py'