In [1]:
import os
from datetime import datetime

import numpy as np
import unbsatsubs
DATA_DIR = '../local_data/xml/'
sat_xml = os.path.join(DATA_DIR, 'TestbedSatJun12Flows.xml')

In [2]:
root = unbsatsubs.parsexml_(sat_xml)

In [3]:
i=0
flows = []
appSet = set([])
sourceIPSet = set([])
destIPSet = set([])
sourcePortSet = set([])
protocolSet = set([])
for element in root.iter("TestbedSatJun12"):
    flow = {}
    for k in element:
        flow[k.tag] = k.text
        if k.tag == 'appName':
            appSet.add(k.text)
        elif k.tag == 'source':
            sourceIPSet.add(k.text)
        elif k.tag == 'destination':
            destIPSet.add(k.text)
        elif k.tag == 'sourcePort':
            sourcePortSet.add(k.text)
        elif k.tag == 'protocolName':
            protocolSet.add(k.text)
    start = datetime.strptime(flow['startDateTime'],'%Y-%m-%dT%H:%M:%S')
    stop = datetime.strptime(flow['stopDateTime'],'%Y-%m-%dT%H:%M:%S')
    duration = (stop-start).total_seconds()
    flow['duration'] = duration
    flows.append(flow)    

In [4]:
flows[123]

{'Tag': 'Normal',
 'appName': 'HTTPImageTransfer',
 'destination': '216.246.64.49',
 'destinationPayloadAsBase64': None,
 'destinationPayloadAsUTF': None,
 'destinationPort': '80',
 'destinationTCPFlagsDescription': 'S,P,A',
 'direction': 'L2R',
 'duration': 3.0,
 'protocolName': 'tcp_ip',
 'source': '192.168.1.104',
 'sourcePayloadAsBase64': None,
 'sourcePayloadAsUTF': None,
 'sourcePort': '22441',
 'sourceTCPFlagsDescription': 'S,P,A',
 'startDateTime': '2010-06-12T23:57:10',
 'stopDateTime': '2010-06-12T23:57:13',
 'totalDestinationBytes': '135295',
 'totalDestinationPackets': '104',
 'totalSourceBytes': '5655',
 'totalSourcePackets': '67'}

In [5]:
def set_to_k_v_pair(set_to_convert):
    dict_to_return = {}
    for idx, el in enumerate(set_to_convert):
        dict_to_return[str(el)] = idx
    return dict_to_return

protocol_dict = set_to_k_v_pair(protocolSet)

labels = []
feature_matrix = []

for flow in flows:
    feature = []
    feature.append(int(flow['totalSourcePackets']))
    feature.append(int(flow['totalSourceBytes']))
    feature.append(int(flow['totalDestinationPackets']))
    feature.append(int(flow['totalDestinationBytes']))
    feature.append(flow['duration'])
    
    one_hot_protocol = np.zeros(len(protocol_dict))
    one_hot_protocol[protocol_dict[flow['protocolName']]] = 1.0
    feature.extend(one_hot_protocol)


    feature_matrix.append(feature)
    labels.append(flow['Tag'])

In [6]:
np.save(os.path.join(DATA_DIR, "features.npy"), np.matrix(feature_matrix))
np.save(os.path.join(DATA_DIR, "labels.npy"), np.matrix(labels))