In [1]:
# Imports
import os
from datetime import datetime
import xml.etree.ElementTree as ET

import numpy as np

In [2]:
DATA_DIR = '../local_data/xml/'
OUT_DIR = '../local_data/features'
TIME_FORMAT = '%Y-%m-%dT%H:%M:%S'

set_names = ["appName", "source", "destination", "sourcePort", "protocolName"]
dict_set_names = dict(zip(set_names, [None for i in set_names])) # sort of shitty attempt at speeding up the lookup

def make_set_dict():
    return { k: set([]) for k in set_names }

In [3]:
class day_definition:
    def __init__(self, day_xml_file_name, attack_type):
        print("parsing: " + day_xml_file_name)
        
        self.day_xml_file_name = day_xml_file_name
        self.attack_type = attack_type
        self.xml_file = os.path.join(DATA_DIR, self.day_xml_file_name)
        
        tree = ET.parse(self.xml_file)
        self.xml_root = tree.getroot()
    
    def parse_day(self):
        i=0
        flows = []
        sets = make_set_dict()

        element_name = self.xml_root.getchildren()[0].tag
        for element in self.xml_root.iter(element_name):
            flow = {}
            for k in element:
                flow[k.tag] = k.text
                if k.tag in dict_set_names:
                    sets[k.tag].add(k.text)
            start = datetime.strptime(flow['startDateTime'], TIME_FORMAT)
            stop = datetime.strptime(flow['stopDateTime'], TIME_FORMAT)
            duration = (stop-start).total_seconds()
            flow['duration'] = duration
            if flow['Tag'] == 'Attack':
                flow['Tag'] = self.attack_type
            flows.append(flow)
            
        return([flows, sets])

In [4]:
%%time

def day_def_generator(xml_and_type):
    return day_definition(xml_and_type[0], xml_and_type[1])

days_raw = [['TestbedSatJun12Flows.xml', 'brute_force'],
           ['TestbedSunJun13Flows.xml', 'internal'],
           ['TestbedMonJun14Flows.xml', 'ddos'],
           ['TestbedTueJun15-1Flows.xml', 'irc_botnet_ddos'],
           ['TestbedTueJun15-2Flows.xml', 'irc_botnet_ddos'],
           ['TestbedTueJun15-3Flows.xml', 'irc_botnet_ddos'],
           ['TestbedWedJun16-1Flows.xml', 'brute_force'],
           ['TestbedWedJun16-2Flows.xml', 'brute_force'],
           ['TestbedWedJun16-3Flows.xml', 'brute_force'],
           ['TestbedThuJun17-1Flows_cleaned.xml', 'ssh_brute_force'],
           ['TestbedThuJun17-2Flows.xml', 'ssh_brute_force'],
           ['TestbedThuJun17-3Flows.xml', 'ssh_brute_force']]

flows = []
all_sets = make_set_dict()
# get flows and sets
for day in days_raw:
    day_flows, sets = day_def_generator(day).parse_day()
    flows = flows + day_flows
    
    for k in sets: # union all sets together for mega feature possibility list
        all_sets[k] = all_sets[k] | sets[k]

parsing: TestbedSatJun12Flows.xml
parsing: TestbedSunJun13Flows.xml
parsing: TestbedMonJun14Flows.xml
parsing: TestbedTueJun15-1Flows.xml
parsing: TestbedTueJun15-2Flows.xml
parsing: TestbedTueJun15-3Flows.xml
parsing: TestbedWedJun16-1Flows.xml
parsing: TestbedWedJun16-2Flows.xml
parsing: TestbedWedJun16-3Flows.xml
parsing: TestbedThuJun17-1Flows_cleaned.xml
parsing: TestbedThuJun17-2Flows.xml
parsing: TestbedThuJun17-3Flows.xml
CPU times: user 15min 18s, sys: 15.4 s, total: 15min 33s
Wall time: 15min 33s


In [6]:
%%time

def set_to_k_v_pair(set_to_convert):
    dict_to_return = {}
    for idx, el in enumerate(set_to_convert):
        dict_to_return[str(el)] = idx
    return dict_to_return

protocol_dict = set_to_k_v_pair(all_sets['protocolName'])

labels = []
feature_matrix = []

for flow in flows:
    feature = []
    feature.append(int(flow['totalSourcePackets']))
    feature.append(int(flow['totalSourceBytes']))
    feature.append(int(flow['totalDestinationPackets']))
    feature.append(int(flow['totalDestinationBytes']))
    feature.append(flow['duration'])

    one_hot_protocol = np.zeros(len(protocol_dict))
    one_hot_protocol[protocol_dict[flow['protocolName']]] = 1.0
    feature.extend(one_hot_protocol)

    feature_matrix.append(feature)
    labels.append(flow['Tag'])

np.save(os.path.join(OUT_DIR, "features.npy"), np.matrix(feature_matrix))
np.save(os.path.join(OUT_DIR, "labels.npy"), np.matrix(labels))

CPU times: user 1min 34s, sys: 2.48 s, total: 1min 36s
Wall time: 1min 36s
