In [1]:
# Imports
import os
from datetime import datetime
import xml.etree.ElementTree as ET

import numpy as np

In [2]:
DATA_DIR = '../local_data/xml/'
OUT_DIR = '../local_data/features'

In [3]:
class day_definition:
    def __init__(self, day_xml_file_name, attack_type):
        print("parsing: " + day_xml_file_name)
        
        self.day_xml_file_name = day_xml_file_name
        self.attack_type = attack_type
        self.xml_file = os.path.join(DATA_DIR, self.day_xml_file_name)
        
        tree = ET.parse(self.xml_file)
        self.xml_root = tree.getroot()
    
    def parse_day(self):
        i=0
        flows = []
        sets = { 
            "app": set([]),
            "sourceIP": set([]),
            "destIP": set([]),
            "sourcePort": set([]),
            "protocol": set([])
        }
        element_name = self.xml_root.getchildren()[0].tag
        for element in self.xml_root.iter(element_name):
            flow = {}
            for k in element:
                flow[k.tag] = k.text
                if k.tag == 'appName':
                    sets['app'].add(k.text)
                elif k.tag == 'source':
                    sets['sourceIP'].add(k.text)
                elif k.tag == 'destination':
                    sets['destIP'].add(k.text)
                elif k.tag == 'sourcePort':
                    sets['sourcePort'].add(k.text)
                elif k.tag == 'protocolName':
                    sets['protocol'].add(k.text)
            start = datetime.strptime(flow['startDateTime'],'%Y-%m-%dT%H:%M:%S')
            stop = datetime.strptime(flow['stopDateTime'],'%Y-%m-%dT%H:%M:%S')
            duration = (stop-start).total_seconds()
            flow['duration'] = duration
            flows.append(flow)
            
        return([flows, sets])

In [4]:
def day_def_generator(xml_and_type):
    return day_definition(xml_and_type[0], xml_and_type[1])

def set_to_k_v_pair(set_to_convert):
    dict_to_return = {}
    for idx, el in enumerate(set_to_convert):
        dict_to_return[str(el)] = idx
    return dict_to_return

def parse_day(day):
    day_obj = day_def_generator(day)
    day_name = day_obj.day_xml_file_name.split("Flows.")[0].split('Testbed')[1]
    flows, sets = day_obj.parse_day()
    
    protocol_dict = set_to_k_v_pair(sets['protocol'])
    
    labels = []
    feature_matrix = []

    for flow in flows:
        feature = []
        feature.append(int(flow['totalSourcePackets']))
        feature.append(int(flow['totalSourceBytes']))
        feature.append(int(flow['totalDestinationPackets']))
        feature.append(int(flow['totalDestinationBytes']))
        feature.append(flow['duration'])

        one_hot_protocol = np.zeros(len(protocol_dict))
        one_hot_protocol[protocol_dict[flow['protocolName']]] = 1.0
        feature.extend(one_hot_protocol)


        feature_matrix.append(feature)
        labels.append(0 if flow['Tag'] == 'Attack' else 1)
        
    np.save(os.path.join(OUT_DIR, "features" + day_name + ".npy"), np.matrix(feature_matrix))
    np.save(os.path.join(OUT_DIR, "labels" + day_name + ".npy"), np.matrix(labels))

days_raw = [['TestbedSatJun12Flows.xml', 'brute_force'],
           ['TestbedSunJun13Flows.xml', 'internal'],
           ['TestbedMonJun14Flows.xml', 'ddos'],
           ['TestbedTueJun15-1Flows.xml', 'irc_botnet_ddos'],
           ['TestbedTueJun15-2Flows.xml', 'irc_botnet_ddos'],
           ['TestbedTueJun15-3Flows.xml', 'irc_botnet_ddos'],
           ['TestbedWedJun16-1Flows.xml', 'brute_force'],
           ['TestbedWedJun16-2Flows.xml', 'brute_force'],
           ['TestbedWedJun16-3Flows.xml', 'brute_force'],
           ['TestbedThuJun17-1Flows_cleaned.xml', 'ssh_brute_force'],
           ['TestbedThuJun17-2Flows.xml', 'ssh_brute_force'],
           ['TestbedThuJun17-3Flows.xml', 'ssh_brute_force']]

for day in days_raw:
    parse_day(day)

parsing: TestbedSatJun12Flows.xml
parsing: TestbedSunJun13Flows.xml
parsing: TestbedMonJun14Flows.xml
parsing: TestbedTueJun15-1Flows.xml
parsing: TestbedTueJun15-2Flows.xml
parsing: TestbedTueJun15-3Flows.xml
parsing: TestbedWedJun16-1Flows.xml
parsing: TestbedWedJun16-2Flows.xml
parsing: TestbedWedJun16-3Flows.xml
parsing: TestbedThuJun17-1Flows_cleaned.xml
parsing: TestbedThuJun17-2Flows.xml
parsing: TestbedThuJun17-3Flows.xml
