In [1]:
# Imports
import os
import pickle
from datetime import datetime
import xml.etree.ElementTree as ET

import numpy as np

In [2]:
DATA_DIR = '../local_data/xml/'
OUT_DIR = '../local_data/features'
TIME_FORMAT = '%Y-%m-%dT%H:%M:%S'

set_names = ["appName", "source", "destination", "sourcePort", "protocolName", "direction"]
dict_set_names = dict(zip(set_names, [None for i in set_names])) # sort of shitty attempt at speeding up the lookup

def make_set_dict():
    return { k: set([]) for k in set_names }

In [3]:
class day_definition:
    def __init__(self, day_xml_file_name, attack_type):
        print("parsing: " + day_xml_file_name)
        
        self.day_xml_file_name = day_xml_file_name
        self.attack_type = attack_type
        self.xml_file = os.path.join(DATA_DIR, self.day_xml_file_name)
        
        tree = ET.parse(self.xml_file)
        self.xml_root = tree.getroot()
    
    def parse_day(self):
        i=0
        flows = []
        sets = make_set_dict()

        element_name = self.xml_root.getchildren()[0].tag
        for element in self.xml_root.iter(element_name):
            flow = {}
            for k in element:
                flow[k.tag] = k.text
                if k.tag in dict_set_names:
                    sets[k.tag].add(k.text)
            start = datetime.strptime(flow['startDateTime'], TIME_FORMAT)
            stop = datetime.strptime(flow['stopDateTime'], TIME_FORMAT)
            duration = (stop-start).total_seconds()
            flow['duration'] = duration
            if flow['Tag'] == 'Attack':
                flow['Tag'] = self.attack_type
            flows.append(flow)
            
        return([flows, sets])

In [4]:
%%time

def day_def_generator(xml_and_type):
    return day_definition(xml_and_type[0], xml_and_type[1])

days_raw = [['TestbedTueJun15-1Flows.xml', 'irc_botnet_ddos']]
flows = []
all_sets = make_set_dict()

for day in days_raw:
    day_flows, sets = day_def_generator(day).parse_day()
    flows = flows + day_flows

    for k in sets: # union all sets together for mega feature possibility list
        all_sets[k] = all_sets[k] | sets[k]

parsing: TestbedTueJun15-1Flows.xml
CPU times: user 14.6 s, sys: 900 ms, total: 15.5 s
Wall time: 16.2 s


In [5]:

def set_to_k_v_pair(set_to_convert):
    dict_to_return = {}
    for idx, el in enumerate(set_to_convert):
        dict_to_return[str(el)] = idx
    return dict_to_return

protocol_dict = set_to_k_v_pair(all_sets['protocolName'])
dir_dict = set_to_k_v_pair(all_sets['direction'])

labels = []
feature_matrix = []

for flow in flows:
    feature = []
    feature.append(int(flow['totalSourcePackets']))
    feature.append(int(flow['totalSourceBytes']))
    feature.append(int(flow['totalDestinationPackets']))
    feature.append(int(flow['totalDestinationBytes']))
    feature.append(flow['duration'])

    one_hot_protocol = np.zeros(len(protocol_dict))
    one_hot_protocol[protocol_dict[flow['protocolName']]] = 1.0
    feature.extend(one_hot_protocol)
    
    one_hot_dir = np.zeros(len(dir_dict))
    one_hot_dir[dir_dict[flow['direction']]] = 1.0
    feature.extend(one_hot_dir)

    feature_matrix.append(feature)
    labels.append(flow['Tag'])

np.save(os.path.join(OUT_DIR, "features-sim.npy"), np.matrix(feature_matrix))
np.save(os.path.join(OUT_DIR, "labels-sim.npy"), np.matrix(labels))
db_list = [[f['totalSourceBytes'],f['totalSourcePackets'],f['sourcePayloadAsBase64'],f['destinationPayloadAsUTF'],f['sourceTCPFlagsDescription'],f['source'],f['sourcePort'],f['totalDestinationBytes'],f['totalDestinationPackets'],f['destinationPayloadAsBase64'],f['destinationPayloadAsUTF'],f['destinationTCPFlagsDescription'],f['destination'],f['destinationPort'],f['appName'],f['direction'],f['startDateTime'],f['stopDateTime'],f['Tag']] for f in flows]
pickle.dump(db_list, open(os.path.join(OUT_DIR, "flow-for-db.pkl"), 'wb'))

total_src_bytes, total_src_pkts, src_b64_payload,src_utf_payload, src_tcp_flags, src_ip, src_port, total_dst_bytes,              total_dst_pkts, dst_b64_payload, dst_utf_payload, dst_tcp_flags, dst_ip, dst_port, app_name, direction, start_time, end_time, tag, classified_tag, created_at, updated_at

      4 <TestbedSatJun12>                                                               
      5     <appName>HTTPWeb</appName>                                                  
      6     <totalSourceBytes>128</totalSourceBytes>                                    
      7     <totalDestinationBytes>64</totalDestinationBytes>                           
      8     <totalDestinationPackets>1</totalDestinationPackets>                        
      9     <totalSourcePackets>2</totalSourcePackets>                                  
     10     <sourcePayloadAsBase64></sourcePayloadAsBase64>                             
     11     <sourcePayloadAsUTF></sourcePayloadAsUTF>                                   
     12     <destinationPayloadAsBase64></destinationPayloadAsBase64>                   
     13     <destinationPayloadAsUTF></destinationPayloadAsUTF>                         
     14     <direction>L2R</direction>                                                  
     15     <sourceTCPFlagsDescription>F,A</sourceTCPFlagsDescription>                  
     16     <destinationTCPFlagsDescription>F,A</destinationTCPFlagsDescription>        
     17     <source>192.168.1.104</source>                                              
     18     <protocolName>tcp_ip</protocolName>                                         
     19     <sourcePort>22441</sourcePort>                                              
     20     <destination>216.246.64.49</destination>                                    
     21     <destinationPort>80</destinationPort>                                       
     22     <startDateTime>2010-06-12T23:58:53</startDateTime>                          
     23     <stopDateTime>2010-06-12T23:58:53</stopDateTime>                            
     24     <Tag>Normal</Tag>                                                           
     25 </TestbedSatJun12>                                                              