# Import Libraries

In [1]:
# Data structs, format and parsing
import json
import pandas
import xml.etree.ElementTree as ET
import xmltodict
import pandas as pd
from flatten_json import flatten_json
from nested_lookup import nested_lookup, nested_delete, get_all_keys, get_occurrence_of_value, get_occurrence_of_key, get_occurrences_and_values

# Visualisation
import networkx as nx
from pyvis.network import Network

# Graph Data
from py2neo import Graph, Node, Relationship, RelationshipMatch

# Others
from tqdm import tqdm
from pprint import pprint
from operator import itemgetter
import statistics

import numpy as np
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

# Read File

In [2]:
filepath = './sample_dataset/win_events-brawl_public_game_001.json'

with open(filepath) as f:
    readfile = f.readlines()

In [3]:
# Sample an item from readfile
sampledata = json.loads(readfile[0]) 
sampledata

{'@timestamp': '2017-05-01T19:20:56.868Z',
 'host': 'dc.brawlco.com',
 'type': 'win_event',
 'data_model': {'fields': {'log_name': 'Security',
   'log_type': 'Microsoft-Windows-Security-Auditing'}},
 'raw': "<Event xmlns='http://schemas.microsoft.com/win/2004/08/events/event'><System><Provider Name='Microsoft-Windows-Security-Auditing' Guid='{54849625-5478-4994-A5BA-3E3B0328C30D}'/><EventID>4634</EventID><Version>0</Version><Level>0</Level><Task>12545</Task><Opcode>0</Opcode><Keywords>0x8020000000000000</Keywords><TimeCreated SystemTime='2017-05-01T19:20:56.868245700Z'/><EventRecordID>8293558</EventRecordID><Correlation/><Execution ProcessID='504' ThreadID='3600'/><Channel>Security</Channel><Computer>dc.brawlco.com</Computer><Security/></System><EventData><Data Name='TargetUserSid'>BRAWLCO\\GEDDIS-PC$</Data><Data Name='TargetUserName'>GEDDIS-PC$</Data><Data Name='TargetDomainName'>BRAWLCO</Data><Data Name='TargetLogonId'>0x2dff7e41</Data><Data Name='LogonType'>3</Data></EventData></Eve

# Pre-processing

In [4]:
def xmlpreproc(xmlstring):
    '''XML to json converter'''
    xmldata = xmltodict.parse(xmlstring)
    jsondata = json.dumps(xmldata, indent=2)
    jsondata = json.loads(jsondata)
    #jsondat = flatten(xmldata)
    return jsondata

In [5]:
# Test xml converter function
jsonresult = xmlpreproc(sampledata['raw'])
jsonresult

{'Event': {'@xmlns': 'http://schemas.microsoft.com/win/2004/08/events/event',
  'System': {'Provider': {'@Name': 'Microsoft-Windows-Security-Auditing',
    '@Guid': '{54849625-5478-4994-A5BA-3E3B0328C30D}'},
   'EventID': '4634',
   'Version': '0',
   'Level': '0',
   'Task': '12545',
   'Opcode': '0',
   'Keywords': '0x8020000000000000',
   'TimeCreated': {'@SystemTime': '2017-05-01T19:20:56.868245700Z'},
   'EventRecordID': '8293558',
   'Correlation': None,
   'Execution': {'@ProcessID': '504', '@ThreadID': '3600'},
   'Channel': 'Security',
   'Computer': 'dc.brawlco.com',
   'Security': None},
  'EventData': {'Data': [{'@Name': 'TargetUserSid',
     '#text': 'BRAWLCO\\GEDDIS-PC$'},
    {'@Name': 'TargetUserName', '#text': 'GEDDIS-PC$'},
    {'@Name': 'TargetDomainName', '#text': 'BRAWLCO'},
    {'@Name': 'TargetLogonId', '#text': '0x2dff7e41'},
    {'@Name': 'LogonType', '#text': '3'}]}}}

In [6]:
#transform and combine xml data in json
def xmlXform(rawdata):
    for key, val in rawdata.items():
        if "xmlns" in val:
            rawdata[key] = xmlpreproc(val)
    return rawdata

xmlXform(sampledata)

{'@timestamp': '2017-05-01T19:20:56.868Z',
 'host': 'dc.brawlco.com',
 'type': 'win_event',
 'data_model': {'fields': {'log_name': 'Security',
   'log_type': 'Microsoft-Windows-Security-Auditing'}},
 'raw': {'Event': {'@xmlns': 'http://schemas.microsoft.com/win/2004/08/events/event',
   'System': {'Provider': {'@Name': 'Microsoft-Windows-Security-Auditing',
     '@Guid': '{54849625-5478-4994-A5BA-3E3B0328C30D}'},
    'EventID': '4634',
    'Version': '0',
    'Level': '0',
    'Task': '12545',
    'Opcode': '0',
    'Keywords': '0x8020000000000000',
    'TimeCreated': {'@SystemTime': '2017-05-01T19:20:56.868245700Z'},
    'EventRecordID': '8293558',
    'Correlation': None,
    'Execution': {'@ProcessID': '504', '@ThreadID': '3600'},
    'Channel': 'Security',
    'Computer': 'dc.brawlco.com',
    'Security': None},
   'EventData': {'Data': [{'@Name': 'TargetUserSid',
      '#text': 'BRAWLCO\\GEDDIS-PC$'},
     {'@Name': 'TargetUserName', '#text': 'GEDDIS-PC$'},
     {'@Name': 'TargetD

In [7]:
# convert all xml items to flatten json in readfile
loglist = []
for readline in readfile:
    sampledata = json.loads(readline)
    sampledata = xmlXform(sampledata)
    loglist.append(sampledata)

loglist

[{'@timestamp': '2017-05-01T19:20:56.868Z',
  'host': 'dc.brawlco.com',
  'type': 'win_event',
  'data_model': {'fields': {'log_name': 'Security',
    'log_type': 'Microsoft-Windows-Security-Auditing'}},
  'raw': {'Event': {'@xmlns': 'http://schemas.microsoft.com/win/2004/08/events/event',
    'System': {'Provider': {'@Name': 'Microsoft-Windows-Security-Auditing',
      '@Guid': '{54849625-5478-4994-A5BA-3E3B0328C30D}'},
     'EventID': '4634',
     'Version': '0',
     'Level': '0',
     'Task': '12545',
     'Opcode': '0',
     'Keywords': '0x8020000000000000',
     'TimeCreated': {'@SystemTime': '2017-05-01T19:20:56.868245700Z'},
     'EventRecordID': '8293558',
     'Correlation': None,
     'Execution': {'@ProcessID': '504', '@ThreadID': '3600'},
     'Channel': 'Security',
     'Computer': 'dc.brawlco.com',
     'Security': None},
    'EventData': {'Data': [{'@Name': 'TargetUserSid',
       '#text': 'BRAWLCO\\GEDDIS-PC$'},
      {'@Name': 'TargetUserName', '#text': 'GEDDIS-PC$'},

In [8]:
# get all unique keys in log
availablekeys = list(set(get_all_keys(loglist)))
availablekeys

['@ActivityID',
 'Opcode',
 '@ProcessID',
 'Computer',
 '@Qualifiers',
 'EventData',
 'log_name',
 'raw',
 '#text',
 'Execution',
 'log_type',
 '@Name',
 '@uuid',
 'fields',
 'Version',
 '@timestamp',
 '@UserID',
 'Provider',
 'EventRecordID',
 '@EventSourceName',
 '@Guid',
 'data_model',
 'type',
 'Keywords',
 'host',
 'System',
 'Binary',
 '@xmlns',
 'Event',
 'game_id',
 'Correlation',
 '@SystemTime',
 'TimeCreated',
 'Task',
 '@ThreadID',
 'Security',
 'EventID',
 'Level',
 'Channel',
 'Data']

In [9]:
# get all eventid in log
qkey = 'EventID'

def getUnique(dictlist, qkey):
    getquery = nested_lookup(qkey,dictlist)
    result = [i for i in getquery if isinstance(i, str)]
    uniqueResult = list(set(result))
    return uniqueResult

uniqueEventid = getUnique(loglist, qkey)
uniqueEventid

['4634',
 '4770',
 '4624',
 '4672',
 '1500',
 '4648',
 '4662',
 '1501',
 '4768',
 '4769']

# windows event csv lookup

In [12]:
winevntcode = r'C:\Users\A101889\Desktop\Workspace\Dataset\Lookup\WindowsEventCodes.csv'
df = pd.read_csv(winevntcode)
df.groupby("Category").size()

Category
Account Logon                                                         15
Account Management                                                    67
Application                                                           10
Applocker                                                             12
Audit                                                                  1
DS Access                                                             18
Detailed Tracking                                                      8
Logon/Logoff                                                          48
Microsoft-Windows-ADFS/Audit                                           4
Microsoft-Windows-Application-Experience/Program-Inventory             6
Microsoft-Windows-ApplicationExperience-Program-Telemetry              1
Microsoft-Windows-Bits-Client                                          1
Microsoft-Windows-CAPI2/Operational                                    3
Microsoft-Windows-CertificateServicesClien

In [13]:
def lookupevent(alleventcodes):
    rellist = []
    for i in alleventcodes:
        query = df.loc[df['EventCode'] == int(i), 'EventDescription']
        query = query.get_values()
        if query.size > 0:
            rellist.append((str(i), query[0]))
    return rellist

impevtlist = lookupevent(uniqueEventid)
impevtlist

[('4634', 'An account was logged off.'),
 ('4770', 'A Kerberos service ticket was renewed.'),
 ('4624', 'An account was successfully logged on.'),
 ('4672', 'Special privileges assigned to new logon.'),
 ('4648', 'A logon was attempted using explicit credentials.'),
 ('4662', 'An operation was performed on an object.'),
 ('4768', 'A Kerberos authentication ticket (TGT) was requested.'),
 ('4769', 'A Kerberos service ticket was requested.')]

# Important Statistics

In [15]:
def getOccurances(dictlist, valuequery):
    '''Get the occrunces of a value in log evidence in a list of dict'''
    rel = get_occurrences_and_values(dictlist, value=valuequery)[valuequery]['occurrences']
    return rel

In [16]:
# Rank eventid occurances
ranktuplist = []
for i, j in impevtlist:
    getocur = getOccurances(loglist, i)
    ranktuplist.append((i, getocur))

ocranceresult = sorted(ranktuplist, key=itemgetter(1), reverse=True)
ocranceresult

[('4624', 664),
 ('4634', 618),
 ('4672', 279),
 ('4769', 115),
 ('4648', 60),
 ('4768', 45),
 ('4770', 6),
 ('4662', 1)]

In [17]:
def getUnique(dictlist, qkey):
    getquery = nested_lookup(qkey,dictlist)
    result = [i for i in getquery if isinstance(i, str)]
    uniqueResult = list(set(result))
    return uniqueResult

In [23]:
# get all host in log
uniquehostlist = getUnique(loglist, 'host')
uniquehostlist

['sespinosa-pc.brawlco.com',
 'teston-pc.brawlco.com',
 'dc.brawlco.com',
 'sounder-pc.brawlco.com',
 'minahan-pc.brawlco.com',
 'kressierer-pc.brawlco.com',
 'ostermeyer-pc.brawlco.com',
 'harley-pc.brawlco.com',
 'beane-pc.brawlco.com',
 'santilli-pc.brawlco.com',
 'peele-pc.brawlco.com',
 'mims-pc.brawlco.com',
 'platten-pc.brawlco.com',
 'fulco-pc.brawlco.com',
 'zissler-pc.brawlco.com',
 'colgan-pc.brawlco.com',
 'escue-pc.brawlco.com']

In [19]:
#get all user in log
uniqueuserlist = getUnique(loglist, '@UserID')
uniqueuserlist

['S-1-5-21-3274432971-3114609421-3981963398-1117',
 'S-1-5-21-3274432971-3114609421-3981963398-1177',
 'S-1-5-21-3274432971-3114609421-3981963398-1148',
 'S-1-5-21-3274432971-3114609421-3981963398-1159',
 'S-1-5-18',
 'S-1-5-21-3274432971-3114609421-3981963398-1172',
 'S-1-5-21-3274432971-3114609421-3981963398-1748',
 'S-1-5-21-3274432971-3114609421-3981963398-1301',
 'S-1-5-21-3274432971-3114609421-3981963398-1300',
 'S-1-5-21-3274432971-3114609421-3981963398-1373',
 'S-1-5-21-3274432971-3114609421-3981963398-1230',
 'S-1-5-21-3274432971-3114609421-3981963398-1233',
 'S-1-5-21-3274432971-3114609421-3981963398-1122',
 'S-1-5-21-3274432971-3114609421-3981963398-1241',
 'S-1-5-21-3274432971-3114609421-3981963398-1749',
 'S-1-5-21-3274432971-3114609421-3981963398-1145',
 'S-1-5-21-3274432971-3114609421-3981963398-1147']

In [20]:
# get all timestamps
timestamplist = getUnique(loglist, '@timestamp')
timestamplist = sorted(timestamplist)
timestamplist

['2017-05-01T18:57:51.680Z',
 '2017-05-01T18:58:02.462Z',
 '2017-05-01T18:58:04.087Z',
 '2017-05-01T18:58:04.446Z',
 '2017-05-01T18:58:06.946Z',
 '2017-05-01T18:58:12.477Z',
 '2017-05-01T18:58:14.805Z',
 '2017-05-01T18:58:15.149Z',
 '2017-05-01T18:58:17.680Z',
 '2017-05-01T18:58:39.290Z',
 '2017-05-01T18:58:53.000Z',
 '2017-05-01T18:58:54.000Z',
 '2017-05-01T18:58:55.000Z',
 '2017-05-01T18:58:56.000Z',
 '2017-05-01T18:59:04.000Z',
 '2017-05-01T18:59:05.000Z',
 '2017-05-01T18:59:05.633Z',
 '2017-05-01T18:59:06.758Z',
 '2017-05-01T18:59:06.821Z',
 '2017-05-01T18:59:09.000Z',
 '2017-05-01T18:59:09.415Z',
 '2017-05-01T18:59:09.430Z',
 '2017-05-01T18:59:10.000Z',
 '2017-05-01T18:59:11.000Z',
 '2017-05-01T18:59:15.000Z',
 '2017-05-01T18:59:15.028Z',
 '2017-05-01T18:59:15.040Z',
 '2017-05-01T18:59:15.477Z',
 '2017-05-01T18:59:15.915Z',
 '2017-05-01T18:59:15.946Z',
 '2017-05-01T18:59:16.000Z',
 '2017-05-01T18:59:16.368Z',
 '2017-05-01T18:59:17.000Z',
 '2017-05-01T18:59:19.540Z',
 '2017-05-01T1

# Time Series Analysis

In [21]:
def freqdist(hostname, colorname):
    
    timekey = "@timestamp"
    hostkey = "host"

    timelist = [x[timekey] for x in loglist if hostname in x[hostkey]]
    
    timefreq_df = pd.DataFrame({"timestamp":timelist})
    timefreq_df["timestamp"] = pd.to_datetime(timefreq_df["timestamp"])

    timefreq_df["timestamp"] = timefreq_df["timestamp"].apply(lambda x: "%d:%d:%d" % (x.hour, x.minute, x.second))

    timefreq = timefreq_df["timestamp"].value_counts().sort_index()
    timefreq_df = pd.DataFrame({"timestamp":timefreq.index, 'count':timefreq.values}) 

    plotlydata = go.Scatter(x=timefreq_df["timestamp"],
                        y=timefreq_df["count"],
                        name = hostname,
                        line = dict(color = colorname),
                        opacity = 0.4)
    return plotlydata

In [27]:
tracedata = []
colorpal = [
    '#1f77b4',  # muted blue
    '#ff7f0e',  # safety orange
    '#2ca02c',  # cooked asparagus green
    '#d62728',  # brick red
    '#9467bd',  # muted purple
    '#8c564b',  # chestnut brown
    '#e377c2',  # raspberry yogurt pink
    '#7f7f7f',  # middle gray
    '#bcbd22',  # curry yellow-green
    '#17becf'   # blue-teal
]
for host,colrs in zip(uniquehostlist,colorpal):
    tracedata.append(freqdist(host, colrs))

layout = dict(title="Time distribution for hosts",)

fig = dict(data=tracedata, layout=layout)

iplot(fig)

In [26]:
totalevents = len(timestamplist)
starttime = pd.to_datetime(min(timestamplist))
endtime = pd.to_datetime(max(timestamplist))
duration = endtime - starttime

print("totalevents: ", totalevents, "\n", "starttime: ",starttime ,"\n", "endtime:", endtime, "\n", "duration: ", duration)

totalevents:  1457 
 starttime:  2017-05-01 18:57:51.680000+00:00 
 endtime: 2017-05-01 20:22:04.009000+00:00 
 duration:  0 days 01:24:12.329000


# Some high level visualisation for evidence

In [28]:
def getEvidenceStruct(jsondata):
    '''Get the top most level of a nested evidence to determine available fields'''
    G = nx.DiGraph()
    for key in jsondata.keys():
        G.add_edge('ROOT',key)

    def walk(node):
        for key, item in node.items():
            if isinstance(item, list):
                for i in item:
                    if isinstance(i, dict):
                        for kk, vv in i.items():
                            G.add_edge(str(key),str(kk))
                    else:
                        G.add_edge(str(key),str(i))
            if isinstance(item, dict):
                for j in item.keys():
                    G.add_edge(str(key),str(j))
                walk(item)
                
    walk(jsondata)
    return G

In [29]:
G = getEvidenceStruct(loglist[2])

nt = Network(height="750px", width="100%", bgcolor="#222222", font_color="white", directed="true")
nt.show_buttons()
nt.from_nx(G)
nt.show("nx.html")

In [30]:
def getEvidenceVals(jsondata):
    '''Get the inner level of a nested evidence to determine values of fields'''
    G = nx.DiGraph()
    for key in jsondata.keys():
        G.add_edge('ROOT',key)

    def walk(node):
        for key, item in node.items():
            if isinstance(item, list):
                for i in item:
                    if isinstance(i, dict):
                        for kk, vv in i.items():
                            G.add_edge(key,kk)
                            G.add_edge(kk,vv)
                    else:
                        G.add_edge(key,i)
            if isinstance(item, dict):
                for j in item.keys():
                    G.add_edge(key,j)
                walk(item)
            if isinstance(item,str):
                G.add_edge(key, item)
                
    walk(jsondata)
    return G

In [31]:
G = getEvidenceVals(loglist[2])

nt = Network(height="750px", width="100%", bgcolor="#222222", font_color="white", directed="true")
nt.show_buttons()
nt.from_nx(G)
nt.show("nx.html")

In [25]:
G = nx.DiGraph()

for evid in tqdm(loglist):
    #print(evid)
    if isinstance(evid, dict):
        G = nx.compose(G, getEvidenceStruct(evid))


nt = Network(height="750px", width="100%", bgcolor="#222222", font_color="white", directed="true")
nt.show_buttons()
nt.from_nx(G)
nt.show("nx.html")    

100%|████████████████| 3605/3605 [00:01<00:00, 3152.58it/s]


# Add to neo4j graph DB

In [32]:
# set up authentication parameters
graph = Graph(password="admin")

In [30]:
# Graph dumps
graph.delete_all()

# Non-timebased modelling/Corelation modeling

In [29]:
def remove_dupe_dicts(l):
  return [
    dict(t) 
    for t in {
      tuple(d.items())
      for d in l
    }
  ]

def tuplepair(n1_n2_r, dictlist):
    triplesList = []
    n1 = n1_n2_r[0]
    n2 = n1_n2_r[1]
    r = n1_n2_r[2]
    for ind, records in enumerate(dictlist):
        node1 = nested_lookup(n1,records)
        node2 = nested_lookup(n2,records)
       
        if node1 and node2:
            for subnode2 in node2:
                if isinstance(subnode2, str):
                    triplesList.append({
                        "node1type" : n1,
                        "node1label" : node1[0],
                        "node2type" : n2,
                        "node2label": subnode2,
                        "relationship": n2,
                        "logindex" : ind
                    })
    return triplesList

def addRelationship(tripleList):
    for dic in tqdm(tripleList):
        tx = graph.begin()
        node1 = Node(dic["node1type"], label = dic["node1label"], index = dic["logindex"])
        node2 = Node(dic["node2type"], label = dic["node2label"])
        newRelationship = Relationship(node1, dic["relationship"], node2)
        
        tx.merge(newRelationship, dic["node1type"], "label")
        tx.commit()

In [31]:
# Host-Event graph
n1_n2_r = ("host","EventID", "event_created")
tuplelist = tuplepair(n1_n2_r, loglist)
addRelationship(tuplelist)

100%|█████████████████| 1817/1817 [00:08<00:00, 222.21it/s]


In [32]:
def eventdatatransform(dictlist):
    host_eventlist = []
    for ind, records in enumerate(dictlist):

        lookup = nested_lookup("EventData",records)
        if isinstance(lookup[0], dict):
            data = lookup[0]["Data"]
            if data and len(data)>0:
                for dataitem in data:
                    if isinstance(dataitem, dict):
                        if "@Name" in dataitem.keys() and "#text" in dataitem.keys():

                            host_eventlist.append((
                                records["host"],
                                dataitem["@Name"], dataitem["#text"]
                            ))
                        
    return host_eventlist
                        
    
    
def nestedEventData(dictlist):
    '''Get event data
    Required: Audit log, nested json, EventData, @Name, #text attributes'''
    triplesList = []
    
    for ind, records in enumerate(dictlist):
        triplesList.append({
            "node1type" : "host",
            "node1label" : records[0],
            "node2type" : records[1],
            "node2label": records[2],
            "relationship": records[1],
            "logindex" : ind
        })
    return triplesList

In [33]:
# Event-data graph
tuplistofevents = eventdatatransform(loglist)
neo_hostdata = nestedEventData(tuplistofevents)
addRelationship(neo_hostdata)

100%|███████████████| 20597/20597 [01:36<00:00, 212.92it/s]


# Time base modelling
### Reference: https://neo4j.com/blog/visualize-time-based-graphs-neo4j/

In [99]:
def tuplepair(n1_n2_r, dictlist):
    triplesList = []
    n1 = n1_n2_r[0]
    n2 = n1_n2_r[1]
    r = n1_n2_r[2]
    for ind, records in enumerate(dictlist):
        node1 = nested_lookup(n1,records)
        node2 = nested_lookup(n2,records)
        t = nested_lookup("@timestamp",records)
        eventdata = nested_lookup("raw",records)
        if node1 and node2:
            for subnode2 in node2:
                if isinstance(subnode2, str):
                    triplesList.append({
                        "node1type" : n1,
                        "node1" : node1[0],
                        "node2type" : n2,
                        "node2": subnode2,
                        "relationship": r,
                        "time" : t[0],
                        "logindex" : ind
                    })
    return triplesList

# Running Graph Queries

In [33]:
result = graph.run(
    "MATCH (h)-[r]->(a) \
    RETURN h.label, type(r), a.label"
).to_table()
result

h.label,type(r),a.label
cmd.exe,Create Process,svchost.exe
cmd.exe,Create Process,expand.exe
cmd.exe,Create Process,cliconfg.exe
cmd.exe,Create Process,sc.exe
cmd.exe,Create Process,systeminfo.exe
cmd.exe,Create Process,reg.exe
cmd.exe,Create Process,eventvwr.exe
cmd.exe,Create Process,timeout.exe
cmd.exe,Create Process,tasklist.exe
cmd.exe,Create Process,conhost.exe


In [36]:
# find host which had accessed the ip address 10.3.12.98
result = graph.run(
    "MATCH (h)-[r]->(a) \
    WHERE a.label = '10.3.12.98' \
    RETURN h.label"
).to_table()
print("Host which had accessed the ip address 10.3.12.98 " + str(result))

Host which had accessed the ip address 10.3.12.98 


In [37]:
# find suspected user BRAWLCO\HANAHAN-PC$ who interected with hosts
result = graph.run(
    "MATCH (h)-[r]->(a) \
    WHERE a.label = 'BRAWLCO\\\HANAHAN-PC$' AND type(r)= 'TargetUserSid'\
    RETURN h.label"
).to_table()
result

In [38]:
#Find host which used explicit credential logon details

result = graph.run(
    "MATCH (h)-[r]->(a) \
    WHERE a.label = '4648' AND type(r)= 'EventID'\
    RETURN h.label"
).to_table()
result