# Log parser and preprocessing
Currently takes in .evtx files.


In [2]:
from evtx import PyEvtxParser
import json
from pprint import pprint
import glob

# Visualisation
import networkx as nx
from pyvis.network import Network
from nested_lookup import nested_lookup, get_all_keys

In [3]:
# Declare directories
inputDir = ".\\sample_dataset\\input\\"
outputDir = ".\\sample_dataset\\output\\"

logextension = ".evtx"

outputfile_json = outputDir + "preprocessed.json"

In [4]:
# Get file list in the directory
inputfilelist = glob.glob(inputDir + "*" + logextension)
inputfilelist

['.\\sample_dataset\\input\\4794_DSRM_password_change_t1098.evtx',
 '.\\sample_dataset\\input\\ACL_ForcePwd_SPNAdd_User_Computer_Accounts.evtx',
 '.\\sample_dataset\\input\\babyshark_mimikatz_powershell.evtx',
 '.\\sample_dataset\\input\\CA_4624_4625_LogonType2_LogonProc_chrome.evtx',
 '.\\sample_dataset\\input\\CA_chrome_firefox_opera_4663.evtx',
 '.\\sample_dataset\\input\\CA_DCSync_4662.evtx',
 '.\\sample_dataset\\input\\CA_hashdump_4663_4656_lsass_access.evtx',
 '.\\sample_dataset\\input\\CA_keefarce_keepass_credump.evtx',
 '.\\sample_dataset\\input\\CA_keepass_KeeThief_Get-KeePassDatabaseKey.evtx',
 '.\\sample_dataset\\input\\CA_Mimikatz_Memssp_Default_Logs_Sysmon_11.evtx',
 '.\\sample_dataset\\input\\CA_protectedstorage_5145_rpc_masterkey.evtx',
 '.\\sample_dataset\\input\\CA_sysmon_hashdump_cmd_meterpreter.evtx',
 '.\\sample_dataset\\input\\CA_teamviewer-dumper_sysmon_10.evtx',
 '.\\sample_dataset\\input\\dc_applog_ntdsutil_dfir_325_326_327.evtx',
 '.\\sample_dataset\\input\\dis

In [5]:
sample = json.loads(list(PyEvtxParser(inputfilelist[5]).records_json())[0]["data"])
pprint(sample)

{'Event': {'#attributes': {'xmlns': 'http://schemas.microsoft.com/win/2004/08/events/event'},
           'EventData': {'AccessList': '%%7688\r\n\t\t\t\t',
                         'AccessMask': '0x100',
                         'AdditionalInfo': '-',
                         'AdditionalInfo2': '',
                         'HandleId': '0x0',
                         'ObjectName': '%{c6faf700-bfe4-452a-a766-424f84c29583}',
                         'ObjectServer': 'DS',
                         'ObjectType': '%{19195a5b-6da0-11d0-afd3-00c04fd930c9}',
                         'OperationType': 'Object Access',
                         'Properties': '%%7688\r\n'
                                       '\t\t'
                                       '{1131f6ad-9c07-11d1-f79f-00c04fc2dcd2}\r\n'
                                       '\t'
                                       '{19195a5b-6da0-11d0-afd3-00c04fd930c9}\r\n',
                         'SubjectDomainName': 'insecurebank',
              

In [6]:
# Bulk process and write to output directory
jsonlist= []

for filepath in inputfilelist:
    parser = PyEvtxParser(filepath)
    
    for record in parser.records_json():
        dictdata = record["data"]
        
        '''<<TODO: data sanitizer function'''
            
        jsondata = json.loads(dictdata)
        jsonlist.append(jsondata)
        
with open(outputfile_json, 'w') as filewriter:
    json.dump(jsonlist,filewriter)

In [7]:
"""Extract nested values from a JSON tree."""
def json_extract(obj, key):
    """Recursively fetch values from nested JSON."""
    arr = []

    def extract(obj, arr, key):
        """Recursively search for values of key in JSON tree."""
        if isinstance(obj, dict):
            for k, v in obj.items():
                if isinstance(v, (dict, list)):
                    extract(v, arr, key)
                elif k == key:
                    arr.append(v)
        elif isinstance(obj, list):
            for item in obj:
                extract(item, arr, key)
        return arr

    values = extract(obj, arr, key)

    if len(values) == 1:
        return values[0]
    else:
        return values

In [8]:
#Load event transformer lookup file to dict
f = open('./transformer_lookup.json',) 
eventdict = json.load(f)

def logtransform(logdict, **kwargs):
    logXformedlist = []
    
    if "eventid" in kwargs:
        #transform selected eventid in the log(for debug)
        eventidsel = kwargs.get("eventid")
        for item in logdict:
            if eventidsel in nested_lookup("EventID", item):
                evtTransformDict = {}
                for prop in eventdict[str(eventidsel)]:
                    evtTransformDict[prop] = json_extract(item, prop)
                logXformedlist.append(evtTransformDict)
    else:
    
        #loop through logs, find eventid, match against lookup, transform
        for item in logdict:
            eventid = str(nested_lookup("EventID", item)[0]) #get eventid in str
            evtTransformDict = {}
            if eventid in eventdict.keys():
                for prop in eventdict[eventid]:
                    evtTransformDict[prop] = json_extract(item, prop)
                logXformedlist.append(evtTransformDict)
    
    return logXformedlist

jsonlist = logtransform(jsonlist)
jsonlist

[{'EventID': 5136,
  'SystemTime': '2019-03-25T12:33:44.816657Z',
  'SubjectUserSid': 'S-1-5-21-738609754-2819869699-4189121830-1108',
  'SubjectUserName': 'bob',
  'SubjectDomainName': 'insecurebank',
  'SubjectLogonId': '0x8d7099',
  'OpCorrelationID': '7A0DDD82-407E-41DA-83B1-014A84297AB8',
  'AppCorrelationID': '-',
  'DSName': 'insecurebank.local',
  'DSType': '%%14676',
  'ObjectDN': 'CN={6AC1786C-016F-11D2-945F-00C04FB984F9},CN=POLICIES,CN=SYSTEM,DC=INSECUREBANK,DC=LOCAL',
  'ObjectClass': 'groupPolicyContainer',
  'AttributeLDAPDisplayName': 'versionNumber',
  'AttributeSyntaxOID': '2.5.5.9',
  'AttributeValue': '9',
  'OperationType': '%%14674'},
 {'EventID': 5136,
  'SystemTime': '2019-03-25T12:33:44.816657Z',
  'SubjectUserSid': 'S-1-5-21-738609754-2819869699-4189121830-1108',
  'SubjectUserName': 'bob',
  'SubjectDomainName': 'insecurebank',
  'SubjectLogonId': '0x8d7099',
  'OpCorrelationID': '7A0DDD82-407E-41DA-83B1-014A84297AB8',
  'AppCorrelationID': '-',
  'DSName': 'i

In [9]:
#Load event transformer lookup file to dict
mitrefile = open('./mitre_winevent_lookup.json',) 
mitrelookup = json.load(mitrefile)
mitrelookup

[{'EventID': [4688, 4663],
  'Tactic': 'Collection',
  'TechniqueName': 'Audio Capture',
  'TechniqueID': 'T1123'},
 {'EventID': [4688, 4663],
  'Tactic': 'Collection',
  'TechniqueName': 'Automated Collection',
  'TechniqueID': 'T1119'},
 {'EventID': [4688, 4688, 200, 4663, 5861, 500, 4100, 4104],
  'Tactic': 'Collection',
  'TechniqueName': 'Data from Local System',
  'TechniqueID': 'T1005'},
 {'EventID': [4688, 4688, 5140, 4663, 5145],
  'Tactic': 'Collection',
  'TechniqueName': 'Data from Network Shared Drive',
  'TechniqueID': 'T1039'},
 {'EventID': [4688, 4688, 4657, 4663, 5140, 5145],
  'Tactic': 'Collection',
  'TechniqueName': 'Data from Removable Media',
  'TechniqueID': 'T1025'},
 {'EventID': [4688, 4688, 4663],
  'Tactic': 'Collection',
  'TechniqueName': 'Data Staged',
  'TechniqueID': 'T1074'},
 {'EventID': [4688, 5156, 4624, 4663],
  'Tactic': 'Collection',
  'TechniqueName': 'Email Collection',
  'TechniqueID': 'T1114'},
 {'EventID': [4624, 4688],
  'Tactic': 'Collecti

In [53]:
from py2neo import Graph, Node, Relationship, NodeMatcher, cypher

graph = Graph(password="admin")
tx = graph.begin()
nodes = NodeMatcher(graph)

In [56]:
for item in jsonlist:
    if 'SubjectUserName' in item.keys():
        user_node1 = Node("User", name = item['SubjectUserName'])
        existing_event_node2 = nodes.match('Event', name=str(item['EventID'])).first()
        
        if existing_event_node2:
            
            existing_n1_created_n2 = Relationship(user_node1, 'created', existing_event_node2)
            graph.create(existing_n1_created_n2)
            print("done")

done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
