In [1]:
import json
from nested_lookup import get_all_keys, get_occurrences_and_values
import pandas as pd

from operator import itemgetter

import numpy as np
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

In [2]:
"""Extract nested values from a JSON tree."""
def json_extract(obj, key):
    """Recursively fetch values from nested JSON."""
    arr = []

    def extract(obj, arr, key):
        """Recursively search for values of key in JSON tree."""
        if isinstance(obj, dict):
            for k, v in obj.items():
                if isinstance(v, (dict, list)):
                    extract(v, arr, key)
                elif k == key:
                    arr.append(v)
        elif isinstance(obj, list):
            for item in obj:
                extract(item, arr, key)
        return arr

    values = extract(obj, arr, key)
    return values

In [3]:
outputDir = ".\\sample_dataset\\output\\"
outputfile_json = outputDir + "preprocessed.json"

with open(outputfile_json) as f:
    jsondata = json.load(f)

In [4]:
# get all unique keys in log
availablekeys = list(set(get_all_keys(jsondata)))
sorted(availablekeys)

['#attributes',
 'ActivityID',
 'AuthenticationLevel',
 'AuthenticationService',
 'Channel',
 'Computer',
 'Correlation',
 'Endpoint',
 'Event',
 'EventData',
 'EventID',
 'EventRecordID',
 'Execution',
 'Guid',
 'ImpersonationLevel',
 'InterfaceUuid',
 'KernelTime',
 'Keywords',
 'Level',
 'Name',
 'NetworkAddress',
 'Opcode',
 'Options',
 'ProcNum',
 'ProcessID',
 'ProcessorID',
 'Protocol',
 'Provider',
 'Security',
 'System',
 'SystemTime',
 'Task',
 'ThreadID',
 'TimeCreated',
 'UserTime',
 'Version',
 'xmlns']

In [None]:
'''TODO: Create a transformar'''

In [14]:
[i for i in availablekeys if "time" in i.lower()]

['KernelTime', 'SystemTime', 'TimeCreated', 'UserTime']

In [7]:
# get all eventid in log
qkey = 'EventID'

def getUnique(dictlist, qkey):
    getquery = json_extract(dictlist, qkey)
    result = [i for i in getquery]
    uniqueResult = list(set(result))
    return uniqueResult

uniqueEventid = getUnique(jsondata, qkey)
uniqueEventid

[6]

In [8]:
winevntcode = r'C:\Users\A101889\Desktop\Workspace\Dataset\Lookup\WindowsEventCodes.csv'
df = pd.read_csv(winevntcode)

In [9]:
def lookupevent(alleventcodes):
    rellist = []
    for i in alleventcodes:
        query = df.loc[df['EventCode'] == int(i), 'EventDescription']
        query = query.get_values()
        if query.size > 0:
            rellist.append((str(i), query[0]))
    return rellist

impevtlist = lookupevent(uniqueEventid)
impevtlist

[('6', 'New Kernel Filter Driver or Driver Loaded')]

In [10]:
def getOccurances(dictlist, valuequery):
    '''Get the occrunces of a value in log evidence in a list of dict'''
    rel = get_occurrences_and_values(dictlist, value=valuequery)[valuequery]["occurrences"]
    return rel

In [11]:
# Rank eventid occurances
ranktuplist = []
for i, j in impevtlist:

    getocur = getOccurances(jsondata, int(i))
   
    ranktuplist.append((i, getocur))

ocranceresult = sorted(ranktuplist, key=itemgetter(1), reverse=True)
ocranceresult

[('6', 866)]

In [12]:
# get all host in log
uniquehostlist = getUnique(jsondata, 'Computer' )
uniquehostlist

['LAPTOP-JU4M3I0E']

In [16]:
# get all timestamps
timestamplist = getUnique(jsondata, 'SystemTime')
timestamplist = sorted(timestamplist)
timestamplist

['2020-09-18T14:02:36.219534Z',
 '2020-09-18T14:02:36.585160Z',
 '2020-09-18T14:02:36.842637Z',
 '2020-09-18T14:02:37.004246Z',
 '2020-09-18T14:02:37.004320Z',
 '2020-09-18T14:02:37.004396Z',
 '2020-09-18T14:02:37.004415Z',
 '2020-09-18T14:02:37.004444Z',
 '2020-09-18T14:02:37.176107Z',
 '2020-09-18T14:02:37.177575Z',
 '2020-09-18T14:02:37.177624Z',
 '2020-09-18T14:02:37.178185Z',
 '2020-09-18T14:02:37.232805Z',
 '2020-09-18T14:02:37.233682Z',
 '2020-09-18T14:02:37.234407Z',
 '2020-09-18T14:02:37.234927Z',
 '2020-09-18T14:02:37.236243Z',
 '2020-09-18T14:02:37.239598Z',
 '2020-09-18T14:02:37.395409Z',
 '2020-09-18T14:02:37.396101Z',
 '2020-09-18T14:02:37.397707Z',
 '2020-09-18T14:02:37.397861Z',
 '2020-09-18T14:02:37.398770Z',
 '2020-09-18T14:02:38.006405Z',
 '2020-09-18T14:02:38.006498Z',
 '2020-09-18T14:02:38.006579Z',
 '2020-09-18T14:02:38.006599Z',
 '2020-09-18T14:02:38.006622Z',
 '2020-09-18T14:02:39.160372Z',
 '2020-09-18T14:02:39.629036Z',
 '2020-09-18T14:02:41.441889Z',
 '2020-0

# Time Series Analysis

In [17]:
def freqdist(hostname, colorname):
    
    timekey = "UtcTime"
    hostkey = "UserID"

    timelist = [x[timekey] for x in jsondata if hostname in x.values()]
    
    timefreq_df = pd.DataFrame({"timestamp":timelist})
    timefreq_df["timestamp"] = pd.to_datetime(timefreq_df["timestamp"])

    timefreq_df["timestamp"] = timefreq_df["timestamp"].apply(lambda x: "%d:%d:%d" % (x.hour, x.minute, x.second))

    timefreq = timefreq_df["timestamp"].value_counts().sort_index()
    timefreq_df = pd.DataFrame({"timestamp":timefreq.index, 'count':timefreq.values}) 

    plotlydata = go.Scatter(x=timefreq_df["timestamp"],
                        y=timefreq_df["count"],
                        name = hostname,
                        line = dict(color = colorname),
                        opacity = 0.4)
    return plotlydata

In [18]:
tracedata = []
colorpal = [
    '#1f77b4',  # muted blue
    '#ff7f0e',  # safety orange
    '#2ca02c',  # cooked asparagus green
    '#d62728',  # brick red
    '#9467bd',  # muted purple
    '#8c564b',  # chestnut brown
    '#e377c2',  # raspberry yogurt pink
    '#7f7f7f',  # middle gray
    '#bcbd22',  # curry yellow-green
    '#17becf'   # blue-teal
]
for host,colrs in zip(uniquehostlist,colorpal):
    tracedata.append(freqdist(host, colrs))

layout = dict(title="Time distribution for hosts",)

fig = dict(data=tracedata, layout=layout)

iplot(fig)

In [19]:
totalevents = len(timestamplist)
starttime = pd.to_datetime(min(timestamplist))
endtime = pd.to_datetime(max(timestamplist))
duration = endtime - starttime

print("totalevents: ", totalevents, "\n", "starttime: ",starttime ,"\n", "endtime:", endtime, "\n", "duration: ", duration)

totalevents:  414 
 starttime:  2020-09-18 14:02:36.219534+00:00 
 endtime: 2020-09-18 14:02:56.237772+00:00 
 duration:  0 days 00:00:20.018238
