Copyright (C) Egon Kidmose 2015-2017

This file is part of lstm-rnn-correlation.

lstm-rnn-correlation is free software: you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.

lstm-rnn-correlation is distributed in the hope that it will be
useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
Lesser General Public License for more details.

You should have received a copy of the GNU Lesser General Public
License along with lstm-rnn-correlation. If not, see
<http://www.gnu.org/licenses/>.


In [None]:
from __future__ import division

import pandas as pd
from pandas import DataFrame
import numpy as np
import re
import datetime
import ipaddress

# matplotlib
try: # might, might not have x available
    import os
    os.environ['DISPLAY']
except KeyError:
    import matplotlib
    matplotlib.use('Agg')
try: # might, might not be notebook
    %matplotlib inline
except NameError:
    pass
import matplotlib.pyplot as plt

seed = 1468586473 # Unix time at time of writing
def rndseed():
    global seed
    seed += 1
    return seed

In [None]:
from data_cfg import data
# load files
data_tmp = DataFrame()
for index, row in data.iterrows():
    print("Opening: {}".format(row['filename']))
    with open(row['filename']) as f:
        alerts = DataFrame(
            map(parse_line, f.readlines()),
            columns=['ts', 'rid', 'msg', 'prio', 'proto', 'srcip', 'srcport', 'dstip', 'dstport']
        )
        alerts['incident'] = row['incident']   
        data_tmp = data_tmp.append(pd.merge(data, alerts))
        print("Loaded: {}".format(row['filename']))
data = data_tmp

In [None]:
data = data_tmp

In [None]:
difficult_ips = [
    '94.63.149.152',
    '147.32.84.165',
]
for ip in difficult_ips:
    assert re.match('('+IP+')', ip).group()

difficult_ips_port = [
    '94.63.149.152:80',
    '147.32.84.165:1040',
]
for ip in difficult_ips_port:
    res = re.match(IP_PORT, ip).groups()
    assert res is not None
    assert len(res) == 2

difficult_lines = [
    '08/15-15:53:48.900440  [**] [120:3:1] (http_inspect) NO CONTENT-LENGTH OR TRANSFER-ENCODING IN HTTP RESPONSE [**] [Classification: Unknown Traffic] [Priority: 3] {TCP} 94.63.149.152:80 -> 147.32.84.165:1040\n',
]
for l in difficult_lines:
    res = parse_line(l)
    assert len(res) == 9

In [None]:
IPV4 = '(?:[0-9]{1,3}(?:\.[0-9]{1,3}){3})'
IPV6 = '(?:[0-9a-f]|:){1,4}(?::(?:[0-9a-f]{0,4})*){1,7}'
IP = '(?:{}|{})'.format(IPV4, IPV6)
IP_PORT = '('+IP+')(?::([^ ]+))?'

SNORT_REGEX = re.compile('^(.*)  \[\*\*] \[([^]]*)] (.*) \[Priority: ([0-9])] {([^}]*)} ('+IP+')(?::([^ ]+))? -> ('+IP+')(?::([^ ]+))?\n')
SNORT_TS_FMT = '%m/%d/%y-%H:%M:%S.%f'
SNORT_TS_FMT_NO_YR = '%m/%d-%H:%M:%S.%f'

def strptime(string):
    ts = None
    try:
        ts = datetime.datetime.strptime(string, SNORT_TS_FMT)
    except:
        pass
    try:
        ts = datetime.datetime.strptime(string, SNORT_TS_FMT_NO_YR)
    except:
        pass
    if ts is None:
        raise Exception('Failed to parse {}: {}'.format(type(string), string))
    return ts

def strftime(ts):
    return ts.strftime(SNORT_TS_FMT)

def parse_line(line):
    tupl = re.match(SNORT_REGEX, line).groups()
    tupl = tuple([strptime(tupl[0])]) + tupl[1:]
    return tupl


def build_line(tupl):
    tupl = tuple([strftime(tupl[0])]) + tupl[1:]
    return "{}  [**] [{}] {} [Priority: {}] {{{}}} {} -> {}\n".format(*tupl)

test_code =\
"""
for fn in data['filename']:
    for l in open(fn).readlines():
        p = parse_line(l)
        b = build_line(p)
        assert l == b or l == (b[:5]+b[8:]), "Mismatch in result"
"""

In [None]:
# data overview
# by incident
def get_data_overview(data):
    df_inc_cnt = DataFrame(data.groupby(['incident']).size().rename('inc_cnt').reset_index())
    df_inc_cnt['inc_cnt_pct'] = df_inc_cnt['inc_cnt']/data.count()[0]*100

    df_inc_prio_cnt = DataFrame(
        pd.merge(data, df_inc_cnt, on='incident').groupby(['incident', 'prio', 'inc_cnt']
    ).size().rename('inc_prio_cnt').reset_index())
    df_inc_prio_cnt['inc_prio_cnt_pct'] = df_inc_prio_cnt['inc_prio_cnt']/df_inc_prio_cnt['inc_cnt']*100

    df_overview = pd.merge(df_inc_cnt, df_inc_prio_cnt).groupby(['incident', 'prio']).first().reset_index()

    tot_prio = data.groupby(['prio']).size().rename('inc_prio_cnt')
    df_tot = DataFrame(tot_prio)
    df_tot['inc_prio_cnt_pct'] = df_tot['inc_prio_cnt']/data.count()[0]*100
    df_tot['inc_cnt'] = data.count()[0]
    df_tot['inc_cnt_pct'] = 100
    df_tot['incident'] = 'total'

    df_overview = pd.concat([df_overview.reset_index(), df_tot.reset_index()])\
        .groupby(['incident', 'inc_cnt', 'inc_cnt_pct', 'prio', 'inc_prio_cnt', 'inc_prio_cnt_pct', ])\
        .first().drop('index', 1)

    return df_overview

df_overview = get_data_overview(data)
df_overview

In [None]:
# print in latex friendly
for l in np.concatenate([
    np.array([df_overview.reset_index().columns]),
    df_overview.reset_index().as_matrix(),
]):
    print(('\t&'.join(['{}']*len(l)) + '\\\\\\hline').format(*l))


In [None]:
# data time overview
df_ts = data[['incident', 'ts']].groupby('incident').agg(['min', 'max'])['ts']
df_ts.columns = ['start', 'stop', ]
df_ts['dur'] = df_ts['stop']-df_ts['start']
df_ts

In [None]:
fig = None
ax = None
def time_span_plot(dataframe):
    "Dataframe must have 'start' and 'duration' keys"
    global fig, ax
    fig, ax = plt.subplots()
    for index, row in dataframe.iterrows():
        x = row['start'] + row['dur']/2
        y = int(index) if index is not 'benign' else 0
        xerr = row['dur']/2
        ax.errorbar(x, y, xerr=xerr)
    
    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_visible(False)
    ax.spines['top'].set_visible(False)
    
    ax.yaxis.set_ticks_position('none')
    ax.xaxis.set_ticks_position('bottom')
    
    ax.set_ylim(np.array(ax.get_ylim()) + [-1, 1])
    
    fig.set_size_inches(14, 2)

time_span_plot(df_ts[(df_ts.index == 'benign')])
time_span_plot(df_ts[(df_ts.index != 'benign')])

In [None]:
# calculate random shift to end up within boundaries of benign
new_start_min = df_ts.loc['benign']['start']
new_start_max = df_ts.loc['benign']['stop']-df_ts['dur']
window = new_start_max - new_start_min
assert (np.array(
        window+df_ts['dur']-df_ts.loc['benign']['dur'],
        dtype='object') == 0).all(), "window is wrong"

np.random.seed(rndseed())
df_ts['shift'] = new_start_min - df_ts['start'] \
    + map(lambda delta : np.random.rand()*delta, window)
df_ts

In [None]:
# apply shift
shifted_data = pd.merge(data, df_ts.reset_index(), on='incident')
shifted_data['ts'] = shifted_data['ts'] + shifted_data['shift']
assert shifted_data['ts'].max() == data.groupby(['incident']).max().loc['benign']['ts']
assert shifted_data['ts'].min() == data.groupby(['incident']).min().loc['benign']['ts']
data = shifted_data
data = data.sort_values('ts')
data = data[data_tmp.columns] # only original columns

df_ts = data[['incident', 'ts']].groupby('incident').agg(['min', 'max'])['ts']
df_ts.columns = ['start', 'stop', ]
df_ts['dur'] = df_ts['stop']-df_ts['start']
df_ts

In [None]:
time_span_plot(df_ts)
ax.plot(new_start_min, 0, 'og')
ax.plot(new_start_max[:-1], range(1, len(new_start_max)), 'or')

In [None]:
# rewrite victim IPs to match benign
malicious_ips = data[data['victim_ip'] != 'benign']\
    [['incident', 'victim_ip']].drop_duplicates()

data_benign = data[data['incident'] == 'benign']
benign_ips = pd.Series(pd.concat([data_benign['srcip'],data_benign['dstip']]).unique(), name='benign_ips')
def is_private_ip(string):
    try:
        return ipaddress.IPv4Address(string).is_private
    except ipaddress.AddressValueError:
        print("ignoring IPv6 addres: {}".format(string))
        return False
    
benign_ips = benign_ips[np.array(map(is_private_ip, benign_ips))] # only private
benign_ips = benign_ips.sample(malicious_ips.shape[0], random_state=rndseed())
df_replace = DataFrame(
    zip(malicious_ips['incident'], malicious_ips['victim_ip'], benign_ips),
    columns=['incident', 'from_ip', 'to_ip']
)
df_replace

In [None]:
# perform IP replacement
src_update = pd.merge(data.reset_index(), df_replace, left_on=['incident', 'srcip'], right_on=['incident', 'from_ip'])
src_update['srcip'] = src_update['to_ip']
src_update = src_update.drop(['from_ip', 'to_ip'], axis=1)
src_update = src_update.set_index('index')
data.update(src_update)

dst_update = pd.merge(data.reset_index(), df_replace, left_on=['incident', 'dstip'], right_on=['incident', 'from_ip'])
dst_update['dstip'] = dst_update['to_ip']
dst_update = dst_update.drop(['from_ip', 'to_ip'], axis=1)
dst_update = dst_update.set_index('index')
data.update(dst_update)

assert (np.array(src_update.groupby('incident').size()) > 0).all(), "No src updated, highly suspicous"
assert (np.array(dst_update.groupby('incident').size()) > 0).all(), "No dst updated, highly suspicous"
assert pd.merge(data, df_replace, left_on=['incident', 'srcip'], right_on=['incident', 'from_ip']).shape[0] == 0
assert pd.merge(data, df_replace, left_on=['incident', 'dstip'], right_on=['incident', 'from_ip']).shape[0] == 0

In [None]:
get_data_overview(data[:data.shape[0]//2])

In [None]:
get_data_overview(data[data.shape[0]//2:])