In [233]:
%reset -f
%load_ext autoreload
%autoreload 2

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
import profiling as prof
from nltk.util import ngrams
from collections import Counter
import itertools
from numpy import dot
from numpy.linalg import norm
from math import sqrt

data9 = prof.load_data("data/capture20110817.binetflow")
data10 = prof.load_data("data/capture20110818-2.binetflow")
data11 = prof.load_data("data/capture20110818.binetflow")
data12 = prof.load_data("data/capture20110819.binetflow")
pdata9 = prof.pre_process(data9,9)
pdata10 = prof.pre_process(data10,10)
pdata11 = prof.pre_process(data11,11)
pdata12 = prof.pre_process(data12,12)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [323]:
scenarios = [9, 10, 11, 12]
grid_search = pd.DataFrame(columns=["Scenario", "Threshold","Unknown","TP","FP","FN","TN","NEW_INFECTIONS"])

for scenario in scenarios:
    if scenario == 9:
        pdata = pdata9.copy()
    elif scenario == 10:
        pdata = pdata10.copy()
    elif scenario == 11:
        pdata = pdata11.copy()
    elif scenario == 12:
        pdata = pdata12.copy() 

    # Feature to use, including its optimal binsize, and discretized to be used for n-grams
    feature = "Protocol"
    nbins = 3

    discretized_data = pd.DataFrame()
    discretized_data["StartTime"] = pdata["StartTime"].copy()
    discretized_data["SourceAddress"] = pdata["SourceAddress"].copy()
    discretized_data["DestinationAddress"] = pdata["DestinationAddress"].copy()
    pdata["Protocol"] = prof.encode_feature(pdata["Protocol"])
    discretized_data[feature], binsedges_infected = prof.discretize_feature(pdata, feature, nbins, "kmeans")

    # Select source with most entries in given dataset to deliver most reliable n-gram profile
    # Determine list of all possible n-grams for selected host
    # This is the 3-gram profile of this host
    selected_source = prof.select_infected_host(pdata,scenario)
    filtered_source = discretized_data.loc[discretized_data.SourceAddress == selected_source]

    ngram = 3
    top_k = 10
    all_n_grams = list(itertools.product(*[['0','1','2'],['0','1','2'],['0','1','2']]))

    n_grams = pd.Index(list(ngrams(filtered_source[feature].astype(str),ngram)))
    n_grams_counts = [0]*len(all_n_grams)

    for i in range(len(n_grams.value_counts())):
        n_gram = n_grams.value_counts()
        index_all_n_grams = all_n_grams.index(n_gram.index[i])
        n_grams_counts[index_all_n_grams] = n_gram[i]

    # Group netflows by source address (host) because we are interested in modeling per-host behavior
    unique_sources = pdata.groupby(["SourceAddress"]).size().reset_index()

    # Calculate the distance for every other source compared to the selected source/host
    distances = [100]*len(unique_sources)
    n_gram_count_list = list()

    for i in range(len(unique_sources)):
        source = unique_sources.iloc[i]
        n_gram_counts = [0]*top_k

        filtered = discretized_data.loc[discretized_data.SourceAddress == source.SourceAddress]
        n_grams_source = pd.Index(list(ngrams(filtered[feature].astype(str),ngram)))
        n_grams_source_counts = [0]*len(all_n_grams)

        for j in range(len(n_grams_source.value_counts())):
            n_gram = n_grams_source.value_counts()
            index_all_n_grams = all_n_grams.index(n_gram.index[j])
            n_grams_source_counts[index_all_n_grams] = n_gram[j]


        # Cosine distance between selected host and other hosts
        if np.sum(n_grams_source_counts) == 0:
            distances[i] = 1
        else:
            distances[i] = 1-dot(n_grams_counts, n_grams_source_counts)/(norm(n_grams_counts)*norm(n_grams_source_counts))


    # Create an overview of the results including the original label and predicted label
    results = pd.DataFrame(columns=["SourceAddress","Cosine_Distance","Label","Prediction"])

    for i in range(len(distances)):
        distance = distances[i]
        source = unique_sources.SourceAddress.iloc[i]
        infected = prof.is_infected(source, scenario)
        normal = prof.is_normal(source)
        label = -1

        if infected == True:
            label = 1
        elif normal == True:
            label = 0

        results = results.append({"SourceAddress": source, "Cosine_Distance": distance, "Label": label, "Prediction": ''}, ignore_index=True)
    
    thresholds = np.arange(0,0.25,0.01)
    
    # For each threshold ranging from 0 to 0.25, in steps of 0.01 we check the results to be able to choose
    # good threshold. This is only based on 'training' scenario 9, the other sets will use the threshold 
    # that has been chosen optimal in scenario 9
    for threshold in thresholds:
        results.Prediction = results.Cosine_Distance.apply(lambda x: 1 if x <= threshold else 0)

        TP = 0
        FP = 0
        FN = 0
        TN = 0
        new_infections = 0
        unknown = 0

        for i in range(len(results)):
            row = results.iloc[i]
            if row.Label == row.Prediction and row.Label == 1:
                TP += 1
            elif row.Label == 0 and row.Prediction == 1:
                FP += 1
            elif row.Label == 0 and row.Prediction == 0:
                TN += 1
            elif row.Label == 1 and row.Prediction == 0:
                FN += 1

            if row.Label == -1 and row.Prediction == 1:
                new_infections += 1
                
            if row.Label == -1:
                unknown += 1
                
        grid_search = grid_search.append({"Scenario": scenario, "Threshold": threshold, "Unknown": unknown, "TP": TP, "FP": FP, "FN": FN, "TN": TN, "NEW_INFECTIONS": new_infections}, ignore_index=True)

# Uncheck print statement to see all results of results
pd.set_option('display.max_rows', None)
# print(grid_search)
        
# Manual evaluation of results of grid search delivered an optimal threshold of 0.18
# For scenario 9 this threshold delivers 12 new infections at the cost of a couple false positives
# For the other scenarios it didn't really matter which threshold to choose
filter_threshold = grid_search.loc[grid_search.Threshold == 0.18]
filter_threshold.head()

Unnamed: 0,Scenario,Threshold,Unknown,TP,FP,FN,TN,NEW_INFECTIONS
18,9.0,0.18,19.0,9.0,5.0,1.0,1.0,12.0
43,10.0,0.18,9.0,2.0,1.0,1.0,5.0,0.0
68,11.0,0.18,19.0,3.0,1.0,0.0,5.0,7.0
93,12.0,0.18,14.0,3.0,5.0,0.0,1.0,6.0
