In [9]:
%reset -f
%load_ext autoreload
%autoreload 2

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
import fingerprinting as finger
from nltk.util import ngrams
from collections import Counter
import itertools
from numpy import dot
from numpy.linalg import norm
from math import sqrt

from sklearn.model_selection import train_test_split

data9 = finger.load_data("data/capture20110817.binetflow.txt")
data10 = finger.load_data("data/capture20110818.binetflow.txt")
data11 = finger.load_data("data/capture20110818-2.binetflow.txt")
data12 = finger.load_data("data/capture20110819.binetflow.txt")
pdata9 = finger.pre_process(data9,9)
pdata10 = finger.pre_process(data10,10)
pdata11 = finger.pre_process(data11,11)
pdata12 = finger.pre_process(data12,12)

scenarios = [9, 10, 11, 12]
results = pd.DataFrame(columns=["Scenario","Fingerprint", "TP","FP","FN","TN","Accuracy","NEW_INFECTIONS"])

for scenario in scenarios:
    if scenario == 9:
        pdata = pdata9.copy()
    elif scenario == 10:
        pdata = pdata10.copy()
    elif scenario == 11:
        pdata = pdata11.copy()
    elif scenario == 12:
        pdata = pdata12.copy() 

    # Feature to use, including its optimal binsize, and discretized to be used for n-grams
    feature = "Protocol"
    nbins = 3

    discretized_data = pd.DataFrame()
    discretized_data["StartTime"] = pdata["StartTime"].copy()
    discretized_data["SourceAddress"] = pdata["SourceAddress"].copy()
    discretized_data["DestinationAddress"] = pdata["DestinationAddress"].copy()
    pdata["Protocol"] = finger.encode_feature(pdata["Protocol"])
    discretized_data[feature], binsedges_infected = finger.discretize_feature(pdata, feature, nbins, "kmeans")

    # Set ngram size and compute all possible 3-grams
    ngram = 3
    all_n_grams = list(itertools.product(*[['0','1','2'],['0','1','2'],['0','1','2']]))
    
    # Filter sources, label them, exclude the unknown hosts and copy to a set for validation purposes
    filtered_sources = discretized_data.copy()
    filtered_sources["Infected"] = filtered_sources.apply(lambda row : finger.label(row["SourceAddress"], scenario),axis=1)
    unknown_sources = filtered_sources.loc[filtered_sources.Infected == -1]
    filtered_sources = filtered_sources.loc[filtered_sources.Infected != -1]
    filtered_sources_unique = filtered_sources.groupby(['SourceAddress','Infected']).size().reset_index().rename(columns={0:'count'})
      
    # Split the set in a train and test part
    X_train, X_test, y_train, y_test = train_test_split(filtered_sources_unique.SourceAddress, filtered_sources_unique.Infected, test_size=0.3, random_state=41)

    # Create n-gram occurence count vectors
    ngram_counts_infected = [0]*len(all_n_grams)
    ngram_counts_benign = [0]*len(all_n_grams)

    # For each unique train source we count the occurences of the 3-grams. We add them to the corresponding 
    # count set depending on its label
    for i in range(len(X_train)):
        source = filtered_sources.loc[filtered_sources.SourceAddress == X_train.iloc[i]]

        n_grams_source = pd.Index(list(ngrams(source[feature].astype(str),ngram)))
        n_grams_source_counts = [0]*len(all_n_grams)

        for j in range(len(n_grams_source.value_counts())):
            n_gram = n_grams_source.value_counts()
            index_all_n_grams = all_n_grams.index(n_gram.index[j])
            n_grams_source_counts[index_all_n_grams] = n_gram[j]

        if y_train.iloc[i] == 1:
            ngram_counts_infected = np.add(ngram_counts_infected, n_grams_source_counts)
        else:
            ngram_counts_benign = np.add(ngram_counts_benign, n_grams_source_counts)

    # From the train set we create a fingerprint that includes 3-grams that are not counted for benign hosts
    # and counted for infected host
    fingerprint = list()
    for i in range(len(all_n_grams)):
        if ngram_counts_infected[i] > 0 and ngram_counts_benign[i] == 0:
            fingerprint.append(i)

    # For each source in the test set we count the 3-gram occurences.
    # If all 3-grams that are in the fingerprint appear in the profile we assign the host as normal,
    # otherwise we assign the host as infected
    TP,FP,FN,TN = 0,0,0,0
    for i in range(len(X_test)):
        source = filtered_sources.loc[filtered_sources.SourceAddress == X_test.iloc[i]]
        true_label = y_test.iloc[i]

        n_grams_source = pd.Index(list(ngrams(source[feature].astype(str),ngram)))
        n_grams_source_counts = [0]*len(all_n_grams)

        for j in range(len(n_grams_source.value_counts())):
            n_gram = n_grams_source.value_counts()
            index_all_n_grams = all_n_grams.index(n_gram.index[j])
            n_grams_source_counts[index_all_n_grams] = n_gram[j]

        # We look if all 3-grams that do not occur in benign hosts do not occur either
        count = 0
        for fing in fingerprint:
            if n_grams_source_counts[fing] == 0:
                count =+ 1

        predicted_label = 1
        if count == len(fingerprint):
            predicted_label = 0

        # Compute the evaluation metrics for the whole test set
        if predicted_label == true_label and true_label == 1:
            TP += 1
        elif predicted_label == 1 and true_label == 0:
            FP += 1
        elif predicted_label == 0 and true_label == 1:
            FN += 1
        elif predicted_label == 0 and true_label == 0:
            TN += 1

    accuracy = (TP+TN)/(TP+TN+FP+FN)
    
    # Apply the same logic to unknown hosts to see how many new infections can be detected with the determined
    # fingerprint. We label a source as infected if not all occurences of the 3-grams in the fingerprint are zero
    unknown_sources_unique = unknown_sources.groupby(['SourceAddress']).size().reset_index().rename(columns={0:'count'})
    new_infections = 0

    for i in range(len(unknown_sources_unique)):
        source = unknown_sources.loc[unknown_sources.SourceAddress == unknown_sources_unique.SourceAddress.iloc[i]]

        if len(source) > 2:

            n_grams_source = pd.Index(list(ngrams(source[feature].astype(str),ngram)))
            n_grams_source_counts = [0]*len(all_n_grams)

            for j in range(len(n_grams_source.value_counts())):
                n_gram = n_grams_source.value_counts()
                index_all_n_grams = all_n_grams.index(n_gram.index[j])
                n_grams_source_counts[index_all_n_grams] = n_gram[j]

            # We look if all 3-grams that do not occur in benign hosts do not occur either
            count = 0
            for fing in fingerprint:
                if n_grams_source_counts[fing] == 0:
                    count =+ 1

            if count != len(fingerprint):
                new_infections += 1
    
    fingerprint_ngrams = [all_n_grams[i] for i in fingerprint]
    results = results.append({"Scenario": scenario, "Fingerprint": fingerprint_ngrams, "TP": TP,"FP": FP,"FN": FN,"TN": TN,"Accuracy": accuracy,"NEW_INFECTIONS": new_infections}, ignore_index=True)

results.head()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Unnamed: 0,Scenario,Fingerprint,TP,FP,FN,TN,Accuracy,NEW_INFECTIONS
0,9,"[(0, 2, 1), (1, 2, 0)]",4,1,0,0,0.8,12
1,10,[],0,0,4,1,0.2,0
2,11,"[(1, 0, 0), (2, 1, 0)]",2,1,0,0,0.666667,0
3,12,"[(0, 0, 0), (0, 2, 0), (1, 2, 2), (2, 1, 2), (...",2,1,0,0,0.666667,6


Unnamed: 0,Scenario,Fingerprint,TP,FP,FN,TN,Accuracy,NEW_INFECTIONS
0,9,"[(0, 2, 1), (1, 2, 0)]",4,1,0,0,0.8,12
1,10,[],0,0,4,1,0.2,0
2,11,"[(1, 0, 0), (2, 1, 0)]",2,1,0,0,0.666667,0
3,12,"[(0, 0, 0), (0, 2, 0), (1, 2, 2), (2, 1, 2), (...",2,1,0,0,0.666667,6


In [None]:
[4, 5, 7, 10, 15, 19]
TP:  2 FP: 3 FN: 0 TN: 0 Accuracy: 0.4 New Infections 12
[2, 15, 18]
TP:  3 FP: 2 FN: 0 TN: 0 Accuracy: 0.6 New Infections 10
[0, 9, 21]
TP:  1 FP: 2 FN: 0 TN: 0 Accuracy: 0.3333333333333333 New Infections 0
[17, 23, 25]
TP:  2 FP: 1 FN: 0 TN: 0 Accuracy: 0.6666666666666666 New Infections 6

[7, 15]
TP:  3 FP: 2 FN: 0 TN: 0 Accuracy: 0.6 New Infections 12
[24]
TP:  1 FP: 2 FN: 0 TN: 2 Accuracy: 0.6 New Infections 0
[9, 21]
TP:  2 FP: 1 FN: 0 TN: 0 Accuracy: 0.6666666666666666 New Infections 0
[17, 23, 25]
TP:  1 FP: 2 FN: 0 TN: 0 Accuracy: 0.3333333333333333 New Infections 6
                    
[7, 15]
TP:  2 FP: 3 FN: 0 TN: 0 Accuracy: 0.4 New Infections 12
[]
TP:  0 FP: 0 FN: 4 TN: 1 Accuracy: 0.2 New Infections 0
[9, 21]
TP:  2 FP: 1 FN: 0 TN: 0 Accuracy: 0.6666666666666666 New Infections 0
[0, 6, 17, 23, 25]
TP:  1 FP: 2 FN: 0 TN: 0 Accuracy: 0.3333333333333333 New Infections 6

In [None]:

scenarios = [9, 10, 11, 12]
grid_search = pd.DataFrame(columns=["Scenario", "Threshold","Unknown","TP","FP","FN","TN","Accuracy","NEW_INFECTIONS"])

for scenario in scenarios:
    if scenario == 9:
        pdata = pdata9.copy()
    elif scenario == 10:
        pdata = pdata10.copy()
    elif scenario == 11:
        pdata = pdata11.copy()
    elif scenario == 12:
        pdata = pdata12.copy() 

    # Feature to use, including its optimal binsize, and discretized to be used for n-grams
    feature = "Protocol"
    nbins = 3

    discretized_data = pd.DataFrame()
    discretized_data["StartTime"] = pdata["StartTime"].copy()
    discretized_data["SourceAddress"] = pdata["SourceAddress"].copy()
    discretized_data["DestinationAddress"] = pdata["DestinationAddress"].copy()
    pdata["Protocol"] = finger.encode_feature(pdata["Protocol"])
    discretized_data[feature], binsedges_infected = finger.discretize_feature(pdata, feature, nbins, "kmeans")

    # Select source with most entries in given dataset to deliver most reliable n-gram profile
    # Determine list of all possible n-grams for selected host
    # This is the 3-gram profile of this host
    selected_source = finger.select_infected_host(pdata,scenario)
    filtered_source = discretized_data.loc[discretized_data.SourceAddress == selected_source]

    ngram = 3
    all_n_grams = list(itertools.product(*[['0','1','2'],['0','1','2'],['0','1','2']]))

    n_grams = pd.Index(list(ngrams(filtered_source[feature].astype(str),ngram)))
    n_grams_counts = [0]*len(all_n_grams)

    for i in range(len(n_grams.value_counts())):
        n_gram = n_grams.value_counts()
        index_all_n_grams = all_n_grams.index(n_gram.index[i])
        n_grams_counts[index_all_n_grams] = n_gram[i]

    # Group netflows by source address (host) because we are interested in modeling per-host behavior
    unique_sources = pdata.groupby(["SourceAddress"]).size().reset_index()
    unique_sources["Infected"] = unique_sources.apply(lambda row : finger.label(row["SourceAddress"], scenario),axis=1)

    # Calculate the distance for every other source compared to the selected source/host
    distances = [100]*len(unique_sources)
    n_gram_count_list = list()
    
    sum_ngram_counts_infected = [0]*len(all_n_grams)
    sum_ngram_counts_benign_train = [0]*len(all_n_grams)
    sum_ngram_counts_benign_test = [0]*len(all_n_grams)
    sum_ngram_counts_unknown = [0]*len(all_n_grams)
    
    # Split benign hosts in a train and test set
    benign_hosts = unique_sources.loc[unique_sources.Infected == 0]
    X_train, X_test, y_train, y_test = train_test_split(benign_hosts.SourceAddress, benign_hosts.Infected, test_size=0.3)
    
    # Obtain all flows from the benign hosts in the train set
    benign_hosts_flows_train = discretized_data.loc[discretized_data.SourceAddress.isin(X_train)]
    
    for i in range(len(X_train)):
        source = benign_hosts_flows_train.loc[benign_hosts_flows_train.SourceAddress == X_train.iloc[i]]
        
        n_grams_source = pd.Index(list(ngrams(source[feature].astype(str),ngram)))
        n_grams_source_counts = [0]*len(all_n_grams)
        
        for j in range(len(n_grams_source.value_counts())):
            n_gram = n_grams_source.value_counts()
            index_all_n_grams = all_n_grams.index(n_gram.index[j])
            n_grams_source_counts[index_all_n_grams] = n_gram[j]
        
        sum_ngram_counts_benign_train = np.add(sum_ngram_counts_benign_train,n_grams_source_counts)
    
    # Find the n-gram indices
    benign_no_count = list()
    
    for i in range(len(sum_ngram_counts_benign_train)):
        if sum_ngram_counts_benign_train[i] == 0:
            benign_no_count.append(i)
    
    print(benign_no_count, [all_n_grams[i] for i in benign_no_count])
    
    # Obtain all infects hosts and flows
    infected_hosts = unique_sources.loc[(unique_sources.Infected == 1) & (unique_sources.SourceAddress != selected_source)]
    infected_hosts_flows = discretized_data.loc[discretized_data.SourceAddress.isin(np.array(infected_hosts.SourceAddress))]
    
    # For each infected host we compute the occurence counts of all 3-grams
    for i in range(len(infected_hosts)):
        source = infected_hosts_flows.loc[infected_hosts_flows.SourceAddress == infected_hosts.SourceAddress.iloc[i]]
        
        n_grams_source = pd.Index(list(ngrams(source[feature].astype(str),ngram)))
        n_grams_source_counts = [0]*len(all_n_grams)
        
        for j in range(len(n_grams_source.value_counts())):
            n_gram = n_grams_source.value_counts()
            index_all_n_grams = all_n_grams.index(n_gram.index[j])
            n_grams_source_counts[index_all_n_grams] = n_gram[j]
            
        sum_ngram_counts_infected = np.add(sum_ngram_counts_infected,n_grams_source_counts)
        
    print(sum_ngram_counts_infected)

    # We check whether the 3-gram that do not occur in the train set of benign hosts occur in all infected hosts.
    # If it occurs we assign the 3-gram to the fingerprint
    fingerprint_ngram_indices = list()

    for i in benign_no_count:
        if sum_ngram_counts_infected[i] > 0:
            fingerprint_ngram_indices.append(i)
            
    print("Fingerprint",fingerprint_ngram_indices)
        
    # For each source in the test set we check whether the 3-gram in the fingerprint exist.
    # If a 3-gram exists, we label the host as 'infected' and it is thus a false positive
    # If not, it is labeled as a true negative since we are only dealing with benign hosts tested for infection
    benign_hosts_flows_test = discretized_data.loc[discretized_data.SourceAddress.isin(X_test)]
    
    for i in range(len(X_test)):
        source = benign_hosts_flows_test.loc[benign_hosts_flows_test.SourceAddress == X_test.iloc[i]]
        
        n_grams_source = pd.Index(list(ngrams(source[feature].astype(str),ngram)))
        n_grams_source_counts = [0]*len(all_n_grams)
        
        for j in range(len(n_grams_source.value_counts())):
            n_gram = n_grams_source.value_counts()
            index_all_n_grams = all_n_grams.index(n_gram.index[j])
            n_grams_source_counts[index_all_n_grams] = n_gram[j]
        
        sum_ngram_counts_benign_test = np.add(sum_ngram_counts_benign_test,n_grams_source_counts)
        
        for fingerprint in fingerprint_ngram_indices:
            if sum_ngram_counts_benign_test[fingerprint] == 0:
                print("TRUE NEGATIVE")
            else:
                print("FALSE POSITIVE")
                
    # Obtain all infects hosts and flows
    unknown_hosts = unique_sources.loc[unique_sources.Infected == -1]
    unknown_hosts_flows = discretized_data.loc[discretized_data.SourceAddress.isin(np.array(unknown_hosts.SourceAddress))]
    
    for i in range(len(unknown_hosts)):
        source = unknown_hosts_flows.loc[unknown_hosts_flows.SourceAddress == unknown_hosts.SourceAddress.iloc[i]]
        
        if len(source) > 2:
            n_grams_source = pd.Index(list(ngrams(source[feature].astype(str),ngram)))
            n_grams_source_counts = [0]*len(all_n_grams)
            
            for j in range(len(n_grams_source.value_counts())):
                n_gram = n_grams_source.value_counts()
                index_all_n_grams = all_n_grams.index(n_gram.index[j])
                n_grams_source_counts[index_all_n_grams] = n_gram[j]

            sum_ngram_counts_unknown = np.add(sum_ngram_counts_unknown,n_grams_source_counts)
            
            for fingerprint in fingerprint_ngram_indices:
                if sum_ngram_counts_unknown[fingerprint] == 0:
                    print("UNKNOWN == NORMAL")
                else:
                    print("UNKNOWN == INFECTED")

   







In [None]:
    # Group netflows by source address (host) because we are interested in modeling per-host behavior
    unique_sources = pdata.groupby(["SourceAddress"]).size().reset_index()
    unique_sources["Infected"] = unique_sources.apply(lambda row : finger.label(row["SourceAddress"], scenario),axis=1)

    # Calculate the distance for every other source compared to the selected source/host
    distances = [100]*len(unique_sources)
    n_gram_count_list = list()
    
    sum_ngram_counts_infected = [0]*len(all_n_grams)
    sum_ngram_counts_benign_train = [0]*len(all_n_grams)
    sum_ngram_counts_benign_test = [0]*len(all_n_grams)
    sum_ngram_counts_unknown = [0]*len(all_n_grams)
    
    # Split benign hosts in a train and test set
    benign_hosts = unique_sources.loc[unique_sources.Infected == 0]
    X_train, X_test, y_train, y_test = train_test_split(benign_hosts.SourceAddress, benign_hosts.Infected, test_size=0.3)
    
    # Obtain all flows from the benign hosts in the train set
    benign_hosts_flows_train = discretized_data.loc[discretized_data.SourceAddress.isin(X_train)]
    
    for i in range(len(X_train)):
        source = benign_hosts_flows_train.loc[benign_hosts_flows_train.SourceAddress == X_train.iloc[i]]
        
        n_grams_source = pd.Index(list(ngrams(source[feature].astype(str),ngram)))
        n_grams_source_counts = [0]*len(all_n_grams)
        
        for j in range(len(n_grams_source.value_counts())):
            n_gram = n_grams_source.value_counts()
            index_all_n_grams = all_n_grams.index(n_gram.index[j])
            n_grams_source_counts[index_all_n_grams] = n_gram[j]
        
        sum_ngram_counts_benign_train = np.add(sum_ngram_counts_benign_train,n_grams_source_counts)
    
    # Find the n-gram indices
    benign_no_count = list()
    
    for i in range(len(sum_ngram_counts_benign_train)):
        if sum_ngram_counts_benign_train[i] == 0:
            benign_no_count.append(i)
    
    print(benign_no_count, [all_n_grams[i] for i in benign_no_count])
    
    # Obtain all infects hosts and flows
    infected_hosts = unique_sources.loc[(unique_sources.Infected == 1) & (unique_sources.SourceAddress != selected_source)]
    infected_hosts_flows = discretized_data.loc[discretized_data.SourceAddress.isin(np.array(infected_hosts.SourceAddress))]
    
    # For each infected host we compute the occurence counts of all 3-grams
    for i in range(len(infected_hosts)):
        source = infected_hosts_flows.loc[infected_hosts_flows.SourceAddress == infected_hosts.SourceAddress.iloc[i]]
        
        n_grams_source = pd.Index(list(ngrams(source[feature].astype(str),ngram)))
        n_grams_source_counts = [0]*len(all_n_grams)
        
        for j in range(len(n_grams_source.value_counts())):
            n_gram = n_grams_source.value_counts()
            index_all_n_grams = all_n_grams.index(n_gram.index[j])
            n_grams_source_counts[index_all_n_grams] = n_gram[j]
            
        sum_ngram_counts_infected = np.add(sum_ngram_counts_infected,n_grams_source_counts)
        
    print(sum_ngram_counts_infected)

    # We check whether the 3-gram that do not occur in the train set of benign hosts occur in all infected hosts.
    # If it occurs we assign the 3-gram to the fingerprint
    fingerprint_ngram_indices = list()

    for i in benign_no_count:
        if sum_ngram_counts_infected[i] > 0:
            fingerprint_ngram_indices.append(i)
            
    print("Fingerprint",fingerprint_ngram_indices)
        
    # For each source in the test set we check whether the 3-gram in the fingerprint exist.
    # If a 3-gram exists, we label the host as 'infected' and it is thus a false positive
    # If not, it is labeled as a true negative since we are only dealing with benign hosts tested for infection
    benign_hosts_flows_test = discretized_data.loc[discretized_data.SourceAddress.isin(X_test)]
    
    for i in range(len(X_test)):
        source = benign_hosts_flows_test.loc[benign_hosts_flows_test.SourceAddress == X_test.iloc[i]]
        
        n_grams_source = pd.Index(list(ngrams(source[feature].astype(str),ngram)))
        n_grams_source_counts = [0]*len(all_n_grams)
        
        for j in range(len(n_grams_source.value_counts())):
            n_gram = n_grams_source.value_counts()
            index_all_n_grams = all_n_grams.index(n_gram.index[j])
            n_grams_source_counts[index_all_n_grams] = n_gram[j]
        
        sum_ngram_counts_benign_test = np.add(sum_ngram_counts_benign_test,n_grams_source_counts)
        
        for fingerprint in fingerprint_ngram_indices:
            if sum_ngram_counts_benign_test[fingerprint] == 0:
                print("TRUE NEGATIVE")
            else:
                print("FALSE POSITIVE")
                
    # Obtain all infects hosts and flows
    unknown_hosts = unique_sources.loc[unique_sources.Infected == -1]
    unknown_hosts_flows = discretized_data.loc[discretized_data.SourceAddress.isin(np.array(unknown_hosts.SourceAddress))]
    
    for i in range(len(unknown_hosts)):
        source = unknown_hosts_flows.loc[unknown_hosts_flows.SourceAddress == unknown_hosts.SourceAddress.iloc[i]]
        
        if len(source) > 2:
            n_grams_source = pd.Index(list(ngrams(source[feature].astype(str),ngram)))
            n_grams_source_counts = [0]*len(all_n_grams)
            
            for j in range(len(n_grams_source.value_counts())):
                n_gram = n_grams_source.value_counts()
                index_all_n_grams = all_n_grams.index(n_gram.index[j])
                n_grams_source_counts[index_all_n_grams] = n_gram[j]

            sum_ngram_counts_unknown = np.add(sum_ngram_counts_unknown,n_grams_source_counts)
            
            for fingerprint in fingerprint_ngram_indices:
                if sum_ngram_counts_unknown[fingerprint] == 0:
                    print("UNKNOWN == NORMAL")
                else:
                    print("UNKNOWN == INFECTED")

   




35


In [3]:
scenarios = [9, 10, 11, 12]
grid_search = pd.DataFrame(columns=["Scenario", "Threshold","Unknown","TP","FP","FN","TN","Accuracy","NEW_INFECTIONS"])

for scenario in scenarios:
    if scenario == 9:
        pdata = pdata9.copy()
    elif scenario == 10:
        pdata = pdata10.copy()
    elif scenario == 11:
        pdata = pdata11.copy()
    elif scenario == 12:
        pdata = pdata12.copy() 

    # Feature to use, including its optimal binsize, and discretized to be used for n-grams
    feature = "Protocol"
    nbins = 3

    discretized_data = pd.DataFrame()
    discretized_data["StartTime"] = pdata["StartTime"].copy()
    discretized_data["SourceAddress"] = pdata["SourceAddress"].copy()
    discretized_data["DestinationAddress"] = pdata["DestinationAddress"].copy()
    pdata["Protocol"] = finger.encode_feature(pdata["Protocol"])
    discretized_data[feature], binsedges_infected = finger.discretize_feature(pdata, feature, nbins, "kmeans")

    # Select source with most entries in given dataset to deliver most reliable n-gram profile
    # Determine list of all possible n-grams for selected host
    # This is the 3-gram profile of this host
    selected_source = finger.select_infected_host(pdata,scenario)
    filtered_source = discretized_data.loc[discretized_data.SourceAddress == selected_source]

    ngram = 3
    all_n_grams = list(itertools.product(*[['0','1','2'],['0','1','2'],['0','1','2']]))

    n_grams = pd.Index(list(ngrams(filtered_source[feature].astype(str),ngram)))
    n_grams_counts = [0]*len(all_n_grams)

    for i in range(len(n_grams.value_counts())):
        n_gram = n_grams.value_counts()
        index_all_n_grams = all_n_grams.index(n_gram.index[i])
        n_grams_counts[index_all_n_grams] = n_gram[i]

    # Group netflows by source address (host) because we are interested in modeling per-host behavior
    unique_sources = pdata.groupby(["SourceAddress"]).size().reset_index()

    # Calculate the distance for every other source compared to the selected source/host
    distances = [100]*len(unique_sources)
    n_gram_count_list = list()

    for i in range(len(unique_sources)):
        source = unique_sources.iloc[i]

        filtered = discretized_data.loc[discretized_data.SourceAddress == source.SourceAddress]
        n_grams_source = pd.Index(list(ngrams(filtered[feature].astype(str),ngram)))
        n_grams_source_counts = [0]*len(all_n_grams)

        for j in range(len(n_grams_source.value_counts())):
            n_gram = n_grams_source.value_counts()
            index_all_n_grams = all_n_grams.index(n_gram.index[j])
            n_grams_source_counts[index_all_n_grams] = n_gram[j]


        # Cosine distance between selected host and other hosts
        if np.sum(n_grams_source_counts) == 0:
            distances[i] = 1
        else:
            distances[i] = 1-dot(n_grams_counts, n_grams_source_counts)/(norm(n_grams_counts)*norm(n_grams_source_counts))


    # Create an overview of the results including the original label and predicted label
    results = pd.DataFrame(columns=["SourceAddress","Cosine_Distance","Label","Prediction"])

    for i in range(len(distances)):
        distance = distances[i]
        source = unique_sources.SourceAddress.iloc[i]
        infected = finger.is_infected(source, scenario)
        normal = finger.is_normal(source)
        label = -1

        if infected == True:
            label = 1
        elif normal == True:
            label = 0

        results = results.append({"SourceAddress": source, "Cosine_Distance": distance, "Label": label, "Prediction": ''}, ignore_index=True)
    
    # For each threshold ranging from 0 to 0.25, in steps of 0.01 we check the results to be able to choose
    # good threshold. This is only based on 'training' scenario 9, the other sets will use the threshold 
    # that has been chosen optimal in scenario 9
    thresholds = [0.18]
    for threshold in thresholds:
        results.Prediction = results.Cosine_Distance.apply(lambda x: 1 if x <= threshold else 0)

        TP, FP, FN, TN, new_infections, unknown = 0, 0, 0, 0, 0, 0

        for i in range(len(results)):
            row = results.iloc[i]
            if row.Label == row.Prediction and row.Label == 1:
                TP += 1
            elif row.Label == 0 and row.Prediction == 1:
                FP += 1
            elif row.Label == 0 and row.Prediction == 0:
                TN += 1
            elif row.Label == 1 and row.Prediction == 0:
                FN += 1
                
            # Number of unlabeled hosts
            if row.Label == -1:
                unknown += 1
                
            # Newly identified infected host
            if row.Label == -1 and row.Prediction == 1:
                new_infections += 1
                
        accuracy = (TP+TN)/(TP+FP+FN+TN)
                
        grid_search = grid_search.append({"Scenario": scenario, "Threshold": threshold, "Unknown": unknown, "TP": TP, "FP": FP, "FN": FN, "TN": TN, "Accuracy": accuracy,"NEW_INFECTIONS": new_infections}, ignore_index=True)

# Uncheck print statement to see all results of results
pd.set_option('display.max_rows', None)
print(grid_search)
        
# Manual evaluation of results of grid search delivered an optimal threshold of 0.18
# For scenario 9 this threshold delivers 12 new infections at the cost of a couple false positives
# For the other scenarios it didn't really matter which threshold to choose
filter_threshold = grid_search.loc[grid_search.Threshold == 0.18]
filter_threshold.head()

   Scenario  Threshold  Unknown    TP   FP   FN   TN  Accuracy  NEW_INFECTIONS
0       9.0       0.18     19.0   9.0  5.0  1.0  1.0  0.625000            12.0
1      10.0       0.18     12.0  10.0  1.0  0.0  5.0  0.937500             0.0
2      11.0       0.18      9.0   2.0  1.0  1.0  5.0  0.777778             0.0
3      12.0       0.18     14.0   3.0  5.0  0.0  1.0  0.444444             6.0


Unnamed: 0,Scenario,Threshold,Unknown,TP,FP,FN,TN,Accuracy,NEW_INFECTIONS
0,9.0,0.18,19.0,9.0,5.0,1.0,1.0,0.625,12.0
1,10.0,0.18,12.0,10.0,1.0,0.0,5.0,0.9375,0.0
2,11.0,0.18,9.0,2.0,1.0,1.0,5.0,0.777778,0.0
3,12.0,0.18,14.0,3.0,5.0,0.0,1.0,0.444444,6.0


### Fingerprinting

In [4]:
import operator
from math import nan
import profiling as prof
from sklearn.model_selection import train_test_split

# For each scenario pick the infected host with the most unique n-gram.
# Then check which n-gram does not occur in the non-infected host.
# Pick top k n-grams that satisfy above criteria, check for other hosts

scenarios = [9, 10, 11, 12]
pd_results = pd.DataFrame()
result_scenario = []
result_ip = []
result_TP = []
result_FP = []
result_TN = []
result_FN = []
result_newinf = []
result_acc = []
result_used_ngrams = []
for scenario in scenarios:
    if scenario == 9:
        pdata = pdata9.copy()
    elif scenario == 10:
        pdata = pdata10.copy()
    elif scenario == 11:
        pdata = pdata11.copy()
    elif scenario == 12:
        pdata = pdata12.copy() 

    # Feature to use, including its optimal binsize, and discretized to be used for n-grams
    feature = "Protocol"
    nbins = 3

    discretized_data = pd.DataFrame()
    discretized_data["StartTime"] = pdata["StartTime"].copy()
    discretized_data["SourceAddress"] = pdata["SourceAddress"].copy()
    discretized_data["DestinationAddress"] = pdata["DestinationAddress"].copy()
    pdata["Protocol"] = prof.encode_feature(pdata["Protocol"])
    discretized_data[feature], binsedges_infected = prof.discretize_feature(pdata, feature, nbins, "kmeans")

    # Select source with most entries in given dataset to deliver most reliable n-gram profile
    selected_sources = prof.select_all_infected_host(pdata,scenario)
    # ONLY FOR TEST LOOP OVER ALL INFECTED IPS     
    for selected_source in selected_sources:
        result_scenario.append(scenario)
        result_ip.append(selected_source)

#         selected_source = prof.select_infected_host(pdata,scenario)
        filtered_source = discretized_data.loc[discretized_data.SourceAddress == selected_source]

        ngram = 3
        all_n_grams = list(itertools.product(*[['0','1','2'],['0','1','2'],['0','1','2']]))

        n_grams = pd.Index(list(ngrams(filtered_source[feature].astype(str),ngram)))
        n_grams_counts = [0]*len(all_n_grams)
        for i in range(len(n_grams.value_counts())):
            n_gram = n_grams.value_counts()
            index_all_n_grams = all_n_grams.index(n_gram.index[i])
            n_grams_counts[index_all_n_grams] = n_gram[i]

        # Select all benign traffic(Non-infected)
        non_infected_host = prof.select_non_infected_host(pdata)
        train, test = train_test_split(non_infected_host, test_size=0.3)

        unique_sources_benign = pd.DataFrame()
        unique_sources_benign['SourceAddress'] = train

        # Sum all counts of all benign host(if n-gram == 0, does not occur in benign data)
        sum_ngrams_benign = [0]*len(all_n_grams)
        for i in range(len(unique_sources_benign)):
            source = unique_sources_benign.iloc[i]

            filtered = discretized_data.loc[discretized_data.SourceAddress == source.SourceAddress]
            n_grams_source = pd.Index(list(ngrams(filtered[feature].astype(str),ngram)))
            n_grams_source_counts = [0]*len(all_n_grams)

            for j in range(len(n_grams_source.value_counts())):
                n_gram = n_grams_source.value_counts()
                index_all_n_grams = all_n_grams.index(n_gram.index[j])
                n_grams_source_counts[index_all_n_grams] = n_gram[j]

            # Check which n-gram does NOT occur in benign data
            sum_ngrams_benign = np.add(sum_ngrams_benign, n_grams_source_counts)

        # Pick all ngrams that does not occur in benign traffic     
        n_grams_not_in_benign = np.array(sum_ngrams_benign)
        n_grams_not_in_benign = np.where(sum_ngrams_benign == 0)[0]

        # Pick all ngrams that occur in infected traffic     
        n_grams_in_infected = np.array(n_grams_counts)
        n_grams_in_infected = np.where(n_grams_in_infected > 0)[0]

        # Pick the ngrams that does not occur in benign but occur in infected data     
        possible_fingerprints = np.intersect1d(n_grams_in_infected, n_grams_not_in_benign)
        result_used_ngrams.append(possible_fingerprints)



        ### PART 2 (CHECK WITH THE NGRAMS FIND ABOVE IF ANY (NEW) INFECTED HOSTS COULD BE DETECTED) ####    

        # Group netflows by source address (host) because we are interested in modeling per-host behavior
        unique_sources = pdata.groupby(["SourceAddress"]).size().reset_index()
        pd_unique_source = pd.DataFrame()

        # Get the data of the infected host without selected infected host     
        all_infected_hosts = prof.select_all_infected_host(pdata, scenario)
        # Remove the selected host     
        all_infected_hosts.remove(selected_source)

        # Merge test data with all infected host except selected
        test_data = np.union1d(test,all_infected_hosts)
        pd_unique_source['SourceAddress'] = test_data

        finger_print_exists = [100]*len(pd_unique_source)

        # Check if fingerprint is contained in the data of the host
        for i in range(len(pd_unique_source)):
            source = pd_unique_source.iloc[i]

            filtered = discretized_data.loc[discretized_data.SourceAddress == source.SourceAddress]
            n_grams_source = pd.Index(list(ngrams(filtered[feature].astype(str),ngram)))
            n_grams_source_counts = [0]*len(all_n_grams)

            for j in range(len(n_grams_source.value_counts())):
                n_gram = n_grams_source.value_counts()
                index_all_n_grams = all_n_grams.index(n_gram.index[j])
                n_grams_source_counts[index_all_n_grams] = n_gram[j]

            # Pick the indexes of the ngram that occur          
            n_grams_in_source = np.array(n_grams_source_counts)
            n_grams_in_source = np.where(n_grams_in_source > 0)[0]

            # For each possible fingerprint check if it exists in the data
            for fingerprint in possible_fingerprints:
                # Raise alarm         
                if fingerprint in n_grams_in_source:
                    finger_print_exists[i] = 1

        # # Create an overview of the results including the original label and predicted label
        results = pd.DataFrame(columns=["SourceAddress","Fingerprint_exists","Label","Prediction"])

        for i in range(len(finger_print_exists)):
            finger_print = finger_print_exists[i]
            source = pd_unique_source.SourceAddress.iloc[i]
            infected = prof.is_infected(source, scenario)
            normal = prof.is_normal(source)
            label = -1

            if infected == True:
                label = 1
            elif normal == True:
                label = 0

            results = results.append({"SourceAddress": source, "Fingerprint_exists": finger_print, "Label": label, "Prediction": finger_print}, ignore_index=True)
        TP, FP, FN, TN, new_infections, unknown = 0, 0, 0, 0, 0, 0

        for i in range(len(results)):
            row = results.iloc[i]
            if row.Label == row.Prediction and row.Label == 1:
                TP += 1
            elif row.Label == 0 and row.Prediction == 1:
                FP += 1
            elif row.Label == 0 and row.Prediction == 0:
                TN += 1
            elif row.Label == 1 and row.Prediction == 0:
                FN += 1

            # Number of unlabeled hosts
            if row.Label == -1:
                unknown += 1

            # Newly identified infected host
            if row.Label == -1 and row.Prediction == 1:
                new_infections += 1
        if (TP+FP+FN+TN) == 0:
            accuracy = nan
        else: accuracy = (TP+TN)/(TP+FP+FN+TN)
        result_TP.append(TP)
        result_FP.append(FP)
        result_TN.append(TN)
        result_FN.append(FN)
        result_newinf.append(new_infections)
        result_acc.append(accuracy)

pd_results['Scenario'] = result_scenario
pd_results['Infected IP used'] = result_ip
pd_results['Used Ngrams'] = result_used_ngrams
pd_results['TP'] = result_TP
pd_results['FP'] = result_FP
pd_results['TN'] = result_TN
pd_results['FN'] = result_FN
pd_results['New infected'] = result_newinf
pd_results['Accuracy'] = result_acc
pd_results.head(50)


Unnamed: 0,Scenario,Infected IP used,Used Ngrams,TP,FP,TN,FN,New infected,Accuracy
0,9,147.32.84.165,[15],2,0,0,0,0,1.0
1,9,147.32.84.191,"[7, 15]",2,0,0,0,0,1.0
2,9,147.32.84.192,[],0,0,0,0,0,
3,9,147.32.84.193,[],0,0,0,0,0,
4,9,147.32.84.204,[],0,0,0,0,0,
5,9,147.32.84.205,[],0,0,0,0,0,
6,9,147.32.84.206,"[4, 10]",3,1,0,0,0,0.75
7,9,147.32.84.207,[],0,0,0,0,0,
8,9,147.32.84.208,[15],2,0,0,0,0,1.0
9,9,147.32.84.209,[],0,0,0,0,0,


The profiles found by the "Botnet profiling task" are here used to detect new infections by fingerprinting. For each scenario, the infected host with the most entries in the dataset is selected since this host has the most reliable n-gram profile. Then the benign data is split into a training and test set. The training set is used to learn to look for the occurrence of an n-gram that does not occur in any benign traffic. After that, we check if the selected fingerprint from the training data is present among all other infected hosts and the generate test set of the benign hosts.

Above the results of the fingerprinting algorithm can be seen. ...
 
Compared to the botnet profiling task, ...

