In [1]:
%reset -f
%load_ext autoreload
%autoreload 2

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
import fingerprinting as finger
from nltk.util import ngrams
from collections import Counter
import itertools
from numpy import dot
from numpy.linalg import norm
from math import sqrt

data9 = finger.load_data("data/capture20110817.binetflow")
data10 = finger.load_data("data/capture20110818.binetflow")
data11 = finger.load_data("data/capture20110818-2.binetflow")
data12 = finger.load_data("data/capture20110819.binetflow")
pdata9 = finger.pre_process(data9,9)
pdata10 = finger.pre_process(data10,10)
pdata11 = finger.pre_process(data11,11)
pdata12 = finger.pre_process(data12,12)

In [2]:
print(pdata9.Infected.value_counts())
pdata9.head()

 1    184987
 0     29890
-1        77
Name: Infected, dtype: int64


Unnamed: 0,StartTime,Duration,Protocol,SourceAddress,SourcePort,Direction,DestinationAddress,DestinationPort,TotalPackets,TotalBytes,SourceBytes,Date,Infected
122,2011/08/17 12:01:01.799805,1752.578735,TCP,147.32.84.134,46402,<?>,69.63.189.16,443,467,274517,79210,2011/08/17 12:01:01.799805,0
209,2011/08/17 12:01:01.980445,0.0,UDP,147.32.80.9,53,->,147.32.84.59,33718,1,130,130,2011/08/17 12:01:01.980445,0
264,2011/08/17 12:01:02.176742,1532.028076,TCP,147.32.84.170,34004,<?>,64.12.73.165,443,71,4556,2154,2011/08/17 12:01:02.176742,0
265,2011/08/17 12:01:02.176750,3570.125732,TCP,147.32.84.170,39463,<?>,64.12.73.133,443,117,7020,3480,2011/08/17 12:01:02.176750,0
266,2011/08/17 12:01:02.176765,3570.125,TCP,147.32.84.170,40828,<?>,205.188.11.14,443,519,74059,19656,2011/08/17 12:01:02.176765,0


In [3]:
scenarios = [9]
grid_search = pd.DataFrame(columns=["Scenario", "Threshold","Unknown","TP","FP","FN","TN","Accuracy","NEW_INFECTIONS"])

for scenario in scenarios:
    if scenario == 9:
        pdata = pdata9.copy()
    elif scenario == 10:
        pdata = pdata10.copy()
    elif scenario == 11:
        pdata = pdata11.copy()
    elif scenario == 12:
        pdata = pdata12.copy() 

    # Feature to use, including its optimal binsize, and discretized to be used for n-grams
    feature = "Protocol"
    nbins = 3

    discretized_data = pd.DataFrame()
    discretized_data["StartTime"] = pdata["StartTime"].copy()
    discretized_data["SourceAddress"] = pdata["SourceAddress"].copy()
    discretized_data["DestinationAddress"] = pdata["DestinationAddress"].copy()
    pdata["Protocol"] = finger.encode_feature(pdata["Protocol"])
    discretized_data[feature], binsedges_infected = finger.discretize_feature(pdata, feature, nbins, "kmeans")

    # Select source with most entries in given dataset to deliver most reliable n-gram profile
    # Determine list of all possible n-grams for selected host
    # This is the 3-gram profile of this host
    selected_source = finger.select_infected_host(pdata,scenario)
    filtered_source = discretized_data.loc[discretized_data.SourceAddress == selected_source]

    ngram = 3
    all_n_grams = list(itertools.product(*[['0','1','2'],['0','1','2'],['0','1','2']]))

    n_grams = pd.Index(list(ngrams(filtered_source[feature].astype(str),ngram)))
    n_grams_counts = [0]*len(all_n_grams)

    for i in range(len(n_grams.value_counts())):
        n_gram = n_grams.value_counts()
        index_all_n_grams = all_n_grams.index(n_gram.index[i])
        n_grams_counts[index_all_n_grams] = n_gram[i]

    # Group netflows by source address (host) because we are interested in modeling per-host behavior
    unique_sources = pdata.groupby(["SourceAddress"]).size().reset_index()

    # Calculate the distance for every other source compared to the selected source/host
    distances = [100]*len(unique_sources)
    n_gram_count_list = list()

    for i in range(len(unique_sources)):
        source = unique_sources.iloc[i]

        filtered = discretized_data.loc[discretized_data.SourceAddress == source.SourceAddress]
        n_grams_source = pd.Index(list(ngrams(filtered[feature].astype(str),ngram)))
        n_grams_source_counts = [0]*len(all_n_grams)

        for j in range(len(n_grams_source.value_counts())):
            n_gram = n_grams_source.value_counts()
            index_all_n_grams = all_n_grams.index(n_gram.index[j])
            n_grams_source_counts[index_all_n_grams] = n_gram[j]


        # Cosine distance between selected host and other hosts
        if np.sum(n_grams_source_counts) == 0:
            distances[i] = 1
        else:
            distances[i] = 1-dot(n_grams_counts, n_grams_source_counts)/(norm(n_grams_counts)*norm(n_grams_source_counts))


    # Create an overview of the results including the original label and predicted label
    results = pd.DataFrame(columns=["SourceAddress","Cosine_Distance","Label","Prediction"])

    for i in range(len(distances)):
        distance = distances[i]
        source = unique_sources.SourceAddress.iloc[i]
        infected = finger.is_infected(source, scenario)
        normal = finger.is_normal(source)
        label = -1

        if infected == True:
            label = 1
        elif normal == True:
            label = 0

        results = results.append({"SourceAddress": source, "Cosine_Distance": distance, "Label": label, "Prediction": ''}, ignore_index=True)
    
    # For each threshold ranging from 0 to 0.25, in steps of 0.01 we check the results to be able to choose
    # good threshold. This is only based on 'training' scenario 9, the other sets will use the threshold 
    # that has been chosen optimal in scenario 9
    thresholds = [0.18]
    for threshold in thresholds:
        results.Prediction = results.Cosine_Distance.apply(lambda x: 1 if x <= threshold else 0)

        TP, FP, FN, TN, new_infections, unknown = 0, 0, 0, 0, 0, 0

        for i in range(len(results)):
            row = results.iloc[i]
            if row.Label == row.Prediction and row.Label == 1:
                TP += 1
            elif row.Label == 0 and row.Prediction == 1:
                FP += 1
            elif row.Label == 0 and row.Prediction == 0:
                TN += 1
            elif row.Label == 1 and row.Prediction == 0:
                FN += 1
                
            # Number of unlabeled hosts
            if row.Label == -1:
                unknown += 1
                
            # Newly identified infected host
            if row.Label == -1 and row.Prediction == 1:
                new_infections += 1
                
        accuracy = (TP+TN)/(TP+FP+FN+TN)
                
        grid_search = grid_search.append({"Scenario": scenario, "Threshold": threshold, "Unknown": unknown, "TP": TP, "FP": FP, "FN": FN, "TN": TN, "Accuracy": accuracy,"NEW_INFECTIONS": new_infections}, ignore_index=True)

# Uncheck print statement to see all results of results
pd.set_option('display.max_rows', None)
print(grid_search)
        
# Manual evaluation of results of grid search delivered an optimal threshold of 0.18
# For scenario 9 this threshold delivers 12 new infections at the cost of a couple false positives
# For the other scenarios it didn't really matter which threshold to choose
filter_threshold = grid_search.loc[grid_search.Threshold == 0.18]
filter_threshold.head()

   Scenario  Threshold  Unknown   TP   FP   FN   TN  Accuracy  NEW_INFECTIONS
0       9.0       0.18     19.0  9.0  5.0  1.0  1.0     0.625            12.0


Unnamed: 0,Scenario,Threshold,Unknown,TP,FP,FN,TN,Accuracy,NEW_INFECTIONS
0,9.0,0.18,19.0,9.0,5.0,1.0,1.0,0.625,12.0


### Fingerprinting

In [5]:
import operator
from math import nan
import profiling as prof
from sklearn.model_selection import train_test_split

# For each scenario pick the infected host with the most unique n-gram.
# Then check which n-gram does not occur in the non-infected host.
# Pick top k n-grams that satisfy above criteria, check for other hosts

scenarios = [9, 10, 11, 12]
pd_results = pd.DataFrame()
result_scenario = []
result_ip = []
result_TP = []
result_FP = []
result_TN = []
result_FN = []
result_newinf = []
result_acc = []
result_used_ngrams = []
for scenario in scenarios:
    if scenario == 9:
        pdata = pdata9.copy()
    elif scenario == 10:
        pdata = pdata10.copy()
    elif scenario == 11:
        pdata = pdata11.copy()
    elif scenario == 12:
        pdata = pdata12.copy() 

    # Feature to use, including its optimal binsize, and discretized to be used for n-grams
    feature = "Protocol"
    nbins = 3

    discretized_data = pd.DataFrame()
    discretized_data["StartTime"] = pdata["StartTime"].copy()
    discretized_data["SourceAddress"] = pdata["SourceAddress"].copy()
    discretized_data["DestinationAddress"] = pdata["DestinationAddress"].copy()
    pdata["Protocol"] = prof.encode_feature(pdata["Protocol"])
    discretized_data[feature], binsedges_infected = prof.discretize_feature(pdata, feature, nbins, "kmeans")

    # Select source with most entries in given dataset to deliver most reliable n-gram profile
    selected_sources = prof.select_all_infected_host(pdata,scenario)
    # ONLY FOR TEST LOOP OVER ALL INFECTED IPS     
    for selected_source in selected_sources:
        result_scenario.append(scenario)
        result_ip.append(selected_source)

#         selected_source = prof.select_infected_host(pdata,scenario)
        filtered_source = discretized_data.loc[discretized_data.SourceAddress == selected_source]

        ngram = 3
        all_n_grams = list(itertools.product(*[['0','1','2'],['0','1','2'],['0','1','2']]))

        n_grams = pd.Index(list(ngrams(filtered_source[feature].astype(str),ngram)))
        n_grams_counts = [0]*len(all_n_grams)
        for i in range(len(n_grams.value_counts())):
            n_gram = n_grams.value_counts()
            index_all_n_grams = all_n_grams.index(n_gram.index[i])
            n_grams_counts[index_all_n_grams] = n_gram[i]

        # Select all benign traffic(Non-infected)
        non_infected_host = prof.select_non_infected_host(pdata)
        train, test = train_test_split(non_infected_host, test_size=0.3)

        unique_sources_benign = pd.DataFrame()
        unique_sources_benign['SourceAddress'] = train

        # Sum all counts of all benign host(if n-gram == 0, does not occur in benign data)
        sum_ngrams_benign = [0]*len(all_n_grams)
        for i in range(len(unique_sources_benign)):
            source = unique_sources_benign.iloc[i]

            filtered = discretized_data.loc[discretized_data.SourceAddress == source.SourceAddress]
            n_grams_source = pd.Index(list(ngrams(filtered[feature].astype(str),ngram)))
            n_grams_source_counts = [0]*len(all_n_grams)

            for j in range(len(n_grams_source.value_counts())):
                n_gram = n_grams_source.value_counts()
                index_all_n_grams = all_n_grams.index(n_gram.index[j])
                n_grams_source_counts[index_all_n_grams] = n_gram[j]

            # Check which n-gram does NOT occur in benign data
            sum_ngrams_benign = np.add(sum_ngrams_benign, n_grams_source_counts)

        # Pick all ngrams that does not occur in benign traffic     
        n_grams_not_in_benign = np.array(sum_ngrams_benign)
        n_grams_not_in_benign = np.where(sum_ngrams_benign == 0)[0]

        # Pick all ngrams that occur in infected traffic     
        n_grams_in_infected = np.array(n_grams_counts)
        n_grams_in_infected = np.where(n_grams_in_infected > 0)[0]

        # Pick the ngrams that does not occur in benign but occur in infected data     
        possible_fingerprints = np.intersect1d(n_grams_in_infected, n_grams_not_in_benign)
        result_used_ngrams.append(possible_fingerprints)



        ### PART 2 (CHECK WITH THE NGRAMS FIND ABOVE IF ANY (NEW) INFECTED HOSTS COULD BE DETECTED) ####    

        # Group netflows by source address (host) because we are interested in modeling per-host behavior
        unique_sources = pdata.groupby(["SourceAddress"]).size().reset_index()
        pd_unique_source = pd.DataFrame()

        # Get the data of the infected host without selected infected host     
        all_infected_hosts = prof.select_all_infected_host(pdata, scenario)
        # Remove the selected host     
        all_infected_hosts.remove(selected_source)

        # Merge test data with all infected host except selected
        test_data = np.union1d(test,all_infected_hosts)
        pd_unique_source['SourceAddress'] = test_data

        finger_print_exists = [100]*len(pd_unique_source)

        # Check if fingerprint is contained in the data of the host
        for i in range(len(pd_unique_source)):
            source = pd_unique_source.iloc[i]

            filtered = discretized_data.loc[discretized_data.SourceAddress == source.SourceAddress]
            n_grams_source = pd.Index(list(ngrams(filtered[feature].astype(str),ngram)))
            n_grams_source_counts = [0]*len(all_n_grams)

            for j in range(len(n_grams_source.value_counts())):
                n_gram = n_grams_source.value_counts()
                index_all_n_grams = all_n_grams.index(n_gram.index[j])
                n_grams_source_counts[index_all_n_grams] = n_gram[j]

            # Pick the indexes of the ngram that occur          
            n_grams_in_source = np.array(n_grams_source_counts)
            n_grams_in_source = np.where(n_grams_in_source > 0)[0]

            # For each possible fingerprint check if it exists in the data
            for fingerprint in possible_fingerprints:
                # Raise alarm         
                if fingerprint in n_grams_in_source:
                    finger_print_exists[i] = 1

        # # Create an overview of the results including the original label and predicted label
        results = pd.DataFrame(columns=["SourceAddress","Fingerprint_exists","Label","Prediction"])

        for i in range(len(finger_print_exists)):
            finger_print = finger_print_exists[i]
            source = pd_unique_source.SourceAddress.iloc[i]
            infected = prof.is_infected(source, scenario)
            normal = prof.is_normal(source)
            label = -1

            if infected == True:
                label = 1
            elif normal == True:
                label = 0

            results = results.append({"SourceAddress": source, "Fingerprint_exists": finger_print, "Label": label, "Prediction": finger_print}, ignore_index=True)
        TP, FP, FN, TN, new_infections, unknown = 0, 0, 0, 0, 0, 0

        for i in range(len(results)):
            row = results.iloc[i]
            if row.Label == row.Prediction and row.Label == 1:
                TP += 1
            elif row.Label == 0 and row.Prediction == 1:
                FP += 1
            elif row.Label == 0 and row.Prediction == 0:
                TN += 1
            elif row.Label == 1 and row.Prediction == 0:
                FN += 1

            # Number of unlabeled hosts
            if row.Label == -1:
                unknown += 1

            # Newly identified infected host
            if row.Label == -1 and row.Prediction == 1:
                new_infections += 1
        if (TP+FP+FN+TN) == 0:
            accuracy = nan
        else: accuracy = (TP+TN)/(TP+FP+FN+TN)
        result_TP.append(TP)
        result_FP.append(FP)
        result_TN.append(TN)
        result_FN.append(FN)
        result_newinf.append(new_infections)
        result_acc.append(accuracy)

pd_results['Scenario'] = result_scenario
pd_results['Infected IP used'] = result_ip
pd_results['Used Ngrams'] = result_used_ngrams
pd_results['TP'] = result_TP
pd_results['FP'] = result_FP
pd_results['TN'] = result_TN
pd_results['FN'] = result_FN
pd_results['New infected'] = result_newinf
pd_results['Accuracy'] = result_acc
pd_results.head(50)


Unnamed: 0,Scenario,Infected IP used,Used Ngrams,TP,FP,TN,FN,New infected,Accuracy
0,9,147.32.84.165,[15],2,0,0,0,0,1.0
1,9,147.32.84.191,"[7, 15]",2,0,0,0,0,1.0
2,9,147.32.84.192,[],0,0,0,0,0,
3,9,147.32.84.193,[],0,0,0,0,0,
4,9,147.32.84.204,[],0,0,0,0,0,
5,9,147.32.84.205,[],0,0,0,0,0,
6,9,147.32.84.206,[],0,0,0,0,0,
7,9,147.32.84.207,[],0,0,0,0,0,
8,9,147.32.84.208,"[4, 10, 15, 19]",3,1,0,0,0,0.75
9,9,147.32.84.209,[],0,0,0,0,0,


The profiles found by the "Botnet profiling task" are here used to detect new infections by fingerprinting. For each scenario, the infected host with the most entries in the dataset is selected since this host has the most reliable n-gram profile. Then the benign data is split into a training and test set. The training set is used to learn to look for the occurrence of an n-gram that does not occur in any benign traffic. After that, we check if the selected fingerprint from the training data is present among all other infected hosts and the generate test set of the benign hosts.

Above the results of the fingerprinting algorithm can be seen. ...
 
Compared to the botnet profiling task, ...

