In [7]:
%reset -f
%load_ext autoreload
%autoreload 2

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
import lsh_hyperplane as lsh
import math
import random
from nltk.util import ngrams
import time

data = lsh.load_data("data/capture20110818.binetflow")
pdata = lsh.pre_process(data)

# We first discretize the values for the features we selected in the discretization task. 
# We then make trigrams 
# Optimal number of bins for source address, based on elbow method
features = ["Protocol"]
nbins = [3]

# Descretize only one infected host (with most flows)
discretized = pd.DataFrame()
discretized["StartTime"] = pdata["StartTime"].copy()
discretized['StartTime'] = discretized['StartTime'].str[:-7]
discretized["SourceAddress"] = pdata["SourceAddress"].copy()
discretized["DestinationAddress"] = pdata["DestinationAddress"].copy()
pdata["Protocol"] = lsh.encode_feature(pdata["Protocol"])

for feature, nbin in zip(features,nbins):
    discretized[feature], binsedges_infected = lsh.discretize_feature(pdata, feature, nbin, "kmeans")
    
# Remove pairs with less than three occurrences (unable to form 3-gram)
ip_pairs = discretized.groupby(['SourceAddress','DestinationAddress']).size().reset_index().rename(columns={0:'count'})


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [38]:
# Obtain the list of unique ip pairs.
# Define the number of projections to apply and the number of bits as the bucket number (2^k bins for k bits)
ip_pairs = discretized.groupby(["SourceAddress","DestinationAddress"]).size().reset_index()
n_projections = 3
bin_size = 2 

# Create a dataframe where all computations will be added
column_names = list()
column_names.append("SourceAddress")
column_names.append("DestinationAddress")
column_names.append("trigrams_counts")

for i in range(n_projections):
    column_names.append("hash_code_"+str(i))
    column_names.append("bin_"+str(i))
    
column_names.append("bins_combined")
ip_pairs_trigrams = pd.DataFrame(columns=column_names)

# Create a set of 3-grams which exist in the dataset. 
possible_trigrams = list(set(ngrams(discretized["Protocol"].astype(str),3)))
dimension = len(possible_trigrams)

# For each pair we count the 3-gram occurences and save it to the dataframe
for pair in ip_pairs.iterrows():
    source = pair[1].SourceAddress
    dest = pair[1].DestinationAddress
    counts = [0]*len(possible_trigrams)
    
    filtered = discretized.loc[(discretized["SourceAddress"] == source) & (discretized["DestinationAddress"] == dest)]
    trigrams = list(ngrams(filtered["Protocol"].astype(str),3))
    
    for trigram in trigrams:
        counts[possible_trigrams.index(trigram)] += 1
    
    ip_pairs_trigrams = ip_pairs_trigrams.append({"SourceAddress": source, "DestinationAddress": dest, "trigrams_counts": counts}, ignore_index=True)
    
# Create random hyperplanes based on the bin size and 3-gram counts vector, and repeat for the number of projections
random_hyperplanes = list()

for i in range(n_projections):
    hyperplanes = np.random.randn(bin_size,dimension)
    random_hyperplanes.append(hyperplanes)

# For every unique pair we calculate the bin the 3-gram counts vector hashes to.
for pair_id in range(len(ip_pairs_trigrams)):
    pair = ip_pairs_trigrams.iloc[pair_id]
    
    hashed_bins = list()
    
    # Repeat this for every projection
    for i in range(n_projections):
        hash_bin = ""
            
        #For every random hyperplane we calculate the hash and add it to the list of all hash codes
        for hyperplane in random_hyperplanes[i]:
            hash_bin += str(lsh.sign_function(lsh.dot_product(pair.trigrams_counts,hyperplane)))
            
        # Save the hash code and the bin number to the dataframe
        ip_pairs_trigrams["hash_code_"+str(i)].iloc[pair_id] = hash_bin
        ip_pairs_trigrams["bin_"+str(i)].iloc[pair_id] = int(hash_bin,2)
        hashed_bins.append(int(hash_bin,2))
        
    ip_pairs_trigrams["bins_combined"].iloc[pair_id] = hashed_bins
ip_pairs_trigrams.head()

Unnamed: 0,SourceAddress,DestinationAddress,trigrams_counts,hash_code_0,bin_0,hash_code_1,bin_1,hash_code_2,bin_2,bins_combined
0,147.32.1.20,147.32.85.20,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0,0,0,0,0,"[0, 0, 0]"
1,147.32.1.20,147.32.85.7,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,1,1,1,0,0,"[1, 1, 0]"
2,147.32.1.20,147.32.85.8,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0,0,0,0,0,"[0, 0, 0]"
3,147.32.1.20,147.32.86.135,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0,0,0,0,0,"[0, 0, 0]"
4,147.32.1.20,147.32.86.155,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0,0,0,0,0,"[0, 0, 0]"


In [39]:
# For obtaining the correct frequencies ip-pairs we neglect the ip pairs that have no trigrams, 
# e.g. have less than three flows between them. These hosts influence the results quite a lot.
# From these removed pairs a lot of them are infected hosts, which only have a connection with other 
# hosts once or twice.
non_zero_indices = list()
for i in range(len(ip_pairs_trigrams)):
    row = ip_pairs_trigrams.iloc[i]
    
    if np.sum(row.trigrams_counts) > 0:
        non_zero_indices.append(i)
        
res_ip_pairs = ip_pairs_trigrams.iloc[non_zero_indices]

# res_ip_pairs = ip_pairs_trigrams

# Retrieve infected and non infected source hosts. Infected host is 1, normal host is 0, unknown host is -1
res_ip_pairs.loc[:,'Infected'] = -1

for i in range(len(res_ip_pairs)):
    ip = res_ip_pairs.SourceAddress.iloc[i]
    if lsh.is_infected(ip):
        res_ip_pairs.Infected.iloc[i] = 1
    elif lsh.is_normal(ip):
        res_ip_pairs.Infected.iloc[i] = 0

# Separate the label cases to derive statistics on bins most frequent used for each label
infected = res_ip_pairs.loc[res_ip_pairs["Infected"] == 1]
normal = res_ip_pairs.loc[res_ip_pairs["Infected"] == 0]
unknown = res_ip_pairs.loc[res_ip_pairs["Infected"] == -1]

print("Infected host bins:")
print(infected.bins_combined.value_counts()/len(infected)*100, "\n")
print("Normal host bins:")
print(normal.bins_combined.value_counts()/len(normal)*100, "\n")
print("Unknown host bins:")
print(unknown.bins_combined.value_counts()/len(unknown)*100, "\n")

Infected host bins:
[1, 1, 0]    64.516129
[1, 0, 0]    32.258065
[1, 1, 1]     3.225806
Name: bins_combined, dtype: float64 

Normal host bins:
[1, 1, 1]    81.992337
[1, 1, 0]    16.091954
[1, 0, 0]     1.915709
Name: bins_combined, dtype: float64 

Unknown host bins:
[1, 1, 0]    90.909091
[1, 1, 1]     9.090909
Name: bins_combined, dtype: float64 



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guid

Unnamed: 0,SourceAddress,DestinationAddress,trigrams_counts,hash_code_0,bin_0,hash_code_1,bin_1,hash_code_2,bin_2,bins_combined
0,147.32.1.20,147.32.85.20,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0,0,0,0,0,"[0, 0, 0]"
1,147.32.1.20,147.32.85.7,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",11,3,11,3,10,2,"[3, 3, 2]"
2,147.32.1.20,147.32.85.8,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0,0,0,0,0,"[0, 0, 0]"
3,147.32.1.20,147.32.86.135,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0,0,0,0,0,"[0, 0, 0]"
4,147.32.1.20,147.32.86.155,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0,0,0,0,0,"[0, 0, 0]"


Infected host bins:
[3, 3, 2]    64.516129
[3, 0, 0]    32.258065
[2, 3, 2]     3.225806
Name: bins_combined, dtype: float64 

Normal host bins:
[2, 3, 2]    81.992337
[3, 3, 2]    16.091954
[3, 0, 0]     1.532567
[3, 0, 1]     0.383142
Name: bins_combined, dtype: float64 

Unknown host bins:
[3, 3, 2]    90.909091
[2, 3, 2]     9.090909
Name: bins_combined, dtype: float64 



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [35]:
# Perform pair-wise distance computation between any pair
start = time.time()
distances_all = [[0 for i in range(len(res_ip_pairs))] for j in range(len(res_ip_pairs))]

for i in range(0,len(res_ip_pairs)):
    for j in range(i+1,len(res_ip_pairs)):
        distance = lsh.euclidean_distance(res_ip_pairs.trigrams_counts.iloc[i],res_ip_pairs.trigrams_counts.iloc[j])
        distances_all[i][j] = distance
        distances_all[j][i] = distance

end = time.time()
print("Time elapsed pair-wise computations:",end-start)

# Perform distance computation between pairs that are mapped to the same bin (only applied to one projection)
start = time.time()
distances_bins = list()

for bin_number in range(0,bin_size**2):
    filtered = res_ip_pairs.loc[res_ip_pairs["bin_0"] == bin_number]
    distances = [[0 for i in range(len(filtered))] for j in range(len(filtered))]
    
    for i in range(0,len(filtered)):
        for j in range(i+1,len(filtered)):
            distance = lsh.euclidean_distance(filtered.trigrams_counts.iloc[i],filtered.trigrams_counts.iloc[j])
            distances[i][j] = distance
            distances[j][i] = distance

    distances_bins.append(distances)

end = time.time()

print("Time elapsed for profiles in same bin:",end-start)

Time elapsed pair-wise computations: 2.333493947982788
Time elapsed for profiles in same bin: 1.1145451068878174


In [40]:
print(discretized)

                   StartTime  SourceAddress DestinationAddress Protocol
407      2011/08/18 10:19:15  147.32.84.164     74.125.232.215        1
697      2011/08/18 10:19:18  147.32.84.164     74.125.232.197        1
717      2011/08/18 10:19:18  147.32.84.164     209.85.149.138        1
896      2011/08/18 10:19:19  147.32.84.170        147.32.80.9        2
897      2011/08/18 10:19:19  147.32.84.170        147.32.80.9        2
...                      ...            ...                ...      ...
1309786  2011/08/18 15:04:59  147.32.84.192       147.32.96.69        0
1309787  2011/08/18 15:04:59  147.32.84.208       147.32.96.69        0
1309788  2011/08/18 15:04:59  147.32.84.208       147.32.96.69        0
1309789  2011/08/18 15:04:59  147.32.84.208       147.32.96.69        0
1309790  2011/08/18 15:04:59  147.32.84.205       147.32.96.69        0

[122199 rows x 4 columns]


In [None]:
## WORKING 

ip_pairs = discretized.groupby(["SourceAddress","DestinationAddress"]).size().reset_index()

column_names = list()
column_names.append("SourceAddress")
column_names.append("DestinationAddress")
column_names.append("trigrams")

n_projections = 3

for i in range(n_projections):
    column_names.append("hash_code_"+str(i))

# Create trigrams for each ip_pair
ip_pairs_trigrams = pd.DataFrame(columns=column_names)

for pair in ip_pairs.iterrows():
    source = pair[1].SourceAddress
    dest = pair[1].DestinationAddress
    
    filtered = discretized.loc[(discretized["SourceAddress"] == source) & (discretized["DestinationAddress"] == dest)]
    trigrams = list(set(ngrams(filtered["Protocol"].astype(str),3)))
    
    ip_pairs_trigrams = ip_pairs_trigrams.append({"SourceAddress": source, "DestinationAddress": dest, "trigrams": trigrams}, ignore_index=True)

hash_size = 3
dimension = 3

random_hyperplanes = list()

for i in range(n_projections):
    hyperplanes = np.random.randn(hash_size,dimension)
    random_hyperplanes.append(hyperplanes)
    print(hyperplanes)

# Every pair in the list
for pair_id in range(len(ip_pairs_trigrams)):
    pair = ip_pairs_trigrams.iloc[pair_id]
    pair_trigrams = pair.trigrams
    hash_codes = list()
    
    # For every trigram in the list with unique trigrams for this pair
    for i in range(n_projections):
        hash_codes = list()
        for trigram in pair_trigrams:
            code = list()
            
            #For every random hyperplane we calculate the hash and add it to the list of all hash codes
            for hyperplane in random_hyperplanes[i]:
                code.append(lsh.sign_function(lsh.dot_product(trigram,hyperplane)))
            
            hash_codes.append(code)
            
        ip_pairs_trigrams["hash_code_"+str(i)].iloc[pair_id] = hash_codes

ip_pairs_trigrams.head(110)