In [1]:
import pandas as pd
import numpy as np
import datetime
import csv
import math

In [2]:
# data source: http://www.unb.ca/cic/research/datasets/botnet.html
# to process pcap file, e.g. "ISCX_Botnet-Training.pcap" in tshark, 
# install Wireshark and call tshark from command line
# tshark -nr ISCX_Botnet-Training.pcap -T fields -E header=y -e frame.time_relative -e frame.len -e ip.src -e ip.dst -e tcp.srcport -e tcp.dstport -e udp.srcport -e udp.dstport  > iscxTrainPCAP.txt
# because of irregularity in frame.time_relative after line 1201967
# head -n 1201967 iscxTrainPCAP.txt > train.txt

In [3]:
#Reads processed file
a = datetime.datetime.now().replace(microsecond=0)
df1 = pd.read_table("train.txt")
df1 = df1.dropna(axis = 0, subset= ['ip.src'])
df1 = df1.reset_index(drop=True)

#this file had unreliable frame_time_relative after line 1201967
#data = pd.read_table("iscxTrainPCAP.txt")
#data = data.dropna(axis = 0, subset= ['ip.src'])
#data = data.reset_index(drop=True)
#data.tail(5)
b = datetime.datetime.now().replace(microsecond=0)
print(b-a)

0:00:02


In [4]:
# scans malicious IP address list from http://www.unb.ca/cic/research/datasets/botnet.html stored in BotIPs.txt
#gives each flow 'bot' attribute (True or False)
a = datetime.datetime.now().replace(microsecond=0)
botIPs = []
with open("BotIPs.txt", 'r') as f:
    reader = csv.reader(f, delimiter = '\t')
    for ip in reader:
        botIPs.append(ip)

botIPs = [val for sublist in botIPs for val in sublist]
for i in botIPs:
    if (len(i) == 0):
        botIPs.remove(i)
        
df1['bot'] = df1['ip.src'].map(lambda x: True if x in botIPs else False)
b = datetime.datetime.now().replace(microsecond=0)
print(b-a)

0:00:01


In [5]:
df = df1
#should be (1187635, 9)
df.shape

(1187635, 9)

In [None]:
##starts separating by sourceIPs
a = datetime.datetime.now().replace(microsecond=0)
SourceIPs = list(set(df['ip.src']))
print(len(SourceIPs))
#counter = 0
sourceIPs = {}
for s in SourceIPs:
    sourceIPs[s] = []
    #counter += 1
    #print(counter)
print("filling dict") 
for i in range(df.shape[0]):
    #print("i = {}".format(i))
    for s in sourceIPs:
        if df.iloc[i]['ip.src'] == s:
            sourceIPs[s].append(df.iloc[i])
b = datetime.datetime.now().replace(microsecond=0)
print(b-a)

12828
filling dict


In [None]:
##then filters by destIPs
a = datetime.datetime.now().replace(microsecond=0)
DestIPs = list(set(df['ip.dst'][~pd.isnull(df['ip.dst'])]))
print(len(DestIPs))
destIPs = {}
#counter = 0
for d in DestIPs:
    destIPs[d] = []
    #counter += 1
    #print(counter)

print("filling dict")    
for key, value in sourceIPs.items():
    for d in DestIPs:
        for i in range(len(value)):
            if value[i]['ip.dst'] == d:
                destIPs[d].append(value[i])
b = datetime.datetime.now().replace(microsecond=0)
print(b-a)

In [None]:
#sorts into TCP C-flows
a = datetime.datetime.now().replace(microsecond=0)
TCPSrcs = list(set(df['tcp.srcport'][~pd.isnull(df['tcp.srcport'])]))
TCPsrcs = {}
for t in TCPSrcs:
    TCPsrcs[t] = []
for key, value in destIPs.items():
    for t in TCPSrcs:
        for i in range(len(value)):
            if value[i]['tcp.srcport'] == t:
                TCPsrcs[t].append(value[i])

TCPDsts = list(set(df['tcp.dstport'][~pd.isnull(df['tcp.dstport'])]))
#len(TCPDsts)
TCPdsts = {}
for t in TCPDsts:
    TCPdsts[t] = []    
for key, value in TCPsrcs.items():
    for t in TCPDsts:
        for i in range(len(value)):
            if value[i]['tcp.dstport'] == t:
                TCPdsts[t].append(value[i])
            
for k in list(TCPdsts.keys()):
    if len(TCPdsts[k]) == 0:
        del TCPdsts[k]

# should be 46 in this dataset
len(TCPdsts)
b = datetime.datetime.now().replace(microsecond=0)
print(b-a)

In [None]:
#sorts into UDP C-flows
a = datetime.datetime.now().replace(microsecond=0)
UDPSrcs = list(set(df['udp.srcport'][~pd.isnull(df['udp.srcport'])]))
UDPsrcs = {}
for u in UDPSrcs:
    UDPsrcs[u] = []
for key, value in destIPs.items():
    for u in UDPSrcs:
        for i in range(len(value)):
            if value[i]['udp.srcport'] == u:
                UDPsrcs[u].append(value[i])

UDPDsts = list(set(df['udp.dstport'][~pd.isnull(df['udp.dstport'])]))
UDPdsts = {}
for u in UDPDsts:
    UDPdsts[u] = []    
for key, value in UDPsrcs.items():
    for u in UDPDsts:
        for i in range(len(value)):
            if value[i]['udp.dstport'] == u:
                if not pd.isnull(value[i]['udp.dstport']):
                    UDPdsts[u].append(value[i])  
                    
for k in list(UDPdsts.keys()):
    if len(UDPdsts[k]) == 0:
        del UDPdsts[k]
# should be 5 in this dataset
len(UDPdsts)
b = datetime.datetime.now().replace(microsecond=0)
print(b-a)

In [None]:
# summarizes C-flow in 5-element vector: bot (0/1), bpp (bits/packet), bps (bits/s), 
# pps (packets/s), ppf (packets/flow)
def CflowVectorizer(flow):
    ppf = len(flow)
    if ppf > 0:
        cFlow = np.zeros(5)
        totalBytes = 0
        try:
            startTime = flow[0]['frame.time_relative']
            endTime = flow[ppf-1]['frame.time_relative']
            for j in range(ppf):
                if flow[j]['bot'] == True:
                    bot = 1
                else:
                    bot = 0
                if flow[j]['frame.time_relative'] < startTime:
                    startTime = flow[j]['frame.time_relative']
                if flow[j]['frame.time_relative'] > endTime:
                    endTime = flow[j]['frame.time_relative']
                totalBytes += flow[j]['frame.len']
            duration = endTime-startTime
            pps = ppf/duration
            bpp = totalBytes/ppf
            bps = totalBytes/duration
        except IndexError:
            pass
        cFlow[0] = bot
        cFlow[1] = bpp
        cFlow[2] = bps
        cFlow[3] = pps
        cFlow[4] = ppf
            
    return cFlow

In [None]:
# vectorizes C-flows 
a = datetime.datetime.now().replace(microsecond=0)
TCPflows = list(TCPdsts.values())
UDPflows = list(UDPdsts.values())
allFlows = TCPflows + UDPflows
CFLOWS = []
for f in allFlows:
    CFLOWS.append(CflowVectorizer(f))
cFlowDF = pd.DataFrame(CFLOWS, columns = ['bot', 'bpp' ,'bps', 'pps', 'ppf'])
b = datetime.datetime.now().replace(microsecond=0)
print(b-a)

In [None]:
def bins(flows):
    quantiles = [5, 10, 15, 20, 25, 30, 40, 50, 60, 70, 80, 90, 100]
    kBin = []
    for q in quantiles:
        kBin.append(np.percentile(flows, q))
    return kBin

In [None]:
# k-bins : 13 (binning C-flows so all elements are range 0-12)
a = datetime.datetime.now().replace(microsecond=0)
bppBin = bins(cFlowDF['bpp'])
bpsBin = bins(cFlowDF['bps'])
ppsBin = bins(cFlowDF['pps'])
ppfBin = bins(cFlowDF['ppf'])
b = datetime.datetime.now().replace(microsecond=0)
print(b-a)

In [None]:
# normalizes C-flows so all elements are in range 0-12
a = datetime.datetime.now().replace(microsecond=0)
# 13 bins
nBins = len(bppBin)   
for i in range(cFlowDF.shape[0]): 
    flow = cFlowDF.ix[i]
    for k in range(1,nBins):
        if flow['bpp'] <= bppBin[k] and flow['bpp'] > bppBin[k-1]:
            flow['bpp'] = k-1 
        if flow['bps'] <= bpsBin[k] and flow['bps'] > bpsBin[k-1]:
            flow['bps'] = k-1
        if flow['pps'] <= ppsBin[k] and flow['bpp'] > ppsBin[k-1]:
            flow['pps'] = k-1
        if flow['ppf'] <= ppfBin[k] and flow['ppf'] > ppfBin[k-1]:
            flow['ppf'] = k-1
    if flow['bpp'] > 11:
        flow['bpp'] = 12
    if flow['bps'] > 11:
        flow['bps'] = 12
    if flow['pps'] > 11:
        flow['pps'] = 12
    if flow['ppf'] > 11:
        flow['ppf'] = 12
b = datetime.datetime.now().replace(microsecond=0)
print(b-a)

In [None]:
# calculates entropy of each cluster
def calculateEntropy(cluster):
    numBots = sum(cluster['bot'])
    total = float(cluster.shape[0])
    pBot = numBots / total
    pNot = (total-numBots) / total
    try:
        entropy = -pBot * math.log2(pBot) - pNot * math.log2(pNot)
    except ValueError:
        entropy = 0
    entropy = entropy * (total / cFlowDF.shape[0])
    return entropy

In [None]:
# KMeans clustering on c-flows

from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

# for some reason this throws an error
# kMeans = [x for x in range(5)]

# so initialize means manually
kMeans = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
X = cFlowDF

clusts = {}
for k in kMeans:
    clusts[k] = 0
    kmeans = KMeans(n_clusters=k, random_state=0).fit(X)
    cFlowDF['predicted_cluster'] = kmeans.fit_predict(X)
    E = 0
    for i in range(k):
        c = cFlowDF.where(cFlowDF['predicted_cluster'] == i).dropna()
        E += calculateEntropy(c)
    clusts[k] = E
    
plt.plot(list(clusts.values()))
plt.xlabel('n_clusters')
plt.ylabel('entropy')
plt.show()