## How to Label the ISCX Data


### 1) Load the Data

In [28]:
import pandas as pd
from time import time

In [29]:
%%time

data = pd.read_csv('ISCX_ISCX_Botnet.csv')
data.head(2)

Wall time: 12.1 s




### Shuffle the data

In [30]:
data = data.sample(frac=1).reset_index(drop=True)

In [31]:
data.columns

Index(['Source IP', ' Source Port', ' Destination IP', ' Destination Port',
       ' Protocol', ' Flow Duration', ' Flow Bytes/s', ' Flow Packets/s',
       ' Flow IAT Mean', ' Flow IAT Std', ' Flow IAT Max', ' Flow IAT Min',
       'Fwd IAT Mean', ' Fwd IAT Std', ' Fwd IAT Max', ' Fwd IAT Min',
       'Bwd IAT Mean', ' Bwd IAT Std', ' Bwd IAT Max', ' Bwd IAT Min',
       'Active Mean', ' Active Std', ' Active Max', ' Active Min', 'Idle Mean',
       ' Idle Std', ' Idle Max', ' Idle Min', 'label'],
      dtype='object')

In [32]:
data.shape

(309206, 29)

In [33]:

data.head()

Unnamed: 0,Source IP,Source Port,Destination IP,Destination Port,Protocol,Flow Duration,Flow Bytes/s,Flow Packets/s,Flow IAT Mean,Flow IAT Std,...,Bwd IAT Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,label
0,158.65.110.24,57782,158.65.12.103,53,17,359,300836.0,5571.030640668524,359.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ISCX
1,158.65.110.24,40313,199.59.148.82,80,6,90217,0.0,22.16877085250008,90217.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ISCX
2,192.168.3.114,4489,67.20.126.228,80,6,1693732,403.252,4.13289,282288.666667,433687.043603,...,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ISCX
3,172.16.2.11,17962,10.0.0.254,53,17,319163,288.254,6.26639,319163.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ISCX
4,72.14.204.118,80,192.168.3.114,2534,6,255,0.0,7843.14,255.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ISCX


In [34]:
#take the first 10000 rows to save time
data = data.iloc[:10000,:]
data.shape

(10000, 29)

### 2) Load list of IP addresses and their corresponding Botnet Names

Data taken from: https://www.unb.ca/cic/datasets/botnet.html

#### functions to apply labelling according to Source and Destination IP addresses

In [36]:
# load ip addresses and the labels
ip1 = pd.read_csv('bots1.csv')
ip2 = pd.read_csv('bots2.csv')

In [37]:
#this function goes through the data one row at a time, checks the source IP and checkes if it exists in ip1
#if so, then this row is given the Botnet label
def find_class1(row):
    sourceIP = str(row['Source IP'])
    #destIP = str(row[' Destination IP'])
    #print(sourceIP+' -- '+destIP)
    for index, ip_row in ip1.iterrows():
        #print('\tcompare to: ' + ip_row['SrcIP']+' -- '+ip_row['DestIP'])
        if sourceIP == ip_row['IP']:
            s = ip_row['Bot']
            return s
    
    return 'Other'

In [38]:
# this function goes through the data one row at a time, checks the source and dest IPs and checkes if they both
# exist in ip2, if so, then this row is given the Botnet label
def find_class2(row):
    sourceIP = str(row['Source IP'])
    destIP = str(row[' Destination IP'])
    #print(sourceIP+' -- '+destIP)
    for index, ip_row in ip2.iterrows():
        #print('\tcompare to: ' + ip_row['SrcIP']+' -- '+ip_row['DestIP'])
        if sourceIP == ip_row['SrcIP'] and destIP == ip_row['DestIP']:
            s = ip_row['Bot']
            return s
    
    return 'Other'        

In [39]:
labels1 = data.apply(find_class1, axis=1)

In [40]:
len(labels1[labels1 == 'Other']) 

4963

In [41]:
labels2 = data.apply(find_class2, axis=1)

In [42]:
len(labels2[labels2 == 'Other'])

9872

In [44]:
len(labels1)

10000

In [45]:
ls1 = list(labels1.values)

In [46]:
ls2 = list(labels2.values)

In [47]:
len(ls1)

10000

In [48]:
label = list()

In [49]:
# now if a label is "Other" in both lists, then it's Normal
# if it's "Other" in one list only, then we assign the label from the other list
for a, b in zip(ls1, ls2):
    if a == 'Other' and b == 'Other':
        label.append('Normal')
    else:
        if a == 'Other':
            label.append(b)
        else:
            label.append(a)

#### Here we add 'BotNet_Label' column to the data after we filled it as above

In [50]:
data['BotNet_Label'] = label

In [51]:
#remove spaces from column names
data = data.rename(columns=lambda x: x.strip())

In [52]:
# Explore BotNet_Label values
data['BotNet_Label'].value_counts()

Normal                    4835
Weasel Bot                2200
Virut                     1363
Neris                      823
Murlo                      387
Menti                      149
IRC                        128
Zero access                 58
TBot                        26
Zeus                         9
Black hole 2                 9
RBot                         3
Smoke bot                    2
Weasel Botmaster             2
Black hole 3                 2
IRCbot and black hole1       2
Osx_trojan                   1
Sogou                        1
Name: BotNet_Label, dtype: int64

In [53]:
#remove unimportant columns
data.drop(['Source IP','Destination IP','label'],inplace=True,axis=1)

### This is how to apply one-hot encoding using Pandas

In [54]:
df_src_port = pd.get_dummies(data['Source Port'],prefix='SrcPort')
df_dest_port = pd.get_dummies(data['Destination Port'],prefix='DestPort')
df_protocol = pd.get_dummies(data['Protocol'],prefix='Protocol')

In [55]:
data = pd.concat([data, df_src_port,df_dest_port,df_protocol], axis=1)
data.shape

(10000, 7610)

In [56]:
data.drop(['Source Port','Destination Port','Protocol'],inplace=True,axis=1)
data.shape

(10000, 7607)

### Save the Data .. it is ready for further analysis and machine learning

In [None]:
%%time

data.to_csv('ISCX_Botnet_Labelled.csv',index=False)
