In [1]:
import os
import glob
import pandas as pd
from os.path import join
import numpy as np
from datetime import datetime
from collections import defaultdict

dataroot = '/hdd/juma/data/net_intrusion/CIC-IDS-2018/CSVs/sk_sr_1.0'
outputroot = dataroot + '_l'
if not os.path.exists(outputroot):
    os.makedirs(outputroot)

In [2]:
def get_immediate_subdirectories(a_dir):
    return [name for name in os.listdir(a_dir)
            if os.path.isdir(os.path.join(a_dir, name)) and not 'archive' in name]


def merge(dataroot):
    folders = get_immediate_subdirectories(dataroot)
    print(folders)
    for folder in folders:
        filenames = [i for i in glob.glob(join(dataroot,folder,'*.pcap_Flow.csv'))]
        combined_csv = pd.concat([pd.read_csv(f) for f in filenames],sort=False)
        combined_csv.to_csv(join(dataroot,folder+'_TrafficForML_CICFlowMeter.csv'),index=False,encoding='utf-8-sig')



In [3]:
merge(dataroot)

['Friday-02-03-2018', 'Wednesday-21-02-2018', 'Friday-16-02-2018', 'Thursday-22-02-2018', 'Thursday-15-02-2018', 'Wednesday-14-02-2018', 'Wednesday-28-02-2018', 'Friday-23-02-2018', 'Thursday-01-03-2018', 'Tuesday-20-02-2018']


In [4]:


def label_flows (data, attackers, victims, attack_time, attack_names, check_only_source=False, infiltration=False):
    data['Label']='Benign'
    data['Timestamp'] = pd.to_datetime(data['Timestamp'],format='%d/%m/%Y %H:%M:%S %p')
    #not_flipped=True
    for ttx, attack_name in enumerate(attack_names):
        for attacker in attackers[ttx]:
            for victim in victims[ttx]:
                    print('Labeling {}->{}'.format(attacker,victim))
                    #if ttx == 0 and not_flipped:
                    #    data[idx][0]= order_flowid(record[0])
                    if infiltration:
                        if check_only_source:
                            attacker_flow = (data['Destination IP']==attacker) | (data['Source IP']==victim)
                        else:
                            attacker_flow = (data['Destination IP']==attacker) & (data['Source IP']==victim)
                    else:
                        if check_only_source:
                            attacker_flow = (data['Source IP']==attacker)
                        else:
                            attacker_flow = (data['Source IP']==attacker)&(data['Destination IP']==victim)
                        
                        
                    before = data['Timestamp']>=datetime.strptime(attack_time[ttx][0],'%d/%m/%Y %H:%M %p')
                    after = data['Timestamp']<= datetime.strptime(attack_time[ttx][1], '%d/%m/%Y %H:%M %p')
                            
                    data.loc[attacker_flow & before & after, 'Label'] = attack_name

            #not_flipped=False
    return data

# my rational for labeling bidirectionally is the fact that if the flow started from attacker is
#intrusion the same flowid initiated by victim is also intrusion

# I also saw that using bidirectional labeling I will have 4 flows in Day9
#while using single direction check would give us 3 flows


def label_flows_bidirectionally(data, attackers, victims, attack_time, attack_names):
    data['Label']='Benign'
    data['Timestamp'] = pd.to_datetime(data['Timestamp'],format='%d/%m/%Y %H:%M:%S %p')
    #not_flipped=True
    for ttx, attack_name in enumerate(attack_names):
        for attacker in attackers[ttx]:
            for victim in victims[ttx]:
                    print('Labeling: {}->{}'.format(attacker,victim))
                    #if ttx == 0 and not_flipped:
                    #    data[idx][0]= order_flowid(record[0])
                    attacker_flow1 = (data['Destination IP']==attacker) & (data['Source IP']==victim)
                    attacker_flow2 = (data['Source IP']==attacker)&(data['Destination IP']==victim)
                    attacker_flow = attacker_flow1 | attacker_flow2
                        
                    before = data['Timestamp']>=datetime.strptime(attack_time[ttx][0],'%d/%m/%Y %H:%M %p')
                    after = data['Timestamp']<= datetime.strptime(attack_time[ttx][1], '%d/%m/%Y %H:%M %p')
                            
                    data.loc[attacker_flow & before & after, 'Label'] = attack_name

            #not_flipped=False
    return data


#currently used for saving label_dist
def save_dict_to_csv(filename,d):
    with open(filename,'w') as f:
        for key in sorted(d.keys()):
            f.write('{},{}\n'.format(key,d[key]))

In [5]:
label_dist = defaultdict(lambda:0)

# Day 1: FTP-BruteForce and SSH-BruteForce

In [6]:
data = pd.read_csv(join(dataroot,'Wednesday-14-02-2018_TrafficForML_CICFlowMeter.csv'))

attack_names = ['FTP-BruteForce','SSH-BruteForce']

attackers = [['18.221.219.4'], ['13.58.98.64']]
victims = [['172.31.69.25'],['172.31.69.25']]

attack_times = [['14/02/2018 10:32 AM','14/02/2018 12:09 PM'], 
               ['14/02/2018 14:01 PM','14/02/2018 15:31 PM']]

data = label_flows_bidirectionally(data, attackers, victims, attack_times, attack_names)  
data.to_csv(join(outputroot,'Wednesday-14-02-2018_TrafficForML_CICFlowMeter.csv'),index=False,encoding='utf-8-sig')
dist = data.Label.value_counts()
print(dist)

for k,v in dist.items():
    label_dist[k]+= v

Labeling: 18.221.219.4->172.31.69.25
Labeling: 13.58.98.64->172.31.69.25
SSH-BruteForce    14146
FTP-BruteForce    14116
Benign             1145
Name: Label, dtype: int64


# Day 2: DoS-GoldenEye AND DoS-Slowloris


In [7]:
data = pd.read_csv(join(dataroot,'Thursday-15-02-2018_TrafficForML_CICFlowMeter.csv'))

attack_names = ['DoS-GoldenEye','DoS-Slowloris']
attackers = [['18.219.211.138'], ['18.217.165.70']]
victims = [['172.31.69.25'],
    ['172.31.69.25']]
attack_times = [['15/02/2018 09:26 AM','15/02/2018 10:09 AM'], 
               ['15/02/2018 10:59 AM','15/02/2018 11:40 AM']]


data = label_flows_bidirectionally(data, attackers,victims, attack_times, attack_names)  
data.to_csv(join(outputroot,'Thursday-15-02-2018_TrafficForML_CICFlowMeter.csv'),index=False,encoding='utf-8-sig')
dist = data.Label.value_counts()
print(dist)
for k,v in dist.items():
    label_dist[k]+= v
    


Labeling: 18.219.211.138->172.31.69.25
Labeling: 18.217.165.70->172.31.69.25
DoS-GoldenEye    14116
DoS-Slowloris     6983
Benign            1416
Name: Label, dtype: int64


# Day 3: DoS-SlowHTTPTest AND DoS-Hulk


In [8]:
data = pd.read_csv(join(dataroot,'Friday-16-02-2018_TrafficForML_CICFlowMeter.csv'))

attack_names = ['DoS-SlowHTTPTest','DoS-Hulk']
attackers = [['13.59.126.31'], ['18.219.193.20']]
victims = [['172.31.69.25'],'172.31.69.25']

attack_times = [['16/02/2018 10:12 AM','16/02/2018 11:08 AM'], 
               ['16/02/2018 13:45 PM','16/02/2018 14:19 PM']]


data = label_flows_bidirectionally(data, attackers, victims, attack_times, attack_names)  
data.to_csv(join(outputroot,'Friday-16-02-2018_TrafficForML_CICFlowMeter.csv'),index=False,encoding='utf-8-sig')
dist = data.Label.value_counts()

print(dist)
for k,v in dist.items():
    label_dist[k]+= v
    


Labeling: 13.59.126.31->172.31.69.25
Labeling: 18.219.193.20->1
Labeling: 18.219.193.20->7
Labeling: 18.219.193.20->2
Labeling: 18.219.193.20->.
Labeling: 18.219.193.20->3
Labeling: 18.219.193.20->1
Labeling: 18.219.193.20->.
Labeling: 18.219.193.20->6
Labeling: 18.219.193.20->9
Labeling: 18.219.193.20->.
Labeling: 18.219.193.20->2
Labeling: 18.219.193.20->5
Benign              15322
DoS-SlowHTTPTest    14116
Name: Label, dtype: int64


#  Day 4: DDoS attacks-LOIC-HTTP AND DDoS-LOIC-UDP



In [9]:
data = pd.read_csv(join(dataroot,'Tuesday-20-02-2018_TrafficForML_CICFlowMeter.csv'))

attack_names = ['DDoS attacks-LOIC-HTTP','DDoS-LOIC-UDP']
attackers = [['18.218.115.60',
                    '18.219.9.1',
                    '18.219.32.43',
                    '18.218.55.126',
                    '52.14.136.135',
                    '18.219.5.43',
                    '18.216.200.189',
                    '18.218.229.235',
                    '18.218.11.51',
                    '18.216.24.42'], 
                 ['18.218.115.60',
                    '18.219.9.1',
                    '18.219.32.43',
                    '18.218.55.126',
                    '52.14.136.135',
                    '18.219.5.43',
                    '18.216.200.189',
                    '18.218.229.235',
                    '18.218.11.51',
                    '18.216.24.42']]
victims = [['172.31.69.25'],
['172.31.69.25']]

attack_times = [['20/02/2018 10:12 AM','20/02/2018 11:17 AM'], 
               ['20/02/2018 13:13 PM','20/02/2018 13:32 PM']]

data = label_flows_bidirectionally(data, attackers, victims, attack_times, attack_names)  
data.to_csv(join(outputroot,'Tuesday-20-02-2018_TrafficForML_CICFlowMeter.csv'),index=False,encoding='utf-8-sig')

dist = data.Label.value_counts()
print(dist)
for k,v in dist.items():
    label_dist[k]+= v



Labeling: 18.218.115.60->172.31.69.25
Labeling: 18.219.9.1->172.31.69.25
Labeling: 18.219.32.43->172.31.69.25
Labeling: 18.218.55.126->172.31.69.25
Labeling: 52.14.136.135->172.31.69.25
Labeling: 18.219.5.43->172.31.69.25
Labeling: 18.216.200.189->172.31.69.25
Labeling: 18.218.229.235->172.31.69.25
Labeling: 18.218.11.51->172.31.69.25
Labeling: 18.216.24.42->172.31.69.25
Labeling: 18.218.115.60->172.31.69.25
Labeling: 18.219.9.1->172.31.69.25
Labeling: 18.219.32.43->172.31.69.25
Labeling: 18.218.55.126->172.31.69.25
Labeling: 52.14.136.135->172.31.69.25
Labeling: 18.219.5.43->172.31.69.25
Labeling: 18.216.200.189->172.31.69.25
Labeling: 18.218.229.235->172.31.69.25
Labeling: 18.218.11.51->172.31.69.25
Labeling: 18.216.24.42->172.31.69.25
DDoS attacks-LOIC-HTTP    163439
Benign                       604
DDoS-LOIC-UDP                100
Name: Label, dtype: int64


#  Day 5: DDOS attack-HOIC AND DDoS-LOIC-UDP


In [10]:
data = pd.read_csv(join(dataroot,'Wednesday-21-02-2018_TrafficForML_CICFlowMeter.csv'))

attack_names = ['DDoS-LOIC-UDP','DDoS-HOIC']
attackers = [['18.218.115.60',
                    '18.219.9.1',
                    '18.219.32.43',
                    '18.218.55.126',
                    '52.14.136.135',
                    '18.219.5.43',
                    '18.216.200.189',
                    '18.218.229.235',
                    '18.218.11.51',
                    '18.216.24.42'], 
                 ['18.218.115.60',
                    '18.219.9.1',
                    '18.219.32.43',
                    '18.218.55.126',
                    '52.14.136.135',
                    '18.219.5.43',
                    '18.216.200.189',
                    '18.218.229.235',
                    '18.218.11.51',
                    '18.216.24.42']]
victims = [['172.31.69.28'],['172.31.69.28']]
attack_times = [['21/02/2018 10:09 AM','21/02/2018 10:43 AM'], 
               ['21/02/2018 14:05 PM','21/02/2018 15:05 PM']]

data = label_flows_bidirectionally(data, attackers,victims, attack_times, attack_names)  
data.to_csv(join(outputroot,'Wednesday-21-02-2018_TrafficForML_CICFlowMeter.csv'),index=False,encoding='utf-8-sig')
dist = data.Label.value_counts()
print(dist)
for k,v in dist.items():
    label_dist[k]+= v
    


Labeling: 18.218.115.60->172.31.69.28
Labeling: 18.219.9.1->172.31.69.28
Labeling: 18.219.32.43->172.31.69.28
Labeling: 18.218.55.126->172.31.69.28
Labeling: 52.14.136.135->172.31.69.28
Labeling: 18.219.5.43->172.31.69.28
Labeling: 18.216.200.189->172.31.69.28
Labeling: 18.218.229.235->172.31.69.28
Labeling: 18.218.11.51->172.31.69.28
Labeling: 18.216.24.42->172.31.69.28
Labeling: 18.218.115.60->172.31.69.28
Labeling: 18.219.9.1->172.31.69.28
Labeling: 18.219.32.43->172.31.69.28
Labeling: 18.218.55.126->172.31.69.28
Labeling: 52.14.136.135->172.31.69.28
Labeling: 18.219.5.43->172.31.69.28
Labeling: 18.216.200.189->172.31.69.28
Labeling: 18.218.229.235->172.31.69.28
Labeling: 18.218.11.51->172.31.69.28
Labeling: 18.216.24.42->172.31.69.28
DDoS-HOIC        163750
Benign              878
DDoS-LOIC-UDP        94
Name: Label, dtype: int64


#  Day 6: Brute Force -Web AND Brute Force -XSS AND SQL Injection


# Brute Force -Web

In [11]:
data = pd.read_csv(join(dataroot,'Thursday-22-02-2018_TrafficForML_CICFlowMeter.csv'))

attack_names = ['Brute Force-Web','Brute Force-XSS','SQL Injection']
attackers = [['18.218.115.60'], 
                 ['18.218.115.60'],
                 ['18.218.115.60']]
victims = [['172.31.69.28'],
          ['172.31.69.28'],
          ['172.31.69.28']]

attack_times = [['22/02/2018 10:17 AM','22/02/2018 11:24 AM'], 
               ['22/02/2018 13:50 PM','22/02/2018 14:29 PM'],
               ['22/02/2018 16:15 PM','22/02/2018 16:29 PM']]

data = label_flows_bidirectionally(data, attackers, victims, attack_times, attack_names)  
data.to_csv(join(outputroot,'Thursday-22-02-2018_TrafficForML_CICFlowMeter.csv'),index=False,encoding='utf-8-sig')
dist = data.Label.value_counts()
print(dist)
for k,v in dist.items():
    label_dist[k]+= v



Labeling: 18.218.115.60->172.31.69.28
Labeling: 18.218.115.60->172.31.69.28
Labeling: 18.218.115.60->172.31.69.28
Benign             1164
Brute Force-Web     137
Brute Force-XSS      42
SQL Injection        17
Name: Label, dtype: int64


In [12]:
#data[(data['Source IP']=='18.218.115.60') & (data['Destination IP']=='172.31.69.28') ].sort_values(by=['Timestamp'])

#  Day 7: Brute Force -Web AND Brute Force -XSS AND SQL Injection


In [13]:
data = pd.read_csv(join(dataroot,'Friday-23-02-2018_TrafficForML_CICFlowMeter.csv'))

attack_names = ['Brute Force-Web','Brute Force-XSS','SQL Injection']
attackers = [['18.218.115.60'], 
                 ['18.218.115.60'],
                 ['18.218.115.60']]

attack_times = [['23/02/2018 10:03 AM','23/02/2018 11:03 AM'], 
               ['23/02/2018 13:00 PM','23/02/2018 14:10 PM'],
               ['23/02/2018 15:05 PM','23/02/2018 15:18 PM']]

data = label_flows_bidirectionally(data, attackers, victims, attack_times, attack_names)  
data.to_csv(join(outputroot,'Friday-23-02-2018_TrafficForML_CICFlowMeter.csv'),index=False,encoding='utf-8-sig')
dist = data.Label.value_counts()
print(dist)
for k,v in dist.items():
    label_dist[k]+= v



Labeling: 18.218.115.60->172.31.69.28
Labeling: 18.218.115.60->172.31.69.28
Labeling: 18.218.115.60->172.31.69.28
Benign             975
Brute Force-Web    123
Brute Force-XSS     72
SQL Injection       31
Name: Label, dtype: int64


#  Day 8: Infiltration

In [14]:
data = pd.read_csv(join(dataroot,'Wednesday-28-02-2018_TrafficForML_CICFlowMeter.csv'))

attack_names = ['Infiltration','Infiltration']
attackers = [['13.58.225.34'], 
                 ['13.58.225.34']]
victims = [['172.31.69.24'],
    ['172.31.69.24']]

attack_times = [['28/02/2018 10:50 AM','28/02/2018 12:05 PM'], 
               ['28/02/2018 13:42 PM','28/02/2018 14:40 PM']]

data = label_flows_bidirectionally(data, attackers, victims, attack_times, attack_names)  
data.to_csv(join(outputroot,'Wednesday-28-02-2018_TrafficForML_CICFlowMeter.csv'),index=False,encoding='utf-8-sig')
dist = data.Label.value_counts()
print(dist)

for k,v in dist.items():
    label_dist[k]+= v
    


Labeling: 13.58.225.34->172.31.69.24
Labeling: 13.58.225.34->172.31.69.24
Benign          197959
Infiltration         1
Name: Label, dtype: int64


#  Day 9: Infiltration

In [15]:
data = pd.read_csv(join(dataroot,'Thursday-01-03-2018_TrafficForML_CICFlowMeter.csv'))

attack_names = ['Infiltration','Infiltration']
attackers = [['13.58.225.34'], 
                 ['13.58.225.34']]
victims = [['172.31.69.13'],
              ['172.31.69.13']]
attack_times = [['01/03/2018 09:57 AM','01/03/2018 10:55 AM'], 
               ['01/03/2018 14:00 PM','01/03/2018 15:37 PM']]


data = label_flows_bidirectionally(data, attackers, victims, attack_times, attack_names)  
data.to_csv(join(outputroot,'Thursday-01-03-2018_TrafficForML_CICFlowMeter.csv'),index=False,encoding='utf-8-sig')

dist = data.Label.value_counts()
print(dist)
for k,v in dist.items():
    label_dist[k]+= v
    
save_dict_to_csv(join(outputroot,'label_dist.csv'),label_dist)

Labeling: 13.58.225.34->172.31.69.13
Labeling: 13.58.225.34->172.31.69.13
Benign          166639
Infiltration         4
Name: Label, dtype: int64
