Quick comparison of the two different datasets we have -- DDOS and regular traffic. Note that `ddostrace.pcap.csv` is on the Google Drive -- to get `equinix-chicago.pcap.csv`, download from GDrive and run `convert_all_pcaps` on it.

In [1]:
import pandas as pd
import numpy as np
import csv as csv
import glob
from datetime import datetime
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
# Import both the ddos and csv traffic
ddos_df = pd.read_csv('resources/ddostrace.pcap.csv')
regular_df = pd.read_csv('resources/equinix-chicago.pcap.csv')

In [7]:
print("DDOS pcap shape: ", ddos_df.shape)
print("Regular pcap shape: ", regular_df.shape)

DDOS pcap shape:  (166448, 13)
Regular pcap shape:  (5357522, 13)


In [8]:
ddos_df.head()

Unnamed: 0,frame.number,frame.time,ip.src,ip.dst,ip.proto,frame.len,_ws.col.Info,syn,ack,fin,push,urgent,unreachable
0,1,"Aug 4, 2007 15:49:36.487629000 CDT",202.1.175.252,71.126.222.64,1,60,"Echo (ping) request id=0xce1d, seq=1280/5, tt...",0,0,0,0,0,0
1,2,"Aug 4, 2007 15:49:36.489552000 CDT",192.120.148.227,71.126.222.64,1,60,"Echo (ping) request id=0x0200, seq=1280/5, tt...",0,0,0,0,0,0
2,3,"Aug 4, 2007 15:49:36.491812000 CDT",51.81.166.201,71.126.222.64,1,60,"Echo (ping) request id=0xef41, seq=1280/5, tt...",0,0,0,0,0,0
3,4,"Aug 4, 2007 15:49:36.492189000 CDT",192.95.27.190,71.126.222.64,1,60,"Echo (ping) request id=0xc495, seq=1280/5, tt...",0,0,0,0,0,0
4,5,"Aug 4, 2007 15:49:36.496475000 CDT",51.173.229.255,71.126.222.64,1,60,"Echo (ping) request id=0x0200, seq=1280/5, tt...",0,0,0,0,0,0


In [9]:
regular_df.head()

Unnamed: 0,frame.number,frame.time,ip.src,ip.dst,ip.proto,frame.len,_ws.col.Info,syn,ack,fin,push,urgent,unreachable
0,1,"Sep 17, 2015 09:01:00.000002000 CDT",96.156.169.251,215.158.239.74,6,1504,443→41813 [ACK] Seq=1 Ack=1 Win=1320 Len=1448 ...,0,1,0,0,0,0
1,2,"Sep 17, 2015 09:01:00.000003000 CDT",131.144.114.218,215.158.238.134,6,56,44365→443 [ACK] Seq=1 Ack=1 Win=32575 Len=0 TS...,0,1,0,0,0,0
2,3,"Sep 17, 2015 09:01:00.000009000 CDT",8.240.117.27,133.178.226.49,6,56,58476→443 [ACK] Seq=1 Ack=1 Win=10353 Len=0 TS...,0,1,0,0,0,0
3,4,"Sep 17, 2015 09:01:00.000010000 CDT",93.210.14.111,8.9.165.184,6,50,51841→443 [ACK] Seq=1 Ack=1 Win=3252 Len=0,0,1,0,0,0,0
4,5,"Sep 17, 2015 09:01:00.000011000 CDT",8.240.117.27,133.178.226.49,6,56,58476→443 [ACK] Seq=1 Ack=2897 Win=10534 Len=0...,0,1,0,0,0,0


In [10]:
# Extract additional features
flags = ['syn', 'ack', 'fin', 'push', 'urgent', 'unreachable']

def extract_feature(dt, feature):
    if feature in dt.lower():
        return 1
    else:
        return 0

for flag in flags:
    ddos_df[flag] = np.vectorize(extract_feature)(ddos_df['_ws.col.Info'], flag)
    regular_df[flag] = np.vectorize(extract_feature)(regular_df['_ws.col.Info'], flag)

In [11]:
# Specifying protocol for later calculation
def protocol(dt, index):
    if dt == index:
        return 1
    else:
        return 0

ddos_df.insert(5, 'tcp', np.vectorize(protocol)(ddos_df['ip.proto'], 6))
ddos_df.insert(6, 'icmp', np.vectorize(protocol)(ddos_df['ip.proto'], 1))
regular_df.insert(5, 'tcp', np.vectorize(protocol)(regular_df['ip.proto'], 6))
regular_df.insert(6, 'icmp', np.vectorize(protocol)(regular_df['ip.proto'], 1))

In [12]:
def time(dt):
    date_and_time = dt.split('.')
    return datetime.strptime(date_and_time[0], "%b %d, %Y %H:%M:%S")

ddos_df.insert(2, 'time', ddos_df['frame.time'].map(time))
regular_df.insert(2, 'time', regular_df['frame.time'].map(time))

In [13]:
def add_label(dt, label):
    dt['label'] = label

ddos_df['label'] = 'attack'
regular_df['label'] = 'regular'

In [14]:
ddos_df.describe()

Unnamed: 0,frame.number,ip.proto,tcp,icmp,frame.len,syn,ack,fin,push,urgent,unreachable
count,166448.0,166448.0,166448.0,166448.0,166448.0,166448.0,166448.0,166448.0,166448,166448,166448.0
mean,83224.5,3.124886,0.424919,0.575062,222.729832,0.056258,0.438984,0.038096,0,0,0.016293
std,48049.54314,2.472347,0.494332,0.494335,420.838099,0.23042,0.496265,0.191429,0,0,0.126602
min,1.0,1.0,0.0,0.0,40.0,0.0,0.0,0.0,0,0,0.0
25%,41612.75,1.0,0.0,0.0,60.0,0.0,0.0,0.0,0,0,0.0
50%,83224.5,1.0,0.0,1.0,60.0,0.0,0.0,0.0,0,0,0.0
75%,124836.25,6.0,1.0,1.0,60.0,0.0,1.0,0.0,0,0,0.0
max,166448.0,17.0,1.0,1.0,1500.0,1.0,1.0,1.0,0,0,1.0


In [15]:
regular_df.describe()

Unnamed: 0,frame.number,ip.proto,tcp,icmp,frame.len,syn,ack,fin,push,urgent,unreachable
count,5357522.0,5083149.0,5357522.0,5357522.0,5357522.0,5357522.0,5357522.0,5357522.0,5357522,5357522,5357522.0
mean,2678761.5,7.468836,0.830506,0.001411,752.761653,0.014041,0.987306,0.00967,0,0,0.000678
std,1546583.528782,4.410471,0.375188,0.037541,682.367336,0.11766,0.111952,0.097858,0,0,0.026032
min,1.0,1.0,0.0,0.0,46.0,0.0,0.0,0.0,0,0,0.0
25%,1339381.25,6.0,1.0,0.0,56.0,0.0,1.0,0.0,0,0,0.0
50%,2678761.5,6.0,1.0,0.0,490.0,0.0,1.0,0.0,0,0,0.0
75%,4018141.75,6.0,1.0,0.0,1496.0,0.0,1.0,0.0,0,0,0.0
max,5357522.0,50.0,1.0,1.0,1504.0,1.0,1.0,1.0,0,0,1.0


In [16]:
ddos_df.head()

Unnamed: 0,frame.number,frame.time,time,ip.src,ip.dst,ip.proto,tcp,icmp,frame.len,_ws.col.Info,syn,ack,fin,push,urgent,unreachable,label
0,1,"Aug 4, 2007 15:49:36.487629000 CDT",2007-08-04 15:49:36,202.1.175.252,71.126.222.64,1,0,1,60,"Echo (ping) request id=0xce1d, seq=1280/5, tt...",0,0,0,0,0,0,attack
1,2,"Aug 4, 2007 15:49:36.489552000 CDT",2007-08-04 15:49:36,192.120.148.227,71.126.222.64,1,0,1,60,"Echo (ping) request id=0x0200, seq=1280/5, tt...",0,0,0,0,0,0,attack
2,3,"Aug 4, 2007 15:49:36.491812000 CDT",2007-08-04 15:49:36,51.81.166.201,71.126.222.64,1,0,1,60,"Echo (ping) request id=0xef41, seq=1280/5, tt...",0,0,0,0,0,0,attack
3,4,"Aug 4, 2007 15:49:36.492189000 CDT",2007-08-04 15:49:36,192.95.27.190,71.126.222.64,1,0,1,60,"Echo (ping) request id=0xc495, seq=1280/5, tt...",0,0,0,0,0,0,attack
4,5,"Aug 4, 2007 15:49:36.496475000 CDT",2007-08-04 15:49:36,51.173.229.255,71.126.222.64,1,0,1,60,"Echo (ping) request id=0x0200, seq=1280/5, tt...",0,0,0,0,0,0,attack


In [17]:
regular_df.head()

Unnamed: 0,frame.number,frame.time,time,ip.src,ip.dst,ip.proto,tcp,icmp,frame.len,_ws.col.Info,syn,ack,fin,push,urgent,unreachable,label
0,1,"Sep 17, 2015 09:01:00.000002000 CDT",2015-09-17 09:01:00,96.156.169.251,215.158.239.74,6,1,0,1504,443→41813 [ACK] Seq=1 Ack=1 Win=1320 Len=1448 ...,0,1,0,0,0,0,regular
1,2,"Sep 17, 2015 09:01:00.000003000 CDT",2015-09-17 09:01:00,131.144.114.218,215.158.238.134,6,1,0,56,44365→443 [ACK] Seq=1 Ack=1 Win=32575 Len=0 TS...,0,1,0,0,0,0,regular
2,3,"Sep 17, 2015 09:01:00.000009000 CDT",2015-09-17 09:01:00,8.240.117.27,133.178.226.49,6,1,0,56,58476→443 [ACK] Seq=1 Ack=1 Win=10353 Len=0 TS...,0,1,0,0,0,0,regular
3,4,"Sep 17, 2015 09:01:00.000010000 CDT",2015-09-17 09:01:00,93.210.14.111,8.9.165.184,6,1,0,50,51841→443 [ACK] Seq=1 Ack=1 Win=3252 Len=0,0,1,0,0,0,0,regular
4,5,"Sep 17, 2015 09:01:00.000011000 CDT",2015-09-17 09:01:00,8.240.117.27,133.178.226.49,6,1,0,56,58476→443 [ACK] Seq=1 Ack=2897 Win=10534 Len=0...,0,1,0,0,0,0,regular
