This notebook will extract information from all of our DDOS csv files and place information into a large samples/features matrix. The matrix will look something like:
```
SOURCE_IP | VICTIM_IP | <Protocol>TRAFFIC_FROM_SRC_TO_DEST ...
```
Where `<Protocol>` can be ICMP, UDP, TCP, and a few others. 

In [261]:
import pandas as pd 
import numpy as np 
import csv as csv 
import glob 
import pickle
from datetime import datetime 
import matplotlib.pyplot as plt 
%matplotlib inline

In [262]:
victimIP = "71.126.222.64"
to_drop = ['<->', 'rel_start']
names = ['src', '<->', 'dest', 'in_frames', 'in_bytes', 'out_frames', 'out_bytes', 'total_frames', 'total_bytes', 'rel_start', 'duration']

In [263]:
# Begin dataframe by importing the total data, which will have all IPs
protocol_names = ['total' + "_" + name for name in names]
print(protocol_names)
protocol_to_drop = ['total' + "_" + name for name in to_drop]
ddos_df = pd.read_csv('resources/smallddos_total.tshark.csv', header=None, skiprows=5, names=protocol_names)
ddos_df['label'] = 'ddos'
ddos_df = ddos_df.drop(ddos_df.tail(1).index)
ddos_df = ddos_df.drop(protocol_to_drop, axis=1)
ddos_df = ddos_df[ddos_df.total_src != victimIP]

['total_src', 'total_<->', 'total_dest', 'total_in_frames', 'total_in_bytes', 'total_out_frames', 'total_out_bytes', 'total_total_frames', 'total_total_bytes', 'total_rel_start', 'total_duration']


In [265]:
ddos_files = glob.glob('resources/smallddos*')
for i in range(len(ddos_files)):
    protocol = ddos_files[i][20:-11] # e.g., icmp, tcp, total, etc
    print("PROTOCOL : {0}".format(protocol))
    if protocol == 'total' or protocol == 'tcpPORTS':
        pass
    else:
        # Prefix the column names with the name of the protocol, e.g.:
        #     icmp_src icmp_dest icmp_out_bytes ...
        protocol_names = [protocol + "_" + name for name in names]
        protocol_to_drop = [protocol + "_" + col for col in to_drop]
        
        # Read the csv
        df = pd.read_csv(ddos_files[i], header=None, skiprows=5, names=protocol_names)
        
        # Drop rows we don't need, including those where the src IP is the victim
        df = df.drop(df.tail(1).index)
        # TODO: Change so that pairs where victimIP is src is labelled
        # victim_response instead of being dropped.
        df = df[df[protocol + "_src"] != victimIP]
        df = df.drop(protocol_to_drop, axis=1)
        
        protocol_names.remove(protocol + "_src")
        protocol_names.remove(protocol + "_<->")
        protocol_names.remove(protocol + "_dest")
        protocol_names.remove(protocol + "_rel_start")
        protocol_names.remove(protocol + "_in_frames")
        protocol_names.remove(protocol + "_in_bytes")
        protocol_names.remove(protocol + "_total_frames")
        protocol_names.remove(protocol + "_total_bytes")
           
        for protocol_name in protocol_names:
            ddos_df[protocol_name] = 0
        
        for row_index, row in df.iterrows():
            protocol_source_ip = row[protocol + '_src']
            idx = ddos_df.loc[ddos_df['total_src'] == protocol_source_ip].index
            for protocol_name in protocol_names:
                ddos_df.loc[idx[0], protocol_name] = row[protocol_name]

PROTOCOL : icmp
PROTOCOL : tcp
PROTOCOL : tcpPORTS
PROTOCOL : total
PROTOCOL : with_tcpsyn


In [248]:
# Pickle the data for later use:
with open('ddos_df_pickled.pickle', 'wb') as handle:
    pickle.dump(ddos_df, handle)

In [221]:
# Now let's import Carlos' tcpPORTS data and add it as a feature.
with open('ddos_tcpPORTS.pickle', 'rb') as handle:
    ddos_ports_dict = pickle.load(handle)
with open('passive_tcpPORTS.pickle', 'rb') as handle:
    passive_ports_dict = pickle.load(handle)

In [247]:
# Need to create a new column in the dataframe which contains the
# number of ports used to attack the victim
# 1. Create column called 'num_ports_used' filled with zeros
# 2. For each value-pair in the dictionary, find the source IP. Then use
#    this to index into the dataframe.

ddos_df['tcp_ports_used'] = 0
for ip_pair in ddos_ports_dict:
    idx = ddos_df.loc[ddos_df['total_src'] == ip_pair[0]].index
    try:
        ddos_df.loc[idx[0], 'tcp_ports_used'] = len(ddos_ports_dict[ip_pair])
    except:
        print("Didn't find!")

Didn't find!
Didn't find!
Didn't find!
Didn't find!
Didn't find!
Didn't find!
Didn't find!
Didn't find!
Didn't find!
Didn't find!
Didn't find!
Didn't find!
Didn't find!
Didn't find!
Didn't find!
Didn't find!
Didn't find!
Didn't find!
Didn't find!
Didn't find!
Didn't find!
Didn't find!
Didn't find!
Didn't find!
Didn't find!
Didn't find!
Didn't find!
Didn't find!
Didn't find!
Didn't find!
Didn't find!
Didn't find!
Didn't find!
Didn't find!
Didn't find!
Didn't find!
Didn't find!
Didn't find!
Didn't find!
Didn't find!
Didn't find!
Didn't find!
Didn't find!
Didn't find!
Didn't find!
Didn't find!
Didn't find!
Didn't find!
Didn't find!
Didn't find!
Didn't find!
Didn't find!
Didn't find!
Didn't find!
Didn't find!
Didn't find!
Didn't find!
Didn't find!
Didn't find!
Didn't find!
Didn't find!
Didn't find!
Didn't find!
Didn't find!
Didn't find!
Didn't find!
Didn't find!
Didn't find!
Didn't find!
Didn't find!
Didn't find!
Didn't find!
Didn't find!
Didn't find!
Didn't find!
Didn't find!
Didn't find!

### Converting Carlos' dictionary into passive_df

In [251]:
with open('passive_df_pickled.pickle', 'rb') as handle:
    passive_dict = pickle.load(handle)

In [253]:
len(passive_dict)

26675

In [256]:
passive_dict
    

defaultdict(list,
            {0: [],
             ('76.137.117.236',
              '136.25.188.239'): [['tcp',
               266.9352807856963,
               5.036514731805591], ['total',
               266.9352807856963,
               5.036514731805591]],
             ('109.50.181.214', '8.182.225.124'): [['tcp', 735000.0, 10000.0],
              ['total', 735000.0, 10000.0],
              ['with_tcpack', 735000.0, 10000.0]],
             ('194.237.11.86',
              '136.25.188.198'): [['tcp',
               5059.862610402356,
               31.403336604514234], ['total',
               5059.862610402356,
               31.403336604514234], ['with_tcpack',
               5059.862610402356,
               31.403336604514234], ['with_tcppush',
               5305.570578691184,
               13.52082206598161]],
             ('241.75.160.2',
              '221.102.113.250'): [['tcp',
               1551.3307984790874,
               10.139416983523446], ['total',
               

### Miscellaneo

In [218]:
# Verifying that all src IPs are distinct...
def anydup(thelist):
  seen = set()
  for x in thelist:
    if x in seen: return True
    seen.add(x)
  return False

print(anydup(ddos_df.total_src))

False


In [225]:
print(anydup(ddos_ports_dict))

False


In [242]:
ddos_df.head()

Unnamed: 0,total_src,total_dest,total_in_frames,total_in_bytes,total_out_frames,total_out_bytes,total_rel_start,total_duration,label,icmp_out_frames,...,icmp_duration,tcp_out_frames,tcp_out_bytes,tcp_rel_start,tcp_duration,with_tcpsyn_out_frames,with_tcpsyn_out_bytes,with_tcpsyn_rel_start,with_tcpsyn_duration,tcp_ports_used
1,195.216.1.86,71.126.222.64,1,76,74,3996,0.006975,0.4997,ddos,37,...,0.4714,37,1776,0.006975,0.4801,37,1776,0.006975,0.4801,37
2,195.52.240.138,71.126.222.64,1,76,60,3276,0.038455,0.4708,ddos,33,...,0.4708,27,1296,0.038496,0.4707,27,1296,0.038496,0.4707,27
4,209.210.127.200,71.126.222.64,0,0,59,3420,0.007926,0.4152,ddos,49,...,0.4152,10,480,0.018872,0.3901,10,480,0.018872,0.3901,10
5,167.46.3.209,71.126.222.64,1,88,55,2964,0.000971,0.5058,ddos,27,...,0.5058,28,1344,0.003696,0.4656,28,1344,0.003696,0.4656,23
6,217.238.32.123,71.126.222.64,1,76,53,2856,0.012702,0.4925,ddos,26,...,0.4766,27,1296,0.012702,0.481,27,1296,0.012702,0.481,23


In [254]:
ddos_df.describe()

Unnamed: 0,total_in_frames,total_in_bytes,total_out_frames,total_out_bytes,total_rel_start,total_duration,icmp_out_frames,icmp_out_bytes,icmp_rel_start,icmp_duration,tcp_out_frames,tcp_out_bytes,tcp_rel_start,tcp_duration,with_tcpsyn_out_frames,with_tcpsyn_out_bytes,with_tcpsyn_rel_start,with_tcpsyn_duration,tcp_ports_used
count,4849.0,4849.0,4849.0,4849.0,4849.0,4849.0,4849.0,4849.0,4849.0,4849.0,4849.0,4849.0,4849.0,4849.0,4849.0,4849.0,4849.0,4849.0,4849.0
mean,0.573314,49.988864,14.869251,877.641988,0.05629,0.402298,13.65993,819.595793,0.058103,0.396163,1.209322,58.046195,0.010068,0.037757,1.209322,58.046195,0.010068,0.037757,1.152403
std,0.633594,55.10291,8.2799,473.179998,0.086616,0.131937,7.103348,426.200887,0.088232,0.136961,4.139994,198.717998,0.047915,0.121305,4.139994,198.717998,0.047915,0.121305,3.917159
min,0.0,0.0,1.0,48.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,9.0,540.0,0.009707,0.3874,9.0,540.0,0.010258,0.3785,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,88.0,15.0,900.0,0.025188,0.4537,14.0,840.0,0.026212,0.4516,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,88.0,20.0,1164.0,0.058776,0.4842,19.0,1140.0,0.060962,0.4834,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,6.0,528.0,74.0,3996.0,0.512105,0.5145,51.0,3060.0,0.512116,0.5145,38.0,1824.0,0.512105,0.5101,38.0,1824.0,0.512105,0.5101,37.0
