This notebook will extract information from all of our DDOS csv files and place information into a large samples/features matrix. The matrix will look something like:
```
SOURCE_IP | VICTIM_IP | <Protocol>TRAFFIC_FROM_SRC_TO_DEST ...
```
Where `<Protocol>` can be ICMP, UDP, TCP, and a few others. 

In [123]:
import pandas as pd 
import numpy as np 
import csv as csv 
import glob 
import pickle
from datetime import datetime 
import matplotlib.pyplot as plt 
%matplotlib inline

In [124]:
victimIP = "71.126.222.64"
to_drop = ['<->', 'total_bytes', 'total_frames']
names = ['src', '<->', 'dest', 'in_frames', 'in_bytes', 'out_frames', 'out_bytes', 'total_frames', 'total_bytes', 'rel_start', 'duration']

In [127]:
# Begin dataframe by importing the total data, which will have all IPs
protocol_names = ['total' + "_" + name for name in names]
print(protocol_names)
protocol_to_drop = ['total' + "_" + name for name in to_drop]
ddos_df = pd.read_csv('resources/smallddos_total.tshark.csv', header=None, skiprows=5, names=protocol_names)
ddos_df.drop(ddos_df.tail(1).index)
ddos_df.drop(protocol_to_drop, axis=1)
ddos_df = ddos_df[ddos_df.total_src != victimIP]

['total_src', 'total_<->', 'total_dest', 'total_in_frames', 'total_in_bytes', 'total_out_frames', 'total_out_bytes', 'total_total_frames', 'total_total_bytes', 'total_rel_start', 'total_duration']


In [115]:
ddos_files = glob.glob('resources/smallddos*')
for i in range(len(ddos_files)):
    protocol = ddos_files[i][13:-11] # e.g., icmp, tcp, total, etc
    if protocol == 'total':
        pass
    else:
        # Prefix the column names with the name of the protocol, e.g.:
        #     icmp_src icmp_dest icmp_out_bytes ...
        protocol_names = [protocol + "_" + name for name in names]
        protocol_to_drop = [protocol + "_" + col for col in to_drop]
        
        # Read the csv
        df = pd.read_csv(ddos_files[i], header=None, skiprows=5, names=protocol_names)
        
        # Drop rows we don't need, including those where the src IP is the victim
        df.drop(df.tail(1).index)
        # TODO: Change so that pairs where victimIP is src is labelled
        # victim_response instead of being dropped.
        df = df[df[protocol + "_src"] != victimIP]
        df = df.drop(protocol_to_drop, axis=1)
        
        protocol_names.remove(protocol + "_src")
        protocol_names.remove(protocol + "_<->")
        protocol_names.remove(protocol + "_dest")
        protocol_names.remove(protocol + "_in_frames")
        protocol_names.remove(protocol + "_in_bytes")
        protocol_names.remove(protocol + "_total_frames")
        protocol_names.remove(protocol + "_total_bytes")
        # Now add information in df to ddos_df
        for name in protocol_names:
            ddos_df[name] = 0
        for row_index, row in df.iterrows():
            # Find row index in ddos_df of source IP from df
            source_IP = row[protocol + '_src']
            idx = ddos_df.loc[ddos_df['total_src'] == row[source_IP]].index
            # Then add information to ddos_df
            for name in protocol_names:
                ddos_df.loc[idx, name[len(protocol):]] = row[name]

In [38]:
x = ddos_df.loc[ddos_df['total_src'] == "195.52.240.138"].index
ddos_df.loc[x, 'src'] = 34
print(ddos_df.loc[x, 'src'])
print(x)

2    34
Name: src, dtype: int64
Int64Index([2], dtype='int64')


In [121]:
ddos_df.columns.values

array(['total_src', 'total_<->', 'total_dest', 'total_in_frames',
       'total_in_bytes', 'total_out_frames', 'total_out_bytes',
       'total_total_frames', 'total_total_bytes', 'total_rel_start',
       'total_duration', 'icmp_out_frames', 'icmp_out_bytes',
       'icmp_rel_start', 'icmp_duration', '_out_frames', '_out_bytes',
       '_rel_start', '_duration', 'tcp_out_frames', 'tcp_out_bytes',
       'tcp_rel_start', 'tcp_duration', 'tcpPORTS_out_frames',
       'tcpPORTS_out_bytes', 'tcpPORTS_rel_start', 'tcpPORTS_duration',
       'with_tcpsyn_out_frames', 'with_tcpsyn_out_bytes',
       'with_tcpsyn_rel_start', 'with_tcpsyn_duration'], dtype=object)

In [13]:
# Verifying that all src IPs are distinct...
def anydup(thelist):
  seen = set()
  for x in thelist:
    if x in seen: return True
    seen.add(x)
  return False

print(anydup(ddos_df.total_src))

False


In [18]:
type(ddos_df["total_src"])

pandas.core.series.Series

In [117]:
ddos_df.describe()

Unnamed: 0,total_in_frames,total_in_bytes,total_out_frames,total_out_bytes,total_total_frames,total_total_bytes,total_rel_start,total_duration,icmp_out_frames,icmp_out_bytes,...,tcp_rel_start,tcp_duration,tcpPORTS_out_frames,tcpPORTS_out_bytes,tcpPORTS_rel_start,tcpPORTS_duration,with_tcpsyn_out_frames,with_tcpsyn_out_bytes,with_tcpsyn_rel_start,with_tcpsyn_duration
count,4849.0,4849.0,4849.0,4849.0,4849.0,4849.0,4849.0,4849.0,4850,4850,...,4850,4850,4850,4850,4850,4850,4850,4850,4850,4850
mean,0.573314,49.988864,14.869251,877.641988,15.442565,927.630852,0.05629,0.402298,0,0,...,0,0,0,0,0,0,0,0,0,0
std,0.633594,55.10291,8.2799,473.179998,8.235602,470.304515,0.086616,0.131937,0,0,...,0,0,0,0,0,0,0,0,0,0
min,0.0,0.0,1.0,48.0,1.0,48.0,0.0,0.0,0,0,...,0,0,0,0,0,0,0,0,0,0
25%,0.0,0.0,9.0,540.0,10.0,600.0,0.009707,0.3874,0,0,...,0,0,0,0,0,0,0,0,0,0
50%,1.0,88.0,15.0,900.0,15.0,928.0,0.025188,0.4537,0,0,...,0,0,0,0,0,0,0,0,0,0
75%,1.0,88.0,20.0,1164.0,20.0,1228.0,0.058776,0.4842,0,0,...,0,0,0,0,0,0,0,0,0,0
max,6.0,528.0,74.0,3996.0,75.0,4072.0,0.512105,0.5145,0,0,...,0,0,0,0,0,0,0,0,0,0
