This notebook will extract information from all of our DDOS csv files and place information into a large samples/features matrix. The matrix will look something like:
```
    SOURCE_IP | DEST_IP | <Protocol>TRAFFIC_FROM_SRC_TO_DEST
```
Where `<Protocol>` can be ICMP, UDP, TCP, and a few others.

In [1]:
import pandas as pd 
import numpy as np 
import csv as csv 
import glob 
import pickle
from datetime import datetime 
import matplotlib.pyplot as plt 

In [2]:
to_drop = ['<->', 'total_bytes', 'total_frames']
names = ['src', '<->', 'dest', 'in_frames', 'in_bytes', 'out_frames', 'out_bytes', 'total_frames', 'total_bytes', 'rel_start', 'duration']

In [3]:
# Begin dataframe by importing the total data, which will have all IPs
protocol_names = ['total' + "_" + name for name in names]
print(protocol_names)
protocol_to_drop = ['total' + "_" + name for name in to_drop]
passive_df = pd.read_csv('resources/smallpassive_total.tshark.csv', header=None, skiprows=5, names=protocol_names)
passive_df['label'] = 'passive'
passive_df = passive_df.drop(passive_df.tail(1).index)
passive_df = passive_df.drop(protocol_to_drop, axis=1)

['total_src', 'total_<->', 'total_dest', 'total_in_frames', 'total_in_bytes', 'total_out_frames', 'total_out_bytes', 'total_total_frames', 'total_total_bytes', 'total_rel_start', 'total_duration']


In [4]:
passive_files = glob.glob('resources/smallpassive*')
for i in range(len(passive_files)):
    protocol = passive_files[i][23:-11] # e.g., icmp, tcp, total, etc
    print("PROTOCOL : {0}".format(protocol))
    if protocol == 'total' or protocol == 'tcpPORTS':
        pass
    else:
        # Prefix the column names with the name of the protocol, e.g.:
        #     icmp_src icmp_dest icmp_out_bytes ...
        protocol_names = [protocol + "_" + name for name in names]
        protocol_to_drop = [protocol + "_" + col for col in to_drop]
        
        # Read the csv
        df = pd.read_csv(passive_files[i], header=None, skiprows=5, names=protocol_names)
        
        # Drop rows we don't need, including those where the src IP is the victim
        df = df.drop(df.tail(1).index)
        df = df.drop(protocol_to_drop, axis=1)
        
        protocol_names.remove(protocol + "_src")
        protocol_names.remove(protocol + "_<->")
        protocol_names.remove(protocol + "_dest")
        protocol_names.remove(protocol + "_in_frames")
        protocol_names.remove(protocol + "_in_bytes")
        protocol_names.remove(protocol + "_total_frames")
        protocol_names.remove(protocol + "_total_bytes")
           
        for protocol_name in protocol_names:
            passive_df[protocol_name] = 0
        
        for row_index, row in df.iterrows():
            protocol_source_ip = row[protocol + '_src']
            idx = passive_df.loc[passive_df['total_src'] == protocol_source_ip].index
            for protocol_name in protocol_names:
                passive_df.loc[idx[0], protocol_name] = row[protocol_name]

PROTOCOL : icmp
PROTOCOL : tcp
PROTOCOL : tcpPORTS
PROTOCOL : total
PROTOCOL : udp


KeyboardInterrupt: 