This notebook will extract information from all of our DDOS csv files and place information into a large samples/features matrix. The matrix will look something like:
```
    SOURCE_IP | DEST_IP | <Protocol>TRAFFIC_FROM_SRC_TO_DEST
```
Where `<Protocol>` can be ICMP, UDP, TCP, and a few others.

In [8]:
import pandas as pd 
import numpy as np 
import csv as csv 
import glob 
import pickle
from datetime import datetime 
import matplotlib.pyplot as plt 

In [9]:
to_drop = ['<->', 'total_bytes', 'total_frames']
names = ['src', '<->', 'dest', 'in_frames', 'in_bytes', 'out_frames', 'out_bytes', 'total_frames', 'total_bytes', 'rel_start', 'duration']

In [10]:
# Begin dataframe by importing the total data, which will have all IPs
protocol_names = ['total' + "_" + name for name in names]
print(protocol_names)
protocol_to_drop = ['total' + "_" + name for name in to_drop]
passive_df = pd.read_csv('resources/smallpassive_total.tshark.csv', header=None, skiprows=5, names=protocol_names)
passive_df['label'] = 'passive'
passive_df = passive_df.drop(passive_df.tail(1).index)
passive_df = passive_df.drop(protocol_to_drop, axis=1)

['total_src', 'total_<->', 'total_dest', 'total_in_frames', 'total_in_bytes', 'total_out_frames', 'total_out_bytes', 'total_total_frames', 'total_total_bytes', 'total_rel_start', 'total_duration']


In [None]:
for i, row in passive_df.iterrows():
        if row.outframes > 0:
            if row.duration != 0:
                d[row.src, row.dst].append([protocol, row.outbytes/row.duration, row.outframes/row.duration])
        if row.inframes > 0:
            if row.duration != 0:
                d[row.dst, row.src].append([protocol, row.inbytes/row.duration, row.inframes/row.duration])

In [11]:
passive_files = glob.glob('resources/smallpassive*')
for i in range(len(passive_files)):
    protocol = passive_files[i][23:-11] # e.g., icmp, tcp, total, etc
    print("PROTOCOL : {0}".format(protocol))
    if protocol == 'total' or protocol == 'tcpPORTS':
        pass
    else:
        # Prefix the column names with the name of the protocol, e.g.:
        #     icmp_src icmp_dest icmp_out_bytes ...
        protocol_names = [protocol + "_" + name for name in names]
        protocol_to_drop = [protocol + "_" + col for col in to_drop]
        
        # Read the csv
        df = pd.read_csv(passive_files[i], header=None, skiprows=5, names=protocol_names)
        
        # Drop rows we don't need, including those where the src IP is the victim
        df = df.drop(df.tail(1).index)
        df = df.drop(protocol_to_drop, axis=1)
        
        for i, row in passive_df.iterrows():
            if row.outframes > 0:
                if row.duration != 0:
                    
        
        protocol_names.remove(protocol + "_src")
        protocol_names.remove(protocol + "_<->")
        protocol_names.remove(protocol + "_dest")
        protocol_names.remove(protocol + "_in_frames")
        protocol_names.remove(protocol + "_in_bytes")
        protocol_names.remove(protocol + "_total_frames")
        protocol_names.remove(protocol + "_total_bytes")
           
        for protocol_name in protocol_names:
            passive_df[protocol_name] = 0
        
        for row_index, row in df.iterrows():
            protocol_source_ip = row[protocol + '_src']
            idx = passive_df.loc[passive_df['total_src'] == protocol_source_ip].index
            for protocol_name in protocol_names:
                passive_df.loc[idx[0], protocol_name] = row[protocol_name]

PROTOCOL : icmp
PROTOCOL : tcp
PROTOCOL : tcpPORTS
PROTOCOL : total
PROTOCOL : with_tcpsyn


In [12]:
# Pickle data for later use:
with open('passive_df_pickled.pickle', 'wb') as handle:
    pickle.dump(passive_df, handle)

In [13]:
# Import tcp port use information from Carlos:
with open('passive_tcpPORTS.pickle', 'rb') as handle:
    passive_ports_dict = pickle.load(handle)

In [None]:
for i, row in passive_df.iterrows():
        if row.outframes > 0:
            if row.duration != 0:
                d[row.src, row.dst].append([protocol, row.outbytes/row.duration, row.outframes/row.duration])
        if row.inframes > 0:
            if row.duration != 0:
                d[row.dst, row.src].append([protocol, row.inbytes/row.duration, row.inframes/row.duration])

In [18]:
passive_df['tcp_ports_used'] = 0
num_notfound = 0
for ip_pair in passive_ports_dict:
    idx = passive_df.loc[passive_df['total_src'] == ip_pair[0]].index
    try:
        passive_df.loc[idx[0], 'tcp_ports_used'] = len(passive_ports_dict[ip_pair])
    except:
        print(ip_pair)
        num_notfound = num_notfound + 1

('0.195.216.195', '136.181.53.206')
('192.229.40.29', '215.158.23.26')
('111.151.239.68', '218.4.168.149')
('57.199.250.198', '215.158.238.135')
('122.158.49.151', '215.158.238.132')
('199.20.31.127', '215.158.239.95')
('199.20.29.243', '215.158.239.76')
('80.47.111.58', '133.178.227.243')
('98.129.146.202', '139.193.141.139')
('38.104.11.27', '215.158.239.84')
('86.76.182.144', '98.236.235.232')
('199.20.29.250', '215.158.239.49')
('98.219.126.175', '215.158.238.201')
('143.10.57.201', '215.158.239.92')
('57.4.201.182', '136.25.189.33')
('196.90.83.92', '215.158.239.56')
('199.20.31.62', '215.158.239.50')
('8.222.6.39', '210.221.107.50')
('199.20.28.136', '215.158.239.78')
('96.156.169.254', '136.25.189.2')
('96.155.60.234', '133.178.226.14')
('199.20.28.18', '215.158.239.93')
('96.156.169.236', '136.25.189.37')
('195.127.43.47', '215.158.239.77')
('8.179.43.83', '136.25.188.173')
('215.158.43.1', '215.158.238.212')
('18.129.154.14', '89.183.106.118')
('215.158.33.107', '215.158.238.2

KeyboardInterrupt: 

In [16]:
num_notfound

15187

In [17]:
passive_df

Unnamed: 0,total_src,total_dest,total_in_frames,total_in_bytes,total_out_frames,total_out_bytes,total_rel_start,total_duration,label,icmp_out_frames,...,icmp_duration,tcp_out_frames,tcp_out_bytes,tcp_rel_start,tcp_duration,with_tcpsyn_out_frames,with_tcpsyn_out_bytes,with_tcpsyn_rel_start,with_tcpsyn_duration,tcp_ports_used
0,121.23.5.158,89.183.104.248,0,0,3799,4718748,0.002621,1.9953,passive,0,...,0.000,0,0,0.000000,0.0000,0,0,0.000000,0,1
1,144.93.121.86,144.93.120.177,3414,696456,0,0,0.001075,1.9987,passive,0,...,0.000,0,0,0.000000,0.0000,0,0,0.000000,0,0
2,215.158.238.110,56.20.122.75,3346,5026059,0,0,0.000020,1.9117,passive,0,...,0.000,0,0,0.756315,1.1641,0,0,0.000000,0,0
3,145.19.13.114,14.27.112.65,3258,4900032,0,0,0.010939,1.9877,passive,0,...,0.000,0,0,0.010939,1.9877,0,0,0.000000,0,0
4,133.178.226.214,100.224.243.106,3137,4515913,0,0,1.033264,0.9531,passive,0,...,0.000,0,0,1.885452,0.0819,0,0,1.000456,0,0
5,192.228.188.15,133.178.227.105,0,0,2986,4362558,0.574513,0.6483,passive,0,...,0.000,2986,4362558,0.574513,0.6483,0,0,0.000000,0,1
6,136.25.189.94,8.188.181.66,2954,4360876,0,0,0.092315,1.2373,passive,0,...,0.007,0,0,0.190227,0.0094,0,0,0.499983,0,0
7,144.93.121.48,144.93.120.177,2928,597312,0,0,0.000797,1.9992,passive,0,...,0.000,0,0,0.000000,0.0000,0,0,0.000000,0,0
8,144.93.121.10,144.93.120.177,2820,575280,0,0,0.000132,1.9988,passive,0,...,0.000,0,0,0.000000,0.0000,0,0,0.000000,0,0
9,218.4.183.196,111.151.238.110,2771,332480,0,0,0.038964,1.9473,passive,0,...,0.000,0,0,0.038964,1.9473,0,0,0.000000,0,0
