In [22]:
import pandas as pd 
import numpy as np 
import csv as csv 
import glob 
import pickle
from datetime import datetime 
import matplotlib.pyplot as plt 
%matplotlib inline

In [23]:
to_drop = ['<->', 'rel_start']
names = ['src', '<->', 'dest', 'in_frames', 'in_bytes', 'out_frames', 'out_bytes', 'total_frames', 'total_bytes', 'rel_start', 'duration']

In [24]:
# Begin dataframe by importing the total data, which will have all IPs
protocol_names = ['total' + "_" + name for name in names]
print(protocol_names)
protocol_to_drop = ['total' + "_" + name for name in to_drop]
passive_df = pd.read_csv('resources/smallpassive_total.tshark.csv', header=None, skiprows=5, names=protocol_names)
# passive_df = passive_df[passive_df.total_out_bytes != 0]
# passive_df = passive_df[passive_df.total_out_frames != 0]
passive_df['label'] = 'passive'
passive_df['total_out_bytes_rate'] = 0
passive_df['total_out_frames_rate'] = 0

['total_src', 'total_<->', 'total_dest', 'total_in_frames', 'total_in_bytes', 'total_out_frames', 'total_out_bytes', 'total_total_frames', 'total_total_bytes', 'total_rel_start', 'total_duration']


In [25]:
for i, row in passive_df.iterrows():
    if row.total_out_bytes > 0 and row.total_duration != 0:
        passive_df.loc[i, 'total_out_bytes_rate'] = row.total_out_bytes / row.total_duration
    if row.total_out_bytes > 0 and row.total_duration != 0:
        passive_df.loc[i, 'total_out_frames_rate'] = row.total_out_frames / row.total_duration    
passive_df = passive_df.drop(passive_df.tail(1).index)
# protocol_to_drop.remove(['total_<->', 'total_rel_start'])
passive_df = passive_df.drop(protocol_to_drop, axis=1)


In [20]:
passive_df.columns.values

array(['total_src', 'total_dest', 'total_in_frames', 'total_in_bytes',
       'total_out_frames', 'total_out_bytes', 'total_total_frames',
       'total_total_bytes', 'total_duration', 'label',
       'total_out_bytes_rate', 'total_out_frames_rate'], dtype=object)

In [26]:
passive_files = glob.glob('resources/smallpassive*')
for i in range(len(passive_files)):
    protocol = passive_files[i][23:-11] # e.g., icmp, tcp, total, etc
    print("PROTOCOL : {0}".format(protocol))
    if protocol == 'total' or protocol == 'tcpPORTS':
        pass
    else:
        # Prefix the column names with the name of the protocol, e.g.:
        #     icmp_src icmp_dest icmp_out_bytes ...
        protocol_names = [protocol + "_" + name for name in names]
        protocol_to_drop = [protocol + "_" + col for col in to_drop]
        
        # Read the csv
        df = pd.read_csv(passive_files[i], header=None, skiprows=5, names=protocol_names)
        
        # Drop rows we don't need, including those where the src IP is the victim
        df = df.drop(df.tail(1).index)
        # TODO: Change so that pairs where victimIP is src is labelled
        # victim_response instead of being dropped.
#         df = df[df[protocol + "_src"] != victimIP]
        df = df.drop(protocol_to_drop, axis=1)
        
        protocol_names.remove(protocol + "_src")
        protocol_names.remove(protocol + "_<->")
        protocol_names.remove(protocol + "_dest")
        protocol_names.remove(protocol + "_rel_start")
        protocol_names.remove(protocol + "_in_frames")
        protocol_names.remove(protocol + "_in_bytes")
        protocol_names.remove(protocol + "_total_frames")
        protocol_names.remove(protocol + "_total_bytes")
           
        passive_df[protocol + '_out_bytes_rate'] = 0 # icmp_out_bytes_rate
        passive_df[protocol + '_out_frames_rate'] = 0# icmp_out_frames_rate
        for protocol_name in protocol_names:
            passive_df[protocol_name] = 0
        
        for row_index, row in df.iterrows():
            protocol_source_ip = row[protocol + '_src']
            idx = passive_df.loc[passive_df['total_src'] == protocol_source_ip].index
            for protocol_name in protocol_names:
                passive_df.loc[idx[0], protocol_name] = row[protocol_name]
            if row[protocol + '_out_bytes'] > 0 and row[protocol + '_duration'] != 0:
                passive_df.loc[idx[0], protocol + '_out_bytes_rate'] = row[protocol + '_out_bytes'] / row[protocol + '_duration']
            if row[protocol + '_out_frames'] > 0 and row[protocol + '_duration'] != 0:
                passive_df.loc[idx[0], protocol + '_out_frames_rate'] = row[protocol +'_out_frames'] / row[protocol + '_duration']


PROTOCOL : icmp
PROTOCOL : tcp
PROTOCOL : tcpPORTS
PROTOCOL : total
PROTOCOL : with_tcpsyn


In [27]:
# Now let's import Carlos' tcpPORTS data and add it as a feature.
with open('passive_tcpPORTS.pickle', 'rb') as handle:
    passive_ports_dict = pickle.load(handle)

In [28]:
# Create a new column in the dataframe which contains the
# number of ports used to attack the victim:
passive_df['tcp_ports_used'] = 0
not_found = 0
for ip_pair in passive_ports_dict:
    idx = passive_df.loc[passive_df['total_src'] == ip_pair[0]].index
    try:
        passive_df.loc[idx[0], 'tcp_ports_used'] = len(passive_ports_dict[ip_pair])
    except:
        not_found = not_found + 1

In [29]:
# Now we drop the features that we're not interested in -- 
# that is, all data not having to do with rates. 
bad_items = ['total_in_frames','total_in_bytes',\
             'total_out_frames','total_out_bytes',\
             'total_total_frames','total_total_bytes',\
             'total_duration','icmp_out_frames',\
             'icmp_out_bytes','icmp_duration','tcp_out_frames',\
             'tcp_out_bytes','tcp_duration','with_tcpsyn_out_frames',\
             'with_tcpsyn_out_bytes','with_tcpsyn_duration']
passive_df = passive_df.drop(bad_items, axis=1)

In [30]:
cols = passive_df.columns.values
cols[0] = 'src'
cols[1] = 'dst'
passive_df.columns = cols
passive_df.head()

Unnamed: 0,src,dst,label,total_out_bytes_rate,total_out_frames_rate,icmp_out_bytes_rate,icmp_out_frames_rate,tcp_out_bytes_rate,tcp_out_frames_rate,with_tcpsyn_out_bytes_rate,with_tcpsyn_out_frames_rate,tcp_ports_used
0,220.20.200.164,84.236.193.94,passive,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0
1,109.97.67.71,92.27.27.185,passive,3043071.564599,3888.944573,0,0,0.0,0.0,0.0,0.0,0
2,247.136.134.84,22.249.49.193,passive,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0
3,220.28.73.211,215.158.140.162,passive,3461141.615055,2386.850881,0,0,1182187.5,1250.0,114.490161,1.788909,1
4,121.23.5.158,89.183.104.248,passive,2364931.589235,1903.97434,0,0,2364931.589235,1903.97434,0.0,0.0,1


In [31]:
passive_df.describe()

Unnamed: 0,total_out_bytes_rate,total_out_frames_rate,icmp_out_bytes_rate,icmp_out_frames_rate,tcp_out_bytes_rate,tcp_out_frames_rate,with_tcpsyn_out_bytes_rate,with_tcpsyn_out_frames_rate,tcp_ports_used
count,39784.0,39784.0,39784.0,39784.0,39784.0,39784.0,39784.0,39784.0,39784.0
mean,129054.4,191.050415,12.127614,0.071723,55477.93,124.833834,193.313656,3.39965,0.351523
std,4379830.0,3413.570275,1240.538359,6.469532,2604905.0,2275.925273,10828.008521,195.95128,0.575518
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,799.8185,8.192208,0.0,0.0,106.0556,1.695849,0.0,0.0,1.0
max,363300000.0,250000.0,174000.0,909.090909,331440000.0,230000.0,1120000.0,20000.0,20.0


In [32]:
# Pickle the data for later use:
with open('passive_df_pickled.pickle', 'wb') as handle:
    pickle.dump(passive_df, handle)

In [33]:
with open('ddos_df_pickled.pickle', 'rb') as handle:
    ddos_df = pickle.load(handle)

In [34]:
ddos_df.head()

Unnamed: 0,src,dst,label,total_out_bytes_rate,total_out_frames_rate,icmp_out_bytes_rate,icmp_out_frames_rate,tcp_out_bytes_rate,tcp_out_frames_rate,with_tcpsyn_out_bytes_rate,with_tcpsyn_out_frames_rate,tcp_ports_used
1,195.216.1.86,71.126.222.64,ddos,7996.798079,148.088853,4709.376326,78.489605,3699.229327,77.067278,3699.229327,77.067278,37
2,195.52.240.138,71.126.222.64,ddos,6958.368734,127.442651,4205.607477,70.093458,2753.34608,57.361377,2753.34608,57.361377,27
4,209.210.127.200,71.126.222.64,ddos,8236.99422,142.100193,7080.924855,118.015414,1230.45373,25.634453,1230.45373,25.634453,10
5,167.46.3.209,71.126.222.64,ddos,5860.023725,108.738632,3202.846975,53.380783,2886.597938,60.137457,2886.597938,60.137457,23
6,217.238.32.123,71.126.222.64,ddos,5798.984772,107.614213,3273.185061,54.553084,2694.386694,56.133056,2694.386694,56.133056,23


In [35]:
passive_df.head()

Unnamed: 0,src,dst,label,total_out_bytes_rate,total_out_frames_rate,icmp_out_bytes_rate,icmp_out_frames_rate,tcp_out_bytes_rate,tcp_out_frames_rate,with_tcpsyn_out_bytes_rate,with_tcpsyn_out_frames_rate,tcp_ports_used
0,220.20.200.164,84.236.193.94,passive,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0
1,109.97.67.71,92.27.27.185,passive,3043071.564599,3888.944573,0,0,0.0,0.0,0.0,0.0,0
2,247.136.134.84,22.249.49.193,passive,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0
3,220.28.73.211,215.158.140.162,passive,3461141.615055,2386.850881,0,0,1182187.5,1250.0,114.490161,1.788909,1
4,121.23.5.158,89.183.104.248,passive,2364931.589235,1903.97434,0,0,2364931.589235,1903.97434,0.0,0.0,1


In [36]:
df_list = []
df_list.append(passive_df)
df_list.append(ddos_df)

In [37]:
concat = pd.concat(df_list)

In [39]:
concat.head()

Unnamed: 0,src,dst,label,total_out_bytes_rate,total_out_frames_rate,icmp_out_bytes_rate,icmp_out_frames_rate,tcp_out_bytes_rate,tcp_out_frames_rate,with_tcpsyn_out_bytes_rate,with_tcpsyn_out_frames_rate,tcp_ports_used
0,220.20.200.164,84.236.193.94,passive,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0
1,109.97.67.71,92.27.27.185,passive,3043071.564599,3888.944573,0,0,0.0,0.0,0.0,0.0,0
2,247.136.134.84,22.249.49.193,passive,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0
3,220.28.73.211,215.158.140.162,passive,3461141.615055,2386.850881,0,0,1182187.5,1250.0,114.490161,1.788909,1
4,121.23.5.158,89.183.104.248,passive,2364931.589235,1903.97434,0,0,2364931.589235,1903.97434,0.0,0.0,1


In [40]:
# Pickle the data for later use:
with open('concat.pickle', 'wb') as handle:
    pickle.dump(concat, handle)