In [42]:
# %matplotlib notebook

import csv, gc, glob, os, platform, pprint, sys, urllib
import fastai as fai
import fastai.tabular as fat
# import keras
# import keras.layers as klayers
# import keras.metrics as kmetrics
# import keras.models as kmodels
# import keras.utils as kutils
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.ticker as mtk
import mpl_toolkits as mpt
import numpy as np
import pandas as pd
# import pandas_ml as pdm
import scipy
import scipy.stats as scistat
import seaborn as sn
import sklearn as sk
import sklearn.decomposition as skd
import sklearn.discriminant_analysis as skda
import sklearn.dummy as skdu
import sklearn.ensemble as ske
# import sklearn.externals.joblib as skjob
import sklearn.feature_selection as skf
import sklearn.linear_model as sklm
import sklearn.metrics as skme
import sklearn.model_selection as skm
import sklearn.naive_bayes as sknb
import sklearn.neighbors as skn
import sklearn.neural_network as sknn
import sklearn.pipeline as skpl
import sklearn.preprocessing as skp
import sklearn.svm as sksvm
import sklearn.tree as skt
import sklearn.utils as sku

RANDOM_STATE: int = 14
seed: int = 7

# set up pretty printer for easier data evaluation
p = pprint.PrettyPrinter(indent=4, width=30)
pretty = p.pprint

print(f"""
python:\t{platform.python_version()}\n
libraries loaded:
\tfastai:\t\t{fai.__version__}
\tmatplotlib:\t{mpl.__version__}
\tnumpy:\t\t{np.__version__}
\tpandas:\t\t{pd.__version__}
\tseaborn:\t{sn.__version__}
\tscipy:\t\t{scipy.__version__}
\tsklearn:\t{sk.__version__}
""")


python:	3.7.10

libraries loaded:
	fastai:		1.0.61
	matplotlib:	3.3.4
	numpy:		1.20.2
	pandas:		1.2.4
	seaborn:	0.11.1
	scipy:		1.6.2
	sklearn:	0.24.2



In [40]:
data_path_1: str = './original/01-12/'
data_path_2: str = './original/03-11/'
    
data_set_1: list = [
    'DrDoS_DNS.csv',
    'DrDoS_LDAP.csv',
    'DrDoS_MSSQL.csv',
    'DrDoS_NetBIOS.csv',
    'DrDoS_NTP.csv',
    'DrDoS_SNMP.csv',
    'DrDoS_SSDP.csv',
    'DrDoS_UDP.csv',
    'Syn.csv',
]
    
data_set_2: list = [
    'TFTP.csv',
    'UDPLag.csv',    
]
    
data_set_3: list = [
    'LDAP.csv',
    'MSSQL.csv',
    'NetBIOS.csv',
    'Portmap.csv',   
]
    
data_set_4: list = [
    'Syn.csv',
    'UDP.csv',
    'UDPLag.csv',
]

def get_file_path(directory):
    def func(file):
        return os.path.join(directory, file)

    return func

file_path_1: str = get_file_path(data_path_1)
file_path_2: str = get_file_path(data_path_2)

file_set_1: list = list(map(file_path_1, data_set_1))
file_set_2: list = list(map(file_path_1, data_set_2))
file_set_3: list = list(map(file_path_2, data_set_3))
file_set_4: list = list(map(file_path_2, data_set_4))
    
    
print(f'We will be cleaning {len(file_set_1)+len(file_set_2)+len(file_set_3)+len(file_set_4)} files:\n')
pretty(file_set_1)
print()
pretty(file_set_2)
print()
pretty(file_set_3)
print()
pretty(file_set_4)

We will be cleaning 18 files:

[   './original/01-12/DrDoS_DNS.csv',
    './original/01-12/DrDoS_LDAP.csv',
    './original/01-12/DrDoS_MSSQL.csv',
    './original/01-12/DrDoS_NetBIOS.csv',
    './original/01-12/DrDoS_NTP.csv',
    './original/01-12/DrDoS_SNMP.csv',
    './original/01-12/DrDoS_SSDP.csv',
    './original/01-12/DrDoS_UDP.csv',
    './original/01-12/Syn.csv']

[   './original/01-12/TFTP.csv',
    './original/01-12/UDPLag.csv']

[   './original/03-11/LDAP.csv',
    './original/03-11/MSSQL.csv',
    './original/03-11/NetBIOS.csv',
    './original/03-11/Portmap.csv']

[   './original/03-11/Syn.csv',
    './original/03-11/UDP.csv',
    './original/03-11/UDPLag.csv']


In [4]:
def load_data(filePath):
    
    # slice off the ./CSV/ from the filePath
    if filePath[0] == '.' and filePath[1] == '/':
        filePathClean: str = filePath[11::]
        pickleDump: str = f'./cache/{filePathClean}.pickle'
    else:
        pickleDump: str = f'./cache/{filePath}.pickle'
    
    print(f'Loading Dataset: {filePath}')
    print(f'\tTo Dataset Cache: {pickleDump}\n')
    
    # check if data already exists within cache
    if os.path.exists(pickleDump):
        df = pd.read_pickle(pickleDump)
        
    # if not, load data and clean it before caching it
    else:
        df = pd.read_csv(filePath, low_memory=False)
#         df = clean_data(df)
        
        df.to_pickle(pickleDump)
    
    return df

In [41]:
# df = load_data(file_set_1[0])
# df.shape
# df1: list = list(map(load_data, file_set_1))
# df2: list = list(map(load_data, file_set_2))
# df3: list = list(map(load_data, file_set_3))
df4: list = list(map(load_data, file_set_4))

Loading Dataset: ./original/03-11/Syn.csv
	To Dataset Cache: ./cache/03-11/Syn.csv.pickle

Loading Dataset: ./original/03-11/UDP.csv
	To Dataset Cache: ./cache/03-11/UDP.csv.pickle

Loading Dataset: ./original/03-11/UDPLag.csv
	To Dataset Cache: ./cache/03-11/UDPLag.csv.pickle



In [7]:
smalldf = df[df[' Label'] == 'BENIGN']
columns = smalldf.columns
vals = smalldf.values

In [34]:
smalldf.shape

(3402, 88)

In [35]:

for i in range(smalldf.shape[1]):
    print(f"Column: {i}\tType: {type(v[0][i])}\tLabel: {columns[i]}")

Column: 0	Type: <class 'int'>	Label: Unnamed: 0
Column: 1	Type: <class 'str'>	Label: Flow ID
Column: 2	Type: <class 'str'>	Label:  Source IP
Column: 3	Type: <class 'int'>	Label:  Source Port
Column: 4	Type: <class 'str'>	Label:  Destination IP
Column: 5	Type: <class 'int'>	Label:  Destination Port
Column: 6	Type: <class 'int'>	Label:  Protocol
Column: 7	Type: <class 'str'>	Label:  Timestamp
Column: 8	Type: <class 'int'>	Label:  Flow Duration
Column: 9	Type: <class 'int'>	Label:  Total Fwd Packets
Column: 10	Type: <class 'int'>	Label:  Total Backward Packets
Column: 11	Type: <class 'float'>	Label: Total Length of Fwd Packets
Column: 12	Type: <class 'float'>	Label:  Total Length of Bwd Packets
Column: 13	Type: <class 'float'>	Label:  Fwd Packet Length Max
Column: 14	Type: <class 'float'>	Label:  Fwd Packet Length Min
Column: 15	Type: <class 'float'>	Label:  Fwd Packet Length Mean
Column: 16	Type: <class 'float'>	Label:  Fwd Packet Length Std
Column: 17	Type: <class 'float'>	Label: Bwd Pa

In [16]:

columns = ['Unnamed: 0', 'Flow ID', ' Source IP', ' Source Port',
       ' Destination IP', ' Destination Port', ' Protocol', ' Timestamp',
       ' Flow Duration', ' Total Fwd Packets', ' Total Backward Packets',
       'Total Length of Fwd Packets', ' Total Length of Bwd Packets',
       ' Fwd Packet Length Max', ' Fwd Packet Length Min',
       ' Fwd Packet Length Mean', ' Fwd Packet Length Std',
       'Bwd Packet Length Max', ' Bwd Packet Length Min',
       ' Bwd Packet Length Mean', ' Bwd Packet Length Std', 'Flow Bytes/s',
       ' Flow Packets/s', ' Flow IAT Mean', ' Flow IAT Std', ' Flow IAT Max',
       ' Flow IAT Min', 'Fwd IAT Total', ' Fwd IAT Mean', ' Fwd IAT Std',
       ' Fwd IAT Max', ' Fwd IAT Min', 'Bwd IAT Total', ' Bwd IAT Mean',
       ' Bwd IAT Std', ' Bwd IAT Max', ' Bwd IAT Min', 'Fwd PSH Flags',
       ' Bwd PSH Flags', ' Fwd URG Flags', ' Bwd URG Flags',
       ' Fwd Header Length', ' Bwd Header Length', 'Fwd Packets/s',
       ' Bwd Packets/s', ' Min Packet Length', ' Max Packet Length',
       ' Packet Length Mean', ' Packet Length Std', ' Packet Length Variance',
       'FIN Flag Count', ' SYN Flag Count', ' RST Flag Count',
       ' PSH Flag Count', ' ACK Flag Count', ' URG Flag Count',
       ' CWE Flag Count', ' ECE Flag Count', ' Down/Up Ratio',
       ' Average Packet Size', ' Avg Fwd Segment Size',
       ' Avg Bwd Segment Size', ' Fwd Header Length.1', 'Fwd Avg Bytes/Bulk',
       ' Fwd Avg Packets/Bulk', ' Fwd Avg Bulk Rate', ' Bwd Avg Bytes/Bulk',
       ' Bwd Avg Packets/Bulk', 'Bwd Avg Bulk Rate', 'Subflow Fwd Packets',
       ' Subflow Fwd Bytes', ' Subflow Bwd Packets', ' Subflow Bwd Bytes',
       'Init_Win_bytes_forward', ' Init_Win_bytes_backward',
       ' act_data_pkt_fwd', ' min_seg_size_forward', 'Active Mean',
       ' Active Std', ' Active Max', ' Active Min', 'Idle Mean', ' Idle Std',
       ' Idle Max', ' Idle Min', 'SimillarHTTP', ' Inbound', ' Label']

In [28]:
len(columns)

88

In [None]:
df.values.dtype

In [37]:
v

array([[123, '192.168.50.8-125.56.201.115-59099-80-6', '192.168.50.8', 59099, ..., 8632089.0,
        'detectportal.firefox.com/success.txt', 0, 'BENIGN'],
       [23, '192.168.50.8-54.218.239.186-59102-443-6', '192.168.50.8', 59102, ..., 9933709.0, '0', 0, 'BENIGN'],
       [126, '192.168.50.253-224.0.0.5-0-0-0', '192.168.50.253', 0, ..., 6781893.0, '0', 0, 'BENIGN'],
       [91, '192.168.50.8-23.15.4.11-59155-80-6', '192.168.50.8', 59155, ..., 10000366.0, '0', 0, 'BENIGN'],
       ...,
       [349, '192.168.50.6-125.56.201.105-57177-80-6', '192.168.50.6', 57177, ..., 0.0, '0', 0, 'BENIGN'],
       [25306, '192.168.50.6-125.56.201.105-57177-80-6', '125.56.201.105', 80, ..., 0.0, '0', 1, 'BENIGN'],
       [24149, '172.217.11.34-192.168.50.6-443-57225-6', '192.168.50.6', 57225, ..., 0.0, '0', 0, 'BENIGN'],
       [480, '192.168.50.254-224.0.0.5-0-0-0', '192.168.50.254', 0, ..., 9290744.0, '0', 0, 'BENIGN']], dtype=object)