Here we import all relevent libraries, check versions, and set a seed and RANDOM_STATE

In [1]:
# %matplotlib notebook

import csv, gc, glob, os, platform, pprint, sys, urllib
import fastai as fai
import fastai.tabular as fat
# import keras
# import keras.layers as klayers
# import keras.metrics as kmetrics
# import keras.models as kmodels
# import keras.utils as kutils
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.ticker as mtk
import mpl_toolkits as mpt
import numpy as np
import pandas as pd
# import pandas_ml as pdm
import scipy
import scipy.stats as scistat
import seaborn as sn
import sklearn as sk
import sklearn.decomposition as skd
import sklearn.discriminant_analysis as skda
import sklearn.dummy as skdu
import sklearn.ensemble as ske
# import sklearn.externals.joblib as skjob
import sklearn.feature_selection as skf
import sklearn.linear_model as sklm
import sklearn.metrics as skme
import sklearn.model_selection as skm
import sklearn.naive_bayes as sknb
import sklearn.neighbors as skn
import sklearn.neural_network as sknn
import sklearn.pipeline as skpl
import sklearn.preprocessing as skp
import sklearn.svm as sksvm
import sklearn.tree as skt
import sklearn.utils as sku

RANDOM_STATE: int = 14
seed: int = 7

# set up pretty printer for easier data evaluation
p = pprint.PrettyPrinter(indent=4, width=30)
pretty = p.pprint

print(f"""
python:\t{platform.python_version()}\n
libraries loaded:
\tfastai:\t\t{fai.__version__}
\tmatplotlib:\t{mpl.__version__}
\tnumpy:\t\t{np.__version__}
\tpandas:\t\t{pd.__version__}
\tseaborn:\t{sn.__version__}
\tscipy:\t\t{scipy.__version__}
\tsklearn:\t{sk.__version__}
""")


python:	3.7.10

libraries loaded:
	fastai:		1.0.61
	matplotlib:	3.3.4
	numpy:		1.20.2
	pandas:		1.2.4
	seaborn:	0.11.1
	scipy:		1.6.2
	sklearn:	0.24.2



Then we prepare the file paths we will be working with

In [12]:
def current_job_generator(num_files):
    for i in range(num_files):
        yield i


def get_file_path(directory):
    def func(file):
        return os.path.join(directory, file)

    return func


def print_features_with_bad_values(df):
    
    invalid_values: list = [
        np.inf, np.nan, 'Infinity', 'inf', 'NaN', 'nan'
    ]
    i = 0
    for col in df.columns:
        print(f'{i}-th pass; Column: {col}')
        i+=1
        for value in invalid_values:
            indexNames = df[df[col] == value].index
            if not indexNames.empty:
                print(f'found {len(indexNames)} rows with {value} in column {col}')

In [3]:
data_path_1: str = './original/01-12/'
data_path_2: str = './original/03-11/'
    
data_set_1: list = [
    'DrDoS_DNS.csv',
    'DrDoS_LDAP.csv',
    'DrDoS_MSSQL.csv',
    'DrDoS_NetBIOS.csv',
    'DrDoS_NTP.csv',
    'DrDoS_SNMP.csv',
    'DrDoS_SSDP.csv',
    'DrDoS_UDP.csv',
    'Syn.csv',
    'TFTP.csv',
    'UDPLag.csv',    
]
    
data_set_2: list = [
    'LDAP.csv',
    'MSSQL.csv',
    'NetBIOS.csv',
    'Portmap.csv',   
    'Syn.csv',
    'UDP.csv',
    'UDPLag.csv',
]


file_path_1: str = get_file_path(data_path_1)
file_path_2: str = get_file_path(data_path_2)

file_set: list = list(map(file_path_1, data_set_1))
file_set.extend(list(map(file_path_2, data_set_2)))


benign_part_path: str = './prepared/benign_parts/'
benign_path: str = './prepared/benign/'
file_number = len(file_set)
job_number = current_job_generator(file_number)


benign_parts = []
for i in range(file_number):
    benign_parts.append(os.path.join(benign_part_path, f'file{i}_benign.csv'))
    
    
print(f'We will be cleaning {len(file_set)} files:')
print(f'Benign samples will be grabbed from each dataset and saved separately\n')
pretty(file_set)

We will be cleaning 18 files:
Benign samples will be grabbed from each dataset and saved separately

[   './original/01-12/DrDoS_DNS.csv',
    './original/01-12/DrDoS_LDAP.csv',
    './original/01-12/DrDoS_MSSQL.csv',
    './original/01-12/DrDoS_NetBIOS.csv',
    './original/01-12/DrDoS_NTP.csv',
    './original/01-12/DrDoS_SNMP.csv',
    './original/01-12/DrDoS_SSDP.csv',
    './original/01-12/DrDoS_UDP.csv',
    './original/01-12/Syn.csv',
    './original/01-12/TFTP.csv',
    './original/01-12/UDPLag.csv',
    './original/03-11/LDAP.csv',
    './original/03-11/MSSQL.csv',
    './original/03-11/NetBIOS.csv',
    './original/03-11/Portmap.csv',
    './original/03-11/Syn.csv',
    './original/03-11/UDP.csv',
    './original/03-11/UDPLag.csv']


We will create some dataframes and lists for dataset statistics

In [4]:
composition_columns = ['File', 'Benign', 'Malicious', 'Total']
data_composition = pd.DataFrame(columns = composition_columns)
composition_after_cleaning = pd.DataFrame(columns = composition_columns)

In [5]:
def load_data(filePath):
    
    # slice off the ./CSV/ from the filePath
    if filePath[0] == '.' and filePath[1] == '/':
        filePathClean: str = filePath[11::]
        pickleDump: str = f'./cache/{filePathClean}.pickle'
    else:
        pickleDump: str = f'./cache/{filePath}.pickle'
    
    print(f'Loading Dataset: {filePath}')
    print(f'\tTo Dataset Cache: {pickleDump}\n')
    
    # check if data already exists within cache
    if os.path.exists(pickleDump):
        df = pd.read_pickle(pickleDump)
        
    # if not, load data and clean it before caching it
    else:
        df = pd.read_csv(filePath, low_memory=True)
#         df = clean_data(df)
        
        df.to_pickle(pickleDump)
    
    return df

In [6]:
new_column_names = {
    'Unnamed: 0': 'Unnamed',                            'Flow ID': 'Flow ID', 
    ' Source IP': 'Source IP',                          ' Source Port':'Source Port', 
    ' Destination IP': 'Destination IP',                ' Destination Port': 'Destination Port', 
    ' Protocol':'Protocol',                             ' Total Length of Bwd Packets':'Total Length of Bwd Packets',              
    ' Flow Duration': 'Flow Duration',                  ' Total Fwd Packets':'Total Fwd Packets', 
    ' Total Backward Packets':'Total Backward Packets', 'Total Length of Fwd Packets':'Total Length of Fwd Packets',
    ' Timestamp':'Timestamp',                           ' Init_Win_bytes_backward':'Init Win bytes backward',
    ' Fwd Packet Length Max':'Fwd Packet Length Max',   ' Fwd Packet Length Min':'Fwd Packet Length Min',
    ' Fwd Packet Length Mean':'Fwd Packet Length Mean', ' Fwd Packet Length Std':'Fwd Packet Length Std',
    'Bwd Packet Length Max':'Bwd Packet Length Max',    ' Bwd Packet Length Min':'Bwd Packet Length Min',
    ' Bwd Packet Length Mean':'Bwd Packet Length Mean', ' Bwd Packet Length Std':'Bwd Packet Length Std', 
    'Flow Bytes/s':'Flow Bytes/s',                      ' Flow Packets/s':'Flow Packets/s', 
    ' Flow IAT Mean':'Flow IAT Mean',                   ' Flow IAT Std':'Flow IAT Std',   
    ' Flow IAT Max':'Flow IAT Max',                     ' Flow IAT Min':'Flow IAT Min', 
    'Fwd IAT Total':'Fwd IAT Total',                    ' Fwd IAT Mean':'Fwd IAT Mean',
    ' Fwd IAT Std':'Fwd IAT Std',                       ' Fwd IAT Max':'Fwd IAT Max',
    ' Fwd IAT Min':'Fwd IAT Min',                       'Bwd IAT Total':'Bwd IAT Total', 
    ' Bwd IAT Mean':'Bwd IAT Mean',                     ' Bwd IAT Std':'Bwd IAT Std',
    ' Bwd IAT Max':'Bwd IAT Max',                       ' Bwd IAT Min':'Bwd IAT Min',
    'Fwd PSH Flags':'Fwd PSH Flags',                    ' Bwd PSH Flags':'Bwd PSH Flags', 
    ' Fwd URG Flags':'Fwd URG Flags',                   ' Bwd URG Flags':'Bwd URG Flags',
    ' Fwd Header Length':'Fwd Header Length',           ' Bwd Header Length':'Bwd Header Length', 
    'Fwd Packets/s':'Fwd Packets/s',                    ' Bwd Packets/s':'Bwd Packets/s', 
    ' Min Packet Length':'Min Packet Length',           ' Max Packet Length':'Max Packet Length', 
    ' Packet Length Mean':'Packet Length Mean',         ' Packet Length Std':'Packet Length Std', 
    ' Packet Length Variance':'Packet Length Variance', 'FIN Flag Count':'FIN Flag Count',
    ' SYN Flag Count':'SYN Flag Count',                 ' RST Flag Count':'RST Flag Count',
    ' PSH Flag Count':'PSH Flag Count',                 ' ACK Flag Count':'ACK Flag Count', 
    ' URG Flag Count':'URG Flag Count',                 ' CWE Flag Count':'CWE Flag Count', 
    ' ECE Flag Count':'ECE Flag Count',                 ' Down/Up Ratio':'Down/Up Ratio',
    ' Average Packet Size':'Average Packet Size',       ' Avg Fwd Segment Size':'Avg Fwd Segment Size',
    ' Avg Bwd Segment Size':'Avg Bwd Segment Size',     ' Fwd Header Length.1':'Fwd Header Length.1', 
    'Fwd Avg Bytes/Bulk':'Fwd Avg Bytes/Bulk',          ' Inbound':'Inbound', 
    ' Fwd Avg Packets/Bulk':'Fwd Avg Packets/Bulk',     ' Fwd Avg Bulk Rate':'Fwd Avg Bulk Rate', 
    ' Bwd Avg Bytes/Bulk':'Bwd Avg Bytes/Bulk',         ' Bwd Avg Packets/Bulk':'Bwd Avg Packets/Bulk',
    'Bwd Avg Bulk Rate':'Bwd Avg Bulk Rate',            'Subflow Fwd Packets':'Subflow Fwd Packets',
    ' Subflow Fwd Bytes':'Subflow Fwd Bytes',           ' Subflow Bwd Packets':'Subflow Bwd Packets',
    ' Subflow Bwd Bytes':'Subflow Bwd Bytes',           'Init_Win_bytes_forward':'Init Win bytes forward',
    ' act_data_pkt_fwd':'act data pkt fwd',             ' min_seg_size_forward':'min seg size forward', 
    'Active Mean':'Active Mean',                        ' Active Std':'Active Std',
    ' Active Max':'Active Max',                         ' Active Min':'Active Min', 
    'Idle Mean':'Idle Mean',                            ' Idle Std':'Idle Std',
    ' Idle Max':'Idle Max',                             ' Idle Min':'Idle Min',
    'SimillarHTTP':'SimillarHTTP',                      ' Label':'Label'
}

We will explore each .csv file in our file set, extract all the benign samples, record the statistics of each set, clean each set, and save a set of new, slimmer datafiles

In [7]:
# current_job = next(job_number)
current_job = next(4)
df = load_data(file_set[current_job])
df = df.rename(columns=new_column_names)
benign_df = df[df['Label'] == 'BENIGN']


data_composition = data_composition.append(pd.DataFrame([
    [file_set[current_job][11:], benign_df.shape[0], df.shape[0]-benign_df.shape[0], df.shape[0]]
], columns = composition_columns))

print(f"""
File:\t\t\t{file_set[current_job]}  
Job Number:\t\t{current_job+1}
Shape:\t\t\t{df.shape}
Samples:\t\t{df.shape[0]} 
Features:\t\t{df.shape[1]}
Benign samples:\t\t{benign_df.shape[0]}
Malicious samples:\t{df.shape[0]-benign_df.shape[0]}
""")

Loading Dataset: ./original/01-12/DrDoS_DNS.csv
	To Dataset Cache: ./cache/01-12/DrDoS_DNS.csv.pickle


File:			./original/01-12/DrDoS_DNS.csv  
Job Number:		1
Shape:			(5074413, 88)
Samples:		5074413 
Features:		88
Benign samples:		3402
Malicious samples:	5071011



Now, we explore the features to see which ones we should get rid of, creating a "pruning" list of all the rejects
We want all of our features other that our label to be numbers

In [22]:
values = benign_df.values
columns = benign_df.columns
prune = []
for i in range(benign_df.shape[1]):
    if type(values[0][i]) == str and columns[i] != 'Label':
        prune.append(columns[i]) 
    print(f"Column: {i}\tType: {type(values[0][i])}\tLabel: {columns[i]}")

Column: 0	Type: <class 'int'>	Label: Unnamed
Column: 1	Type: <class 'str'>	Label: Flow ID
Column: 2	Type: <class 'str'>	Label: Source IP
Column: 3	Type: <class 'int'>	Label: Source Port
Column: 4	Type: <class 'str'>	Label: Destination IP
Column: 5	Type: <class 'int'>	Label: Destination Port
Column: 6	Type: <class 'int'>	Label: Protocol
Column: 7	Type: <class 'str'>	Label: Timestamp
Column: 8	Type: <class 'int'>	Label: Flow Duration
Column: 9	Type: <class 'int'>	Label: Total Fwd Packets
Column: 10	Type: <class 'int'>	Label: Total Backward Packets
Column: 11	Type: <class 'float'>	Label: Total Length of Fwd Packets
Column: 12	Type: <class 'float'>	Label: Total Length of Bwd Packets
Column: 13	Type: <class 'float'>	Label: Fwd Packet Length Max
Column: 14	Type: <class 'float'>	Label: Fwd Packet Length Min
Column: 15	Type: <class 'float'>	Label: Fwd Packet Length Mean
Column: 16	Type: <class 'float'>	Label: Fwd Packet Length Std
Column: 17	Type: <class 'float'>	Label: Bwd Packet Length Max
C

In [13]:
print_features_with_bad_values(df)

0-th pass; Column: Unnamed
1-th pass; Column: Flow ID
2-th pass; Column: Source IP
3-th pass; Column: Source Port
4-th pass; Column: Destination IP
5-th pass; Column: Destination Port
6-th pass; Column: Protocol
7-th pass; Column: Timestamp
8-th pass; Column: Flow Duration
9-th pass; Column: Total Fwd Packets
10-th pass; Column: Total Backward Packets
11-th pass; Column: Total Length of Fwd Packets
12-th pass; Column: Total Length of Bwd Packets
13-th pass; Column: Fwd Packet Length Max
14-th pass; Column: Fwd Packet Length Min
15-th pass; Column: Fwd Packet Length Mean
16-th pass; Column: Fwd Packet Length Std
17-th pass; Column: Bwd Packet Length Max
18-th pass; Column: Bwd Packet Length Min
19-th pass; Column: Bwd Packet Length Mean
20-th pass; Column: Bwd Packet Length Std
21-th pass; Column: Flow Bytes/s
found 162363 rows with inf in column Flow Bytes/s
22-th pass; Column: Flow Packets/s
found 162394 rows with inf in column Flow Packets/s
23-th pass; Column: Flow IAT Mean
24-th pa

Since print_features_with_bad_values showed more than 160,000 bad entries in Flow Bytes/s and Flow Packets/s, it would be more efficient to just delete the Flow Bytes/s and Flow Packets/s columns instead of removing 160,000 data samples.

In [23]:
prune.append('Flow Bytes/s')
prune.append('Flow Packets/s')

According to Jaafar et al. in "Recent Analysis of Forged Request Headers Constituted by HTTP DDoS", DDoS attacks can use headers stolen from legitimate parties, thus we can reason that header length wont be a feature that provides much value to the ML algorithms.

In [24]:
prune.append('Bwd Header Length')
prune.append('Fwd Header Length')
prune.append('Fwd Header Length.1')

Similar to IP addresses, Port addresses will differ greatly between attacks and as qualitative data wont be processed by ML algorithms well

In [29]:
prune.append('Destination Port')
prune.append('Source Port')

All research on this dataset cannot reveal what the unnamed column might be. Since we cannot verify that it is even meaningful or attached to anything, we add it to the prune list.

In [25]:
prune.append('Unnamed')

We then drop all columns in the prune list from our dataframe so we can test the features for suitability

In [34]:
for col in prune:
    if col in df.columns:
        df.drop(columns=[col], inplace=True)

We now start feature analysis. To do so, we split the features into X and Y groups

In [50]:
dfVals = df.values
X = dfVals[:,:-1]
Y = dfVals[:,-1:]

In [85]:
smallDf = df[:100000]
smallDfVals = smallDf.values
smallX = smallDfVals[:,:-1]
smally = smallDfVals[:,-1:]

In [59]:
model=ske.ExtraTreesClassifier()
model.fit(X, Y.T[0])
importance_df = pd.DataFrame([model.feature_importances_], columns = df.columns[0:-1])
importance_df

Unnamed: 0,Protocol,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min seg size forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Inbound
0,0.098584,0.010731,0.005289,0.002757,0.001493,0.003071,0.00535,0.014826,0.008331,0.001502,...,0.085336,0.000709,5.7e-05,0.000606,0.000624,0.004195,0.000332,0.001085,0.003641,0.17811


In [67]:
importance_df.sort_values(by=0, axis=1)*100

Unnamed: 0,ECE Flag Count,Fwd URG Flags,Bwd URG Flags,Bwd Avg Bulk Rate,Bwd Avg Packets/Bulk,Bwd PSH Flags,Fwd Avg Bytes/Bulk,FIN Flag Count,Fwd Avg Bulk Rate,PSH Flag Count,...,Min Packet Length,CWE Flag Count,Fwd PSH Flags,Bwd Packet Length Mean,Bwd Packet Length Min,Down/Up Ratio,min seg size forward,URG Flag Count,Protocol,Inbound
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.229795,2.401527,2.732366,2.801694,3.574448,8.477985,8.533562,9.411856,9.858405,17.810951


In [75]:
test = skf.SelectKBest(score_func=skf.f_classif, k=20)
fit = test.fit(X, Y.T[0])
selected_features = np.hstack([fit.get_support(),[True]])
features = [label for i, label in enumerate(df.columns) if selected_features[i] ]
best4_univariate_df = df.filter(features)
best4_univariate_df

  f = msb / msw


Unnamed: 0,Protocol,Flow Duration,Bwd Packet Length Min,Bwd Packet Length Mean,Bwd Packet Length Std,Flow IAT Max,Fwd IAT Total,Fwd IAT Max,Bwd IAT Total,Fwd PSH Flags,...,RST Flag Count,ACK Flag Count,URG Flag Count,CWE Flag Count,Down/Up Ratio,Avg Bwd Segment Size,Init Win bytes forward,Init Win bytes backward,Inbound,Label
0,17,28415,0.0,0.0,0.0,3596.0,28415.0,3596.0,0.0,0,...,0,0,0,0,0.0,0.0,-1,-1,1,DrDoS_DNS
1,17,2,0.0,0.0,0.0,2.0,2.0,2.0,0.0,0,...,0,0,0,0,0.0,0.0,-1,-1,0,DrDoS_DNS
2,17,48549,0.0,0.0,0.0,5418.0,48549.0,5418.0,0.0,0,...,0,0,0,0,0.0,0.0,-1,-1,1,DrDoS_DNS
3,17,48337,0.0,0.0,0.0,3337.0,48337.0,3337.0,0.0,0,...,0,0,0,0,0.0,0.0,-1,-1,1,DrDoS_DNS
4,17,32026,0.0,0.0,0.0,1236.0,32026.0,1236.0,0.0,0,...,0,0,0,0,0.0,0.0,-1,-1,1,DrDoS_DNS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5074408,17,1,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0,...,0,0,0,0,0.0,0.0,-1,-1,1,DrDoS_DNS
5074409,17,30,0.0,0.0,0.0,30.0,30.0,30.0,0.0,0,...,0,0,0,0,0.0,0.0,-1,-1,1,DrDoS_DNS
5074410,17,1,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0,...,0,0,0,0,0.0,0.0,-1,-1,1,DrDoS_DNS
5074411,17,1,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0,...,0,0,0,0,0.0,0.0,-1,-1,1,DrDoS_DNS


In [82]:
x_train, x_test, y_train, y_test = skm.train_test_split(X, Y.T[0], test_size = .33, random_state=seed)
univ_val = best4_univariate_df.values
uniX = univ_val[:,:-1]
uniY = univ_val[:,-1:]

In [84]:
model = skn.KNeighborsClassifier()
model.fit(x_train, y_train)
prediction = model.predict(x_test)
report = skme.classification_report(y_test, prediction)
print(report)

KeyboardInterrupt: 

In [79]:
folds = 12
splits = skm.KFold(n_splits=folds, random_state=RANDOM_STATE, shuffle=True)
model = sklm.LogisticRegression(max_iter=250)
results = skm.cross_val_score(model, uniX, uniY.T[0], cv=splits, scoring='accuracy')
print(
f"""Accuracy:\t{results.mean()*100}%
Deviation:\t{results.std()*100}%"""
)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


KeyboardInterrupt: 

In [None]:
folds = 12
splits = skm.KFold(n_splits=folds, random_state=RANDOM_STATE, shuffle=True)
model = sklm.LogisticRegression(max_iter=250)
results = skm.cross_val_score(model, X, Y.T[0], cv=splits, scoring='accuracy')
print(
f"""Accuracy:\t{results.mean()*100}%
Deviation:\t{results.std()*100}%"""
)

This tells us ./original/01-12/DrDoS_DNS.csv has 5,074,413 samples which consist of 3402 Benign samples and 5,071,011 DNS DDoS attack samples

In [None]:
print(benign_df.dtypes)

In [None]:
prune

In [14]:
df.head()

Unnamed: 0,Unnamed,Flow ID,Source IP,Source Port,Destination IP,Destination Port,Protocol,Timestamp,Flow Duration,Total Fwd Packets,...,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,SimillarHTTP,Inbound,Label
0,425,172.16.0.5-192.168.50.1-634-60495-17,172.16.0.5,634,192.168.50.1,60495,17,2018-12-01 10:51:39.813448,28415,97,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,DrDoS_DNS
1,430,172.16.0.5-192.168.50.1-60495-634-17,192.168.50.1,634,172.16.0.5,60495,17,2018-12-01 10:51:39.820842,2,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,DrDoS_DNS
2,1654,172.16.0.5-192.168.50.1-634-46391-17,172.16.0.5,634,192.168.50.1,46391,17,2018-12-01 10:51:39.852499,48549,200,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,DrDoS_DNS
3,2927,172.16.0.5-192.168.50.1-634-11894-17,172.16.0.5,634,192.168.50.1,11894,17,2018-12-01 10:51:39.890213,48337,200,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,DrDoS_DNS
4,694,172.16.0.5-192.168.50.1-634-27878-17,172.16.0.5,634,192.168.50.1,27878,17,2018-12-01 10:51:39.941151,32026,200,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,DrDoS_DNS


In [None]:
# df = clean_data(df)

In [None]:
def clean_data(df):
    
    # Create a new dataframe and strip off all leading and trailing whitespace
    ndf = df.rename(str.strip, axis='columns')
    
    # Here we create a dictionary that maps possible column names to desired column names
    columns: dict = {
        'Unnamed: 0'        : 'Unnamed',         ' Total Backward Packets'     :'Total Backward Packets',
        ' Source IP'        : 'Source IP',       ' Bwd Avg Bytes/Bulk'         :'Bwd Avg Bytes/Bulk', 
        ' Destination IP'   : 'Destination IP',  ' Destination Port'           : 'Destination Port', 
        ' Protocol'         :'Protocol',         ' Total Length of Bwd Packets':'Total Length of Bwd Packets',              
        ' Flow Duration'    : 'Flow Duration',   ' Total Fwd Packets'          :'Total Fwd Packets', 
        'Flow ID'           : 'Flow ID',         'Total Length of Fwd Packets' :'Total Length of Fwd Packets',
        ' Timestamp'        :'Timestamp',        ' Init_Win_bytes_backward'    :'Init Win bytes backward',
        ' Flow IAT Std'     :'Flow IAT Std',     ' Fwd Packet Length Min'      :'Fwd Packet Length Min',
        ' Flow IAT Min'     :'Flow IAT Min',     ' Fwd Packet Length Std'      :'Fwd Packet Length Std',
        ' Fwd IAT Mean'     :'Fwd IAT Mean',     ' Bwd Packet Length Min'      :'Bwd Packet Length Min',
        ' Fwd IAT Max'      :'Fwd IAT Max',      ' Bwd Packet Length Std'      :'Bwd Packet Length Std', 
        'Flow Bytes/s'      :'Flow Bytes/s',     ' Flow Packets/s'             :'Flow Packets/s', 
        ' Flow IAT Mean'    :'Flow IAT Mean',    ' Fwd Packet Length Max'      :'Fwd Packet Length Max',
        ' Flow IAT Max'     :'Flow IAT Max',     ' Fwd Packet Length Mean'     :'Fwd Packet Length Mean',
        'Fwd IAT Total'     :'Fwd IAT Total',    'Bwd Packet Length Max'       :'Bwd Packet Length Max', 
        ' Fwd IAT Std'      :'Fwd IAT Std',      ' Bwd Packet Length Mean'     :'Bwd Packet Length Mean', 
        ' Fwd IAT Min'      :'Fwd IAT Min',      ' Fwd Header Length'          :'Fwd Header Length',
        ' Bwd IAT Mean'     :'Bwd IAT Mean',     ' Min Packet Length'          :'Min Packet Length',
        ' Bwd IAT Max'      :'Bwd IAT Max',      ' Packet Length Mean'         :'Packet Length Mean',
        'Fwd PSH Flags'     :'Fwd PSH Flags',    ' Bwd PSH Flags'              :'Bwd PSH Flags', 
        ' Fwd URG Flags'    :'Fwd URG Flags',    ' Bwd URG Flags'              :'Bwd URG Flags',
        'Bwd IAT Total'     :'Bwd IAT Total',    ' Bwd Header Length'          :'Bwd Header Length', 
        'Fwd Packets/s'     :'Fwd Packets/s',    ' Bwd Packets/s'              :'Bwd Packets/s', 
        ' Bwd IAT Std'      :'Bwd IAT Std',      ' Max Packet Length'          :'Max Packet Length', 
        ' Bwd IAT Min'      :'Bwd IAT Min',      ' Packet Length Std'          :'Packet Length Std', 
        ' Inbound'          :'Inbound',          'FIN Flag Count'              :'FIN Flag Count',
        ' Active Std'       :'Active Std',       ' RST Flag Count'             :'RST Flag Count',
        ' PSH Flag Count'   :'PSH Flag Count',   ' ACK Flag Count'             :'ACK Flag Count', 
        ' URG Flag Count'   :'URG Flag Count',   ' CWE Flag Count'             :'CWE Flag Count', 
        ' ECE Flag Count'   :'ECE Flag Count',   ' Down/Up Ratio'              :'Down/Up Ratio',
        ' Idle Std'         :'Idle Std',         ' Avg Fwd Segment Size'       :'Avg Fwd Segment Size',
        ' Idle Min'         :'Idle Min',         ' Fwd Header Length.1'        :'Fwd Header Length.1', 
        ' Active Min'       :'Active Min',       ' Packet Length Variance'     :'Packet Length Variance',
        'SimillarHTTP'      :'SimillarHTTP',     ' Fwd Avg Bulk Rate'          :'Fwd Avg Bulk Rate', 
        ' Source Port'      :'Source Port',      ' Bwd Avg Packets/Bulk'       :'Bwd Avg Packets/Bulk',
        'Bwd Avg Bulk Rate' :'Bwd Avg Bulk Rate', 'Subflow Fwd Packets'        :'Subflow Fwd Packets',
        ' Subflow Fwd Bytes':'Subflow Fwd Bytes', ' Subflow Bwd Packets'       :'Subflow Bwd Packets',
        ' Subflow Bwd Bytes':'Subflow Bwd Bytes', 'Init_Win_bytes_forward'     :'Init Win bytes forward',
        ' act_data_pkt_fwd' :'act data pkt fwd',  ' min_seg_size_forward'      :'min seg size forward', 
        'Active Mean'       :'Active Mean',       ' SYN Flag Count'            :'SYN Flag Count',  
        ' Active Max'       :'Active Max',        'Fwd Avg Bytes/Bulk'         :'Fwd Avg Bytes/Bulk', 
        'Idle Mean'         :'Idle Mean',         ' Average Packet Size'       :'Average Packet Size',
        ' Idle Max'         :'Idle Max',          ' Avg Bwd Segment Size'      :'Avg Bwd Segment Size', 
        ' Label'            :'Label',             ' Fwd Avg Packets/Bulk'      :'Fwd Avg Packets/Bulk',
    }  
    # Replace old column names
    ndf.rename(columns = columns, inplace = True)
    
    # Prune columns with string, address, or time features
    prune: list = [ 
        'Destination IP'     , 'Destination Port',
        'Flow ID'            , 'Fwd Header Length'
        'Fwd Header Length.1', 'Protocol', 
        'SimillarHTTP'       , 'Source IP', 
        'Source Port'        , 'Timestamp',
        'Unnamed'
    ]
    
    for col in prune:
        if col in ndf.columns:
            ndf.drop(columns=[col], inplace=True)
            
    
    # drop missing values/NaN etc.
    ndf.dropna(inplace=True)
    
    # Search through dataframe for any Infinite or NaN values in various forms that were not picked up previously
    invalid_values: list = [
        np.inf, np.nan, 'Infinity', 'inf', 'NaN', 'nan'
    ]
        
    for col in ndf.columns:
        for value in invalid_values:
            indexNames = ndf[ndf[col] == value].index
            if not indexNames.empty:
                print(f'deleting {len(indexNames)} rows with Infinity in column {col}')
                ndf.drop(indexNames, inplace=True)

    
#     for col in ndf.columns:
#         indexNames = ndf[ndf[col] == np.inf].index
#         if not indexNames.empty:
#             print('deleting {} rows with Infinity in column {}'.format(len(indexNames), col))
#             ndf.drop(indexNames, inplace=True)

#         indexNames = ndf[ndf[col] == np.nan].index
#         if not indexNames.empty:
#             print('deleting {} rows with Infinity in column {}'.format(len(indexNames), col))
#             ndf.drop(indexNames, inplace=True)

#         indexNames = ndf[ndf[col]=='Infinity'].index
#         if not indexNames.empty:
#             print('deleting {} rows with Infinity in column {}'.format(len(indexNames), col))
#             ndf.drop(indexNames, inplace=True)

#         indexNames = ndf[ndf[col]=='inf'].index
#         if not indexNames.empty:
#             print('deleting {} rows with inf in column {}'.format(len(indexNames), col))
#             ndf.drop(indexNames, inplace=True)

#         indexNames = ndf[ndf[col]=='NaN'].index
#         if not indexNames.empty:
#             print('deleting {} rows with NaN in column {}'.format(len(indexNames), col))
#             ndf.drop(indexNames, inplace=True)

#         indexNames = ndf[ndf[col]=='nan'].index
#         if not indexNames.empty:
#             print('deleting {} rows with nan in column {}'.format(len(indexNames), col))
#             ndf.drop(indexNames, inplace=True)
            

#     # drop Protocol if it is present
#     if 'Protocol' in ndf.columns:
#         ndf.drop(columns=['Protocol'], inplace=True)
    
#     # drop  Source IP and Destination IP columns if they are present        
#     if 'Source IP' in ndf.columns:
#         ndf.drop(columns=['Source IP'], inplace=True)

#     if 'Destination IP' in ndf.columns:
#         ndf.drop(columns=['Destination IP'], inplace=True)
        
#     # drop  Source Port and Destination Port columns if they are present        
#     if 'Source Port' in ndf.columns:
#         ndf.drop(columns=['Source Port'], inplace=True)

#     if 'Destination Port' in ndf.columns:
#         ndf.drop(columns=['Destination Port'], inplace=True)
             
    # Finally we standardize the contents of the Label column to being TOR or nonTOR
    ndf = ndf.replace( ['DrDoS_DNS'], 'DNS')

    return ndf