# Midterm - Network Request Detection
Datasets from [Web Network Traffic dataset](https://www.kaggle.com/datasets/rudrakumar96/web-firewall-good-and-bad-request/data).

## Importing Packages

In [63]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

## Read in and Evaluate Data

In [53]:
base_path = '../datasets/network_requests/'
combine_paths = ['2bad_reqff.csv', '2good_reqff.csv']
test_validate = pd.read_csv(base_path + 'Testing_data.csv')
badwords = ['sleep', 'uid', 'select', 'waitfor', 'delay', 'system', 'union', 'order by', 'group by', 'admin', 'drop', 'script']

combined = pd.concat([pd.read_csv(base_path + f) for f in combine_paths], ignore_index=True)

In [54]:
combined.head()

Unnamed: 0,method,path,body,single_q,double_q,dashes,braces,spaces,percentages,semicolons,angle_brackets,special_chars,path_length,body_length,badwords_count,class
0,POST,/doLogin,uid=ZAP&passw=ZAP&btnSubmit=Login,0,0,0,0,0,0,0,0,0,8,33,1,bad
1,POST,/sendFeedback,cfile=comments.txt&name=ZAP&email_addr=ZAP&sub...,0,0,0,0,7,0,0,0,0,13,124,0,bad
2,GET,/admin/clients.xls,,0,0,0,0,0,0,0,0,0,18,0,1,bad
3,GET,/my%20documents/JohnSmith/Bank%20Site%20Docume...,,0,0,0,0,3,0,0,0,0,57,0,0,bad
4,GET,/my%20documents/JohnSmith/Bank%20Site%20Docume...,,0,0,0,0,3,0,0,0,0,82,0,0,bad


In [6]:
combined.describe()

Unnamed: 0,single_q,double_q,dashes,braces,spaces,percentages,semicolons,angle_brackets,special_chars,path_length,body_length,badwords_count
count,5797.0,5797.0,5797.0,5797.0,5797.0,5797.0,5797.0,5797.0,5797.0,5797.0,5797.0,5797.0
mean,0.795066,0.587373,0.183198,0.780231,4.159565,0.0,0.202691,0.27773,0.0,52.059686,51.558565,0.807314
std,1.787375,1.40276,0.427538,1.844078,6.078129,0.0,0.58989,0.91106,0.0,111.756514,116.9824,1.513214
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,33.0,0.0,0.0
75%,1.0,0.0,0.0,1.0,7.0,0.0,0.0,0.0,0.0,69.0,86.0,1.0
max,12.0,14.0,2.0,10.0,40.0,0.0,3.0,8.0,0.0,2162.0,2224.0,10.0


In [7]:
combined.size

92752

In [8]:
# Find the count of nulls in each column
combined.isnull().sum(axis = 0)

method               0
path                 0
body              3396
single_q             0
double_q             0
dashes               0
braces               0
spaces               0
percentages          0
semicolons           0
angle_brackets       0
special_chars        0
path_length          0
body_length          0
badwords_count       0
class                0
dtype: int64

In [9]:
testing.size

1740

## Engineer Data

In [86]:
# Loop through badwords for each sample
# Return 1 if any badwords found in the path or body values
# combined_keywords = combined.assign(badword_found = lambda x: 1 if any((word in x.path) or (word in x.body) for word in badwords) else 0)
# combined_keywords.head()

### Remove Path and Body Columns

In [10]:
dropped = combined.drop(columns=['path', 'body'])
dropped.head()

Unnamed: 0,method,single_q,double_q,dashes,braces,spaces,percentages,semicolons,angle_brackets,special_chars,path_length,body_length,badwords_count,class
0,POST,0,0,0,0,0,0,0,0,0,8,33,1,bad
1,POST,0,0,0,0,7,0,0,0,0,13,124,0,bad
2,GET,0,0,0,0,0,0,0,0,0,18,0,1,bad
3,GET,0,0,0,0,3,0,0,0,0,57,0,0,bad
4,GET,0,0,0,0,3,0,0,0,0,82,0,0,bad


### Split Dataset `data`, `target`, and `target_names` variables

In [79]:
data = dropped.drop(columns='class')
target = dropped['class'].map({'bad': 0, 'good': 1})
target_names = dropped['class'].unique()

print(target)
print(target_names)

0       0
1       0
2       0
3       0
4       0
       ..
5792    1
5793    1
5794    1
5795    1
5796    1
Name: class, Length: 5797, dtype: int64
['bad' 'good']


### Binary Encode 'method' Column

In [80]:
print(data['method'].unique())

['POST' 'GET']


In [81]:
data['method'] = data['method'].map({'GET': 0, 'POST': 1})
data.head()

Unnamed: 0,method,single_q,double_q,dashes,braces,spaces,percentages,semicolons,angle_brackets,special_chars,path_length,body_length,badwords_count
0,1,0,0,0,0,0,0,0,0,0,8,33,1
1,1,0,0,0,0,7,0,0,0,0,13,124,0
2,0,0,0,0,0,0,0,0,0,0,18,0,1
3,0,0,0,0,0,3,0,0,0,0,57,0,0
4,0,0,0,0,0,3,0,0,0,0,82,0,0


### Normalize Count Columns

In [82]:
scaler = StandardScaler()
scaler.fit(data)
data = scaler.transform(data)
data

array([[ 1.19098267, -0.44486184, -0.41876262, ..., -0.39428115,
        -0.15865776,  0.12734653],
       [ 1.19098267, -0.44486184, -0.41876262, ..., -0.34953717,
         0.61930414, -0.53355572],
       [-0.83964278, -0.44486184, -0.41876262, ..., -0.30479319,
        -0.44077581,  0.12734653],
       ...,
       [-0.83964278, -0.44486184, -0.41876262, ..., -0.11686847,
        -0.44077581, -0.53355572],
       [-0.83964278, -0.44486184, -0.41876262, ..., -0.22425402,
        -0.44077581, -0.53355572],
       [-0.83964278, -0.44486184, -0.41876262, ..., -0.11686847,
        -0.44077581, -0.53355572]])

### Split Data into Train/Test (80/20)

In [83]:
x_train, x_test, y_train, y_test = train_test_split(data, target, test_size=0.20, random_state=81)

In [85]:
x_train

array([[ 1.19098267,  0.67419396, -0.41876262, ..., -0.34953717,
         1.39726604, -0.53355572],
       [ 1.19098267,  0.67419396,  0.2941792 , ..., -0.39428115,
         0.10636223,  1.44915102],
       [-0.83964278,  0.11466606, -0.41876262, ...,  0.21423699,
        -0.44077581,  0.78824877],
       ...,
       [ 1.19098267, -0.44486184,  0.2941792 , ..., -0.35848596,
         0.01232288,  0.12734653],
       [-0.83964278,  1.79324975, -0.41876262, ...,  0.67957439,
        -0.44077581,  0.12734653],
       [-0.83964278, -0.44486184, -0.41876262, ..., -0.37638356,
        -0.44077581, -0.53355572]])

## K Nearest Neighbors