# Midterm - Network Request Detection
Datasets from [Web Network Traffic dataset](https://www.kaggle.com/datasets/rudrakumar96/web-firewall-good-and-bad-request/data).

## Importing Packages

In [97]:
import numpy as np
import pandas as pd
from sklearn import datasets, tree
from sklearn.utils import resample, shuffle
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import RandomOverSampler

## Read in and Evaluate Data

In [2]:
base_path = '../datasets/network_requests/'
combine_paths = ['2bad_reqff.csv', '2good_reqff.csv']
validation_raw = pd.read_csv(base_path + 'Testing_data.csv')
badwords = ['sleep', 'uid', 'select', 'waitfor', 'delay', 'system', 'union', 'order by', 'group by', 'admin', 'drop', 'script']

combined = pd.concat([pd.read_csv(base_path + f) for f in combine_paths], ignore_index=True)

In [3]:
combined.head()

Unnamed: 0,method,path,body,single_q,double_q,dashes,braces,spaces,percentages,semicolons,angle_brackets,special_chars,path_length,body_length,badwords_count,class
0,POST,/doLogin,uid=ZAP&passw=ZAP&btnSubmit=Login,0,0,0,0,0,0,0,0,0,8,33,1,bad
1,POST,/sendFeedback,cfile=comments.txt&name=ZAP&email_addr=ZAP&sub...,0,0,0,0,7,0,0,0,0,13,124,0,bad
2,GET,/admin/clients.xls,,0,0,0,0,0,0,0,0,0,18,0,1,bad
3,GET,/my%20documents/JohnSmith/Bank%20Site%20Docume...,,0,0,0,0,3,0,0,0,0,57,0,0,bad
4,GET,/my%20documents/JohnSmith/Bank%20Site%20Docume...,,0,0,0,0,3,0,0,0,0,82,0,0,bad


In [4]:
combined.describe()

Unnamed: 0,single_q,double_q,dashes,braces,spaces,percentages,semicolons,angle_brackets,special_chars,path_length,body_length,badwords_count
count,5797.0,5797.0,5797.0,5797.0,5797.0,5797.0,5797.0,5797.0,5797.0,5797.0,5797.0,5797.0
mean,0.795066,0.587373,0.183198,0.780231,4.159565,0.0,0.202691,0.27773,0.0,52.059686,51.558565,0.807314
std,1.787375,1.40276,0.427538,1.844078,6.078129,0.0,0.58989,0.91106,0.0,111.756514,116.9824,1.513214
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,33.0,0.0,0.0
75%,1.0,0.0,0.0,1.0,7.0,0.0,0.0,0.0,0.0,69.0,86.0,1.0
max,12.0,14.0,2.0,10.0,40.0,0.0,3.0,8.0,0.0,2162.0,2224.0,10.0


In [5]:
combined.size

92752

In [6]:
# Find the count of nulls in each column
combined.isnull().sum(axis = 0)

method               0
path                 0
body              3396
single_q             0
double_q             0
dashes               0
braces               0
spaces               0
percentages          0
semicolons           0
angle_brackets       0
special_chars        0
path_length          0
body_length          0
badwords_count       0
class                0
dtype: int64

In [7]:
validation_raw.size

1740

## Engineer Data
We end up with:
- `data`
- `target`
- `target_names`
- `x_train_std`, `x_test_std`, `y_train_std`, and `y_test_std`
- `x_train_norm`, `x_test_norm`, `y_train_norm`, and `y_test_norm`
- `x_train_down`, `x_test_norm`, `y_train_down`, and `y_test_norm`
- `x_smoteenn`, `y_smoteenn`
- `x_oversampled`, `y_oversampled`
- `validation`

TODO
- [x] Try with normalized rather than standard scaler (Didn't seem to have any change)
- [x] Downsample
- [ ] Try Chi-square test to limit features
- [ ] Find confidence interval for significant results

### Remove Path and Body Columns

In [8]:
dropped = combined.drop(columns=['path', 'body'])
dropped.head()

Unnamed: 0,method,single_q,double_q,dashes,braces,spaces,percentages,semicolons,angle_brackets,special_chars,path_length,body_length,badwords_count,class
0,POST,0,0,0,0,0,0,0,0,0,8,33,1,bad
1,POST,0,0,0,0,7,0,0,0,0,13,124,0,bad
2,GET,0,0,0,0,0,0,0,0,0,18,0,1,bad
3,GET,0,0,0,0,3,0,0,0,0,57,0,0,bad
4,GET,0,0,0,0,3,0,0,0,0,82,0,0,bad


### Split Dataset `data`, `target`, and `target_names` variables

In [9]:
data = dropped.drop(columns='class')
target = dropped['class'].map({'bad': 0, 'good': 1})
target_names = dropped['class'].unique()

print(target)
print(target_names)

0       0
1       0
2       0
3       0
4       0
       ..
5792    1
5793    1
5794    1
5795    1
5796    1
Name: class, Length: 5797, dtype: int64
['bad' 'good']


### One-Hot Encode 'method' Column

In [10]:
print(data['method'].unique())

['POST' 'GET']


In [11]:
data['method'] = data['method'].map({'GET': 0, 'POST': 1})
data.head()

Unnamed: 0,method,single_q,double_q,dashes,braces,spaces,percentages,semicolons,angle_brackets,special_chars,path_length,body_length,badwords_count
0,1,0,0,0,0,0,0,0,0,0,8,33,1
1,1,0,0,0,0,7,0,0,0,0,13,124,0
2,0,0,0,0,0,0,0,0,0,0,18,0,1
3,0,0,0,0,0,3,0,0,0,0,57,0,0
4,0,0,0,0,0,3,0,0,0,0,82,0,0


### Normalize Count Columns (Both MinMax and Standard Scalers)

In [15]:
minmax_scaler = MinMaxScaler()
minmax_scaler.fit(data)
minmax = minmax_scaler.transform(data)
print(minmax[0])

scaler = StandardScaler()
scaler.fit(data)
standard = scaler.transform(data)
print(standard[0])

[1.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.00323924 0.01483813
 0.1       ]
[ 1.19098267 -0.44486184 -0.41876262 -0.4285327  -0.42313745 -0.68440867
  0.         -0.34363762 -0.30486888  0.         -0.39428115 -0.15865776
  0.12734653]


### Split Data into Train/Test (80/20)

In [16]:
x_train_std, x_test_std, y_train_std, y_test_std = train_test_split(standard, target, test_size=0.20, stratify=target, random_state=81)

In [18]:
x_train_norm, x_test_norm, y_train_norm, y_test_norm = train_test_split(minmax, target, test_size=0.20, stratify=target, random_state=81)

In [19]:
print(x_train_std.shape)
x_train_std

(4637, 13)


array([[-0.83964278, -0.44486184, -0.41876262, ..., -0.07212449,
        -0.44077581, -0.53355572],
       [-0.83964278, -0.44486184, -0.41876262, ..., -0.08107328,
        -0.44077581, -0.53355572],
       [ 1.19098267, -0.44486184,  0.2941792 , ..., -0.34953717,
         0.850128  ,  0.12734653],
       ...,
       [ 1.19098267, -0.44486184,  1.00712102, ..., -0.35848596,
        -0.00477519, -0.53355572],
       [-0.83964278,  1.79324975,  1.00712102, ...,  1.36863169,
        -0.44077581,  2.11005327],
       [-0.83964278,  1.79324975,  2.43300465, ...,  1.01067984,
        -0.44077581,  0.12734653]])

In [20]:
print(np.unique(y_train_std, return_counts=True))
print(np.unique(y_test_std, return_counts=True))

(array([0, 1]), array([4407,  230]))
(array([0, 1]), array([1103,   57]))


### Resample by Downsampling 'Bad' Class (Using MinMax Normalized Data)

In [105]:
print("Original Shape:", x_train_norm.shape)

# Create masks to grab elements from the two classes
y_mask_0 = y_train_norm == 0
y_mask_1 = y_train_norm == 1
# Class 0
x_0 = x_train_norm[y_mask_0]
y_0 = y_train_norm[y_mask_0]
# Class 1
x_1 = x_train_norm[y_mask_1]
y_1 = y_train_norm[y_mask_1]

print("x_0 shape:", x_0.shape, "y_0 shape:", y_0.shape)
print("x_1 shape:", x_1.shape, "y_1 shape:", y_1.shape)

# Selecting random samples from class "0" (a bit more than class "1")
x_0_down, y_0_down = resample(x_0, y_0, replace=False, n_samples=500, random_state=39)

# Concate "0" class back with "1"
x_train_down = np.concatenate([x_0_down, x_1])
y_train_down = np.concatenate([y_0_down, y_1])
# Shuffle
x_train_down, y_train_down = shuffle(x_train_down, y_train_down, random_state=72)
print(x_train_down.shape)

Original Shape: (4637, 13)
x_0 shape: (4407, 13) y_0 shape: (4407,)
x_1 shape: (230, 13) y_1 shape: (230,)
(730, 13)


### Resample Data with SMOTEENN

In [22]:
smoteenn = SMOTEENN(random_state=38)
x_smoteenn, y_smoteenn= smoteenn.fit_resample(x_train_std, y_train_std)

### Resample Data by Resampling Duplicating

In [23]:
oversample = RandomOverSampler(sampling_strategy='minority', random_state=13, shrinkage=0.3)
x_oversampled, y_oversampled = oversample.fit_resample(x_train_std, y_train_std)

### Perform Same Operations on Validation Dataset (Unused)

In [24]:
# Drop 'path' and 'body' columns
validation = validation_raw.drop(columns=['path', 'body'])
# Bindary Encode 'method' column
validation['method'] = validation['method'].map({'GET': 0, 'POST': 1})
# Normalize Columns
val_scaler = StandardScaler()
val_scaler.fit(validation)
validation = val_scaler.transform(validation)
print(validation.shape)
validation

(116, 13)


array([[ 2.94392029,  0.        ,  0.        , ..., -1.50750536,
         0.67923546,  1.70722012],
       [ 2.94392029,  0.        ,  0.        , ..., -1.28875302,
         3.37878724, -0.44536177],
       [-0.33968311,  0.        ,  0.        , ..., -1.07000068,
        -0.29972288,  1.70722012],
       ...,
       [-0.33968311,  0.        ,  0.        , ..., -0.15124084,
        -0.29972288, -0.44536177],
       [-0.33968311,  0.        ,  0.        , ..., -0.67624646,
        -0.29972288, -0.44536177],
       [-0.33968311,  0.        ,  0.        , ..., -0.15124084,
        -0.29972288, -0.44536177]])

## K Nearest Neighbors (5 Methods)

In [25]:
def knn(x, y, x_test, y_test, names, neighbors):
    # Train Model
    knn = KNeighborsClassifier(n_neighbors = neighbors)
    knn.fit(x, y)
    # Make Predictions
    knn_pred = knn.predict(x_test)
    # Get Reports
    print(confusion_matrix(y_test, knn_pred), '\n')
    print(classification_report(y_test, knn_pred, target_names=names))

### Training with Raw Standard Scaler Data

In [26]:
knn(x_train_std, y_train_std, x_test_std, y_test_std, target_names, 10)

[[1098    5]
 [  23   34]] 

              precision    recall  f1-score   support

         bad       0.98      1.00      0.99      1103
        good       0.87      0.60      0.71        57

    accuracy                           0.98      1160
   macro avg       0.93      0.80      0.85      1160
weighted avg       0.97      0.98      0.97      1160



### Training with Raw Normalized Data

In [45]:
knn(x_train_norm, y_train_norm, x_test_norm, y_test_norm, target_names, 10)

[[1098    5]
 [  23   34]] 

              precision    recall  f1-score   support

         bad       0.98      1.00      0.99      1103
        good       0.87      0.60      0.71        57

    accuracy                           0.98      1160
   macro avg       0.93      0.80      0.85      1160
weighted avg       0.97      0.98      0.97      1160



### Training with SMOTEENN Data

In [27]:
knn(x_smoteenn, y_smoteenn, x_test_std, y_test_std, target_names, 10)

[[1052   51]
 [  12   45]] 

              precision    recall  f1-score   support

         bad       0.99      0.95      0.97      1103
        good       0.47      0.79      0.59        57

    accuracy                           0.95      1160
   macro avg       0.73      0.87      0.78      1160
weighted avg       0.96      0.95      0.95      1160



### Training with Oversampled Data

In [28]:
knn(x_oversampled, y_oversampled, x_test_std, y_test_std, target_names, 10)

[[1098    5]
 [  23   34]] 

              precision    recall  f1-score   support

         bad       0.98      1.00      0.99      1103
        good       0.87      0.60      0.71        57

    accuracy                           0.98      1160
   macro avg       0.93      0.80      0.85      1160
weighted avg       0.97      0.98      0.97      1160



### Training with Downsampled Data

In [106]:
knn(x_train_down, y_train_down, x_test_norm, y_test_norm, target_names, 10)

[[1014   89]
 [   1   56]] 

              precision    recall  f1-score   support

         bad       1.00      0.92      0.96      1103
        good       0.39      0.98      0.55        57

    accuracy                           0.92      1160
   macro avg       0.69      0.95      0.76      1160
weighted avg       0.97      0.92      0.94      1160



## Decision Trees (4 Methods)

In [29]:
def treeModel(x, y, x_test, y_test, names):
    # Train Model
    tree_model = tree.DecisionTreeClassifier()
    tree_model = tree_model.fit(x, y)
    # Make Predictions
    trees_pred = tree_model.predict(x_test)
    # Get Reports on Test
    print(confusion_matrix(y_test, trees_pred))
    print(classification_report(y_test, trees_pred, target_names=names))

### Train with Raw Data

In [30]:
treeModel(x_train_std, y_train_std, x_test_std, y_test_std, target_names)

[[1093   10]
 [  21   36]]
              precision    recall  f1-score   support

         bad       0.98      0.99      0.99      1103
        good       0.78      0.63      0.70        57

    accuracy                           0.97      1160
   macro avg       0.88      0.81      0.84      1160
weighted avg       0.97      0.97      0.97      1160



### Train with SMOTEENN Data

In [31]:
treeModel(x_smoteenn, y_smoteenn, x_test_std, y_test_std, target_names)

[[1051   52]
 [  29   28]]
              precision    recall  f1-score   support

         bad       0.97      0.95      0.96      1103
        good       0.35      0.49      0.41        57

    accuracy                           0.93      1160
   macro avg       0.66      0.72      0.69      1160
weighted avg       0.94      0.93      0.94      1160



### Train with Oversampled Data

In [32]:
treeModel(x_oversampled, y_oversampled, x_test_std, y_test_std, target_names)

[[1094    9]
 [  21   36]]
              precision    recall  f1-score   support

         bad       0.98      0.99      0.99      1103
        good       0.80      0.63      0.71        57

    accuracy                           0.97      1160
   macro avg       0.89      0.81      0.85      1160
weighted avg       0.97      0.97      0.97      1160



### Train with Downsampled Data

In [107]:
treeModel(x_train_down, y_train_down, x_test_norm, y_test_norm, target_names)

[[1020   83]
 [   6   51]]
              precision    recall  f1-score   support

         bad       0.99      0.92      0.96      1103
        good       0.38      0.89      0.53        57

    accuracy                           0.92      1160
   macro avg       0.69      0.91      0.75      1160
weighted avg       0.96      0.92      0.94      1160



## Logistic Regression (3 Methods)

In [33]:
def logres(x, y, x_test, y_test, names):
    # Train Model
    log_model = LogisticRegression(random_state=38, max_iter=1000).fit(x, y)
    # Make Predictions
    log_pred = log_model.predict(x_test)
    # Get Reports
    print(confusion_matrix(y_test, log_pred))
    print(classification_report(y_test, log_pred, target_names=names))

### Training with Raw Data

In [34]:
logres(x_train_std, y_train_std, x_test_std, y_test_std, target_names)

[[1103    0]
 [  57    0]]
              precision    recall  f1-score   support

         bad       0.95      1.00      0.97      1103
        good       0.00      0.00      0.00        57

    accuracy                           0.95      1160
   macro avg       0.48      0.50      0.49      1160
weighted avg       0.90      0.95      0.93      1160



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Training with SMOTEENN Data

In [35]:
logres(x_smoteenn, y_smoteenn, x_test_std, y_test_std, target_names)

[[945 158]
 [  0  57]]
              precision    recall  f1-score   support

         bad       1.00      0.86      0.92      1103
        good       0.27      1.00      0.42        57

    accuracy                           0.86      1160
   macro avg       0.63      0.93      0.67      1160
weighted avg       0.96      0.86      0.90      1160



### Training with Oversampled Data

In [36]:
logres(x_oversampled, y_oversampled, x_test_std, y_test_std, target_names)

[[907 196]
 [  0  57]]
              precision    recall  f1-score   support

         bad       1.00      0.82      0.90      1103
        good       0.23      1.00      0.37        57

    accuracy                           0.83      1160
   macro avg       0.61      0.91      0.64      1160
weighted avg       0.96      0.83      0.88      1160



## Support Vector Machines (5 Methods)

In [37]:
def svm(x, y, x_test, y_test, names):
    # Train Model
    svc_model = SVC()
    svc_model.fit(x, y)
    # Make Predictions
    svc_pred = svc_model.predict(x_test)
    # Get Reports
    print(confusion_matrix(y_test, svc_pred))
    print(classification_report(y_test, svc_pred, target_names=names))

### Train with Raw Standard Scaler Data

In [38]:
svm(x_train_std, y_train_std, x_test_std, y_test_std, target_names)

[[1103    0]
 [  57    0]]
              precision    recall  f1-score   support

         bad       0.95      1.00      0.97      1103
        good       0.00      0.00      0.00        57

    accuracy                           0.95      1160
   macro avg       0.48      0.50      0.49      1160
weighted avg       0.90      0.95      0.93      1160



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Train with Raw MinMax Scaler Data

In [109]:
svm(x_train_norm, y_train_norm, x_test_norm, y_test_norm, target_names)

[[1103    0]
 [  57    0]]
              precision    recall  f1-score   support

         bad       0.95      1.00      0.97      1103
        good       0.00      0.00      0.00        57

    accuracy                           0.95      1160
   macro avg       0.48      0.50      0.49      1160
weighted avg       0.90      0.95      0.93      1160



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Train with SMOTEENN Data

In [39]:
svm(x_smoteenn, y_smoteenn, x_test_std, y_test_std, target_names)

[[927 176]
 [  0  57]]
              precision    recall  f1-score   support

         bad       1.00      0.84      0.91      1103
        good       0.24      1.00      0.39        57

    accuracy                           0.85      1160
   macro avg       0.62      0.92      0.65      1160
weighted avg       0.96      0.85      0.89      1160



### Train with Oversampled Data

In [40]:
svm(x_oversampled, y_oversampled, x_test_std, y_test_std, target_names)

[[903 200]
 [  0  57]]
              precision    recall  f1-score   support

         bad       1.00      0.82      0.90      1103
        good       0.22      1.00      0.36        57

    accuracy                           0.83      1160
   macro avg       0.61      0.91      0.63      1160
weighted avg       0.96      0.83      0.87      1160



### Train with Downsampled Data

In [108]:
svm(x_train_down, y_train_down, x_test_norm, y_test_norm, target_names)

[[810 293]
 [  0  57]]
              precision    recall  f1-score   support

         bad       1.00      0.73      0.85      1103
        good       0.16      1.00      0.28        57

    accuracy                           0.75      1160
   macro avg       0.58      0.87      0.56      1160
weighted avg       0.96      0.75      0.82      1160



## Artificial Neural Networks (5 Methods)

In [41]:
def ann(x, y, x_test, y_test, names, layers = (100,), seed = 1, epochs = 1000):
    # Train Model
    ann = MLPClassifier(random_state = seed, max_iter = epochs, hidden_layer_sizes = layers).fit(x, y) 
    # Make Predictions
    ann_pred = ann.predict(x_test)
    # Get Reports
    print(confusion_matrix(y_test, ann_pred))
    print(classification_report(y_test, ann_pred, target_names=names))
    print(ann.score(x_test, y_test))

### Training with Raw Standard Scaler Data

In [42]:
ann(x_train_std, y_train_std, x_test_std, y_test_std, target_names)

[[1103    0]
 [  57    0]]
              precision    recall  f1-score   support

         bad       0.95      1.00      0.97      1103
        good       0.00      0.00      0.00        57

    accuracy                           0.95      1160
   macro avg       0.48      0.50      0.49      1160
weighted avg       0.90      0.95      0.93      1160

0.9508620689655173


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Training with Raw MinMax Scaler Data

In [46]:
ann(x_train_norm, y_train_norm, x_test_norm, y_test_norm, target_names)

[[1103    0]
 [  57    0]]
              precision    recall  f1-score   support

         bad       0.95      1.00      0.97      1103
        good       0.00      0.00      0.00        57

    accuracy                           0.95      1160
   macro avg       0.48      0.50      0.49      1160
weighted avg       0.90      0.95      0.93      1160

0.9508620689655173


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Training with SMOTEENN Data 

In [43]:
ann(x_smoteenn, y_smoteenn, x_test_std, y_test_std, target_names)

[[1024   79]
 [   9   48]]
              precision    recall  f1-score   support

         bad       0.99      0.93      0.96      1103
        good       0.38      0.84      0.52        57

    accuracy                           0.92      1160
   macro avg       0.68      0.89      0.74      1160
weighted avg       0.96      0.92      0.94      1160

0.9241379310344827


### Training with Random Oversampled Data

In [44]:
ann(x_oversampled, y_oversampled, x_test_std, y_test_std, target_names)

[[936 167]
 [  0  57]]
              precision    recall  f1-score   support

         bad       1.00      0.85      0.92      1103
        good       0.25      1.00      0.41        57

    accuracy                           0.86      1160
   macro avg       0.63      0.92      0.66      1160
weighted avg       0.96      0.86      0.89      1160

0.8560344827586207


### Training with Downsampled Data

In [101]:
ann(x_train_down, y_train_down, x_test_norm, y_test_norm, target_names)

[[951 152]
 [  0  57]]
              precision    recall  f1-score   support

         bad       1.00      0.86      0.93      1103
        good       0.27      1.00      0.43        57

    accuracy                           0.87      1160
   macro avg       0.64      0.93      0.68      1160
weighted avg       0.96      0.87      0.90      1160

0.8689655172413793
