# K-Nearest Neighbors

In [22]:
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from COMP8085_Project1.scripts import preprocess as ref

original_csv = pd.read_csv("../data/UNSW-NB15-BALANCED-TRAIN.csv",
                               encoding='ISO-8859-1',
                               low_memory=False)

    # Process data - change values of ports and null + factorize
df = ref.preprocess_data(original_csv)

## 'Label' Feature

In [21]:
# Split original csv into train and validate+test (0.7 : 0.3)
train_df, validate_test_df = train_test_split(df,
                                              train_size=0.7,
                                              shuffle=True,
                                              stratify=df['attack_cat'],
                                              random_state=32)

# Split validate+test into validate and test (0.5 : 0.5)
validate_df, test_df = train_test_split(validate_test_df,
                                        train_size=0.5,
                                        shuffle=True,
                                        stratify=validate_test_df[
                                            'attack_cat'],
                                        random_state=34)

In [20]:
Y_label_train = train_df["Label"]
X_train = train_df.drop(['srcip', 'dstip','Label', 'attack_cat'], axis=1)

Y_label_test = test_df["Label"]
X_test = test_df.drop(['srcip', 'dstip','Label', 'attack_cat'], axis=1)

Y_label_validate = validate_df["Label"]
X_validate = validate_df.drop(['srcip', 'dstip','Label', 'attack_cat'], axis=1)

In [19]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_validate_scaled = scaler.transform(X_validate)

In [32]:
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier


X_combined = np.concatenate([X_train_scaled, X_validate_scaled])
Y_combined = np.concatenate([Y_label_train, Y_label_validate])

# Training the model
final_model = KNeighborsClassifier(n_neighbors=9, weights='distance', metric='manhattan', n_jobs=5)
final_model.fit(X_combined, Y_combined)

# Predicting test set labels
Y_test_pred = final_model.predict(X_test_scaled)

# Classification report
print("\n==========Label Scores for all features==========")
print(classification_report(Y_label_test, Y_test_pred, labels=[0, 1], zero_division=0))


              precision    recall  f1-score   support

           0       1.00      0.99      0.99     33735
           1       0.99      1.00      0.99     33735

    accuracy                           0.99     67470
   macro avg       0.99      0.99      0.99     67470
weighted avg       0.99      0.99      0.99     67470


## ‘Attack Cat’ Feature

In [1]:
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from COMP8085_Project1.scripts import preprocess as ref

original_csv = pd.read_csv("../data/UNSW-NB15-BALANCED-TRAIN.csv",
                               encoding='ISO-8859-1',
                               low_memory=False)

    # Process data - change values of ports and null + factorize
df = ref.preprocess_data(original_csv)
# Split original csv into train and validate+test (0.7 : 0.3)
train_df, validate_test_df = train_test_split(df,
                                              train_size=0.7,
                                              shuffle=True,
                                              stratify=df['attack_cat'],
                                              random_state=32)

# Split validate+test into validate and test (0.5 : 0.5)
validate_df, test_df = train_test_split(validate_test_df,
                                        train_size=0.5,
                                        shuffle=True,
                                        stratify=validate_test_df[
                                            'attack_cat'],
                                        random_state=34)

In [2]:
Y_attack_train = train_df["attack_cat"]
X_train = train_df.drop(['srcip', 'dstip', 'Label', 'attack_cat'], axis=1)

Y_attack_test = test_df["attack_cat"]
X_test = test_df.drop(['srcip', 'dstip', 'Label', 'attack_cat'], axis=1)

Y_attack_validate = validate_df["attack_cat"]
X_validate = validate_df.drop(['srcip', 'dstip', 'Label', 'attack_cat'], axis=1)

In [3]:
from sklearn.preprocessing import StandardScaler

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_validate_scaled = scaler.transform(X_validate)


In [4]:
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier

X_combined = np.concatenate([X_train_scaled, X_validate_scaled])
Y_combined = np.concatenate([Y_attack_train, Y_attack_validate])

# Training the model
final_model = KNeighborsClassifier(n_neighbors=5, weights='distance', metric='manhattan', n_jobs=5)
final_model.fit(X_combined, Y_combined)

# Predicting test set labels
Y_test_pred = final_model.predict(X_test_scaled)

# Generating the classification report
# Removing the labels parameter to automatically include all labels present in Y_attack_test
print("\n==========Attack Category Scores for all features==========")
print(classification_report(Y_attack_test, Y_test_pred, zero_division=0))


              precision    recall  f1-score   support

           0       1.00      0.99      0.99     33735
           1       1.00      0.98      0.99     22626
           2       0.67      0.81      0.73      2545
           3       0.60      0.77      0.67      4674
           4       0.31      0.21      0.25      1713
           5       0.65      0.67      0.66      1477
           6       0.31      0.02      0.04       247
           7       0.51      0.09      0.16       279
           8       0.52      0.22      0.31       157
           9       0.00      0.00      0.00        17

    accuracy                           0.93     67470
   macro avg       0.56      0.48      0.48     67470
weighted avg       0.93      0.93      0.92     67470
