In [None]:
from time import time
from os.path import exists
from os import mkdir
from numpy import unique
from numpy.random import rand
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.metrics import *

from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

First, we must import the datasets we plan to use. Be sure to confirm that the training dataset is larger than the testing dataset. Download the datasets from https://www.kaggle.com/datasets/mrwellsdavid/unsw-nb15.

In [None]:
trainDS = pd.read_csv("UNSW_NB15_training-set.csv")
testDS = pd.read_csv("UNSW_NB15_testing-set.csv")

In [None]:
trainDS.head(5)

In [None]:
testDS.head(5)

In [None]:
trainDS.shape

In [None]:
testDS.shape

To see the disparities in the different categories, we can count all the categories in each dataset.

In [None]:
def CountCategories(dataset, datasetLabel):
    print(f'\n\nFrequency Statistics of {datasetLabel} dataset...')
    print("------------------------------------------------------")
    catList = list(dataset["attack_cat"])
    uniqueCat = list(dataset["attack_cat"].unique())
    uniqueCatCount = [0] * len(uniqueCat)
    print(uniqueCat)

    for category in catList:
        for i in range(len(uniqueCat)):
            if uniqueCat[i] == category:
                uniqueCatCount[i] += 1

    for i in range(len(uniqueCat)):
        print(f'{uniqueCat[i]}: {uniqueCatCount[i]}')

    print(f'Total: {sum(uniqueCatCount)}')

In [None]:
CountCategories(trainDS, "Training")

In [None]:
CountCategories(testDS, "Testing")

The Datasets have been imported and now it is time to Explore what figures we have here. First, lets inumerate all string values.

In [None]:
def CreateNumCol(column, index, DF, uniqueItems):
    rowCount = len(DF.axes[0])
    newColIndex = index + 1
    DF.insert(newColIndex, column + "Num", [0] * rowCount, True)

    for row in range(rowCount):
        curItem = DF.iloc[row, index]
        for itemIndex in range(len(uniqueItems)):
            if uniqueItems[itemIndex] == curItem:
                DF.iloc[row, newColIndex] = itemIndex
                break

In [None]:
def AddNumCol(column, trainDS, testDS):
    uniqueTrain = list(trainDS[column].unique())
    uniqueTest = list(testDS[column].unique())
    uniqueItems = []

    # List of unique values
    for i in range(len(uniqueTrain)):
        uniqueItems.append(uniqueTrain[i])

    for i in range(len(uniqueTest)):
        if uniqueTest[i] not in uniqueItems:
            uniqueItems.append(uniqueTest[i])

    originalIndex = trainDS.columns.get_loc(column)
    CreateNumCol(column, originalIndex, trainDS, uniqueItems)
    CreateNumCol(column, originalIndex, testDS, uniqueItems)

In [None]:
AddNumCol("proto", trainDS, testDS)
AddNumCol("service", trainDS, testDS)
AddNumCol("state", trainDS, testDS)

In [None]:
trainDS.head(20)

In [None]:
testDS.head(20)

Now that we have all values enumerated, we can now begin wrangling some more. Lets get rid of the columns we do not need in both tables.

In [None]:
unneededCol = ['id', 'proto', 'service', 'state', 'attack_cat']
purgedTrainDS = trainDS.drop(unneededCol, axis=1)
purgedTestDS = testDS.drop(unneededCol, axis=1)

This now leaves only numerical values in both of our datasets. We can start normalizing our data to make it easier on the machine learning modules. For this specific dataset, we choose between MaxAbsScaler and MinMax Scaler.

In [None]:
purgedTrainDS.dtypes

In [None]:
PTDS = list(purgedTrainDS)

xTrain = purgedTrainDS.iloc[:, :-1]
yTrain = purgedTrainDS.iloc[:, -1]

xTest = purgedTestDS.iloc[:, :-1]
yTest = purgedTestDS.iloc[:, -1]

In [None]:
minMax = MinMaxScaler()
maxAbs = MaxAbsScaler()

mmTrain = minMax.fit_transform(xTrain)
mmTest = minMax.transform(xTest)

maTrain = maxAbs.fit_transform(xTrain)
maTest = maxAbs.transform(xTest)

Our data has now been pruned and standarized, and is now ready for modeling. The below function will generate our Confusion Matrixes for us.

In [None]:
def CreateCM(yTest, yPred, fullName, path, clf):
    cm = confusion_matrix(yTest, yPred, labels=clf.classes_)
    cmDisplay = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=clf.classes_)
    cmDisplay.plot()

    fileName = path + fullName + "_CM.png"
    cloneNum = 0
    while True:
        if exists(fileName):
            cloneNum += 1
            fileName = path + str(cloneNum) + "_" + fullName + "_CM.png"
        else:
            break

    plt.savefig(fileName, dpi=300)

The below function generates all numbers we will compare.

In [None]:
def ResultsReturn(yTest, yPred, name, clf, elapsed):
    uniCount = len(unique(yTest))
    accCount = accuracy_score(yTest, yPred, normalize=False)
    accScore = round(accuracy_score(yTest, yPred) * 100, 2)
    balAccScore = round(balanced_accuracy_score(yTest, yPred) * 100, 2)
    zeroOneLossScore = round(zero_one_loss(yTest, yPred) * 100, 2)
    zeroOneCount = zero_one_loss(yTest, yPred, normalize=False)
    if uniCount > 2:
        precScore = round(precision_score(yTest, yPred, average="macro") * 100, 2)
        recScore = round(recall_score(yTest, yPred, average="macro") * 100, 2)
        f1Score = round(f1_score(yTest, yPred, average="macro") * 100, 2)
    else:
        precScore = round(precision_score(yTest, yPred, average="binary") * 100, 2)
        recScore = round(recall_score(yTest, yPred, average="binary") * 100, 2)
        f1Score = round(f1_score(yTest, yPred, average="binary") * 100, 2)

    return [
        name, str(clf), uniCount, accCount, zeroOneCount, accScore, balAccScore,
        precScore, recScore, f1Score, zeroOneLossScore, elapsed
    ]

The below function prints all results as a CSV file.

In [None]:
def ExportCSV(allResults, fullName, path):
    scoreListLabels = [
        "Name", "Classifier", "Unique_Classifications", "Correct_Predictions",
        "Incorrect_Predictions", "Accuracy", "Balanced_Accuracy", "Precision",
        "Recall", "F1_Score", "Loss", "Execution Time"
    ]

    resultsDF = pd.DataFrame(allResults, columns=scoreListLabels)

    # Filename
    fileName = f'{path}{fullName}.csv'
    cloneNum = 0
    while True:
        if exists(fileName):
            cloneNum += 1
            fileName = f'{path}{str(cloneNum)}_{fullName}.csv'
        else:
            break

    resultsDF.to_csv(fileName, index=False)

The below function will generate all the bar graphs we need for visulizations

In [None]:
def GenerateCharts(allResults, fullName, path):
    scoreListLabels = [
        "Name", "Classifier", "Unique_Classifications", "Correct_Predictions",
        "Incorrect_Predictions", "Accuracy", "Balanced_Accuracy", "Precision",
        "Recall", "F1_Score", "Loss", "Execution Time"
    ]

    resultsDF = pd.DataFrame(allResults, columns=scoreListLabels)
    colorList = [rand(3)] * resultsDF.shape[0]

    for i in range(3, len(scoreListLabels)):
        resultsDF.plot(
            x='Name',
            y=scoreListLabels[i],
            kind='bar',
            color=colorList,
            legend=None
        )

        # Filename
        fileName = f'{path}{fullName}_{scoreListLabels[i]}.png'
        cloneNum = 0
        while True:
            if exists(fileName):
                cloneNum += 1
                fileName = f'{path}{str(cloneNum)}_{fullName}_{scoreListLabels[i]}.png'
            else:
                break

        # Adjust the y-axis for visuals
        yRange = resultsDF[scoreListLabels[i]]
        bufferPerc = 0.25
        buffer = (yRange.max() - yRange.min()) * bufferPerc + 0.05
        yMin = yRange.min() - buffer
        if yMin < 0:
            yMin = 0
        yMax = yRange.max() + buffer
        plt.ylim((yMin, yMax))

        # Fine tuning visuals
        plt.title(scoreListLabels[i] + " Comparison")
        plt.xlabel("Classifiers")
        plt.ylabel("Scores/Percentages")
        plt.xticks(rotation=30)
        plt.tight_layout()
        for i, v, in enumerate(yRange.tolist()):
            plt.text(i, v, str(v), ha="center")

        plt.savefig(fileName)

This function puts it all together and gets together everything we need for the model.

In [None]:
def ModelResults(xTrain, yTrain, xTest, yTest, stdType, path):
    # Principal Component Analysis
    pcaTrain = PCA(n_components=15)
    pcaTrain.fit(xTrain)

    xTrain = pcaTrain.transform(xTrain)
    xTest = pcaTrain.transform(xTest)

    # Start Modeling
    allResults = []
    modelNames = [
        "Neural Network", "Random Forest", "Decision Tree", "Naive Bayes",
        "K-Nearest Neighbor"
    ]

    models = [
        MLPClassifier(random_state=42, hidden_layer_sizes=(50, 20, 5,)),
        RandomForestClassifier(random_state=42, n_estimators=50),
        DecisionTreeClassifier(random_state=42),
        GaussianNB(),
        KNeighborsClassifier(n_neighbors=2)
    ]

    # Iterate through models
    for name, clf in zip(modelNames, models):
        print(f'Tailoring {name} for {stdType} Standard')
        start = time()
        clf.fit(xTrain, yTrain)
        end = time()
        elapsed = round(end - start, 6)
        yPred = clf.predict(xTest)

        # Results
        fullName = f'{stdType}-{name}'
        CreateCM(yTest, yPred, fullName, path, clf)
        rr = ResultsReturn(yTest, yPred, fullName, clf, elapsed)
        allResults.append(rr)

    # Export all Visualizations
    fullName = f'{stdType}'
    ExportCSV(allResults, fullName, path)
    GenerateCharts(allResults, fullName, path)

In [None]:
pathNum = 0
path = "OpenShift Results/"
while exists(path):
    pathNum += 1
    path = f'OpenShift Results {pathNum}/'
mkdir(path)

In [None]:
ModelResults(mmTrain, yTrain, mmTest, yTest, "Min Max", path)
ModelResults(maTrain, yTrain, maTest, yTest, "Max Abs", path)