In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.optim as optim


### Read datafile

In [19]:
filePath =  "../../project_course_data/"
fileName = "preprocessedData.csv"

data = pd.read_csv(filePath + fileName, sep="\t")

### Function for debugging

In [18]:
def printRow(df, id):
    try:
        row = df[df["Id"] == id]
        for col, val in row.iloc[0].items():
            print(f"{col}: {val}")
    except KeyError:
        print(f"Row with id {id} not found in DataFrame.")

In [22]:
id = 2
printRow(data, id)

Row with id 2 not found in DataFrame.


### Drop uninteresting attributes

In [17]:
dataDropped = data.copy()

attributesToDrop = ["Id" ,"Domain Name", "Flows", "Datetime", "Host Port", "Src IP Addr", "Dst IP Addr", "Client IP", "Host IP", "Src Port", "Dst Port"]
dataDropped = dataDropped.drop(attributesToDrop, axis=1)

     Duration  Packets    Bytes  TCP  UDP  1900  27018  27025  27036  27043  \
0  180312.365     1200  49200.0    1    0     0      0      0      0      0   
1  180312.365      400      3.0    1    0     0      0      0      0      0   
2     135.108      100   4000.0    1    0     0      0      0      0      0   
3     135.108      100   4600.0    1    0     0      0      0      0      0   
4       0.000      100  97000.0    1    0     0      0      0      0      0   

   27047  27051  27053  27057  27060  443  5228  80  1900.1  27018.1  27025.1  \
0      0      0      0      0      0    1     0   0       0        0        0   
1      0      0      0      0      0    1     0   0       0        0        0   
2      0      0      0      0      0    1     0   0       0        0        0   
3      0      0      0      0      0    1     0   0       0        0        0   
4      0      0      0      0      0    1     0   0       0        0        0   

   27036.1  27043.1  27047.1  27051.1 

In [None]:
printRow(dataDropped, id)

## Convert Service label into integer

In [None]:
dataNumLabel = dataDropped.copy()

n = dataNumLabel["Label"].value_counts()
print(n)

LABELS_DICT = {}
for i in range(len(n)):
    LABELS_DICT[n.index.tolist()[i]] = i
print()
print("LABELS AND CORRESPONDING NUMBER:")
for key,val in LABELS_DICT.items():
    print(f"{val}\t{key}")


#### Conversion functions for labels

In [None]:
def label2num(label):
    return(LABELS_DICT[label])
def num2label(num):
    return next((key for key, val in LABELS_DICT.items() if val == num), None)

## Split the data from the labels

In [None]:
X = dataNumLabel.drop("Label", axis = 1).to_numpy()
Y = dataNumLabel["Label"].apply(label2num).to_numpy() #translate label to corresponding integer

### Trainging data and Testing data split & convert into tensor

In [None]:
# TRAIN TEST SPLIT
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2, random_state=13)
 
# TO TENSOR
Xtrain = torch.tensor(Xtrain, dtype=torch.float32)
Ytrain = torch.tensor(Ytrain, dtype=torch.long)
Xtest = torch.tensor(Xtest, dtype=torch.float32)
Ytest = torch.tensor(Ytest, dtype=torch.long)

## THE NETWORK

In [None]:
# NETWORK CLASS
class Net(nn.Module):
    def __init__(self, input_size, h1, h2, h3, h4, output_size):
        super(Net, self).__init__()
        dropoutrate = 0.5
        self.fc1 = nn.Linear(input_size, h1)
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(dropoutrate)

        self.fc2 = nn.Linear(h1, h2)
        self.relu2 = nn.ReLU()

        self.fc3 = nn.Linear(h2, h3)
        self.relu3 = nn.ReLU()
        self.dropout2 = nn.Dropout(dropoutrate)

        self.fc4 = nn.Linear(h3, h4)
        self.relu4 = nn.ReLU()

        self.fc5 = nn.Linear(h4, output_size)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.dropout1(x)

        x = self.fc2(x)
        x = self.relu2(x)

        x = self.fc3(x)
        x = self.relu3(x)
        x = self.dropout2(x)

        x = self.fc4(x)
        x = self.relu4(x)

        x = self.fc5(x)
        return x

### Create model

In [None]:
# NETWORK LAYER SIZES
input_size = X.shape[1]
h1 = 200
h2 = 150
h3 = 100
h4 = 50
output_size = len(LABELS_DICT)

# CREATE NEURAL NETWORK MODEL
model = Net(input_size, h1, h2, h3, h4, output_size)

# LOSS & EVALUATION
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr = 0.001) #create optimizer

## TRAINING

In [None]:
# TRAINING
num_epochs = 500
for epoch in range(num_epochs):
    outputs = model(Xtrain) #Prediction
    loss = criterion(outputs, Ytrain) #Loss calculation

    optimizer.zero_grad() #reset optimizer gradient
    loss.backward() # bakåt ?
    optimizer.step() # uppdatera vikter

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item()}')

### Accuracy print

In [None]:
# PRINT ACCURACY
with torch.no_grad():
    model.eval()
    outputs_test = model(Xtest)
    predicted_labels = torch.argmax(outputs_test, dim=1)
    acc = torch.sum(predicted_labels == Ytest).item() / len(Ytest)
    print(f"Test Accuracy: {acc}")

## Export Model !

In [None]:
# export model weights here