In [25]:
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.optim as optim


### Read datafile

In [26]:
filePath =  "../../project_course_data/"
fileName = "preprocessedDataLabeled.csv"

data = pd.read_csv(filePath + fileName)

### Function for debugging

In [27]:
def printRow(df, id):
    try:
        row = df[df["Id"] == id]
        for col, val in row.iloc[0].items():
            print(f"{col}: {val}")
    except KeyError:
        print(f"Row with id {id} not found in DataFrame.")

In [28]:
id = 1
printRow(data, id)

Datetime: 2023-11-22 12:18:39
Label: Unknown
Duration: 0.0
Packets: 0.0
Bytes: 5.82e-05
Flows: 1
Id: 1
Src IP Addr: 13.107.42.18
Src Port: 443
Dst IP Addr: 192.168.8.235
Dst Port: 63935
Host IP: 13.107.42.18
Client IP: 192.168.8.235
Domain Name: 0
IGMP: 0
TCP: 1
UDP: 0
Host Port: 443
0: 0
1900: 0
22222: 0
27018: 0
27025: 0
27036: 0
27043: 0
27047: 0
27051: 0
27053: 0
27057: 0
27060: 0
3478: 0
3480: 0
34820: 0
3702: 0
4070: 0
44142: 0
443: 1
50002: 0
50012: 0
50022: 0
50027: 0
5228: 0
5353: 0
5355: 0
67: 0
80: 0
8009: 0
1drv: 0
1.00E+100: 0
82f3dc: 0
833aec: 0
a104: 0
a184: 0
a2: 0
a23: 0
a95: 0
aa784e235de7c8b14: 0
adobedc: 0
akamaitechnologies: 0
all: 0
amazonaws: 0
ams: 0
ams1: 0
ams15s47: 0
ams15s51: 0
ams17s02: 0
ams17s04: 0
ams17s13: 0
ams17s17: 0
ams58: 0
andreas: 0
arn: 0
arn001: 0
arn04: 0
arn09s18: 0
arn09s19: 0
arn09s20: 0
arn09s21: 0
arn09s22: 0
arn09s23: 0
arn09s25: 0
arn09s26: 0
arn09s27: 0
arn1: 0
arn11s03: 0
arn11s04: 0
arn11s09: 0
arn11s10: 0
arn11s11: 0
arn11s12: 0
arn

### Drop uninteresting attributes

In [29]:
dataDropped = data.copy()

dataDropped = dataDropped.drop(dataDropped[dataDropped.Label == "Unknown"].index)

attributesToDrop = ["Id" ,"Domain Name", "Flows", "Datetime", "Host Port", "Src IP Addr", "Dst IP Addr", "Client IP", "Host IP", "Src Port", "Dst Port"]
dataDropped = dataDropped.drop(attributesToDrop, axis=1)

In [30]:
printRow(dataDropped, id)

Row with id 1 not found in DataFrame.


## Convert Service label into integer

In [31]:
dataNumLabel = dataDropped.copy()

n = dataNumLabel["Label"].value_counts()

LABELS_DICT = {}
for i in range(len(n)):
    LABELS_DICT[n.index.tolist()[i]] = i
print()
print("LABELS AND CORRESPONDING NUMBER:")
for key,val in LABELS_DICT.items():
    print(f"{val}\t{key}")



LABELS AND CORRESPONDING NUMBER:
0	Youtube
1	Netflix
2	Browsing/Shopping
3	Twitch TV
4	Prime Video
5	SVT Play
6	Spotify
7	Facebook
8	Playstation
9	Soundcloud
10	Discord
11	Reddit
12	Amazon SHOP
13	Google Drive
14	Skype
15	Disney+
16	Steam Gaming
17	Gmail
18	Instagram
19	Outlook Mail
20	X


#### Conversion functions for labels

In [32]:
def label2num(label):
    return(LABELS_DICT[label])
def num2label(num):
    return next((key for key, val in LABELS_DICT.items() if val == num), None)

## Split the data from the labels

In [33]:
X = dataNumLabel.drop("Label", axis = 1).astype("float32").to_numpy()
Y = dataNumLabel["Label"].apply(label2num).to_numpy() #translate label to corresponding integer

### Trainging data and Testing data split & convert into tensor

In [34]:
# TRAIN TEST SPLIT
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2, random_state=13)
 
# TO TENSOR
Xtrain = torch.tensor(Xtrain, dtype=torch.float32)
Ytrain = torch.tensor(Ytrain, dtype=torch.long)
Xtest = torch.tensor(Xtest, dtype=torch.float32)
Ytest = torch.tensor(Ytest, dtype=torch.long)

## THE NETWORK

In [35]:
# NETWORK CLASS
class Net(nn.Module):
    def __init__(self, input_size, h1, h2, h3, h4, output_size):
        super(Net, self).__init__()
        dropoutrate = 0.5
        self.fc1 = nn.Linear(input_size, h1)
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(dropoutrate)

        self.fc2 = nn.Linear(h1, h2)
        self.relu2 = nn.ReLU()

        self.fc3 = nn.Linear(h2, h3)
        self.relu3 = nn.ReLU()
        self.dropout2 = nn.Dropout(dropoutrate)

        self.fc4 = nn.Linear(h3, h4)
        self.relu4 = nn.ReLU()

        self.fc5 = nn.Linear(h4, output_size)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.dropout1(x)

        x = self.fc2(x)
        x = self.relu2(x)

        x = self.fc3(x)
        x = self.relu3(x)
        x = self.dropout2(x)

        x = self.fc4(x)
        x = self.relu4(x)

        x = self.fc5(x)
        return x

### Create model

In [36]:
# NETWORK LAYER SIZES
input_size = X.shape[1]
h1 = 200
h2 = 150
h3 = 100
h4 = 50
output_size = len(LABELS_DICT)

# CREATE NEURAL NETWORK MODEL
model = Net(input_size, h1, h2, h3, h4, output_size)

# LOSS & EVALUATION
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr = 0.001) #create optimizer

## TRAINING

In [42]:
# TRAINING
num_epochs = 500
for epoch in range(num_epochs):
    outputs = model(Xtrain) #Prediction
    loss = criterion(outputs, Ytrain) #Loss calculation

    optimizer.zero_grad() #reset optimizer gradient
    loss.backward() # bakåt ?
    optimizer.step() # uppdatera vikter

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item()}')

Epoch [1/500], Loss: 0.849406361579895
Epoch [2/500], Loss: 0.8498115539550781
Epoch [3/500], Loss: 0.8508362770080566
Epoch [4/500], Loss: 0.8517848253250122
Epoch [5/500], Loss: 0.8512610793113708
Epoch [6/500], Loss: 0.8497948050498962
Epoch [7/500], Loss: 0.8486401438713074
Epoch [8/500], Loss: 0.8488523364067078
Epoch [9/500], Loss: 0.8499324321746826
Epoch [10/500], Loss: 0.8502976298332214
Epoch [11/500], Loss: 0.8498690724372864
Epoch [12/500], Loss: 0.8487566113471985
Epoch [13/500], Loss: 0.84803307056427
Epoch [14/500], Loss: 0.8484055995941162
Epoch [15/500], Loss: 0.8492395281791687
Epoch [16/500], Loss: 0.8496139049530029
Epoch [17/500], Loss: 0.8493883609771729
Epoch [18/500], Loss: 0.8494486212730408
Epoch [19/500], Loss: 0.8500787019729614
Epoch [20/500], Loss: 0.85160893201828
Epoch [21/500], Loss: 0.8512500524520874
Epoch [22/500], Loss: 0.8507909774780273
Epoch [23/500], Loss: 0.8491981625556946
Epoch [24/500], Loss: 0.8481243252754211
Epoch [25/500], Loss: 0.847798

### Accuracy print

In [43]:
# PRINT ACCURACY
with torch.no_grad():
    model.eval()
    outputs_test = model(Xtest)
    predicted_labels = torch.argmax(outputs_test, dim=1)
    acc = torch.sum(predicted_labels == Ytest).item() / len(Ytest)
    print(f"Test Accuracy: {acc}")

Test Accuracy: 0.6101485148514851


## Export Model !

In [39]:
# export model weights here