In [1]:
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.optim as optim


### Read datafile

In [2]:
filePath =  "../../project_course_data/"
fileName = "preprocessedDataLabeled.csv"

data = pd.read_csv(filePath + fileName)

### Function for debugging

In [3]:
def printRow(df, id):
    try:
        row = df[df["Id"] == id]
        for col, val in row.iloc[0].items():
            print(f"{col}: {val}")
    except KeyError:
        print(f"Row with id {id} not found in DataFrame.")

In [4]:
id = 1
printRow(data, id)

Datetime: 2023-11-22 12:18:39
Label: Unknown
Duration: 0.0
Packets: 0.0
Bytes: 5.82e-05
Flows: 1
Id: 1
Src IP Addr: 13.107.42.18
Src Port: 443
Dst IP Addr: 192.168.8.235
Dst Port: 63935
Host IP: 13.107.42.18
Client IP: 192.168.8.235
Domain Name: 0
IGMP: 0
TCP: 1
UDP: 0
Host Port: 443
0: 0
1900: 0
22222: 0
27018: 0
27025: 0
27036: 0
27043: 0
27047: 0
27051: 0
27053: 0
27057: 0
27060: 0
3478: 0
3480: 0
34820: 0
3702: 0
4070: 0
44142: 0
443: 1
50002: 0
50012: 0
50022: 0
50027: 0
5228: 0
5353: 0
5355: 0
67: 0
80: 0
8009: 0
1drv: 0
1.00E+100: 0
82f3dc: 0
833aec: 0
a104: 0
a184: 0
a2: 0
a23: 0
a95: 0
aa784e235de7c8b14: 0
adobedc: 0
akamaitechnologies: 0
all: 0
amazonaws: 0
ams: 0
ams1: 0
ams15s47: 0
ams15s51: 0
ams17s02: 0
ams17s04: 0
ams17s13: 0
ams17s17: 0
ams58: 0
andreas: 0
arn: 0
arn001: 0
arn04: 0
arn09s18: 0
arn09s19: 0
arn09s20: 0
arn09s21: 0
arn09s22: 0
arn09s23: 0
arn09s25: 0
arn09s26: 0
arn09s27: 0
arn1: 0
arn11s03: 0
arn11s04: 0
arn11s09: 0
arn11s10: 0
arn11s11: 0
arn11s12: 0
arn

### Drop uninteresting attributes

In [5]:
dataDropped = data.copy()

dataDropped = dataDropped.drop(dataDropped[dataDropped.Label == "Unknown"].index)

attributesToDrop = ["Id" ,"Domain Name", "Flows", "Datetime", "Host Port", "Src IP Addr", "Dst IP Addr", "Client IP", "Host IP", "Src Port", "Dst Port"]
dataDropped = dataDropped.drop(attributesToDrop, axis=1)

In [6]:
printRow(dataDropped, id)

Row with id 1 not found in DataFrame.


## Convert Service label into integer

In [7]:
dataNumLabel = dataDropped.copy()

n = dataNumLabel["Label"].value_counts()

LABELS_DICT = {}
for i in range(len(n)):
    LABELS_DICT[n.index.tolist()[i]] = i
print()
print("LABELS AND CORRESPONDING NUMBER:")
for key,val in LABELS_DICT.items():
    print(f"{val}\t{key}")



LABELS AND CORRESPONDING NUMBER:
0	Youtube
1	Netflix
2	Browsing/Shopping
3	Twitch TV
4	Prime Video
5	SVT Play
6	Spotify
7	Facebook
8	Playstation
9	Soundcloud
10	Discord
11	Reddit
12	Amazon SHOP
13	Google Drive
14	Skype
15	Disney+
16	Steam Gaming
17	Gmail
18	Instagram
19	Outlook Mail
20	X


In [8]:
# Save LABELS_DICT
import pickle

dictPath = "labelsDictionary.pkl"

with open(filePath + dictPath, "wb") as pkl_f:
    pickle.dump(LABELS_DICT, pkl_f)

In [9]:
# Service to activity translation

ACTIVITY_DICT = {
    "Youtube":           "Video streaming",
    "Netflix":           "Video streaming",
    "Browsing/Shopping": "Browsing/Shopping",
    "Twitch TV":         "Video streaming",
    "Prime Video":       "Video streaming",
    "SVT Play":          "Video streaming",
    "Spotify":           "Sound streaming",
    "Facebook":          "Social media",
    "Playstation":       "Gaming",
    "Soundcloud":        "Sound streaming",
    "Discord":           "Voice chat",
    "Reddit":            "Social media",
    "Amazon SHOP":       "Browsing/Shopping",
    "Google Drive":      "Browsing/Shopping",
    "Skype":             "Voice chat",
    "Disney+":           "Video streaming",
    "Steam Gaming":      "Gaming",
    "Gmail":             "Browsing/Shopping",
    "Instagram":         "Social media",
    "Outlook Mail":      "Browsing/Shopping",
    "X":                 "Social media",
}

def service2activity(service):
    return(ACTIVITY_DICT[service])

#### Conversion functions for labels

In [10]:
def label2num(label):
    return(LABELS_DICT[label])
def num2label(num):
    return next((key for key, val in LABELS_DICT.items() if val == num), None)

In [11]:
# Save finished training data columns

columns = dataNumLabel.columns.values.tolist()

dataColNamesPath = "dataColNames.txt"
with open(filePath + dataColNamesPath, "w") as f:
    for col in columns:
        f.write(str(col) + "\n")


## Split the data from the labels

In [12]:
X = dataNumLabel.drop("Label", axis = 1).astype("float32").to_numpy()
Y = dataNumLabel["Label"].apply(label2num).to_numpy() #translate label to corresponding integer

### Training data and Testing data split & convert into tensor

In [13]:
# Assuming CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

# TRAIN TEST SPLIT
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2, random_state=13)
 
# TO TENSOR
Xtrain = torch.tensor(Xtrain, dtype=torch.float32).to(device)
Ytrain = torch.tensor(Ytrain, dtype=torch.long).to(device)
Xtest = torch.tensor(Xtest, dtype=torch.float32).to(device)
Ytest = torch.tensor(Ytest, dtype=torch.long).to(device)

cuda


## THE NETWORK

In [14]:
# NETWORK CLASS
# class Net(nn.Module):
#     def __init__(self, input_size, h1, h2, h3, h4, output_size):
#         super(Net, self).__init__()
#         dropoutrate = 0.5
#         self.fc1 = nn.Linear(input_size, h1)
#         self.relu1 = nn.ReLU()
#         self.dropout1 = nn.Dropout(dropoutrate)

#         self.fc2 = nn.Linear(h1, h2)
#         self.relu2 = nn.ReLU()

#         self.fc3 = nn.Linear(h2, h3)
#         self.relu3 = nn.ReLU()
#         self.dropout2 = nn.Dropout(dropoutrate)

#         self.fc4 = nn.Linear(h3, h4)
#         self.relu4 = nn.ReLU()

#         self.fc5 = nn.Linear(h4, output_size)

#     def forward(self, x):
#         x = self.fc1(x)
#         x = self.relu1(x)
#         x = self.dropout1(x)

        # x = self.fc2(x)
        # x = self.relu2(x)

        # x = self.fc3(x)
        # x = self.relu3(x)
        # x = self.dropout2(x)

        # x = self.fc4(x)
        # x = self.relu4(x)

        # x = self.fc5(x)
        # return x

from modelClass import Net

### Create model

In [15]:
# NETWORK LAYER SIZES
input_size = X.shape[1]
input_size = len(columns) - 1
h1 = 20000
h2 = 16000
h3 = 8000
h4 = 3000
h5 = 600
h6 = 80
output_size = len(LABELS_DICT)

# CREATE NEURAL NETWORK MODEL
model = Net(input_size, h1, h2, h3, h4, h5, h6, output_size).to(device)

# LOSS & EVALUATION
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr = 0.001) #create optimizer

## TRAINING

In [16]:
# TRAINING
num_epochs = 120
for epoch in range(num_epochs):
    outputs = model(Xtrain) #Prediction
    loss = criterion(outputs, Ytrain) #Loss calculation

    optimizer.zero_grad() #reset optimizer gradient
    loss.backward() # bak√•t ?
    optimizer.step() # uppdatera vikter

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item()}')

Epoch [1/120], Loss: 3.071791648864746
Epoch [2/120], Loss: 14.155570030212402
Epoch [3/120], Loss: 3.0649302005767822
Epoch [4/120], Loss: 3.0657567977905273
Epoch [5/120], Loss: 3.064931631088257
Epoch [6/120], Loss: 3.061868190765381
Epoch [7/120], Loss: 3.0556225776672363
Epoch [8/120], Loss: 3.0414249897003174
Epoch [9/120], Loss: 3.014357328414917
Epoch [10/120], Loss: 2.9703993797302246
Epoch [11/120], Loss: 2.909635543823242
Epoch [12/120], Loss: 2.856029987335205
Epoch [13/120], Loss: 2.820303201675415
Epoch [14/120], Loss: 2.7625973224639893
Epoch [15/120], Loss: 2.7011725902557373
Epoch [16/120], Loss: 2.6748337745666504
Epoch [17/120], Loss: 2.6559836864471436
Epoch [18/120], Loss: 2.633361577987671
Epoch [19/120], Loss: 2.606959342956543
Epoch [20/120], Loss: 2.5974085330963135
Epoch [21/120], Loss: 2.586439371109009
Epoch [22/120], Loss: 2.5703227519989014
Epoch [23/120], Loss: 2.5624914169311523
Epoch [24/120], Loss: 2.5499556064605713
Epoch [25/120], Loss: 2.53437876701

### Accuracy print

In [17]:
# PRINT ACCURACY
with torch.no_grad():
    model.eval()
    outputs_test = model(Xtest)  # model made guess vectors
    predicted_labels = torch.argmax(outputs_test, dim=1)  # choose highest probability

    # Move predicted_labels to CPU
    predicted_labels_cpu = predicted_labels.cpu()

    # Service accuracy
    service_acc = torch.sum(predicted_labels_cpu == Ytest.cpu()).item() / len(Ytest.cpu())

    # convert from number to service
    predicted_services = [num2label(num) for num in predicted_labels_cpu.numpy()]
    # convert from service to activity
    predicted_activity = [service2activity(service) for service in predicted_services]

    # convert true labels to activity
    activity_truth = [service2activity(service) for service in [num2label(num) for num in Ytest.cpu().numpy()]]

    # Activity accuracy
    activity_acc = sum([x == y for x, y in zip(predicted_activity, activity_truth)]) / len(activity_truth)

    # PRINTS
    print(f"Service test Accuracy: {service_acc}")
    print(f"Activity test Accuracy: {activity_acc}")

Service test Accuracy: 0.5915841584158416
Activity test Accuracy: 0.7029702970297029


## Export Model !

In [19]:
# export model weights here

modelWeights = "modelWeights.pth"

torch.save(model.state_dict(), filePath + modelWeights)