In [8]:
import os
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim

from modelClass import Net

import pickle

### Load training data columns

In [10]:
filePath =  "../../project_course_data/"

colNamesPath = "dataColNames.txt"

with open(filePath + colNamesPath, "r") as f:
    loadedColumns = [str(line.strip()) for line in f]

print(loadedColumns)




['Label', 'Duration', 'Packets', 'Bytes', 'IGMP', 'TCP', 'UDP', '0', '1900', '22222', '27018', '27025', '27036', '27043', '27047', '27051', '27053', '27057', '27060', '3478', '3480', '34820', '3702', '4070', '44142', '443', '50002', '50012', '50022', '50027', '5228', '5353', '5355', '67', '80', '8009', '1drv', '1.00E+100', '82f3dc', '833aec', 'a104', 'a184', 'a2', 'a23', 'a95', 'aa784e235de7c8b14', 'adobedc', 'akamaitechnologies', 'all', 'amazonaws', 'ams', 'ams1', 'ams15s47', 'ams15s51', 'ams17s02', 'ams17s04', 'ams17s13', 'ams17s17', 'ams58', 'andreas', 'arn', 'arn001', 'arn04', 'arn09s18', 'arn09s19', 'arn09s20', 'arn09s21', 'arn09s22', 'arn09s23', 'arn09s25', 'arn09s26', 'arn09s27', 'arn1', 'arn11s03', 'arn11s04', 'arn11s09', 'arn11s10', 'arn11s11', 'arn11s12', 'arn11s13', 'arn11s14', 'arn2', 'arn54', 'arn56', 'awsglobalaccelerator', 'bc', 'berlin', 'betterttv', 'bkk03s02', 'bl', 'bunnyinfra', 'c061', 'c062', 'c063', 'c064', 'c066', 'c069', 'c073', 'c076', 'c078', 'c079', 'c081', '

LABELS AND CORRESPONDING NUMBER:
0	Youtube
1	Netflix
2	Browsing/Shopping
3	Twitch TV
4	Prime Video
5	SVT Play
6	Spotify
7	Facebook
8	Playstation
9	Soundcloud
10	Discord
11	Reddit
12	Amazon SHOP
13	Google Drive
14	Skype
15	Disney+
16	Steam Gaming
17	Gmail
18	Instagram
19	Outlook Mail
20	X


<All keys matched successfully>

### Load new data

In [11]:
dataPath = "pilot3.txt"

with open(filePath + dataPath, "r") as f:
    content = f.read().replace("->", " ").replace(" K ", "K ").replace(" M ", "M ").replace(" G ", "G ")

### Do all data processing again

In [12]:
import io

In [13]:
csvStringIO = io.StringIO(content)
columnNames = ["Datetime", "Time", "Duration",  "Proto", "Src IP Addr:Port", "Dst IP Addr:Port", "Packets", "Bytes", "Flows"]

# dateparse = lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S.%f')

data = pd.read_csv(csvStringIO, sep = '\s+', names = columnNames, header = None, usecols=range(len(columnNames)), parse_dates=True, engine = "python")
data = data.iloc[1:-4] #Removes the summary lines and col names

data["Datetime"] += " " + data["Time"]
data = data.drop(columns = ["Time"])

data["Id"] = data.reset_index(drop = True).index

#### Debug function

In [15]:
def printRow(df, id):
    try:
        row = df[df["Id"] == id]
        for col, val in row.iloc[0].items():
            print(f"{col}: {val}")
    except KeyError:
        print(f"Row with id {id} not found in DataFrame.")

In [21]:
id = 9999
printRow(data, id)

Datetime: 2023-10-19 19:15:55.334
Duration: 0.000
Proto: TCP
Src IP Addr:Port: 192.168.8.177:56024
Dst IP Addr:Port: 109.74.196.205:443
Packets: 100
Bytes: 5200
Flows: 1
Id: 9999


#### Sorting and splitting and combining

In [10]:
 # Convert the datetime column to pandas datetime format
data['Datetime'] = pd.to_datetime(data['Datetime'])

# Sort the DataFrame by the datetime column
data = data.sort_values(by='Datetime')

# Split IP address and port to two columns, and drops the old column
data[['Src IP Addr', 'Src Port']] = data['Src IP Addr:Port'].str.split(':', n=1, expand=True)
data[['Dst IP Addr', 'Dst Port']] = data['Dst IP Addr:Port'].str.split(':', n=1, expand=True)
data = data.drop(columns=["Src IP Addr:Port", "Dst IP Addr:Port"])

In [11]:
printRow(data, id)

Datetime: 2023-10-18 18:20:45.577000
Duration: 0.000
Proto: TCP
Packets: 100
Bytes: 5200
Flows: 1
Id: 332
Src IP Addr: 208.67.220.220
Src Port: 443
Dst IP Addr: 192.168.8.177
Dst Port: 53934


#### Preprocess data

In [12]:
# Changes Duration column to float
data['Duration'] = data['Duration'].astype(float)
# Drop outliers
data = data.drop(data[data.Duration > 10000].index)

# Find clients IP address
client = data['Src IP Addr'].value_counts().idxmax().split('.')
client = '.'.join(client[:3])

# Initialize 'Host IP'-column from 'Src Ip Addr'
data['Host IP'] = data['Src IP Addr']
data['Client IP'] = data['Dst IP Addr']

# Removes all internal flows
# If the destination IP is not equal to the clients IP, adds it to 'Host IP'-column
for index, row in data.iterrows():
    if client in row['Dst IP Addr'] and client in row['Src IP Addr']:
        data.drop(index, inplace=True)
    elif client not in row['Dst IP Addr']:
        data.at[index, 'Host IP'] = row['Dst IP Addr']
        data.at[index, 'Client IP'] = row['Src IP Addr']

In [13]:
printRow(data, id)

Datetime: 2023-10-18 18:20:45.577000
Duration: 0.0
Proto: TCP
Packets: 100
Bytes: 5200
Flows: 1
Id: 332
Src IP Addr: 208.67.220.220
Src Port: 443
Dst IP Addr: 192.168.8.177
Dst Port: 53934
Host IP: 208.67.220.220
Client IP: 192.168.8.177


#### Prefix fixing

In [14]:
def convert_bytes(value):
    value = str(value).upper()
    multipliers = {'K': 1000, 'M': 1000**2, 'G': 1000**3}

    if value[-1] in multipliers:
        return int(float(value[0:-1]) * multipliers[value[-1]])
    else:
        return int(value)

# Apply the conversion function to the 'Bytes' column
data['Bytes'] = data['Bytes'].apply(convert_bytes)
data['Packets'] = data['Packets'].apply(convert_bytes)

In [15]:
printRow(data, id)

Datetime: 2023-10-18 18:20:45.577000
Duration: 0.0
Proto: TCP
Packets: 100
Bytes: 5200
Flows: 1
Id: 332
Src IP Addr: 208.67.220.220
Src Port: 443
Dst IP Addr: 192.168.8.177
Dst Port: 53934
Host IP: 208.67.220.220
Client IP: 192.168.8.177


#### Reverse DNS

In [16]:
import socket

In [17]:
# Find all unique addresses
unique_ip = data['Host IP'].unique()

# Creates a new dataframe
data_DNS = pd.DataFrame(columns=['IP', 'Host'])

data_DNS['IP']=unique_ip
host = []

# for-loop for doing reverse DNS lookup
i=0
for ip in unique_ip:
    try:
        host_name = socket.gethostbyaddr(ip)[0]
        host.append(host_name)
    except socket.herror:
        host.append(None)
    
    if i % 100 == 0: # Used to keep track how far along we've come
        print(f"{i} / {len(unique_ip)}")
    i += 1

# Adds the corresponding domain names to the IP-addresses and creates a CSV-file
data_DNS['Host'] = host
# data_DNS.to_csv('./host_names', index=False)

0 / 5168
100 / 5168
200 / 5168
300 / 5168
400 / 5168
500 / 5168
600 / 5168
700 / 5168
800 / 5168
900 / 5168
1000 / 5168
1100 / 5168
1200 / 5168
1300 / 5168
1400 / 5168
1500 / 5168
1600 / 5168
1700 / 5168
1800 / 5168
1900 / 5168
2000 / 5168
2100 / 5168
2200 / 5168
2300 / 5168
2400 / 5168
2500 / 5168
2600 / 5168
2700 / 5168
2800 / 5168
2900 / 5168
3000 / 5168
3100 / 5168
3200 / 5168
3300 / 5168
3400 / 5168
3500 / 5168
3600 / 5168
3700 / 5168
3800 / 5168
3900 / 5168
4000 / 5168
4100 / 5168
4200 / 5168
4300 / 5168
4400 / 5168
4500 / 5168
4600 / 5168
4700 / 5168
4800 / 5168
4900 / 5168
5000 / 5168
5100 / 5168


In [18]:
DNS_dict = {}
for index, row in data_DNS.iterrows():
    DNS_dict[row["IP"]] = row["Host"]

print(DNS_dict)


dataWithDomains = data.copy()
dataWithDomains["Domain Name"] = dataWithDomains.apply(lambda row: DNS_dict[row["Host IP"]], axis= 1)

{'224.0.0.22': 'igmp.mcast.net', '23.192.152.19': 'a23-192-152-19.deploy.static.akamaitechnologies.com', '193.182.111.141': 'ntp6.flashdance.cx', '17.253.57.211': None, '17.57.146.175': None, '115.167.7.199': '115-167-7-199.telecomsukkah.com', '52.233.234.218': None, '84.200.70.40': None, '146.112.41.2': 'doh.opendns.com', '146.112.41.5': 'doh.umbrella.com', '18.195.249.137': 'ec2-18-195-249-137.eu-central-1.compute.amazonaws.com', '40.114.169.224': None, '2.19.224.19': 'a2-19-224-19.deploy.static.akamaitechnologies.com', '17.57.146.136': None, '208.67.222.222': 'dns.umbrella.com', '208.67.220.220': 'dns.opendns.com', '172.64.171.5': None, '142.250.179.106': 'par21s20-in-f10.1e100.net', '172.217.20.163': 'waw02s07-in-f3.1e100.net', '13.248.212.111': 'ac88393aca5853df7.awsglobalaccelerator.com', '34.241.4.102': 'ec2-34-241-4-102.eu-west-1.compute.amazonaws.com', '17.32.194.34': None, '17.8.153.240': None, '84.200.69.80': None, '17.32.194.2': None, '17.188.23.79': None, '192.229.221.95':

#### Protocol into feature columns

In [19]:
dataWithDummies = dataWithDomains.copy()

# Get the dummies and store it in a variable
dummies = pd.get_dummies(dataWithDummies.Proto).astype(int)
 
# Concatenate the dummies to original dataframe
dataWithDummies = pd.concat([dataWithDummies, dummies], axis='columns')

# drop the values
dataWithDummies = dataWithDummies.drop(['Proto'], axis='columns')
dataWithDummies = dataWithDummies.fillna('0')

#### Port into feature columns

In [20]:
#  Creates a column for non-client ports
dataWithDummies['Host Port'] = None

# Adds the non-client port to the new column by checking that the ports (Src & Dst) does not contain the client IP
for index, row in dataWithDummies.iterrows():
    if row["Host IP"] ==  row['Src IP Addr']:
        dataWithDummies.at[index, 'Host Port'] = row['Src Port']
    else:
        dataWithDummies.at[index, 'Host Port'] = row['Dst Port'] 

# Get the dummies and store it in a variable
dummies = pd.get_dummies(dataWithDummies["Host Port"]).astype(int)
 
# Concatenate the dummies to original dataframe
dataWithDummies = pd.concat([dataWithDummies, dummies], axis='columns')

# drop the values
dataWithDummies = dataWithDummies.fillna('0')

In [21]:
print("A row of data after port dummies:\n")
printRow(dataWithDummies, id)

A row of data after port dummies:

Datetime: 2023-10-18 18:20:45.577000
Duration: 0.0
Packets: 100
Bytes: 5200
Flows: 1
Id: 332
Src IP Addr: 208.67.220.220
Src Port: 443
Dst IP Addr: 192.168.8.177
Dst Port: 53934
Host IP: 208.67.220.220
Client IP: 192.168.8.177
Domain Name: dns.opendns.com
ICMP: 0
IGMP: 0
TCP: 1
UDP: 0
Host Port: 443
0: 0
10001: 0
10171: 0
1195: 0
123: 0
1234: 0
1236: 0
13447: 0
137: 0
16403: 0
16760: 0
19113: 0
19302: 0
22995: 0
27015: 0
27017: 0
27018: 0
27020: 0
27021: 0
27024: 0
27029: 0
27031: 0
27032: 0
27034: 0
27035: 0
27036: 0
27037: 0
27043: 0
27045: 0
27048: 0
27049: 0
27050: 0
27057: 0
27060: 0
27948: 0
29866: 0
3.1: 0
3.3: 0
3478: 0
3479: 0
3481: 0
3482: 0
37684: 0
38285: 0
38512: 0
38768: 0
38824: 0
38893: 0
4070: 0
41347: 0
41947: 0
43790: 0
443: 1
44369: 0
45095: 0
45966: 0
46220: 0
465: 0
47524: 0
48669: 0
49416: 0
50002: 0
50003: 0
50004: 0
50016: 0
50021: 0
50649: 0
5222: 0
5223: 0
5228: 0
53: 0
55323: 0
55692: 0
57021: 0
587: 0
58757: 0
58805: 0
590

#### BOW for domain names

In [26]:
from sklearn.feature_extraction.text import CountVectorizer

In [28]:
dataWithBOW = dataWithDummies.copy()

# Create and fit vectorizer
vectorizer = CountVectorizer(binary=True)
X = vectorizer.fit_transform(dataWithBOW['Domain Name'])

# vectorize
df_bow = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
# find columns with only numeric names and drop them
numeric_columns = df_bow.columns[df_bow.columns.str.isnumeric()]
df_bow.drop(numeric_columns, axis=1, inplace=True)

dataWithBOW.reset_index(drop=True, inplace=True)
df_bow.reset_index(drop=True, inplace=True)
dataWithBOW = pd.concat([dataWithBOW, df_bow], axis=1)

In [29]:
print("A row of data after Bag of Words:\n")
printRow(dataWithBOW, id)

A row of data after Bag of Words:

Datetime: 2023-10-18 18:20:45.577000
Duration: 0.0
Packets: 100
Bytes: 5200
Flows: 1
Id: 332
Src IP Addr: 208.67.220.220
Src Port: 443
Dst IP Addr: 192.168.8.177
Dst Port: 53934
Host IP: 208.67.220.220
Client IP: 192.168.8.177
Domain Name: dns.opendns.com
ICMP: 0
IGMP: 0
TCP: 1
UDP: 0
Host Port: 443
0: 0
10001: 0
10171: 0
1195: 0
123: 0
1234: 0
1236: 0
13447: 0
137: 0
16403: 0
16760: 0
19113: 0
19302: 0
22995: 0
27015: 0
27017: 0
27018: 0
27020: 0
27021: 0
27024: 0
27029: 0
27031: 0
27032: 0
27034: 0
27035: 0
27036: 0
27037: 0
27043: 0
27045: 0
27048: 0
27049: 0
27050: 0
27057: 0
27060: 0
27948: 0
29866: 0
3.1: 0
3.3: 0
3478: 0
3479: 0
3481: 0
3482: 0
37684: 0
38285: 0
38512: 0
38768: 0
38824: 0
38893: 0
4070: 0
41347: 0
41947: 0
43790: 0
443: 1
44369: 0
45095: 0
45966: 0
46220: 0
465: 0
47524: 0
48669: 0
49416: 0
50002: 0
50003: 0
50004: 0
50016: 0
50021: 0
50649: 0
5222: 0
5223: 0
5228: 0
53: 0
55323: 0
55692: 0
57021: 0
587: 0
58757: 0
58805: 0
590

#### Cert lookup

In [30]:
# WHOIS FETCH FUNCTION
def get_ip_info(ip):
    
    ipwhois_obj = IPWhois(ip)

    result = ipwhois_obj.lookup_rdap()

    # Available information from ipwhois:

    # print("IP Address:", result['query'])
    # print("ASN:", result['asn'])
    # print("CIDR:", result['asn_cidr'])
    # print("Name:", result['network']['name'])
    # print("Country:", result['asn_country_code'])
    # print("Description:", result['asn_description'])

    return {"name": result['network']['name'], "country": result['asn_country_code']}

In [31]:
dataWithWhoIs = dataWithBOW.copy()

sz = len(dataWithWhoIs)
t = 0
memo = {}
## ITERATING THROUGH DATASET / CERTIFICATE LOOKUP OF IP
for index, row in dataWithWhoIs.iterrows():

    host_IP = row["Host IP"]

    if(host_IP not in memo): # NEW IP
        print("not in")
        try: 
            # fetch ipwhois info
            whoIsResult = get_ip_info(row["Host IP"])

            # add info
            dataWithWhoIs.at[index, "ipwhois_name"] = whoIsResult["name"]
            dataWithWhoIs.at[index, "ipwhois_country"] = whoIsResult["country"]

            # save for later
            memo[host_IP] = whoIsResult

        except:
            # default option
            dataWithWhoIs.at[index, "ipwhois_name"] = np.nan
            dataWithWhoIs.at[index, "ipwhois_country"] = np.nan

    else: # IP ALREADY CHECKED
        # get saved whois info
        whoIsResult = memo[host_IP]

        # add info
        dataWithWhoIs.at[index, "ipwhois_name"] = whoIsResult["name"]
        dataWithWhoIs.at[index, "ipwhois_country"] = whoIsResult["country"]

    t+=1
    
    print(f"{t} / {sz}")

not in
1 / 99995
not in
2 / 99995
not in
3 / 99995
not in
4 / 99995
not in
5 / 99995
not in
6 / 99995
not in
7 / 99995
not in
8 / 99995
not in
9 / 99995
not in
10 / 99995
not in
11 / 99995
not in
12 / 99995
not in
13 / 99995
not in
14 / 99995
not in
15 / 99995
not in
16 / 99995
not in
17 / 99995
not in
18 / 99995
not in
19 / 99995
not in
20 / 99995
not in
21 / 99995
not in
22 / 99995
not in
23 / 99995
not in
24 / 99995
not in
25 / 99995
not in
26 / 99995
not in
27 / 99995
not in
28 / 99995
not in
29 / 99995
not in
30 / 99995
not in
31 / 99995
not in
32 / 99995
not in
33 / 99995
not in
34 / 99995
not in
35 / 99995
not in
36 / 99995
not in
37 / 99995
not in
38 / 99995
not in
39 / 99995
not in
40 / 99995
not in
41 / 99995
not in
42 / 99995
not in
43 / 99995
not in
44 / 99995
not in
45 / 99995
not in
46 / 99995
not in
47 / 99995
not in
48 / 99995
not in
49 / 99995
not in
50 / 99995
not in
51 / 99995
not in
52 / 99995
not in
53 / 99995
not in
54 / 99995
not in
55 / 99995
not in
56 / 99995
n

In [32]:
# MAKE DUMMIES FROM WHOIS DATA

# get the dummies and store it in a variable
dummies_name = pd.get_dummies(dataWithWhoIs.ipwhois_name).astype(int)
dummies_country = pd.get_dummies(dataWithWhoIs.ipwhois_country).astype(int)

 
# Concatenate the dummies to original dataframe
dataWithWhoIs = pd.concat([dataWithWhoIs, dummies_name], axis='columns')
dataWithWhoIs = pd.concat([dataWithWhoIs, dummies_country], axis='columns')


# drop the values
dataWithWhoIs = dataWithWhoIs.drop(["ipwhois_name", "ipwhois_country"], axis='columns')
dataWithWhoIs = dataWithWhoIs.fillna(0)

In [33]:
print("A row of data after whois:\n")
printRow(dataWithWhoIs, id)

A row of data after whois:

Datetime: 2023-10-18 18:20:45.577000
Duration: 0.0
Packets: 100
Bytes: 5200
Flows: 1
Id: 332
Src IP Addr: 208.67.220.220
Src Port: 443
Dst IP Addr: 192.168.8.177
Dst Port: 53934
Host IP: 208.67.220.220
Client IP: 192.168.8.177
Domain Name: dns.opendns.com
ICMP: 0
IGMP: 0
TCP: 1
UDP: 0
Host Port: 443
0: 0
10001: 0
10171: 0
1195: 0
123: 0
1234: 0
1236: 0
13447: 0
137: 0
16403: 0
16760: 0
19113: 0
19302: 0
22995: 0
27015: 0
27017: 0
27018: 0
27020: 0
27021: 0
27024: 0
27029: 0
27031: 0
27032: 0
27034: 0
27035: 0
27036: 0
27037: 0
27043: 0
27045: 0
27048: 0
27049: 0
27050: 0
27057: 0
27060: 0
27948: 0
29866: 0
3.1: 0
3.3: 0
3478: 0
3479: 0
3481: 0
3482: 0
37684: 0
38285: 0
38512: 0
38768: 0
38824: 0
38893: 0
4070: 0
41347: 0
41947: 0
43790: 0
443: 1
44369: 0
45095: 0
45966: 0
46220: 0
465: 0
47524: 0
48669: 0
49416: 0
50002: 0
50003: 0
50004: 0
50016: 0
50021: 0
50649: 0
5222: 0
5223: 0
5228: 0
53: 0
55323: 0
55692: 0
57021: 0
587: 0
58757: 0
58805: 0
59013: 0
5

#### Min max norm

In [34]:
dataNorm = dataWithWhoIs.copy()
dataNorm["Packets"] = pd.to_numeric(dataNorm["Packets"])

columns = ['Bytes', 'Duration', 'Packets']
for column in columns:
    dataNorm[column] = (dataNorm[column] - dataNorm[column].min()) / (dataNorm[column].max() - dataNorm[column].min()) 

In [35]:
print("A row of data after normalization:\n")
printRow(dataNorm, id)

A row of data after normalization:

Datetime: 2023-10-18 18:20:45.577000
Duration: 0.0
Packets: 0.0
Bytes: 1.709225673766078e-06
Flows: 1
Id: 332
Src IP Addr: 208.67.220.220
Src Port: 443
Dst IP Addr: 192.168.8.177
Dst Port: 53934
Host IP: 208.67.220.220
Client IP: 192.168.8.177
Domain Name: dns.opendns.com
ICMP: 0
IGMP: 0
TCP: 1
UDP: 0
Host Port: 443
0: 0
10001: 0
10171: 0
1195: 0
123: 0
1234: 0
1236: 0
13447: 0
137: 0
16403: 0
16760: 0
19113: 0
19302: 0
22995: 0
27015: 0
27017: 0
27018: 0
27020: 0
27021: 0
27024: 0
27029: 0
27031: 0
27032: 0
27034: 0
27035: 0
27036: 0
27037: 0
27043: 0
27045: 0
27048: 0
27049: 0
27050: 0
27057: 0
27060: 0
27948: 0
29866: 0
3.1: 0
3.3: 0
3478: 0
3479: 0
3481: 0
3482: 0
37684: 0
38285: 0
38512: 0
38768: 0
38824: 0
38893: 0
4070: 0
41347: 0
41947: 0
43790: 0
443: 1
44369: 0
45095: 0
45966: 0
46220: 0
465: 0
47524: 0
48669: 0
49416: 0
50002: 0
50003: 0
50004: 0
50016: 0
50021: 0
50649: 0
5222: 0
5223: 0
5228: 0
53: 0
55323: 0
55692: 0
57021: 0
587: 0
587

### DATA PROCESSING DONE

## ONLY KEEP COLUMNS THAT WERE PRESENT IN TRAINING DATA

In [36]:
trainingColumns = loadedColumns.copy()
trainingColumns.remove("Label")
print(len(trainingColumns))

columnsInData = [col for col in trainingColumns if col in dataNorm.columns]
print(columnsInData)

columnsNotInData = [col for col in trainingColumns if col not in columnsInData]
print(columnsNotInData)

364
['Duration', 'Packets', 'Bytes', 'IGMP', 'TCP', 'UDP', '0', '27018', '27036', '27043', '27057', '27060', '3478', '4070', '443', '50002', '5228', '80', '1drv', 'a104', 'a184', 'a2', 'a23', 'a95', 'adobedc', 'akamaitechnologies', 'amazonaws', 'ams', 'ams1', 'ams17s17', 'ams58', 'arn001', 'arn09s18', 'arn09s19', 'arn09s20', 'arn09s21', 'arn09s22', 'arn09s23', 'arn09s25', 'arn09s26', 'arn09s27', 'arn1', 'arn11s03', 'arn11s09', 'arn11s10', 'arn11s11', 'arn11s12', 'arn11s13', 'arn2', 'arn54', 'arn56', 'awsglobalaccelerator', 'bc', 'bl', 'bunnyinfra', 'c079', 'c084', 'c089', 'c090', 'c102', 'c108', 'c110', 'cdn', 'central', 'clients', 'cloudfront', 'com', 'compute', 'data', 'dc', 'de', 'deploy', 'dgw', 'east', 'ec2', 'ed0', 'ed13', 'ed14', 'ed2', 'edge', 'eu', 'f1', 'f10', 'f113', 'f13', 'f138', 'f14', 'f17', 'f170', 'f188', 'f19', 'f2', 'f22', 'f27', 'f3', 'f4', 'f6', 'f7', 'f8', 'f9', 'f94', 'facebook', 'fbcdn', 'fra', 'fra60', 'github', 'google', 'googleusercontent', 'https', 'igmp', '

In [37]:
filteredDataframe = dataNorm[columnsInData]

zeroDataframe = pd.DataFrame(0, index = filteredDataframe.index, columns = [col for col in columnsNotInData])

completeDataframe = filteredDataframe.join(zeroDataframe)

### Convert into tensor

In [38]:
X = completeDataframe.astype("float32").to_numpy()
X = torch.tensor(X, dtype=torch.float32)

### Import model data

In [39]:
# Load labels dictionary

labelsDictPath = "labelsDictionary.pkl"

with open(filePath + labelsDictPath, "rb") as pkl_f:
    LABELS_DICT = pickle.load(pkl_f)

print("LABELS AND CORRESPONDING NUMBER:")
for key,val in LABELS_DICT.items():
    print(f"{val}\t{key}")



LABELS AND CORRESPONDING NUMBER:
0	Youtube
1	Netflix
2	Browsing/Shopping
3	Twitch TV
4	Prime Video
5	SVT Play
6	Spotify
7	Facebook
8	Playstation
9	Soundcloud
10	Discord
11	Reddit
12	Amazon SHOP
13	Google Drive
14	Skype
15	Disney+
16	Steam Gaming
17	Gmail
18	Instagram
19	Outlook Mail
20	X


In [40]:
def label2num(label):
    return(LABELS_DICT[label])
def num2label(num):
    return next((key for key, val in LABELS_DICT.items() if val == num), None)

In [41]:
# Service to activity translation

ACTIVITY_DICT = {
    "Youtube":           "Video streaming",
    "Netflix":           "Video streaming",
    "Browsing/Shopping": "Browsing/Shopping",
    "Twitch TV":         "Video streaming",
    "Prime Video":       "Video streaming",
    "SVT Play":          "Video streaming",
    "Spotify":           "Sound streaming",
    "Facebook":          "Social media",
    "Playstation":       "Gaming",
    "Soundcloud":        "Sound streaming",
    "Discord":           "Voice chat",
    "Reddit":            "Social media",
    "Amazon SHOP":       "Browsing/Shopping",
    "Google Drive":      "Browsing/Shopping",
    "Skype":             "Voice chat",
    "Disney+":           "Video streaming",
    "Steam Gaming":      "Gaming",
    "Gmail":             "Browsing/Shopping",
    "Instagram":         "Social media",
    "Outlook Mail":      "Browsing/Shopping",
    "X":                 "Social media",
}

def service2activity(service):
    return(ACTIVITY_DICT[service])

In [42]:
# NETWORK LAYER SIZES
input_size = len(trainingColumns)
h1 = 20000
h2 = 16000
h3 = 8000
h4 = 3000
h5 = 600
h6 = 80
output_size = len(LABELS_DICT)



In [43]:
# Import trained model weights


modelWeights = "modelWeights.pth"

model = Net(input_size, h1, h2, h3, h4, h5, h6, output_size)

model.load_state_dict(torch.load(filePath + modelWeights))

<All keys matched successfully>

### Make classifications on pilot file

In [46]:
# with torch.no_grad():
#     model.eval()
#     logits = model(X) #model made guess vectors (logits)
#     probs = torch.softmax(logits, dim=1) # probabilites
#     preds = torch.argmax(probs, dim = 1)

chunk_size = 10  # num chunks
chunks = torch.split(X, chunk_size, dim=0)

all_preds = torch.tensor([], dtype=torch.long)

with torch.no_grad():
    model.eval()

    for X_chunk in chunks:
        # Make predictions for the current chunk
        logits_chunk = model(X_chunk)
        probs_chunk = torch.softmax(logits_chunk, dim=1)
        preds_chunk = torch.argmax(probs_chunk, dim=1)

        # Concatenate the current predictions to the overall predictions
        all_preds = torch.cat((all_preds, preds_chunk), dim=0)


# logits = logits.tolist()   
# probs = probs.tolist()  
servicePredsNum = all_preds.tolist()
servicePredsStr = [num2label(num) for num in servicePredsNum]
activityPreds = [service2activity(service) for service in servicePredsStr]

# print(logits[0])
# print(probs[0])

print(servicePredsNum)
print(servicePredsStr)
print(activityPreds)


[6, 6, 5, 6, 3, 3, 5, 6, 5, 6, 6, 6, 6, 6, 5, 6, 6, 6, 6, 6, 5, 3, 2, 6, 0, 6, 6, 5, 6, 6, 6, 6, 6, 5, 6, 6, 3, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 6, 6, 5, 6, 6, 5, 5, 6, 6, 5, 5, 6, 6, 6, 6, 6, 5, 6, 6, 6, 6, 6, 6, 5, 6, 6, 5, 5, 6, 5, 6, 6, 6, 3, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 2, 6, 6, 6, 0, 6, 6, 5, 6, 2, 6, 6, 6, 6, 5, 5, 5, 5, 0, 6, 5, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 5, 5, 6, 6, 6, 6, 5, 6, 3, 5, 6, 5, 6, 5, 5, 6, 6, 6, 6, 6, 3, 2, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 6, 6, 6, 6, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 3, 6, 6, 6, 6, 6, 5, 6, 6, 6, 6, 6, 6, 2, 6, 6, 2, 5, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 6, 5, 6, 5, 5, 5, 6, 6, 5, 5, 6, 6, 6, 6, 5, 6, 5, 6, 6, 6, 6, 5, 6, 0, 5, 3, 5, 6, 6, 6, 5, 5, 5, 6, 6, 6, 6, 5, 5, 6, 6, 6, 6, 5, 3, 10, 5, 6, 6, 5, 6, 6, 6, 6, 5, 6, 6, 6, 6, 6, 6, 6, 5, 5, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6, 6, 6, 6, 5, 5, 6, 6, 6, 6, 6, 6, 6, 5, 5, 6, 6, 6, 6, 5, 6,

### Handle predictions

In [47]:
print("SERVICE PREDICTIONS:\n")
for service, num in LABELS_DICT.items():
    count = servicePredsStr.count(service)
    ratio = count/len(servicePredsStr) * 100
    if count:
        print(f"{service} was predicted to be {round(ratio,3)}%")

print("ACTIVITY PREDICTIONS:\n")
uniqueActivites = set(ACTIVITY_DICT.values())
for activity in uniqueActivites:
    count = activityPreds.count(activity)
    ratio = count / len(activityPreds) * 100
    if count:
        print(f"{activity} was predicted to be {round(ratio,3)}%")


SERVICE PREDICTIONS:

Youtube was predicted to be 0.351%
Browsing/Shopping was predicted to be 1.99%
Twitch TV was predicted to be 12.289%
SVT Play was predicted to be 29.236%
Spotify was predicted to be 55.66%
Facebook was predicted to be 0.001%
Playstation was predicted to be 0.335%
Discord was predicted to be 0.089%
Skype was predicted to be 0.001%
Steam Gaming was predicted to be 0.048%
ACTIVITY PREDICTIONS:

Sound streaming was predicted to be 55.66%
Gaming was predicted to be 0.383%
Video streaming was predicted to be 41.876%
Social media was predicted to be 0.001%
Browsing/Shopping was predicted to be 1.99%
Voice chat was predicted to be 0.09%


In [None]:
import csv

with open('servocePredsNum.csv', 'w', newline = '') as csvfile:
    my_writer = csv.writer(csvfile, delimiter = ' ')
    my_writer.writerow(servicePredsNum)

with open('servicePredsStr.csv', 'w', newline = '') as csvfile:
    my_writer = csv.writer(csvfile, delimiter = ' ')
    my_writer.writerow(servicePredsStr)

with open('activityPreds.csv', 'w', newline = '') as csvfile:
    my_writer = csv.writer(csvfile, delimiter = ' ')
    my_writer.writerow(activityPreds)
