In [3]:
import os
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim

from modelClass import Net

import pickle

### Load training data columns

In [79]:
filePath =  "../../project_course_data/"

colNamesPath = "dataColNames.txt"

with open(filePath + colNamesPath, "r") as f:
    loadedColumns = [str(line.strip()) for line in f]

print(loadedColumns)




['Label', 'Duration', 'Packets', 'Bytes', 'IGMP', 'TCP', 'UDP', '0', '1900', '22222', '27018', '27025', '27036', '27043', '27047', '27051', '27053', '27057', '27060', '3478', '3480', '34820', '3702', '4070', '44142', '443', '50002', '50012', '50022', '50027', '5228', '5353', '5355', '67', '80', '8009', '1drv', '1.00E+100', '82f3dc', '833aec', 'a104', 'a184', 'a2', 'a23', 'a95', 'aa784e235de7c8b14', 'adobedc', 'akamaitechnologies', 'all', 'amazonaws', 'ams', 'ams1', 'ams15s47', 'ams15s51', 'ams17s02', 'ams17s04', 'ams17s13', 'ams17s17', 'ams58', 'andreas', 'arn', 'arn001', 'arn04', 'arn09s18', 'arn09s19', 'arn09s20', 'arn09s21', 'arn09s22', 'arn09s23', 'arn09s25', 'arn09s26', 'arn09s27', 'arn1', 'arn11s03', 'arn11s04', 'arn11s09', 'arn11s10', 'arn11s11', 'arn11s12', 'arn11s13', 'arn11s14', 'arn2', 'arn54', 'arn56', 'awsglobalaccelerator', 'bc', 'berlin', 'betterttv', 'bkk03s02', 'bl', 'bunnyinfra', 'c061', 'c062', 'c063', 'c064', 'c066', 'c069', 'c073', 'c076', 'c078', 'c079', 'c081', '

LABELS AND CORRESPONDING NUMBER:
0	Youtube
1	Netflix
2	Browsing/Shopping
3	Twitch TV
4	Prime Video
5	SVT Play
6	Spotify
7	Facebook
8	Playstation
9	Soundcloud
10	Discord
11	Reddit
12	Amazon SHOP
13	Google Drive
14	Skype
15	Disney+
16	Steam Gaming
17	Gmail
18	Instagram
19	Outlook Mail
20	X


<All keys matched successfully>

### Load new data

In [6]:
dataPath = "pilottest.txt"

with open(filePath + dataPath, "r") as f:
    content = f.read().replace("->", " ").replace(" K ", "K ").replace(" M ", "M ").replace(" G ", "G ")

### Do all data processing again

In [7]:
import io

In [8]:
csvStringIO = io.StringIO(content)
columnNames = ["Datetime", "Time", "Duration",  "Proto", "Src IP Addr:Port", "Dst IP Addr:Port", "Packets", "Bytes", "Flows"]

# dateparse = lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S.%f')

data = pd.read_csv(csvStringIO, sep = '\s+', names = columnNames, header = None, usecols=range(len(columnNames)), parse_dates=True, engine = "python")
data = data.iloc[1:-4] #Removes the summary lines and col names

data["Datetime"] += " " + data["Time"]
data = data.drop(columns = ["Time"])

data["Id"] = data.reset_index(drop = True).index

#### Debug function

In [9]:
def printRow(df, id):
    try:
        row = df[df["Id"] == id]
        for col, val in row.iloc[0].items():
            print(f"{col}: {val}")
    except KeyError:
        print(f"Row with id {id} not found in DataFrame.")

In [10]:
id = 332
printRow(data, id)

Datetime: 2023-12-04 20:17:14.325
Duration: 0.126
Proto: UDP
Src IP Addr:Port: 192.168.8.195:53709
Dst IP Addr:Port: 74.125.111.73:443
Packets: 100
Bytes: 6100
Flows: 1
Id: 332


#### Sorting and splitting and combining

In [11]:
 # Convert the datetime column to pandas datetime format
data['Datetime'] = pd.to_datetime(data['Datetime'])

# Sort the DataFrame by the datetime column
data = data.sort_values(by='Datetime')

# Split IP address and port to two columns, and drops the old column
data[['Src IP Addr', 'Src Port']] = data['Src IP Addr:Port'].str.split(':', n=1, expand=True)
data[['Dst IP Addr', 'Dst Port']] = data['Dst IP Addr:Port'].str.split(':', n=1, expand=True)
data = data.drop(columns=["Src IP Addr:Port", "Dst IP Addr:Port"])

In [12]:
printRow(data, id)

Datetime: 2023-12-04 20:17:14.325000
Duration: 0.126
Proto: UDP
Packets: 100
Bytes: 6100
Flows: 1
Id: 332
Src IP Addr: 192.168.8.195
Src Port: 53709
Dst IP Addr: 74.125.111.73
Dst Port: 443


#### Preprocess data

In [13]:
# Changes Duration column to float
data['Duration'] = data['Duration'].astype(float)
# Drop outliers
data = data.drop(data[data.Duration > 10000].index)

# Find clients IP address
client = data['Src IP Addr'].value_counts().idxmax().split('.')
client = '.'.join(client[:3])

# Initialize 'Host IP'-column from 'Src Ip Addr'
data['Host IP'] = data['Src IP Addr']
data['Client IP'] = data['Dst IP Addr']

# Removes all internal flows
# If the destination IP is not equal to the clients IP, adds it to 'Host IP'-column
for index, row in data.iterrows():
    if client in row['Dst IP Addr'] and client in row['Src IP Addr']:
        data.drop(index, inplace=True)
    elif client not in row['Dst IP Addr']:
        data.at[index, 'Host IP'] = row['Dst IP Addr']
        data.at[index, 'Client IP'] = row['Src IP Addr']

In [14]:
printRow(data, id)

Datetime: 2023-12-04 20:17:14.325000
Duration: 0.126
Proto: UDP
Packets: 100
Bytes: 6100
Flows: 1
Id: 332
Src IP Addr: 192.168.8.195
Src Port: 53709
Dst IP Addr: 74.125.111.73
Dst Port: 443
Host IP: 74.125.111.73
Client IP: 192.168.8.195


#### Prefix fixing

In [15]:
def convert_bytes(value):
    value = str(value).upper()
    multipliers = {'K': 1000, 'M': 1000**2, 'G': 1000**3}

    if value[-1] in multipliers:
        return int(float(value[0:-1]) * multipliers[value[-1]])
    else:
        return int(value)

# Apply the conversion function to the 'Bytes' column
data['Bytes'] = data['Bytes'].apply(convert_bytes)
data['Packets'] = data['Packets'].apply(convert_bytes)

In [16]:
printRow(data, id)

Datetime: 2023-12-04 20:17:14.325000
Duration: 0.126
Proto: UDP
Packets: 100
Bytes: 6100
Flows: 1
Id: 332
Src IP Addr: 192.168.8.195
Src Port: 53709
Dst IP Addr: 74.125.111.73
Dst Port: 443
Host IP: 74.125.111.73
Client IP: 192.168.8.195


#### Reverse DNS

In [18]:
import socket

In [19]:
# Find all unique addresses
unique_ip = data['Host IP'].unique()

# Creates a new dataframe
data_DNS = pd.DataFrame(columns=['IP', 'Host'])

data_DNS['IP']=unique_ip
host = []

# for-loop for doing reverse DNS lookup
i=0
for ip in unique_ip:
    try:
        host_name = socket.gethostbyaddr(ip)[0]
        host.append(host_name)
    except socket.herror:
        host.append(None)
    
    if i % 100 == 0: # Used to keep track how far along we've come
        print(f"{i} / {len(unique_ip)}")
    i += 1

# Adds the corresponding domain names to the IP-addresses and creates a CSV-file
data_DNS['Host'] = host
# data_DNS.to_csv('./host_names', index=False)

0 / 196
100 / 196


In [20]:
DNS_dict = {}
for index, row in data_DNS.iterrows():
    DNS_dict[row["IP"]] = row["Host"]

print(DNS_dict)


dataWithDomains = data.copy()
dataWithDomains["Domain Name"] = dataWithDomains.apply(lambda row: DNS_dict[row["Host IP"]], axis= 1)

{'255.255.255.255': None, '23.58.201.11': 'a23-58-201-11.deploy.static.akamaitechnologies.com', '40.91.80.89': None, '23.34.234.75': 'a23-34-234-75.deploy.static.akamaitechnologies.com', '20.123.104.105': None, '23.201.43.185': 'a23-201-43-185.deploy.static.akamaitechnologies.com', '23.201.43.155': 'a23-201-43-155.deploy.static.akamaitechnologies.com', '20.190.177.84': None, '204.79.197.203': 'a-0003.a-msedge.net', '20.223.36.55': None, '204.79.197.222': None, '13.107.42.254': None, '52.98.151.66': None, '34.98.74.57': '57.74.98.34.bc.googleusercontent.com', '35.186.224.25': '25.224.186.35.bc.googleusercontent.com', '34.36.232.77': '77.232.36.34.bc.googleusercontent.com', '35.186.224.17': '17.224.186.35.bc.googleusercontent.com', '13.107.42.16': None, '142.250.74.98': 'arn11s10-in-f2.1e100.net', '204.79.197.239': None, '20.199.58.43': None, '132.245.231.8': None, '142.250.74.33': 'arn09s22-in-f1.1e100.net', '142.250.74.66': 'arn09s23-in-f2.1e100.net', '80.239.138.80': None, '142.250.74

#### Protocol into feature columns

In [21]:
dataWithDummies = dataWithDomains.copy()

# Get the dummies and store it in a variable
dummies = pd.get_dummies(dataWithDummies.Proto).astype(int)
 
# Concatenate the dummies to original dataframe
dataWithDummies = pd.concat([dataWithDummies, dummies], axis='columns')

# drop the values
dataWithDummies = dataWithDummies.drop(['Proto'], axis='columns')
dataWithDummies = dataWithDummies.fillna('0')

#### Port into feature columns

In [22]:
#  Creates a column for non-client ports
dataWithDummies['Host Port'] = None

# Adds the non-client port to the new column by checking that the ports (Src & Dst) does not contain the client IP
for index, row in dataWithDummies.iterrows():
    if row["Host IP"] ==  row['Src IP Addr']:
        dataWithDummies.at[index, 'Host Port'] = row['Src Port']
    else:
        dataWithDummies.at[index, 'Host Port'] = row['Dst Port'] 

# Get the dummies and store it in a variable
dummies = pd.get_dummies(dataWithDummies["Host Port"]).astype(int)
 
# Concatenate the dummies to original dataframe
dataWithDummies = pd.concat([dataWithDummies, dummies], axis='columns')

# drop the values
dataWithDummies = dataWithDummies.fillna('0')

In [23]:
print("A row of data after port dummies:\n")
printRow(dataWithDummies, id)

A row of data after port dummies:

Datetime: 2023-12-04 20:17:14.325000
Duration: 0.126
Packets: 100
Bytes: 6100
Flows: 1
Id: 332
Src IP Addr: 192.168.8.195
Src Port: 53709
Dst IP Addr: 74.125.111.73
Dst Port: 443
Host IP: 74.125.111.73
Client IP: 192.168.8.195
Domain Name: arn09s26-in-f9.1e100.net
TCP: 0
UDP: 1
Host Port: 443
1900: 0
3702: 0
443: 1
5228: 0
67: 0
80: 0


#### BOW for domain names

In [25]:
from sklearn.feature_extraction.text import CountVectorizer

In [26]:
dataWithBOW = dataWithDummies.copy()

# Create and fit vectorizer
vectorizer = CountVectorizer(binary=True)
X = vectorizer.fit_transform(dataWithBOW['Domain Name'])

# vectorize
df_bow = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
# find columns with only numeric names and drop them
numeric_columns = df_bow.columns[df_bow.columns.str.isnumeric()]
df_bow.drop(numeric_columns, axis=1, inplace=True)

dataWithBOW.reset_index(drop=True, inplace=True)
df_bow.reset_index(drop=True, inplace=True)
dataWithBOW = pd.concat([dataWithBOW, df_bow], axis=1)

In [27]:
print("A row of data after Bag of Words:\n")
printRow(dataWithBOW, id)

A row of data after Bag of Words:

Datetime: 2023-12-04 20:17:14.325000
Duration: 0.126
Packets: 100
Bytes: 6100
Flows: 1
Id: 332
Src IP Addr: 192.168.8.195
Src Port: 53709
Dst IP Addr: 74.125.111.73
Dst Port: 443
Host IP: 74.125.111.73
Client IP: 192.168.8.195
Domain Name: arn09s26-in-f9.1e100.net
TCP: 0
UDP: 1
Host Port: 443
1900: 0
3702: 0
443: 1
5228: 0
67: 0
80: 0
1e100: 1
a184: 0
a2: 0
a23: 0
a95: 0
akamaitechnologies: 0
amazonaws: 0
ams58: 0
arn001: 0
arn09s18: 0
arn09s19: 0
arn09s20: 0
arn09s21: 0
arn09s22: 0
arn09s23: 0
arn09s25: 0
arn09s26: 1
arn09s27: 0
arn1: 0
arn11s03: 0
arn11s04: 0
arn11s09: 0
arn11s10: 0
arn11s11: 0
arn11s12: 0
arn11s13: 0
arn11s14: 0
arn2: 0
arn54: 0
arn56: 0
bc: 0
bl: 0
c062: 0
c063: 0
c064: 0
c069: 0
c073: 0
c076: 0
c078: 0
c084: 0
c085: 0
c086: 0
c087: 0
c089: 0
c091: 0
c093: 0
c094: 0
c095: 0
c096: 0
c097: 0
c100: 0
c102: 0
c104: 0
c105: 0
c108: 0
c109: 0
cdn: 0
cloudfront: 0
com: 0
compute: 0
cph2: 0
dc: 0
deploy: 0
ec2: 0
edge: 0
eu: 0
f1: 0
f10: 

#### Cert lookup

In [28]:
# WHOIS FETCH FUNCTION
def get_ip_info(ip):
    
    ipwhois_obj = IPWhois(ip)

    result = ipwhois_obj.lookup_rdap()

    # Available information from ipwhois:

    # print("IP Address:", result['query'])
    # print("ASN:", result['asn'])
    # print("CIDR:", result['asn_cidr'])
    # print("Name:", result['network']['name'])
    # print("Country:", result['asn_country_code'])
    # print("Description:", result['asn_description'])

    return {"name": result['network']['name'], "country": result['asn_country_code']}

In [29]:
dataWithWhoIs = dataWithBOW.copy()

sz = len(dataWithWhoIs)
t = 0
memo = {}
## ITERATING THROUGH DATASET / CERTIFICATE LOOKUP OF IP
for index, row in dataWithWhoIs.iterrows():

    host_IP = row["Host IP"]

    if(host_IP not in memo): # NEW IP
        print("not in")
        try: 
            # fetch ipwhois info
            whoIsResult = get_ip_info(row["Host IP"])

            # add info
            dataWithWhoIs.at[index, "ipwhois_name"] = whoIsResult["name"]
            dataWithWhoIs.at[index, "ipwhois_country"] = whoIsResult["country"]

            # save for later
            memo[host_IP] = whoIsResult

        except:
            # default option
            dataWithWhoIs.at[index, "ipwhois_name"] = np.nan
            dataWithWhoIs.at[index, "ipwhois_country"] = np.nan

    else: # IP ALREADY CHECKED
        # get saved whois info
        whoIsResult = memo[host_IP]

        # add info
        dataWithWhoIs.at[index, "ipwhois_name"] = whoIsResult["name"]
        dataWithWhoIs.at[index, "ipwhois_country"] = whoIsResult["country"]

    t+=1
    
    print(f"{t} / {sz}")

not in
1 / 884
not in
2 / 884
not in
3 / 884
not in
4 / 884
not in
5 / 884
not in
6 / 884
not in
7 / 884
not in
8 / 884
not in
9 / 884
not in
10 / 884
not in
11 / 884
not in
12 / 884
not in
13 / 884
not in
14 / 884
not in
15 / 884
not in
16 / 884
not in
17 / 884
not in
18 / 884
not in
19 / 884
not in
20 / 884
not in
21 / 884
not in
22 / 884
not in
23 / 884
not in
24 / 884
not in
25 / 884
not in
26 / 884
not in
27 / 884
not in
28 / 884
not in
29 / 884
not in
30 / 884
not in
31 / 884
not in
32 / 884
not in
33 / 884
not in
34 / 884
not in
35 / 884
not in
36 / 884
not in
37 / 884
not in
38 / 884
not in
39 / 884
not in
40 / 884
not in
41 / 884
not in
42 / 884
not in
43 / 884
not in
44 / 884
not in
45 / 884
not in
46 / 884
not in
47 / 884
not in
48 / 884
not in
49 / 884
not in
50 / 884
not in
51 / 884
not in
52 / 884
not in
53 / 884
not in
54 / 884
not in
55 / 884
not in
56 / 884
not in
57 / 884
not in
58 / 884
not in
59 / 884
not in
60 / 884
not in
61 / 884
not in
62 / 884
not in
63 / 884
n

In [30]:
# MAKE DUMMIES FROM WHOIS DATA

# get the dummies and store it in a variable
dummies_name = pd.get_dummies(dataWithWhoIs.ipwhois_name).astype(int)
dummies_country = pd.get_dummies(dataWithWhoIs.ipwhois_country).astype(int)

 
# Concatenate the dummies to original dataframe
dataWithWhoIs = pd.concat([dataWithWhoIs, dummies_name], axis='columns')
dataWithWhoIs = pd.concat([dataWithWhoIs, dummies_country], axis='columns')


# drop the values
dataWithWhoIs = dataWithWhoIs.drop(["ipwhois_name", "ipwhois_country"], axis='columns')
dataWithWhoIs = dataWithWhoIs.fillna(0)

In [31]:
print("A row of data after whois:\n")
printRow(dataWithWhoIs, id)

A row of data after whois:

Datetime: 2023-12-04 20:17:14.325000
Duration: 0.126
Packets: 100
Bytes: 6100
Flows: 1
Id: 332
Src IP Addr: 192.168.8.195
Src Port: 53709
Dst IP Addr: 74.125.111.73
Dst Port: 443
Host IP: 74.125.111.73
Client IP: 192.168.8.195
Domain Name: arn09s26-in-f9.1e100.net
TCP: 0
UDP: 1
Host Port: 443
1900: 0
3702: 0
443: 1
5228: 0
67: 0
80: 0
1e100: 1
a184: 0
a2: 0
a23: 0
a95: 0
akamaitechnologies: 0
amazonaws: 0
ams58: 0
arn001: 0
arn09s18: 0
arn09s19: 0
arn09s20: 0
arn09s21: 0
arn09s22: 0
arn09s23: 0
arn09s25: 0
arn09s26: 1
arn09s27: 0
arn1: 0
arn11s03: 0
arn11s04: 0
arn11s09: 0
arn11s10: 0
arn11s11: 0
arn11s12: 0
arn11s13: 0
arn11s14: 0
arn2: 0
arn54: 0
arn56: 0
bc: 0
bl: 0
c062: 0
c063: 0
c064: 0
c069: 0
c073: 0
c076: 0
c078: 0
c084: 0
c085: 0
c086: 0
c087: 0
c089: 0
c091: 0
c093: 0
c094: 0
c095: 0
c096: 0
c097: 0
c100: 0
c102: 0
c104: 0
c105: 0
c108: 0
c109: 0
cdn: 0
cloudfront: 0
com: 0
compute: 0
cph2: 0
dc: 0
deploy: 0
ec2: 0
edge: 0
eu: 0
f1: 0
f10: 0
f14: 

#### Min max norm

In [32]:
dataNorm = dataWithWhoIs.copy()
dataNorm["Packets"] = pd.to_numeric(dataNorm["Packets"])

columns = ['Bytes', 'Duration', 'Packets']
for column in columns:
    dataNorm[column] = (dataNorm[column] - dataNorm[column].min()) / (dataNorm[column].max() - dataNorm[column].min()) 

In [33]:
print("A row of data after normalization:\n")
printRow(dataNorm, id)

A row of data after normalization:

Datetime: 2023-12-04 20:17:14.325000
Duration: 3.606138678355898e-05
Packets: 0.0
Bytes: 1.7500058333527779e-06
Flows: 1
Id: 332
Src IP Addr: 192.168.8.195
Src Port: 53709
Dst IP Addr: 74.125.111.73
Dst Port: 443
Host IP: 74.125.111.73
Client IP: 192.168.8.195
Domain Name: arn09s26-in-f9.1e100.net
TCP: 0
UDP: 1
Host Port: 443
1900: 0
3702: 0
443: 1
5228: 0
67: 0
80: 0
1e100: 1
a184: 0
a2: 0
a23: 0
a95: 0
akamaitechnologies: 0
amazonaws: 0
ams58: 0
arn001: 0
arn09s18: 0
arn09s19: 0
arn09s20: 0
arn09s21: 0
arn09s22: 0
arn09s23: 0
arn09s25: 0
arn09s26: 1
arn09s27: 0
arn1: 0
arn11s03: 0
arn11s04: 0
arn11s09: 0
arn11s10: 0
arn11s11: 0
arn11s12: 0
arn11s13: 0
arn11s14: 0
arn2: 0
arn54: 0
arn56: 0
bc: 0
bl: 0
c062: 0
c063: 0
c064: 0
c069: 0
c073: 0
c076: 0
c078: 0
c084: 0
c085: 0
c086: 0
c087: 0
c089: 0
c091: 0
c093: 0
c094: 0
c095: 0
c096: 0
c097: 0
c100: 0
c102: 0
c104: 0
c105: 0
c108: 0
c109: 0
cdn: 0
cloudfront: 0
com: 0
compute: 0
cph2: 0
dc: 0
deploy:

### DATA PROCESSING DONE

## ONLY KEEP COLUMNS THAT WERE PRESENT IN TRAINING DATA

In [81]:
trainingColumns = loadedColumns.copy()
trainingColumns.remove("Label")
print(len(trainingColumns))

columnsInData = [col for col in trainingColumns if col in dataNorm.columns]
print(columnsInData)

columnsNotInData = [col for col in trainingColumns if col not in columnsInData]
print(columnsNotInData)

364
['Duration', 'Packets', 'Bytes', 'TCP', 'UDP', '1900', '3702', '443', '5228', '67', '80', 'a184', 'a2', 'a23', 'a95', 'akamaitechnologies', 'amazonaws', 'ams58', 'arn001', 'arn09s18', 'arn09s19', 'arn09s20', 'arn09s21', 'arn09s22', 'arn09s23', 'arn09s25', 'arn09s26', 'arn09s27', 'arn1', 'arn11s03', 'arn11s04', 'arn11s09', 'arn11s10', 'arn11s11', 'arn11s12', 'arn11s13', 'arn11s14', 'arn2', 'arn54', 'arn56', 'bc', 'bl', 'c062', 'c063', 'c064', 'c069', 'c073', 'c076', 'c078', 'c084', 'c085', 'c086', 'c087', 'c089', 'c091', 'c093', 'c094', 'c095', 'c096', 'c097', 'c100', 'c102', 'c104', 'c105', 'c108', 'c109', 'cdn', 'cloudfront', 'com', 'compute', 'cph2', 'dc', 'deploy', 'ec2', 'edge', 'eu', 'f1', 'f10', 'f14', 'f17', 'f170', 'f188', 'f2', 'f22', 'f27', 'f3', 'f4', 'f6', 'f7', 'f8', 'f84', 'f9', 'f94', 'facebook', 'fbcdn', 'fra07s64', 'fra16s58', 'github', 'googleusercontent', 'iad23s04', 'in', 'ipv4', 'ix', 'kul09s13', 'lk', 'lr', 'lt', 'mini', 'msedge', 'muc03s13', 'net', 'nflxvideo

In [82]:
filteredDataframe = dataNorm[columnsInData]

zeroDataframe = pd.DataFrame(0, index = filteredDataframe.index, columns = [col for col in columnsNotInData])

completeDataframe = filteredDataframe.join(zeroDataframe)

### Convert into tensor

In [88]:
X = completeDataframe.astype("float32").to_numpy()
X = torch.tensor(X, dtype=torch.float32)

### Import model data

In [114]:
# Load labels dictionary

labelsDictPath = "labelsDictionary.pkl"

with open(filePath + labelsDictPath, "rb") as pkl_f:
    LABELS_DICT = pickle.load(pkl_f)

print("LABELS AND CORRESPONDING NUMBER:")
for key,val in LABELS_DICT.items():
    print(f"{val}\t{key}")



LABELS AND CORRESPONDING NUMBER:
0	Youtube
1	Netflix
2	Browsing/Shopping
3	Twitch TV
4	Prime Video
5	SVT Play
6	Spotify
7	Facebook
8	Playstation
9	Soundcloud
10	Discord
11	Reddit
12	Amazon SHOP
13	Google Drive
14	Skype
15	Disney+
16	Steam Gaming
17	Gmail
18	Instagram
19	Outlook Mail
20	X


In [116]:
def label2num(label):
    return(LABELS_DICT[label])
def num2label(num):
    return next((key for key, val in LABELS_DICT.items() if val == num), None)

In [115]:
# Service to activity translation

ACTIVITY_DICT = {
    "Youtube":           "Video streaming",
    "Netflix":           "Video streaming",
    "Browsing/Shopping": "Browsing/Shopping",
    "Twitch TV":         "Video streaming",
    "Prime Video":       "Video streaming",
    "SVT Play":          "Video streaming",
    "Spotify":           "Sound streaming",
    "Facebook":          "Social media",
    "Playstation":       "Gaming",
    "Soundcloud":        "Sound streaming",
    "Discord":           "Voice chat",
    "Reddit":            "Social media",
    "Amazon SHOP":       "Browsing/Shopping",
    "Google Drive":      "Browsing/Shopping",
    "Skype":             "Voice chat",
    "Disney+":           "Video streaming",
    "Steam Gaming":      "Gaming",
    "Gmail":             "Browsing/Shopping",
    "Instagram":         "Social media",
    "Outlook Mail":      "Browsing/Shopping",
    "X":                 "Social media",
}

def service2activity(service):
    return(ACTIVITY_DICT[service])

In [90]:
# NETWORK LAYER SIZES
input_size = len(trainingColumns)
h1 = 4000
h2 = 1000
h3 = 500
h4 = 100
output_size = len(LABELS_DICT)



In [91]:
# Import trained model weights


modelWeights = "modelWeights.pth"

model = Net(input_size, h1, h2, h3, h4, output_size)

model.load_state_dict(torch.load(filePath + modelWeights))

<All keys matched successfully>

### Make classifications on pilot file

In [117]:
with torch.no_grad():
    model.eval()
    logits = model(X) #model made guess vectors (logits)
    probs = torch.softmax(logits, dim=1) # probabilites
    preds = torch.argmax(probs, dim = 1)

logits = logits.tolist()   
probs = probs.tolist()  
servicePredsNum = preds.tolist()
servicePredsStr = [num2label(num) for num in servicePredsNum]
activityPreds = [service2activity(service) for service in servicePredsStr]

print(logits[0])
print(probs[0])

print(servicePredsNum)
print(servicePredsStr)
print(activityPreds)


[0.21074485778808594, 0.5534424781799316, 0.6883357763290405, 0.8085711002349854, 0.5488860011100769, 0.5113452076911926, 0.362014502286911, 0.22955948114395142, 0.3633122146129608, 0.28089815378189087, -0.22266137599945068, -0.5072119235992432, -0.5257412791252136, -0.6020470261573792, -0.5387327671051025, -0.62226802110672, -0.9602039456367493, -0.5502543449401855, -0.4327058792114258, -0.862647294998169, -0.6808798313140869]
[0.05545608326792717, 0.07812335342168808, 0.08940551429986954, 0.100828155875206, 0.07776819169521332, 0.07490283250808716, 0.06451263278722763, 0.0565093457698822, 0.06459639966487885, 0.05948621779680252, 0.035952016711235046, 0.027048582211136818, 0.026552002876996994, 0.02460130676627159, 0.026209285482764244, 0.02410884015262127, 0.017195412889122963, 0.025909047573804855, 0.029140839353203773, 0.01895749196410179, 0.02273639105260372]
[3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 2, 2, 2, 2, 2, 3, 2, 1, 2, 2, 3, 2

### Handle predictions

In [132]:
print("SERVICE PREDICTIONS:\n")
for service, num in LABELS_DICT.items():
    count = servicePredsStr.count(service)
    ratio = count/len(servicePredsStr) * 100
    if count:
        print(f"{service} was predicted to be {round(ratio,3)}%")

print("ACTIVITY PREDICTIONS:\n")
uniqueActivites = set(ACTIVITY_DICT.values())
for activity in uniqueActivites:
    count = activityPreds.count(activity)
    ratio = count / len(activityPreds) * 100
    if count:
        print(f"{activity} was predicted to be {round(ratio,3)}%")


SERVICE PREDICTIONS:

Youtube was predicted to be 1.131%
Netflix was predicted to be 53.846%
Browsing/Shopping was predicted to be 38.009%
Twitch TV was predicted to be 7.014%
ACTIVITY PREDICTIONS:

Browsing/Shopping was predicted to be 38.009%
Video streaming was predicted to be 61.991%
