In [94]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

file_path = "../../web_service_dataset.csv"
df_original = pd.read_csv(file_path)

print(df_original.shape)

(2704839, 46)


In [81]:
df_test = df_original.sample(n = 100)

df_test

Unnamed: 0,src_ip,src_port,dst_ip,dst_port,proto,pktTotalCount,octetTotalCount,min_ps,max_ps,avg_ps,...,b_std_dev_ps,b_flowStart,b_flowEnd,b_flowDuration,b_min_piat,b_max_piat,b_avg_piat,b_std_dev_piat,category,web_service
973842,192.168.125.177,52511,172.16.255.183,53,UDP,2,154,63,91,77.000000,...,0.000000,1.556117e+09,1.556117e+09,0.000000e+00,0.000000,0.000000,0.000000,0.000000,Web,Microsoft
276926,192.168.121.42,50609,69.195.219.126,443,TCP,15,4506,40,1500,300.400000,...,154.590133,1.556032e+09,1.556032e+09,1.554475e+12,0.000002,0.125039,0.028932,0.044626,Web,TLS
1754237,192.168.125.91,63887,23.56.199.24,443,TCP,20,2740,40,857,137.000000,...,100.846864,1.556640e+09,1.556640e+09,1.555083e+12,0.001052,45.016578,20.024559,22.347417,Web,TLS
489358,192.168.125.190,50976,205.185.216.10,80,TCP,10,1227,40,469,122.700000,...,160.289582,1.556029e+09,1.556029e+09,1.554473e+12,0.000511,29.637302,10.007042,13.881570,Web,Microsoft
1020243,192.168.127.39,64531,172.16.255.183,53,UDP,2,217,64,153,108.500000,...,0.000000,1.556137e+09,1.556137e+09,0.000000e+00,0.000000,0.000000,0.000000,0.000000,Network,DNS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2688379,192.168.128.141,45122,118.214.160.43,443,TCP,1,52,52,52,52.000000,...,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000,Web,TLS
572761,192.168.127.76,51912,31.13.67.35,443,TCP,12,3499,40,2511,291.583333,...,920.003336,1.556052e+09,1.556052e+09,1.554496e+12,0.000057,0.115992,0.038850,0.048612,SocialNetwork,Facebook
1556097,192.168.128.61,59277,190.36.161.232,46118,TCP,3,152,48,52,50.666667,...,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000,Unspecified,Unknown
477708,192.168.125.87,50051,205.185.216.10,80,TCP,7,304,40,52,43.428571,...,5.656854,1.556029e+09,1.556029e+09,1.554473e+12,0.004051,0.141718,0.072884,0.068833,Web,HTTP


In [95]:
# 2.1
# Do you think all the features are useful for our target, and explain why? If no, indicate the redundant features, and remove them from the dataset.

# We can delete irrelevant features (because they cannot help a lot with prediction of the target variable)

drop_features = ["src_ip", "src_port", "dst_ip", "dst_port"]
df_original.drop(columns=drop_features, inplace=True, errors='ignore')

# We need to delete constant features first
constant_features = [col for col in df_original.columns if df_original[col].nunique() == 1]

if len(constant_features) > 1: 
    print('They need to be deleted!')

# We need to delete duplicate features

df_original.drop_duplicates()

# We can delete features with high correlation 

numeric_df = df_original.select_dtypes(include=['number']) 
correlation_matrix = numeric_df.corr().abs()

c = correlation_matrix[correlation_matrix > 0.8]
s = c.unstack()
so = s.sort_values(ascending=False).reset_index()
so = so[(so[0].isnull() == False) & (so["level_0"] != so["level_1"])]

to_be_deleted = []
candidates = list(so["level_0"])
subset_so = so

for candidate in candidates:
    if (candidate in list(subset_so["level_0"])):
        to_be_deleted.append(candidate)
        subset_so = subset_so[(subset_so["level_0"] != candidate) & (subset_so["level_1"] != candidate)]
        
df_original.drop(columns=to_be_deleted, inplace=True, errors='ignore')

print(f"Size of modified dataframe: {df_original.shape}")

Size of modified dataframe: (2704839, 21)


In [96]:
# 2.2
# For all classes (web service name), do we have a balanced number of samples (flows)? If no, how many services only have few samples, and indicate some examples? Define a reasonable threshold of sample quantity and remove services that do not have enough samples (lower than the threshold). How many services do you have left? Note that the threshold should not be too high, e.g., 1000.

threshold = 1000

web_service_counts = df_original['web_service'].value_counts()

low_web_service = web_service_counts[web_service_counts < threshold]

print("Services with low web counts")
print(low_web_service.head())

# So we need to delete them 

df_original = df_original[~df_original['web_service'].isin(low_web_service.index)]

# So, it is the size of modified dataframe

print(f"Size of modified dataframe: {df_original.shape}")

Services with low web counts
web_service
TeamViewer     990
Telegram       832
SMTP           815
Steam          796
AppleiTunes    707
Name: count, dtype: int64
Size of modified dataframe: (2689966, 21)


In [97]:
df_original

Unnamed: 0,proto,min_ps,avg_ps,flowEnd,min_piat,std_dev_piat,f_pktTotalCount,f_octetTotalCount,f_max_ps,f_avg_ps,...,f_flowDuration,f_std_dev_piat,b_octetTotalCount,b_min_ps,b_flowStart,b_min_piat,b_max_piat,b_std_dev_piat,category,web_service
0,UDP,328,346.363636,1.555955e+09,3.128052e-04,78.913442,17,5670,354,333.529412,...,1701.385427,74.034708,1950,389,1.555955e+09,1.035380e-02,198.657965,84.916348,Network,DHCP
1,UDP,328,333.529412,1.555955e+09,1.035595e-02,74.034707,17,5670,354,333.529412,...,1701.385515,74.034707,0,0,0.000000e+00,0.000000e+00,0.000000,0.000000,Network,DHCP
2,UDP,328,351.720930,1.555965e+09,2.391338e-04,58.972989,30,10062,352,335.400000,...,1450.967340,65.290032,5062,389,1.555964e+09,2.233195e-02,340.268454,124.270745,Network,DHCP
3,UDP,328,336.200000,1.555965e+09,1.533008e-02,65.510537,30,10086,352,336.200000,...,1450.967130,65.510537,0,0,0.000000e+00,0.000000e+00,0.000000,0.000000,Network,DHCP
4,ICMP,56,56.000000,1.555966e+09,0.000000e+00,0.000000,1,56,56,56.000000,...,0.000000,0.000000,0,0,0.000000e+00,0.000000e+00,0.000000,0.000000,Network,ICMP
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2704834,UDP,78,78.000000,1.559771e+09,7.469001e-01,261.659496,9,702,78,78.000000,...,979.409645,261.659496,0,0,0.000000e+00,0.000000e+00,0.000000,0.000000,System,NetBIOS
2704835,UDP,229,229.000000,1.559771e+09,7.190894e+02,0.164189,3,687,229,229.000000,...,1438.507195,0.164189,0,0,0.000000e+00,0.000000e+00,0.000000,0.000000,System,NetBIOS
2704836,UDP,229,229.000000,1.559771e+09,7.190895e+02,0.164083,3,687,229,229.000000,...,1438.507230,0.164083,0,0,0.000000e+00,0.000000e+00,0.000000,0.000000,System,NetBIOS
2704837,TCP,40,46.933333,1.559771e+09,1.150203e-02,24.747414,37,1540,66,41.621622,...,1710.028548,14.809387,1980,40,1.559770e+09,1.150203e-02,90.155711,14.822745,Web,Google


In [98]:
# 2.3
# Perform a stratified split (based on the label) to segment the dataset into training (50%), validation (20%), and test (30%) dataset. Standardize your dataset, by fitting the scaler on training set and then transforming all data.

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# One hot encoding for categorical features
# df_original = pd.get_dummies(df_original, columns=['proto', 'category', 'web_service'])

# web_service will be our label 
X = df_original.drop(columns=['proto', 'category', 'web_service'])
Y = df_original['web_service']

# segmentation of dataset
# here we will obtain the test data (30% for test data => test_size = 0.3, and X_temp and Y_temp will he total amount 70 %)
X_temp, X_test, Y_temp, Y_test = train_test_split( X, Y, test_size=0.3, stratify=Y, random_state=42)

# here we will obtain the training (50% of 70%) and validation data (20% / 70% = 0.2857) from X_temp and Y_temp 
X_train, X_valid, Y_train, Y_valid = train_test_split(X_temp, Y_temp, test_size=0.2857, stratify=Y_temp, random_state=42)

# Let's see what we have 

print(f"Size of training data: {X_train.shape}")
print(f"Size of validation data: {X_valid.shape}")
print(f"Size of testing data: {X_test.shape}")

# We need to standardize our dataset

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)
X_test_scaled = scaler.transform(X_test)



Size of training data: (1345009, 18)
Size of validation data: (537967, 18)
Size of testing data: (806990, 18)
