In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

file_path = "../../web_service_dataset.csv"
df_original = pd.read_csv(file_path)

df_original_copy = df_original.copy()

print(df_original.shape)

(2704839, 46)


In [2]:
df_test = df_original.sample(n = 100)

df_test

Unnamed: 0,src_ip,src_port,dst_ip,dst_port,proto,pktTotalCount,octetTotalCount,min_ps,max_ps,avg_ps,...,b_std_dev_ps,b_flowStart,b_flowEnd,b_flowDuration,b_min_piat,b_max_piat,b_avg_piat,b_std_dev_piat,category,web_service
141795,192.168.127.13,50432,3.211.192.227,443,TCP,1,52,52,52,52.000000,...,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,Web,TLS
122301,192.168.125.19,137,172.16.255.185,137,UDP,11,858,78,78,78.000000,...,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,System,NetBIOS
701162,192.168.128.92,34203,172.16.255.183,53,UDP,2,176,68,108,88.000000,...,0.000000,1.556031e+09,1.556031e+09,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,Chat,Messenger
2067725,192.168.122.140,55140,10.200.7.5,3128,TCP,2,93,41,52,46.500000,...,0.000000,1.559685e+09,1.559685e+09,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,Web,HTTP_Proxy
837175,192.168.122.47,50734,74.119.119.149,443,TCP,24,8396,40,1489,349.833333,...,509.282521,1.556138e+09,1.556138e+09,1.554582e+12,6.794930e-05,12.312555,1.175347,3.368119,Web,TLS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
508846,192.168.127.13,40378,157.240.14.32,443,TCP,13,3361,52,2549,258.538462,...,929.985260,1.556048e+09,1.556048e+09,1.554492e+12,7.295609e-05,0.114207,0.036115,0.043176,Chat,Messenger
758333,192.168.121.42,60582,69.195.219.126,443,TCP,31,20115,40,1613,648.870968,...,675.900651,1.556124e+09,1.556124e+09,1.554568e+12,9.536743e-07,0.119653,0.014148,0.031073,Web,TLS
867711,192.168.122.52,54421,172.217.2.206,443,TCP,34,6185,40,1458,181.911765,...,461.818176,1.556142e+09,1.556142e+09,1.554586e+12,5.960464e-06,7.443639,0.517683,1.851508,Web,Google
1588435,192.168.128.88,56150,172.217.1.110,443,TCP,18,3130,52,1352,173.888889,...,120.033432,1.556290e+09,1.556290e+09,1.554734e+12,3.800392e-04,58.983977,7.525739,19.451064,SoftwareUpdate,PlayStore


In [3]:
# 2.1
# Do you think all the features are useful for our target, and explain why? If no, indicate the redundant features, and remove them from the dataset.

# We can delete irrelevant features (because they cannot help a lot with prediction of the target variable)

drop_features = ["src_ip", "src_port", "dst_ip", "dst_port"]
df_original.drop(columns=drop_features, inplace=True, errors='ignore')

# We need to delete constant features first
constant_features = [col for col in df_original.columns if df_original[col].nunique() == 1]

if len(constant_features) > 1: 
    print('They need to be deleted!')

# We need to delete duplicate features

df_original.drop_duplicates()

# We can delete features with high correlation 

numeric_df = df_original.select_dtypes(include=['number']) 
correlation_matrix = numeric_df.corr().abs()

c = correlation_matrix[correlation_matrix > 0.8]
s = c.unstack()
so = s.sort_values(ascending=False).reset_index()
so = so[(so[0].isnull() == False) & (so["level_0"] != so["level_1"])]

to_be_deleted = []
candidates = list(so["level_0"])
subset_so = so

for candidate in candidates:
    if (candidate in list(subset_so["level_0"])):
        to_be_deleted.append(candidate)
        subset_so = subset_so[(subset_so["level_0"] != candidate) & (subset_so["level_1"] != candidate)]
        
df_original.drop(columns=to_be_deleted, inplace=True, errors='ignore')

print(f"Size of modified dataframe: {df_original.shape}")

Size of modified dataframe: (2704839, 21)


In [4]:
# 2.2
# For all classes (web service name), do we have a balanced number of samples (flows)? If no, how many services only have few samples, and indicate some examples? Define a reasonable threshold of sample quantity and remove services that do not have enough samples (lower than the threshold). How many services do you have left? Note that the threshold should not be too high, e.g., 1000.

threshold = 1000

web_service_counts = df_original['web_service'].value_counts()

low_web_service = web_service_counts[web_service_counts < threshold]

print("Services with low web counts")
print(low_web_service.head())

# So we need to delete them 

df_original = df_original[~df_original['web_service'].isin(low_web_service.index)]

# So, it is the size of modified dataframe

print(f"Size of modified dataframe: {df_original.shape}")

Services with low web counts
web_service
TeamViewer     990
Telegram       832
SMTP           815
Steam          796
AppleiTunes    707
Name: count, dtype: int64
Size of modified dataframe: (2689966, 21)


In [5]:
df_original.loc[:, 'min_ps':'b_std_dev_piat']

Unnamed: 0,min_ps,avg_ps,flowEnd,min_piat,std_dev_piat,f_pktTotalCount,f_octetTotalCount,f_max_ps,f_avg_ps,f_flowEnd,f_flowDuration,f_std_dev_piat,b_octetTotalCount,b_min_ps,b_flowStart,b_min_piat,b_max_piat,b_std_dev_piat
0,328,346.363636,1.555955e+09,3.128052e-04,78.913442,17,5670,354,333.529412,1.555955e+12,1701.385427,74.034708,1950,389,1.555955e+09,1.035380e-02,198.657965,84.916348
1,328,333.529412,1.555955e+09,1.035595e-02,74.034707,17,5670,354,333.529412,1.555955e+12,1701.385515,74.034707,0,0,0.000000e+00,0.000000e+00,0.000000,0.000000
2,328,351.720930,1.555965e+09,2.391338e-04,58.972989,30,10062,352,335.400000,1.555965e+12,1450.967340,65.290032,5062,389,1.555964e+09,2.233195e-02,340.268454,124.270745
3,328,336.200000,1.555965e+09,1.533008e-02,65.510537,30,10086,352,336.200000,1.555965e+12,1450.967130,65.510537,0,0,0.000000e+00,0.000000e+00,0.000000,0.000000
4,56,56.000000,1.555966e+09,0.000000e+00,0.000000,1,56,56,56.000000,1.555966e+09,0.000000,0.000000,0,0,0.000000e+00,0.000000e+00,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2704834,78,78.000000,1.559771e+09,7.469001e-01,261.659496,9,702,78,78.000000,1.559771e+12,979.409645,261.659496,0,0,0.000000e+00,0.000000e+00,0.000000,0.000000
2704835,229,229.000000,1.559771e+09,7.190894e+02,0.164189,3,687,229,229.000000,1.559771e+12,1438.507195,0.164189,0,0,0.000000e+00,0.000000e+00,0.000000,0.000000
2704836,229,229.000000,1.559771e+09,7.190895e+02,0.164083,3,687,229,229.000000,1.559771e+12,1438.507230,0.164083,0,0,0.000000e+00,0.000000e+00,0.000000,0.000000
2704837,40,46.933333,1.559771e+09,1.150203e-02,24.747414,37,1540,66,41.621622,1.559771e+12,1710.028548,14.809387,1980,40,1.559770e+09,1.150203e-02,90.155711,14.822745


In [14]:
# 2.3
# Perform a stratified split (based on the label) to segment the dataset into training (50%), validation (20%), and test (30%) dataset. Standardize your dataset, by fitting the scaler on training set and then transforming all data.

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# web_service will be our label 
X = df_original.loc[:, 'min_ps':'b_std_dev_piat']
Y = df_original['web_service']

# segmentation of dataset
# here we will obtain the test data (30% for test data => test_size = 0.3, and X_temp and Y_temp will he total amount 70 %)
X_temp, X_train, Y_temp, Y_train = train_test_split(X, Y, train_size=0.5, stratify=Y, random_state=1)

# here we will obtain the training (50%) and validation data from X_temp and Y_temp 
X_valid, X_test, Y_valid, Y_test = train_test_split(X_temp, Y_temp, test_size=0.6, stratify=Y_temp, random_state=15)

# Let's see what we have 

print(f"Size of training data: {X_train.shape}")
print(f"Size of validation data: {X_valid.shape}")
print(f"Size of testing data: {X_test.shape}")

# We need to standardize our dataset

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)
X_test_scaled = scaler.transform(X_test)

Size of training data: (1344983, 18)
Size of validation data: (537993, 18)
Size of testing data: (806990, 18)


In [27]:
# 2.4
# Select a basic classification method, training the model on training set while testing on test set. What kind of phenomenon can you observe, and why?

# Our classification method will be LogisticRegression, so let's try to apply it

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# creating model and train it
# model = LogisticRegression(solver='saga', verbose=1, random_state=42, n_jobs=-1,) - to long!!
# model.fit(X_train_scaled, Y_train)

# Let's try to use RandomForestClassifier

model_rf = RandomForestClassifier(n_estimators=150, random_state=28)
model_rf.fit(X_train_scaled[1:20000], Y_train[1:20000])

In [30]:
# 2.4
# It is important to compute the predictions of the model
Y_pred = model_rf.predict(X_test_scaled[0:10000])

# End estimate the accuracy
accuracy = accuracy_score(Y_test[0:10000], Y_pred[0:10000])

print(f"Accuracy on the test data: {accuracy}")

# The problem that we have is overfitting, but we will discuss it later 

Accuracy on the test data: 0.7084


In [125]:
# 2.5

# Redefine the threshold to be 100,000 and remove services that do not meet the threshold. What are the services do we have left? Repeat a similar process in 3 except that you just need to split the dataset into 70%/30%, so that the 70% can be used in the following question.

threshold = 100000

service_counts = df_original['web_service'].value_counts()

removed_services = service_counts[service_counts < threshold].index

# after that we need to delete them 
df_delete_web = df_original[~df_original['web_service'].isin(removed_services)]

# and check how many services we obtain after deleting
remaining_services = df_delete_web['web_service'].unique()
print(f"Remaining services after applying new threshold: {remaining_services}")


# Repeat a similar process in 3 except
X = df_delete_web.loc[:, 'min_ps':'b_std_dev_piat']
Y = df_delete_web['web_service']

# Split data for the following question

X_train_following, X_test_following, Y_train_following, Y_test_following = train_test_split(
    X, Y, test_size=0.3, stratify=Y, random_state=42)

print(f"Size of training data: {X_train_following.shape}")

print(f"Size of testing data: {X_test_following.shape}")

Remaining services after applying new threshold: ['HTTP' 'Microsoft' 'Unknown' 'DNS' 'TLS' 'Google' 'Facebook']
Size of training data: (1361045, 18)
Size of testing data: (583305, 18)


In [126]:
X_train_following

Unnamed: 0,min_ps,avg_ps,flowEnd,min_piat,std_dev_piat,f_pktTotalCount,f_octetTotalCount,f_max_ps,f_avg_ps,f_flowEnd,f_flowDuration,f_std_dev_piat,b_octetTotalCount,b_min_ps,b_flowStart,b_min_piat,b_max_piat,b_std_dev_piat
2388497,61,69.000000,1.559655e+09,0.000597,0.000000,1,61,61,61.000000,1.559655e+09,0.000000,0.000000,77,77,1.559655e+09,0.000000,0.000000,0.000000
1980969,65,73.000000,1.559654e+09,0.000726,0.000000,1,65,65,65.000000,1.559654e+09,0.000000,0.000000,81,81,1.559654e+09,0.000000,0.000000,0.000000
2197797,72,86.000000,1.559655e+09,0.001313,0.000000,1,72,72,72.000000,1.559655e+09,0.000000,0.000000,100,100,1.559655e+09,0.000000,0.000000,0.000000
455727,68,121.000000,1.556049e+09,0.000726,0.000000,1,68,68,68.000000,1.556049e+09,0.000000,0.000000,174,174,1.556049e+09,0.000000,0.000000,0.000000
882208,40,234.105263,1.556119e+09,0.000002,0.111488,10,1998,1185,199.800000,1.556119e+12,0.921821,0.142630,2450,40,1.556119e+09,0.000002,0.430382,0.144184
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1363138,40,237.740741,1.556291e+09,0.000004,8.655506,13,1902,575,146.307692,1.556291e+12,48.290500,12.393739,4517,40,1.556291e+09,0.000008,45.031500,11.955703
1721414,67,110.500000,1.556638e+09,0.172608,0.000000,1,67,67,67.000000,1.556638e+09,0.000000,0.000000,154,154,1.556638e+09,0.000000,0.000000,0.000000
1171662,40,377.746032,1.556125e+09,0.000002,14.746815,26,2327,557,89.500000,1.556125e+12,240.291031,22.004356,21471,40,1.556125e+09,0.000003,90.092020,18.863907
1644287,40,152.111111,1.556311e+09,0.000130,0.000612,5,457,285,91.400000,1.556311e+12,0.003745,0.000838,912,40,1.556311e+09,0.000354,0.002065,0.000725


In [127]:
Y_train

1601725            Google
1702331               TLS
1133820            Google
2258642            Amazon
2492836            Google
                ...      
1685485               DNS
1936735    GoogleServices
2598862         Microsoft
1066991          Facebook
1544356         Instagram
Name: web_service, Length: 1344983, dtype: object

In [140]:
# 2.6
# Select at least 3 classification methods, performing model development with grid search. Choose the best-performed model (why?) and evaluate the final performance on test set (indicate the classification report and confusion matrix). Comparing to the one derived from 4 (compare each class), do you have better or worse performance?

from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

scaler = StandardScaler()

X_train_following_scaled = scaler.fit_transform(X_train_following)
X_test_following_scaled = scaler.transform(X_test_following)

print(f"Size of training data: {X_train_following_scaled.shape}")

print(f"Size of testing data: {X_test_following_scaled.shape}")
 

Size of training data: (1361045, 18)
Size of testing data: (583305, 18)


In [141]:
# 2.6

from sklearn.model_selection import GridSearchCV

params_forest = {
    'n_estimators': [100, 150, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
}

grid_forest = GridSearchCV(RandomForestClassifier(n_estimators=150, random_state=28), params_forest, cv = 5, scoring='accuracy')

grid_forest.fit(X_train_following_scaled[1:20000], Y_train_following[1:20000])

best_forest = grid_forest.best_estimator_

print(f"Best parameters for RandomForestClassifier: {best_forest}")

param_gaussian = {
    'var_smoothing': [1e-9, 1e-8, 1e-7]
}

grid_gaussian = GridSearchCV(GaussianNB(), param_gaussian, cv=5, scoring='accuracy')

grid_gaussian.fit(X_train_following_scaled[1:20000], Y_train_following[1:20000])

best_gaussian = grid_gaussian.best_estimator_

print(f"Best parameters for GaussianNB: {best_gaussian}")

params_logistic = {
    'C': [0.01, 0.1, 1, 10],
    'penalty': ['l2'],
    'solver': ['lbfgs', 'liblinear'],
}

grid_logistic = GridSearchCV(LogisticRegression(max_iter=2000, random_state=28), params_logistic, cv=5, scoring='accuracy')

grid_logistic.fit(X_train_following_scaled[1:20000], Y_train_following[1:20000])

best_logistic = grid_logistic.best_estimator_

print(f"Best parameters for LogisticRegression: {best_logistic}")


Best parameters for RandomForestClassifier: RandomForestClassifier(max_depth=30, min_samples_split=5, random_state=28)
Best parameters for GaussianNB: GaussianNB()
Best parameters for LogisticRegression: LogisticRegression(C=10, max_iter=2000, random_state=28)


In [142]:
# 2.6

Y_pred_forest = grid_forest.predict(X_test_following_scaled[0:10000])

Y_pred_gaussian = grid_gaussian.predict(X_test_following_scaled[0:10000])

Y_pred_logistic = grid_logistic.predict(X_test_following_scaled[0:10000])

print(f"Y_pred_forest shape: {Y_pred_forest.shape}")

print(f"Y_pred_gaussian shape: {Y_pred_gaussian.shape}")

print(f"Y_pred_logistic shape: {Y_pred_logistic.shape}")

# End estimate the accuracy

accuracy_forest = accuracy_score(Y_test_following[0:10000], Y_pred_forest[0:10000])

accuracy_gaussian = accuracy_score(Y_test_following[0:10000], Y_pred_gaussian[0:10000])

accuracy_logistic = accuracy_score(Y_test_following[0:10000], Y_pred_logistic[0:10000])


print(f"Accuracy forest: {accuracy_forest}")

print(f"Accuracy gaussian: {accuracy_gaussian}")

print(f"Accuracy logistic: {accuracy_logistic}")

Y_pred_forest shape: (10000,)
Y_pred_gaussian shape: (10000,)
Y_pred_logistic shape: (10000,)
Accuracy forest: 0.8228
Accuracy gaussian: 0.3471
Accuracy logistic: 0.5227
