In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

file_path = "../../web_service_dataset.csv"
df_original = pd.read_csv(file_path)

print(df_original.shape)

(2704839, 46)


In [2]:
# 2.1
# Do you think all the features are useful for our target, and explain why? If no, indicate the redundant features, and remove them from the dataset.

# We can delete irrelevant features (because they cannot help a lot with prediction of the target variable)

drop_features = ["src_ip", "src_port", "dst_ip", "dst_port"]
df_original.drop(columns=drop_features, inplace=True, errors='ignore')

# We need to delete constant features first
constant_features = [col for col in df_original.columns if df_original[col].nunique() == 1]

if len(constant_features) > 1: 
    print('They need to be deleted!')

# We need to delete duplicate features

df_original.drop_duplicates()

# We can delete features with high correlation 

numeric_df = df_original.select_dtypes(include=['number']) 
correlation_matrix = numeric_df.corr().abs()

c = correlation_matrix[correlation_matrix > 0.8]
s = c.unstack()
so = s.sort_values(ascending=False).reset_index()
so = so[(so[0].isnull() == False) & (so["level_0"] != so["level_1"])]

to_be_deleted = []
candidates = list(so["level_0"])
subset_so = so

for candidate in candidates:
    if (candidate in list(subset_so["level_0"])):
        to_be_deleted.append(candidate)
        subset_so = subset_so[(subset_so["level_0"] != candidate) & (subset_so["level_1"] != candidate)]
        
df_original.drop(columns=to_be_deleted, inplace=True, errors='ignore')

print(f"Size of modified dataframe: {df_original.shape}")



Size of modified dataframe: (2704839, 21)


In [3]:
df_original

Unnamed: 0,proto,min_ps,avg_ps,flowEnd,min_piat,std_dev_piat,f_pktTotalCount,f_octetTotalCount,f_avg_ps,f_std_dev_ps,...,f_flowDuration,f_std_dev_piat,b_octetTotalCount,b_min_ps,b_flowStart,b_min_piat,b_max_piat,b_std_dev_piat,category,web_service
0,UDP,328,346.363636,1.555955e+09,3.128052e-04,78.913442,17,5670,333.529412,9.140200,...,1701.385427,74.034708,1950,389,1.555955e+09,1.035380e-02,198.657965,84.916348,Network,DHCP
1,UDP,328,333.529412,1.555955e+09,1.035595e-02,74.034707,17,5670,333.529412,9.140200,...,1701.385515,74.034707,0,0,0.000000e+00,0.000000e+00,0.000000,0.000000,Network,DHCP
2,UDP,328,351.720930,1.555965e+09,2.391338e-04,58.972989,30,10062,335.400000,9.718024,...,1450.967340,65.290032,5062,389,1.555964e+09,2.233195e-02,340.268454,124.270745,Network,DHCP
3,UDP,328,336.200000,1.555965e+09,1.533008e-02,65.510537,30,10086,336.200000,10.057833,...,1450.967130,65.510537,0,0,0.000000e+00,0.000000e+00,0.000000,0.000000,Network,DHCP
4,ICMP,56,56.000000,1.555966e+09,0.000000e+00,0.000000,1,56,56.000000,0.000000,...,0.000000,0.000000,0,0,0.000000e+00,0.000000e+00,0.000000,0.000000,Network,ICMP
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2704834,UDP,78,78.000000,1.559771e+09,7.469001e-01,261.659496,9,702,78.000000,0.000000,...,979.409645,261.659496,0,0,0.000000e+00,0.000000e+00,0.000000,0.000000,System,NetBIOS
2704835,UDP,229,229.000000,1.559771e+09,7.190894e+02,0.164189,3,687,229.000000,0.000000,...,1438.507195,0.164189,0,0,0.000000e+00,0.000000e+00,0.000000,0.000000,System,NetBIOS
2704836,UDP,229,229.000000,1.559771e+09,7.190895e+02,0.164083,3,687,229.000000,0.000000,...,1438.507230,0.164083,0,0,0.000000e+00,0.000000e+00,0.000000,0.000000,System,NetBIOS
2704837,TCP,40,46.933333,1.559771e+09,1.150203e-02,24.747414,37,1540,41.621622,4.069341,...,1710.028548,14.809387,1980,40,1.559770e+09,1.150203e-02,90.155711,14.822745,Web,Google


In [4]:
# 2.2
# For all classes (web service name), do we have a balanced number of samples (flows)? If no, how many services only have few samples, and indicate some examples? Define a reasonable threshold of sample quantity and remove services that do not have enough samples (lower than the threshold). How many services do you have left? Note that the threshold should not be too high, e.g., 1000.

threshold = 1000

# We extract a series with all the counts
web_service_counts = df_original['web_service'].value_counts()

low_web_service = web_service_counts[web_service_counts < threshold]

print("Services with low web counts")
print(low_web_service.head())

# So we need to delete them 

df_original = df_original[~df_original['web_service'].isin(low_web_service.index)]

# So, it is the size of modified dataframe

print(f"Size of modified dataframe: {df_original.shape}")




Services with low web counts
web_service
TeamViewer     990
Telegram       832
SMTP           815
Steam          796
AppleiTunes    707
Name: count, dtype: int64
Size of modified dataframe: (2689966, 21)


In [5]:
# 2.3
# Perform a stratified split (based on the label) to segment the dataset into training (50%), validation (20%), and test (30%) dataset. Standardize your dataset, by fitting the scaler on training set and then transforming all data.

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# web_service will be our label 
X = df_original.loc[:, 'min_ps':'b_std_dev_piat'] #We exclude web_service, category and proto columns
Y = df_original['web_service']

# segmentation of dataset
# here we will obtain the test data (30% for test data => test_size = 0.3, and X_temp and Y_temp will he total amount 70 %)
X_temp, X_train, Y_temp, Y_train = train_test_split(X, Y, train_size=0.5, stratify=Y, random_state=1)

# here we will obtain the training (50%) and validation data (20% = 0.2857) from X_temp and Y_temp 
X_valid, X_test, Y_valid, Y_test = train_test_split(X_temp, Y_temp, test_size=0.6, stratify=Y_temp, random_state=15)

# Let's see what we have 

print(f"Size of training data: {X_train.shape}")
print(f"Size of validation data: {X_valid.shape}")
print(f"Size of testing data: {X_test.shape}")

# We need to standardize our dataset

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)
X_test_scaled = scaler.transform(X_test)


Size of training data: (1344983, 18)
Size of validation data: (537993, 18)
Size of testing data: (806990, 18)


In [6]:
# 2.4
# Select a basic classification method, training the model on training set while testing on test set. What kind of phenomenon can you observe, and why?

# Our classification method will be LogisticRegression, so let's try to apply it

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# creating model and train it
# model = LogisticRegression(solver='saga', verbose=1, random_state=42, n_jobs=-1,) - to long!!
# model.fit(X_train_scaled, Y_train)

# Let's try to use RandomForestClassifier

model_rf = RandomForestClassifier(n_estimators=150, random_state=28)
model_rf.fit(X_train_scaled[1:20000], Y_train[1:20000])




In [7]:
# 2.4
# It is important to compute the predictions of the model
Y_pred = model_rf.predict(X_test_scaled[0:10000])

# End estimate the accuracy
accuracy = accuracy_score(Y_test[0:10000], Y_pred[0:10000])

print(f"Accuracy on the test data: {accuracy}")

# The problem that we have is overfitting, but we will discuss it later 



Accuracy on the test data: 0.7009


In [9]:
# 2.5

# Redefine the threshold to be 100,000 and remove services that do not meet the threshold. What are the services do we have left? Repeat a similar process in 3 except that you just need to split the dataset into 70%/30%, so that the 70% can be used in the following question.

threshold = 100000

service_counts = df_original['web_service'].value_counts()

removed_services = service_counts[service_counts < threshold].index

# after that we need to delete them 
df_delete_web = df_original[~df_original['web_service'].isin(removed_services)]

# and check how many services we obtain after deleting
remaining_services = df_delete_web['web_service'].unique()
print(f"Remaining services after applying new threshold: {remaining_services}")


# Repeat a similar process in 3 except
X = df_delete_web.loc[:, 'min_ps':'b_std_dev_piat']
Y = df_delete_web['web_service']

# Split data for the following question

X_train_following, X_test_following, Y_train_following, Y_test_following = train_test_split(
    X, Y, test_size=0.3, stratify=Y, random_state=42)

X_train_following



Remaining services after applying new threshold: ['HTTP' 'Microsoft' 'Unknown' 'DNS' 'TLS' 'Google' 'Facebook']


Unnamed: 0,min_ps,avg_ps,flowEnd,min_piat,std_dev_piat,f_pktTotalCount,f_octetTotalCount,f_avg_ps,f_std_dev_ps,f_flowEnd,f_flowDuration,f_std_dev_piat,b_octetTotalCount,b_min_ps,b_flowStart,b_min_piat,b_max_piat,b_std_dev_piat
2388497,61,69.000000,1.559655e+09,0.000597,0.000000,1,61,61.000000,0.000000,1.559655e+09,0.000000,0.000000,77,77,1.559655e+09,0.000000,0.000000,0.000000
1980969,65,73.000000,1.559654e+09,0.000726,0.000000,1,65,65.000000,0.000000,1.559654e+09,0.000000,0.000000,81,81,1.559654e+09,0.000000,0.000000,0.000000
2197797,72,86.000000,1.559655e+09,0.001313,0.000000,1,72,72.000000,0.000000,1.559655e+09,0.000000,0.000000,100,100,1.559655e+09,0.000000,0.000000,0.000000
455727,68,121.000000,1.556049e+09,0.000726,0.000000,1,68,68.000000,0.000000,1.556049e+09,0.000000,0.000000,174,174,1.556049e+09,0.000000,0.000000,0.000000
882208,40,234.105263,1.556119e+09,0.000002,0.111488,10,1998,199.800000,333.847510,1.556119e+12,0.921821,0.142630,2450,40,1.556119e+09,0.000002,0.430382,0.144184
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1363138,40,237.740741,1.556291e+09,0.000004,8.655506,13,1902,146.307692,157.598997,1.556291e+12,48.290500,12.393739,4517,40,1.556291e+09,0.000008,45.031500,11.955703
1721414,67,110.500000,1.556638e+09,0.172608,0.000000,1,67,67.000000,0.000000,1.556638e+09,0.000000,0.000000,154,154,1.556638e+09,0.000000,0.000000,0.000000
1171662,40,377.746032,1.556125e+09,0.000002,14.746815,26,2327,89.500000,113.788687,1.556125e+12,240.291031,22.004356,21471,40,1.556125e+09,0.000003,90.092020,18.863907
1644287,40,152.111111,1.556311e+09,0.000130,0.000612,5,457,91.400000,96.911506,1.556311e+12,0.003745,0.000838,912,40,1.556311e+09,0.000354,0.002065,0.000725


In [12]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix

# 2.6

# Define models and their parameters for GridSearchCV
models = {
    'RandomForest': (RandomForestClassifier(random_state=42), {
        'n_estimators': [100, 150, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
    }),
    'SVM': (SVC(random_state=42), {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf'],
    }),
    'GradientBoosting': (GradientBoostingClassifier(random_state=42), {
        'n_estimators': [100, 150],
        'learning_rate': [0.05, 0.1, 0.2],
        'max_depth': [3, 5],
    }),
}

# Perform Grid Search for each model
best_models = {}
for model_name, (model, params) in models.items():
    grid_search = GridSearchCV(model, params, cv=3, scoring='accuracy', n_jobs=-1, verbose=1)
    grid_search.fit(X_train_following[1:20000], Y_train_following[1:20000])
    best_models[model_name] = grid_search.best_estimator_
    print(f"Best parameters for {model_name}: {grid_search.best_params_}")

# Evaluate each best model on the test set
for model_name, model in best_models.items():
    print(f"\nEvaluating {model_name} on the test set...")
    Y_pred = model.predict(X_test_following[1:10000])
    
    print("Classification Report:")
    print(classification_report(Y_test_following, Y_pred))
    
    print("Confusion Matrix:")
    print(confusion_matrix(Y_test_following, Y_pred))

# Compare with earlier results
# Earlier test accuracy was stored in the variable 'accuracy'
print("\nComparing with earlier results:")
print(f"Earlier test accuracy: {accuracy}")
for model_name, model in best_models.items():
    Y_pred = model.predict(X_test_following)
    test_accuracy = accuracy_score(Y_test_following, Y_pred)
    print(f"{model_name} test accuracy: {test_accuracy}")


Fitting 3 folds for each of 27 candidates, totalling 81 fits
Best parameters for RandomForest: {'max_depth': 20, 'min_samples_split': 5, 'n_estimators': 100}
Fitting 3 folds for each of 6 candidates, totalling 18 fits


KeyboardInterrupt: 