## Training Multiclass Classifier with top 25 features from Random Forest feature selection

In [17]:
import pandas as pd
import numpy as np
import warnings
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight


import xgboost as xgb

warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv('data/train_multiclass.csv')

In [4]:
df.columns

Index(['ID', 'flow_duration', 'Header_Length', 'Protocol type', 'Duration',
       'Rate', 'Srate', 'Drate', 'fin_flag_number', 'syn_flag_number',
       'rst_flag_number', 'psh_flag_number', 'ack_flag_number',
       'ece_flag_number', 'cwr_flag_number', 'ack_count', 'syn_count',
       'fin_count', 'urg_count', 'rst_count', 'HTTP', 'HTTPS', 'DNS', 'Telnet',
       'SMTP', 'SSH', 'IRC', 'TCP', 'UDP', 'DHCP', 'ARP', 'ICMP', 'IPv', 'LLC',
       'Tot sum', 'Min', 'Max', 'AVG', 'Std', 'Tot size', 'IAT', 'Number',
       'Magnitue', 'Radius', 'Covariance', 'Variance', 'Weight', 'Label',
       'Multiclass'],
      dtype='object')

In [6]:
df = df.drop(columns=['ID', 'Label',
                      'TCP', 'fin_count', 'ack_count', 'psh_flag_number', 'HTTPS',
                      'syn_flag_number', 'fin_flag_number', 'rst_flag_number', 'HTTP',
                      'SSH', 'DNS', 'LLC', 'IPv', 'ARP', 'ece_flag_number', 'Drate',
                      'cwr_flag_number', 'DHCP', 'IRC', 'Telnet', 'SMTP'])

In [7]:
df.shape

(1950138, 26)

In [9]:
encoder = LabelEncoder()
df['Multiclass'] = encoder.fit_transform(df['Multiclass'])

In [10]:
X = df.drop(columns=['Multiclass'])
y = df['Multiclass']

In [11]:
from sklearn.preprocessing import RobustScaler

scaler = RobustScaler()
# Fit on training data
scaler.fit(X)
X = scaler.transform(X)

In [13]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Further split the testing set into testing and validation sets
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

# Print the sizes of the resulting datasets
print("Training set size:", X_train.shape[0])
print("Validation set size:", X_val.shape[0])
print("Testing set size:", X_test.shape[0])

Training set size: 1365096
Validation set size: 292521
Testing set size: 292521


In [15]:
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
weights = dict(zip(np.unique(y_train), class_weights))

In [16]:
sample_weight = np.array([weights[label] for label in y_train])
sample_weight

array([0.90096426, 0.24715064, 1.29258704, ..., 0.86325978, 0.71441446,
       2.56849497], shape=(1365096,))

In [18]:
model = xgb.XGBClassifier(random_state=42)
model.fit(X_train, y_train, sample_weight=sample_weight)

In [19]:
y_pred = model.predict(X_test)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("Precision: ",precision)
print("Recall: ",recall)
print("F1 Score: ",f1)

Precision:  0.9593880723168781
Recall:  0.9394299896417693
F1 Score:  0.945845344968793


In [20]:
y_pred_train =  model.predict(X_train)
y_pred_test = model.predict(X_test)

In [21]:
f1_train = f1_score(y_train, y_pred_train, average='weighted')
f1_test = f1_score(y_test, y_pred_test, average='weighted')
print("F1 score on the training set: ", f1_train)
print("F1 score on the test set: ", f1_test)

F1 score on the training set:  0.9520245158218135
F1 score on the test set:  0.945845344968793


In [22]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, f1_score

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'reg_alpha': [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
}

# Create a GridSearchCV object
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring=make_scorer(f1_score, average='weighted'), cv=5, verbose=2)

# Fit the grid search to the validation data
grid_search.fit(X_val, y_val)

# Print the best parameters and the best average F1 score
print("Best parameters found: ", grid_search.best_params_)
print("Best average F1 score found: ", grid_search.best_score_)

Fitting 5 folds for each of 11 candidates, totalling 55 fits
[CV] END ........................................reg_alpha=0; total time=   7.4s
[CV] END ........................................reg_alpha=0; total time=   7.4s
[CV] END ........................................reg_alpha=0; total time=   7.3s
[CV] END ........................................reg_alpha=0; total time=   7.3s
[CV] END ........................................reg_alpha=0; total time=   7.3s
[CV] END ......................................reg_alpha=0.1; total time=   7.4s
[CV] END ......................................reg_alpha=0.1; total time=   7.2s
[CV] END ......................................reg_alpha=0.1; total time=   7.0s
[CV] END ......................................reg_alpha=0.1; total time=   7.3s
[CV] END ......................................reg_alpha=0.1; total time=   7.1s
[CV] END ......................................reg_alpha=0.2; total time=   7.2s
[CV] END ......................................r

In [23]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, f1_score

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'reg_lambda': [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
}

# Create a GridSearchCV object
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring=make_scorer(f1_score, average='weighted'), cv=5, verbose=2)

# Fit the grid search to the validation data
grid_search.fit(X_val, y_val)

# Print the best parameters and the best average F1 score
print("Best parameters found: ", grid_search.best_params_)
print("Best average F1 score found: ", grid_search.best_score_)

Fitting 5 folds for each of 11 candidates, totalling 55 fits
[CV] END .......................................reg_lambda=0; total time=   7.2s
[CV] END .......................................reg_lambda=0; total time=   6.9s
[CV] END .......................................reg_lambda=0; total time=   6.9s
[CV] END .......................................reg_lambda=0; total time=   7.1s
[CV] END .......................................reg_lambda=0; total time=   7.1s
[CV] END .....................................reg_lambda=0.1; total time=   7.3s
[CV] END .....................................reg_lambda=0.1; total time=   6.7s
[CV] END .....................................reg_lambda=0.1; total time=   6.7s
[CV] END .....................................reg_lambda=0.1; total time=   6.7s
[CV] END .....................................reg_lambda=0.1; total time=   7.1s
[CV] END .....................................reg_lambda=0.2; total time=   7.0s
[CV] END .....................................re

In [24]:
model = xgb.XGBClassifier(reg_alpha=0.3, reg_lambda=0.1, random_state=42)
model.fit(X_train, y_train, sample_weight=sample_weight)

In [25]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy: {:.4f}".format(accuracy))
print("Precision: {:.4f}".format(precision))
print("Recall: {:.4f}".format(recall))
print("F1 Score: {:.4f}".format(f1))

Accuracy: 0.9391
Precision: 0.9592
Recall: 0.9391
F1 Score: 0.9455


In [26]:
f1_train = f1_score(y_train, y_pred_train, average='weighted')
f1_test = f1_score(y_test, y_pred_test, average='weighted')
print("F1 score on the training set: ", f1_train)
print("F1 score on the test set: ", f1_test)

F1 score on the training set:  0.9520245158218135
F1 score on the test set:  0.945845344968793
