## Training Binary Classifier with top 25 features from Random Forest feature selection

### Load the dataset

In [1]:
import pandas as pd
import warnings
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

warnings.filterwarnings('ignore')

### Load the dataset

In [2]:
df = pd.read_csv('data/train_binary.csv')

### Drop unnecessary columns

In [3]:
df.columns

Index(['ID', 'flow_duration', 'Header_Length', 'Protocol type', 'Duration',
       'Rate', 'Srate', 'Drate', 'fin_flag_number', 'syn_flag_number',
       'rst_flag_number', 'psh_flag_number', 'ack_flag_number',
       'ece_flag_number', 'cwr_flag_number', 'ack_count', 'syn_count',
       'fin_count', 'urg_count', 'rst_count', 'HTTP', 'HTTPS', 'DNS', 'Telnet',
       'SMTP', 'SSH', 'IRC', 'TCP', 'UDP', 'DHCP', 'ARP', 'ICMP', 'IPv', 'LLC',
       'Tot sum', 'Min', 'Max', 'AVG', 'Std', 'Tot size', 'IAT', 'Number',
       'Magnitue', 'Radius', 'Covariance', 'Variance', 'Weight', 'Label',
       'Binary Class'],
      dtype='object')

In [4]:
df[['Binary Class']].value_counts()

Binary Class
Attack          1866053
Benign            84085
Name: count, dtype: int64

In [5]:
df = df.drop(columns=['ID', 'Label',
                      'fin_count', 'ack_count', 'HTTP', 'psh_flag_number', 'UDP',
                      'syn_flag_number', 'rst_flag_number', 'ICMP', 'SSH', 'DNS',
                      'fin_flag_number', 'LLC', 'IPv', 'ARP', 'ece_flag_number',
                      'cwr_flag_number', 'DHCP', 'IRC', 'Drate', 'Telnet', 'SMTP'])

In [6]:
df.shape

(1950138, 26)

### Encode the binary class

In [7]:
encoder = LabelEncoder()
df['Binary Class'] = encoder.fit_transform(df['Binary Class'])

In [8]:
X = df.drop(columns=['Binary Class'])
y = df['Binary Class']

In [9]:
scaler = RobustScaler()
scaler.fit(X)
X = scaler.transform(X)

### Split the dataset

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

print("Training set size:", X_train.shape[0])
print("Validation set size:", X_val.shape[0])
print("Testing set size:", X_test.shape[0])

Training set size: 1365096
Validation set size: 292521
Testing set size: 292521


In [11]:
scale_pos_weight = (1 / 0.0235)

In [12]:
scale_pos_weight

42.5531914893617

### Train the model

In [13]:
import xgboost as xgb

model = xgb.XGBClassifier(scale_pos_weight=scale_pos_weight, random_state=42)
model.fit(X_train, y_train)

### Evaluate the model

In [14]:
y_pred = model.predict(X_test)
# Evaluate the model
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("Precision: ",precision)
print("Recall: ",recall)
print("F1 Score: ",f1)

Precision:  0.9786040887279208
Recall:  0.9602079850677387
F1 Score:  0.9659413286164009


### Calculate F1 score on the training set

In [15]:
# Predict on the test set
y_pred_train =  model.predict(X_train)
y_pred_test = model.predict(X_test)

In [16]:
# Calculate F1 score on the training set
f1_train = f1_score(y_train, y_pred_train, average='weighted')
# Calculate F1 score on the test set
f1_test = f1_score(y_test, y_pred_test, average='weighted')
print("F1 score on the training set: ", f1_train)
print("F1 score on the test set: ", f1_test)

F1 score on the training set:  0.9666233228922582
F1 score on the test set:  0.9659413286164009


### Hyperparameter tuning

In [17]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, f1_score

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'reg_alpha': [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
}

# Create a GridSearchCV object
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring=make_scorer(f1_score, average='weighted'), cv=5, verbose=2)

# Fit the grid search to the validation data
grid_search.fit(X_val, y_val)

# Print the best parameters and the best average F1 score
print("Best parameters found: ", grid_search.best_params_)
print("Best average F1 score found: ", grid_search.best_score_)

Fitting 5 folds for each of 11 candidates, totalling 55 fits
[CV] END ........................................reg_alpha=0; total time=   0.8s
[CV] END ........................................reg_alpha=0; total time=   0.8s
[CV] END ........................................reg_alpha=0; total time=   0.8s
[CV] END ........................................reg_alpha=0; total time=   0.8s
[CV] END ........................................reg_alpha=0; total time=   0.8s
[CV] END ......................................reg_alpha=0.1; total time=   0.8s
[CV] END ......................................reg_alpha=0.1; total time=   0.8s
[CV] END ......................................reg_alpha=0.1; total time=   0.7s
[CV] END ......................................reg_alpha=0.1; total time=   0.8s
[CV] END ......................................reg_alpha=0.1; total time=   0.7s
[CV] END ......................................reg_alpha=0.2; total time=   0.7s
[CV] END ......................................r

### Hyperparameter tuning

In [18]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, f1_score

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'reg_lambda': [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
}

# Create a GridSearchCV object
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring=make_scorer(f1_score, average='weighted'), cv=5, verbose=2)

# Fit the grid search to the validation data
grid_search.fit(X_val, y_val)

# Print the best parameters and the best average F1 score
print("Best parameters found: ", grid_search.best_params_)
print("Best average F1 score found: ", grid_search.best_score_)

Fitting 5 folds for each of 11 candidates, totalling 55 fits
[CV] END .......................................reg_lambda=0; total time=   0.7s
[CV] END .......................................reg_lambda=0; total time=   0.7s
[CV] END .......................................reg_lambda=0; total time=   0.8s
[CV] END .......................................reg_lambda=0; total time=   0.8s
[CV] END .......................................reg_lambda=0; total time=   0.7s
[CV] END .....................................reg_lambda=0.1; total time=   0.7s
[CV] END .....................................reg_lambda=0.1; total time=   0.7s
[CV] END .....................................reg_lambda=0.1; total time=   0.7s
[CV] END .....................................reg_lambda=0.1; total time=   0.7s
[CV] END .....................................reg_lambda=0.1; total time=   0.7s
[CV] END .....................................reg_lambda=0.2; total time=   0.8s
[CV] END .....................................re

### Training the model with the best hyperparameters

In [19]:
# Create an XGBoost model with custom class weights
model = xgb.XGBClassifier(scale_pos_weight=scale_pos_weight, reg_alpha=0.8, reg_lambda=0, random_state=42)
model.fit(X_train, y_train)

### Evaluate the model

In [20]:
# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy: {:.4f}".format(accuracy))
print("Precision: {:.4f}".format(precision))
print("Recall: {:.4f}".format(recall))
print("F1 Score: {:.4f}".format(f1))

Accuracy: 0.9589
Precision: 0.9783
Recall: 0.9589
F1 Score: 0.9649


### Calculate F1 score on the training set

In [21]:
# Calculate F1 score on the training set
f1_train = f1_score(y_train, y_pred_train, average='weighted')
# Calculate F1 score on the test set
f1_test = f1_score(y_test, y_pred_test, average='weighted')
print("F1 score on the training set: ", f1_train)
print("F1 score on the test set: ", f1_test)

F1 score on the training set:  0.9666233228922582
F1 score on the test set:  0.9659413286164009


### Save the model

In [22]:
import joblib
joblib.dump(model, 'model/xgb_binary_top25.joblib')

['model/xgb_binary_top25.joblib']