# Load model and predict on the train set

## Import libraries

In [1]:
import joblib
import pandas as pd
import warnings
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.preprocessing import LabelEncoder

warnings.filterwarnings("ignore")


## Load the model

In [2]:
model = joblib.load('model/xgb_multiclass_top25.joblib')

In [3]:
model

### Predict on the train set

In [4]:
df_train = pd.read_csv('data/train_multiclass.csv')

### Drop unnecessary columns

In [5]:
df_train = df_train.drop(columns=['ID', 'Label',
                      'TCP', 'fin_count', 'ack_count', 'psh_flag_number', 'HTTPS',
                      'syn_flag_number', 'fin_flag_number', 'rst_flag_number', 'HTTP',
                      'SSH', 'DNS', 'LLC', 'IPv', 'ARP', 'ece_flag_number', 'Drate',
                      'cwr_flag_number', 'DHCP', 'IRC', 'Telnet', 'SMTP'])

In [6]:
df_train.columns

Index(['flow_duration', 'Header_Length', 'Protocol type', 'Duration', 'Rate',
       'Srate', 'ack_flag_number', 'syn_count', 'urg_count', 'rst_count',
       'UDP', 'ICMP', 'Tot sum', 'Min', 'Max', 'AVG', 'Std', 'Tot size', 'IAT',
       'Number', 'Magnitue', 'Radius', 'Covariance', 'Variance', 'Weight',
       'Multiclass'],
      dtype='object')

In [7]:
df_train.shape

(1950138, 26)

### Encode the labels

In [8]:
df_train['Multiclass'].value_counts()

Multiclass
DDoS          876708
DoS           302825
Mirai         251721
Recon         240063
Spoofing      168006
Benign         84085
Web            15265
BruteForce      9213
Malware         2252
Name: count, dtype: int64

In [9]:
encoder = LabelEncoder()
df_train['Multiclass'] = encoder.fit_transform(df_train['Multiclass'])

### Get the labels

In [10]:
X = df_train.drop(columns=['Multiclass'])
y = df_train['Multiclass']

In [11]:
### Scale the data

In [12]:
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
scaler.fit(X)
X = scaler.transform(X)

### Predict

In [13]:
y_pred_train = model.predict(X)

### Evaluate the model

In [14]:
accuracy = accuracy_score(y, y_pred_train)
precision = precision_score(y, y_pred_train, average='weighted')
recall = recall_score(y, y_pred_train, average='weighted')
f1 = f1_score(y, y_pred_train, average='weighted')
print("Accuracy: {:.4f}".format(accuracy))
print("Precision: {:.4f}".format(precision))
print("Recall: {:.4f}".format(recall))

Accuracy: 0.9447
Precision: 0.9638
Recall: 0.9447
