In [2]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
import kagglehub

In [3]:
try:
    from imblearn.over_sampling import SMOTE
    from imblearn.combine import SMOTETomek
    IMBALANCED_LEARN_AVAILABLE = True
except ImportError:
    print("Warning: imbalanced-learn not available. Install with: pip install imbalanced-learn")
    IMBALANCED_LEARN_AVAILABLE = False

data_test = pd.read_csv('../raw_data/merged_data.csv', sep='|', low_memory=False)

In [4]:
# data_test=data.copy()

In [5]:
# data.head()
data_test.head()

Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,proto,service,duration,orig_bytes,...,local_resp,missed_bytes,history,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,tunnel_parents,label,detailed-label
0,1545403000.0,CdNmOg26ZIaBRzPvWj,192.168.1.196,59932.0,104.248.160.24,80.0,tcp,-,3.097754,0,...,-,0.0,S,3.0,180.0,0.0,0.0,-,Malicious C&C,
1,1545403000.0,CgzGV333k9WCximeu8,192.168.1.196,59932.0,104.248.160.24,80.0,tcp,-,-,-,...,-,0.0,S,1.0,60.0,0.0,0.0,-,Malicious C&C,
2,1545403000.0,CLm5Pd3ZnqmYVjrZ44,192.168.1.196,59932.0,104.248.160.24,80.0,tcp,-,-,-,...,-,0.0,S,1.0,60.0,0.0,0.0,-,Malicious C&C,
3,1545403000.0,CDn2pd1rDD1lCMXAia,192.168.1.196,35883.0,192.168.1.1,53.0,udp,dns,5.005148,78,...,-,0.0,D,2.0,134.0,0.0,0.0,-,Benign,-
4,1545403000.0,C1NKkV3tB4rImzbpDj,192.168.1.196,43531.0,192.168.1.1,53.0,udp,dns,5.005145,78,...,-,0.0,D,2.0,134.0,0.0,0.0,-,Benign,-


In [6]:
# data.info()
data_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000363 entries, 0 to 25000362
Data columns (total 23 columns):
 #   Column          Dtype  
---  ------          -----  
 0   ts              float64
 1   uid             object 
 2   id.orig_h       object 
 3   id.orig_p       float64
 4   id.resp_h       object 
 5   id.resp_p       float64
 6   proto           object 
 7   service         object 
 8   duration        object 
 9   orig_bytes      object 
 10  resp_bytes      object 
 11  conn_state      object 
 12  local_orig      object 
 13  local_resp      object 
 14  missed_bytes    float64
 15  history         object 
 16  orig_pkts       float64
 17  orig_ip_bytes   float64
 18  resp_pkts       float64
 19  resp_ip_bytes   float64
 20  tunnel_parents  object 
 21  label           object 
 22  detailed-label  object 
dtypes: float64(8), object(15)
memory usage: 4.3+ GB


In [7]:
print((data_test.isnull().sum()/len(data_test)*100).sort_values(ascending=False))

detailed-label    36.695223
local_orig         0.000000
label              0.000000
tunnel_parents     0.000000
resp_ip_bytes      0.000000
resp_pkts          0.000000
orig_ip_bytes      0.000000
orig_pkts          0.000000
history            0.000000
missed_bytes       0.000000
local_resp         0.000000
ts                 0.000000
uid                0.000000
resp_bytes         0.000000
orig_bytes         0.000000
duration           0.000000
service            0.000000
proto              0.000000
id.resp_p          0.000000
id.resp_h          0.000000
id.orig_p          0.000000
id.orig_h          0.000000
conn_state         0.000000
dtype: float64


In [8]:
# data.replace('-', np.nan, inplace=True)
data_test.replace('-', np.nan, inplace=True)

  data_test.replace('-', np.nan, inplace=True)


In [9]:
print((data_test.isnull().sum()/len(data_test)*100).sort_values(ascending=False))

tunnel_parents    100.000000
local_orig        100.000000
local_resp        100.000000
service            99.928057
detailed-label     71.805777
duration           61.062341
orig_bytes         61.062341
resp_bytes         61.062341
history             0.100463
label               0.000000
resp_ip_bytes       0.000000
resp_pkts           0.000000
orig_ip_bytes       0.000000
orig_pkts           0.000000
ts                  0.000000
missed_bytes        0.000000
uid                 0.000000
proto               0.000000
id.resp_p           0.000000
id.resp_h           0.000000
id.orig_p           0.000000
id.orig_h           0.000000
conn_state          0.000000
dtype: float64


In [10]:
# data['label'].value_counts()
data_test['label'].value_counts()

label
Benign                                   8777766
Malicious                                7046785
Malicious   DDoS                         5778153
Malicious   PartOfAHorizontalPortScan    3386241
Malicious   C&C                             8660
Malicious   Attack                          2755
Malicious   FileDownload                       3
Name: count, dtype: int64

In [11]:
# data['detailed-label'].value_counts()
data_test['detailed-label'].value_counts()

detailed-label
PartOfAHorizontalPortScan    7041695
Attack                          5962
HeartBeat                        940
Torii                             30
C&C                               16
FileDownload                      15
Name: count, dtype: int64

In [12]:
data_test['label'].value_counts()

label
Benign                                   8777766
Malicious                                7046785
Malicious   DDoS                         5778153
Malicious   PartOfAHorizontalPortScan    3386241
Malicious   C&C                             8660
Malicious   Attack                          2755
Malicious   FileDownload                       3
Name: count, dtype: int64

In [13]:
# data['merged']= data['label'] + '-' + data['detailed-label']
data_test['merged'] = data_test['label'] + ' - ' + data_test['detailed-label'].fillna('')

In [14]:
# data['merged'].value_counts()
data_test['merged'].value_counts()

merged
Benign -                                       8777766
Malicious - PartOfAHorizontalPortScan          7040807
Malicious   DDoS -                             5778153
Malicious   PartOfAHorizontalPortScan -        3386241
Malicious   C&C -                                 6787
Malicious - Attack                                5962
Malicious   Attack -                              2755
Malicious   C&C - HeartBeat                        940
Malicious   C&C - PartOfAHorizontalPortScan        888
Malicious   C&C - Torii                             30
Malicious - C&C                                     16
Malicious   C&C - FileDownload                      15
Malicious   FileDownload -                           3
Name: count, dtype: int64

In [15]:
# Split the 'merged' column into 'label' and 'detailed-label' after 'Benign' or 'Malicious'
# data[['label', 'detailed-label']] = data['merged'].str.extract(r'^(Malicious)\s*[-]?\s*(.*)$')
data_test[['label', 'detailed-label']] = data_test['merged'].str.extract(r'^(Benign|Malicious)\s*[-]?\s*(.*)$')

In [16]:
data_test['label'].value_counts()

label
Malicious    16222597
Benign        8777766
Name: count, dtype: int64

In [17]:
print((data_test.isnull().sum()/len(data_test)*100).sort_values(ascending=False))

local_orig        100.000000
tunnel_parents    100.000000
local_resp        100.000000
service            99.928057
duration           61.062341
orig_bytes         61.062341
resp_bytes         61.062341
history             0.100463
detailed-label      0.000000
label               0.000000
resp_ip_bytes       0.000000
resp_pkts           0.000000
orig_ip_bytes       0.000000
orig_pkts           0.000000
ts                  0.000000
missed_bytes        0.000000
uid                 0.000000
conn_state          0.000000
proto               0.000000
id.resp_p           0.000000
id.resp_h           0.000000
id.orig_p           0.000000
id.orig_h           0.000000
merged              0.000000
dtype: float64


In [18]:
data_test['detailed-label'].replace('', 'Benign', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data_test['detailed-label'].replace('', 'Benign', inplace=True)


In [19]:
data_test['detailed-label'] = data_test['detailed-label'].str.replace(r'[\s\-]+$', '', regex=True)

In [20]:
data_test['detailed-label'].value_counts()

detailed-label
PartOfAHorizontalPortScan          10427048
Benign                              8777766
DDoS                                5778153
Attack                                 8717
C&C                                    6803
C&C - HeartBeat                         940
C&C - PartOfAHorizontalPortScan         888
C&C - Torii                              30
C&C - FileDownload                       15
FileDownload                              3
Name: count, dtype: int64

In [21]:
# data['label'].value_counts()
data_test['label'].value_counts()

label
Malicious    16222597
Benign        8777766
Name: count, dtype: int64

In [22]:
# data.drop('merged', axis=1, inplace=True)
data_test.drop('merged', axis=1, inplace=True)

In [23]:
# data.head()
data_test.head()

Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,proto,service,duration,orig_bytes,...,local_resp,missed_bytes,history,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,tunnel_parents,label,detailed-label
0,1545403000.0,CdNmOg26ZIaBRzPvWj,192.168.1.196,59932.0,104.248.160.24,80.0,tcp,,3.097754,0.0,...,,0.0,S,3.0,180.0,0.0,0.0,,Malicious,C&C
1,1545403000.0,CgzGV333k9WCximeu8,192.168.1.196,59932.0,104.248.160.24,80.0,tcp,,,,...,,0.0,S,1.0,60.0,0.0,0.0,,Malicious,C&C
2,1545403000.0,CLm5Pd3ZnqmYVjrZ44,192.168.1.196,59932.0,104.248.160.24,80.0,tcp,,,,...,,0.0,S,1.0,60.0,0.0,0.0,,Malicious,C&C
3,1545403000.0,CDn2pd1rDD1lCMXAia,192.168.1.196,35883.0,192.168.1.1,53.0,udp,dns,5.005148,78.0,...,,0.0,D,2.0,134.0,0.0,0.0,,Benign,Benign
4,1545403000.0,C1NKkV3tB4rImzbpDj,192.168.1.196,43531.0,192.168.1.1,53.0,udp,dns,5.005145,78.0,...,,0.0,D,2.0,134.0,0.0,0.0,,Benign,Benign


In [24]:
# data.drop_duplicates(inplace=True)
data_test.drop_duplicates(inplace=True)

In [25]:
data_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000363 entries, 0 to 25000362
Data columns (total 23 columns):
 #   Column          Dtype  
---  ------          -----  
 0   ts              float64
 1   uid             object 
 2   id.orig_h       object 
 3   id.orig_p       float64
 4   id.resp_h       object 
 5   id.resp_p       float64
 6   proto           object 
 7   service         object 
 8   duration        object 
 9   orig_bytes      object 
 10  resp_bytes      object 
 11  conn_state      object 
 12  local_orig      float64
 13  local_resp      float64
 14  missed_bytes    float64
 15  history         object 
 16  orig_pkts       float64
 17  orig_ip_bytes   float64
 18  resp_pkts       float64
 19  resp_ip_bytes   float64
 20  tunnel_parents  float64
 21  label           object 
 22  detailed-label  object 
dtypes: float64(11), object(12)
memory usage: 4.3+ GB


In [26]:
cols = ['orig_bytes', 'resp_bytes', 'resp_ip_bytes', 'orig_ip_bytes', 'resp_pkts', 'orig_pkts']

for col in cols:
    data_test[col] = pd.to_numeric(data_test[col], errors='coerce')
    data_test.loc[data_test[col].notna(), col] = data_test.loc[data_test[col].notna(), col].astype(int)


In [27]:
columns = list(data_test.columns)
zero_ratios = {col: (data_test[col] == 0).mean() for col in columns}

for col, ratio in zero_ratios.items():
    print(f"{col}: {ratio:.2%} zeros")

ts: 0.00% zeros
uid: 0.00% zeros
id.orig_h: 0.00% zeros
id.orig_p: 0.00% zeros
id.resp_h: 0.00% zeros
id.resp_p: 0.03% zeros
proto: 0.00% zeros
service: 0.00% zeros
duration: 0.00% zeros
orig_bytes: 38.47% zeros
resp_bytes: 38.80% zeros
conn_state: 0.00% zeros
local_orig: 0.00% zeros
local_resp: 0.00% zeros
missed_bytes: 100.00% zeros
history: 0.00% zeros
orig_pkts: 14.38% zeros
orig_ip_bytes: 14.38% zeros
resp_pkts: 99.79% zeros
resp_ip_bytes: 99.79% zeros
tunnel_parents: 0.00% zeros
label: 0.00% zeros
detailed-label: 0.00% zeros


In [28]:
drop_cols = ['resp_pkts','resp_ip_bytes','uid', 'tunnel_parents', 'local_resp', 'local_orig', 'missed_bytes', 'ts', 'id.orig_h', 'id.resp_h', 'id.resp_p', 'id.orig_p', 'label', 'service']
data_test.drop(columns=drop_cols, inplace=True, errors='ignore')

data_test.dropna(subset=[ 'duration', 'orig_bytes', 'resp_bytes'], inplace=True)

In [29]:
data_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9734556 entries, 0 to 25000361
Data columns (total 9 columns):
 #   Column          Dtype  
---  ------          -----  
 0   proto           object 
 1   duration        object 
 2   orig_bytes      float64
 3   resp_bytes      float64
 4   conn_state      object 
 5   history         object 
 6   orig_pkts       float64
 7   orig_ip_bytes   float64
 8   detailed-label  object 
dtypes: float64(4), object(5)
memory usage: 742.7+ MB


In [30]:
columns = list(data_test.columns)
zero_ratios = {col: (data_test[col] == 0).mean() for col in columns}

for col, ratio in zero_ratios.items():
    print(f"{col}: {ratio:.2%} zeros")

proto: 0.00% zeros
duration: 0.00% zeros
orig_bytes: 98.79% zeros
resp_bytes: 99.65% zeros
conn_state: 0.00% zeros
history: 0.00% zeros
orig_pkts: 0.00% zeros
orig_ip_bytes: 0.00% zeros
detailed-label: 0.00% zeros


In [31]:
data_test.drop(columns=['orig_bytes', 'resp_bytes'], inplace=True, errors='ignore')

In [32]:
columns = list(data_test.columns)
zero_ratios = {col: (data_test[col] == 0).mean() for col in columns}

for col, ratio in zero_ratios.items():
    print(f"{col}: {ratio:.2%} zeros")

proto: 0.00% zeros
duration: 0.00% zeros
conn_state: 0.00% zeros
history: 0.00% zeros
orig_pkts: 0.00% zeros
orig_ip_bytes: 0.00% zeros
detailed-label: 0.00% zeros


In [33]:
data_test.duplicated().sum()

8005029

In [34]:
data_test.drop_duplicates(inplace=True)

In [35]:
data_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1729527 entries, 0 to 25000361
Data columns (total 7 columns):
 #   Column          Dtype  
---  ------          -----  
 0   proto           object 
 1   duration        object 
 2   conn_state      object 
 3   history         object 
 4   orig_pkts       float64
 5   orig_ip_bytes   float64
 6   detailed-label  object 
dtypes: float64(2), object(5)
memory usage: 105.6+ MB


In [36]:
# empty_history_count = (data['history'].isna() | (data['history'] == '-') | (data['history'] == '0')).sum()
empty_history_count = (data_test['history'].isna() | (data_test['history'] == '-') | (data_test['history'] == '0')).sum()
print(f"Number of empty or '0' values in 'history' column: {empty_history_count}")

Number of empty or '0' values in 'history' column: 1522


In [37]:
data_test['history'].isna().sum()

1522

In [38]:
data_test.dropna(subset=['history'], inplace=True)

In [39]:
data_test['history'].isna().sum()

0

In [40]:
data_test['history'].unique()

array(['S', 'D', 'Dd', 'DdAaFf', 'ShADadttfF', 'ShADadtfF', 'Sr',
       'ShAdfFa', 'ShAdDaR', 'ShAr', 'ShAdDaTR', 'ShAfF', 'ShAF',
       'ShAdDafr', 'ShAaw', 'ShAfFa', 'ShAdDaTRr', 'ShAdDaTFf', 'ShAdfDF',
       'ShAdDafF', 'ShwA', 'ShAdDaTfF', 'ShAdfr', 'ShAdDaTFR', 'Fa',
       'ShAdDar', 'ShwAr', 'ShAdDaRr', 'SaR', 'ShAdfDr', 'ShAdDaFf',
       'ShAdDaTfRr', 'I', 'DTT', 'DT', '^dtt', 'ShAdr', 'ShAdDaTRft',
       'ShAdDafR', 'DrF', 'DFr', 'ShAdfF', 'ShAdtfFa', 'SI', 'ShADdfFa',
       'ShADadFf', 'ShAdaDR', 'ShADdfF', 'ShADdattFfR', 'ShADdf', 'F',
       'ShAfdtF', 'ShADdFaf', 'ShADdFf', 'ShADdtatFfR', 'ShAdDaFRR', 'Dr',
       'ShAdDaRRR', 'ShAdDaTRf', 'HaDdAr', 'ShADad', 'ShADadfrr', 'DdA',
       'ShAdFaf', 'ShAadDr', 'ShAdDaFr', 'ShAdDaFfR', 'ShAdDaFRf',
       'ShADFr', 'ShADF', 'ShADFfR', 'ShADFa', 'ShR', '^aA', 'ShADFar',
       'ShAdDaFRRfR', 'ShAdDaFRRRf', 'ShADFaR', 'ShArR', 'ShADadFfR',
       'ShAdDaF', 'ShAa', 'ShAdDaFT', 'ShAdDFf', 'ShAdDatFf', 'ShAdDaFTf',
       'S

In [41]:
def add_history_bucket(df, col='history', drop_original=True):
    PURE_MALICIOUS = {'I', 'DTT'}
    SUSPICIOUS_COMBOS = {
        'ShAdDaFf', 'ShAdDafF', 'ShADadfF', 'ShADafF',
        'ShADar', 'ShAdDaFr', 'ShAdDfFr', 'ShAdDaft',
        'ShADr', 'ShADdfFa'
    }
    PURE_BENIGN = {'D', 'Dd', 'R'}
    def bucket_history(val):
        if val == 'S':
            return 'majority_S'
        elif val in PURE_MALICIOUS:
            return 'pure_malicious'
        elif val in SUSPICIOUS_COMBOS:
            return 'known_suspicious_combos'
        elif val in PURE_BENIGN:
            return 'pure_benign'
        else:
            return 'rare_mixed'
    df[f'{col}_bucket'] = df[col].apply(bucket_history)
    if drop_original:
        df.drop(columns=[col], inplace=True)
    return df


In [42]:
add_history_bucket(data_test)

Unnamed: 0,proto,duration,conn_state,orig_pkts,orig_ip_bytes,detailed-label,history_bucket
0,tcp,3.097754,S0,3.0,180.0,C&C,majority_S
3,udp,5.005148,S0,2.0,134.0,Benign,pure_benign
4,udp,5.005145,S0,2.0,134.0,Benign,pure_benign
6,udp,5.005149,S0,2.0,134.0,Benign,pure_benign
7,udp,0.143648,SF,1.0,76.0,Benign,pure_benign
...,...,...,...,...,...,...,...
25000348,tcp,0.054468,SF,7.0,388.0,Benign,rare_mixed
25000349,udp,0.007744,SF,2.0,136.0,Benign,pure_benign
25000350,udp,0.000994,SF,2.0,136.0,Benign,pure_benign
25000360,udp,0.005252,SF,1.0,76.0,Benign,pure_benign


In [43]:
data_test['detailed-label'].value_counts()

detailed-label
DDoS                               1560817
Benign                               92592
PartOfAHorizontalPortScan            60716
Attack                                8682
C&C                                   3998
C&C - HeartBeat                        835
C&C - PartOfAHorizontalPortScan        331
C&C - Torii                             16
C&C - FileDownload                      15
FileDownload                             3
Name: count, dtype: int64

In [44]:
labels_to_drop = data_test['detailed-label'].value_counts().tail(3).index
data_test = data_test[~data_test['detailed-label'].isin(labels_to_drop)]

In [45]:
data_test['detailed-label'].value_counts()

detailed-label
DDoS                               1560817
Benign                               92592
PartOfAHorizontalPortScan            60716
Attack                                8682
C&C                                   3998
C&C - HeartBeat                        835
C&C - PartOfAHorizontalPortScan        331
Name: count, dtype: int64

In [46]:
data_test.describe()

Unnamed: 0,orig_pkts,orig_ip_bytes
count,1727971.0,1727971.0
mean,172.0536,6245.499
std,66615.63,2238537.0
min,0.0,0.0
25%,2.0,120.0
50%,3.0,160.0
75%,5.0,200.0
max,66027350.0,1914793000.0


In [47]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, RobustScaler
from sklearn.compose import ColumnTransformer


# Define features (X) and target (y)
X = data_test.drop('detailed-label', axis=1)
y = data_test['detailed-label']

In [48]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)
print("Data split into training and testing sets.")

print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")

Data split into training and testing sets.
Training set shape: (1209579, 6)
Test set shape: (518392, 6)


In [None]:
# Identify categorical and numerical features
categorical_features = ['proto', 'conn_state', 'history_bucket']
numerical_features = X.select_dtypes(include=np.number).columns.tolist()

# Create and apply the scaler and encoder
scaler = RobustScaler()
X_train_num = scaler.fit_transform(X_train[numerical_features])
X_test_num = scaler.transform(X_test[numerical_features])

encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
X_train_cat = encoder.fit_transform(X_train[categorical_features])
X_test_cat = encoder.transform(X_test[categorical_features])

# Combine processed numerical and categorical features
num_feature_names = numerical_features
cat_feature_names = encoder.get_feature_names_out(categorical_features)
all_feature_names = num_feature_names + list(cat_feature_names)

X_train_processed = np.hstack([X_train_num, X_train_cat])
X_test_processed = np.hstack([X_test_num, X_test_cat])

X_train_processed = pd.DataFrame(X_train_processed, columns=all_feature_names)
X_test_processed = pd.DataFrame(X_test_processed, columns=all_feature_names)

print("Preprocessing complete.")
print(f"X_train_processed shape: {X_train_processed.shape}")
print(f"X_test_processed shape: {X_test_processed.shape}")

In [52]:
scaler = RobustScaler()
X_train_num = scaler.fit_transform(X_train[numerical_features])
X_test_num = scaler.transform(X_test[numerical_features])
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
X_train_cat = encoder.fit_transform(X_train[categorical_features])
X_test_cat = encoder.transform(X_test[categorical_features])
num_feature_names = numerical_features
cat_feature_names = encoder.get_feature_names_out(categorical_features)
all_feature_names = num_feature_names + list(cat_feature_names)
X_train_processed = np.hstack([X_train_num, X_train_cat])
X_test_processed = np.hstack([X_test_num, X_test_cat])
X_train_processed = pd.DataFrame(X_train_processed, columns=all_feature_names)
X_test_processed = pd.DataFrame(X_test_processed, columns=all_feature_names)

In [None]:
# X_processed = preprocessor.fit_transform(X_train)

In [53]:
X_train_processed.shape

(1209579, 22)

In [None]:
# Get feature names after one-hot encoding
# ohe_feature_names = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)
# features = numerical_features + list(ohe_feature_names)

# # Convert the processed data back to a DataFrame
# X_train_processed = pd.DataFrame(X_processed, columns=features)

In [None]:
# X_test_processed = preprocessor.transform(X_test)

In [None]:
# X_test_processed = pd.DataFrame(X_test_processed, columns=features)

In [54]:

print("\n=== APPLYING DATA BALANCING ===")
smote = SMOTE(random_state=42)
X_balanced, y_balanced = smote.fit_resample(X_train_processed, y_train)
print("Balancing method used: SMOTE")
balanced_distribution = Counter(y_balanced)
print(f"Balanced class distribution: {balanced_distribution}")
print(f"New dataset size: {len(y_balanced)} (original: {len(y_train)})")


=== APPLYING DATA BALANCING ===
Balancing method used: SMOTE
Balanced class distribution: Counter({'DDoS': 1092571, 'Benign': 1092571, 'PartOfAHorizontalPortScan': 1092571, 'Attack': 1092571, 'C&C': 1092571, 'C&C - HeartBeat': 1092571, 'C&C - PartOfAHorizontalPortScan': 1092571})
New dataset size: 7647997 (original: 1209579)


In [62]:
X_balanced.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7647997 entries, 0 to 7647996
Data columns (total 22 columns):
 #   Column                                  Dtype  
---  ------                                  -----  
 0   orig_pkts                               float64
 1   orig_ip_bytes                           float64
 2   proto_tcp                               float64
 3   proto_udp                               float64
 4   conn_state_OTH                          float64
 5   conn_state_REJ                          float64
 6   conn_state_RSTO                         float64
 7   conn_state_RSTOS0                       float64
 8   conn_state_RSTR                         float64
 9   conn_state_RSTRH                        float64
 10  conn_state_S0                           float64
 11  conn_state_S1                           float64
 12  conn_state_S2                           float64
 13  conn_state_S3                           float64
 14  conn_state_SF                     

In [55]:
label = LabelEncoder()

y_train_processed = label.fit_transform(y_balanced)
y_test_processed = label.transform(y_test)


---
### Using GPU with XGBoost

To leverage the GPU, we can use the `XGBoost` library, which supports Apple Silicon GPUs. First, we need to install it.


In [None]:

%pip install xgboost



Now, let's train an XGBoost classifier using the GPU.


In [57]:

import xgboost as xgb

# Check if MPS is available
if xgb.config.get_config().get("USE_MPS", False):
    print("MPS is available, using GPU.")
    device = "mps"
else:
    print("MPS not available, using CPU.")
    device = "cpu"

# Create an XGBoost classifier
# The `device` parameter is set to "mps" to enable GPU acceleration on Apple Silicon.
# All other parameters are set to be similar to the RandomForestClassifier for comparison.
xgb_clf = xgb.XGBClassifier(
    n_estimators=100,
    random_state=42,
    device=device,  # Use 'mps' for Apple Silicon GPU
    eval_metric='logloss',
    use_label_encoder=False
)


MPS not available, using CPU.


In [58]:
print("\n=== XGBoost CROSS-VALIDATION ANALYSIS (GPU) ===")
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(xgb_clf, X_balanced, y_train_processed, cv=cv, scoring='accuracy')

print(f"Cross-validation scores: {cv_scores}")
print(f"Mean CV accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")


=== XGBoost CROSS-VALIDATION ANALYSIS (GPU) ===
Parameters: { "device" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "device" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "device" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: 

In [59]:
print("\n=== XGBoost FINAL MODEL TRAINING (GPU) ===")
xgb_clf.fit(X_balanced, y_train_processed)

y_pred_xgb = xgb_clf.predict(X_test_processed)
print("Test Set Results (XGBoost):")
print(f"Accuracy: {accuracy_score(y_test_processed, y_pred_xgb):.4f}")
print(f"Classification Report:\n{classification_report(y_test_processed, y_pred_xgb)}")


=== XGBoost FINAL MODEL TRAINING (GPU) ===
Parameters: { "device" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Test Set Results (XGBoost):
Accuracy: 0.9382
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.71      0.82      2605
           1       0.98      0.40      0.57     27778
           2       0.06      1.00      0.11      1199
           3       0.18      1.00      0.31       250
           4       0.01      0.94      0.02        99
           5       1.00      1.00      1.00    468246
           6       0.76      0.20      0.32     18215

    accuracy                           0.94    518392
   macro avg       0.57      0.75      0.45    518392
weighted avg       0.99      0.94

In [60]:
print("\n=== DETAILED EVALUATION (XGBoost) ===")
cm_xgb = confusion_matrix(y_test_processed, y_pred_xgb)
print("Confusion Matrix (XGBoost):")
print(cm_xgb)


=== DETAILED EVALUATION (XGBoost) ===
Confusion Matrix (XGBoost):
[[  1845      1      2    746      0      0     11]
 [    13  11128  15234    289      0      0   1114]
 [     1      2   1194      1      0      1      0]
 [     0      0      0    249      0      0      1]
 [     0      6      0      0     93      0      0]
 [     0      8     10      0      0 468224      4]
 [    40    185   3289     72  11012      0   3617]]


In [64]:
import joblib
import json

# --- EXPORTING MODEL AND PREPROCESSING OBJECTS ---

# 1. Save the trained XGBoost model
model_filename = 'flowguard_xgboost_model_final.pkl'
joblib.dump(xgb_clf, model_filename)
print(f"Model saved to {model_filename}")

# 2. Save the feature list and categorical mappings
preprocessing_data = {
    'numerical_features': numerical_features,
    'categorical_features': categorical_features,
    'ohe_categories': [list(cats) for cats in encoder.categories_],
    'label_encoder_classes': list(label.classes_),
    'feature_names_out': all_feature_names
}

# Save the scaler and encoder objects separately using joblib
joblib.dump(scaler, 'flowguard_scaler.pkl')
print("Scaler saved to flowguard_scaler.pkl")
joblib.dump(encoder, 'flowguard_encoder.pkl')
print("Encoder saved to flowguard_encoder.pkl")
joblib.dump(label, 'flowguard_label_encoder.pkl')
print("Label encoder saved to flowguard_label_encoder.pkl")

preprocessing_filename = 'flowguard_preprocessing.json'
with open(preprocessing_filename, 'w') as f:
    json.dump(preprocessing_data, f)
print(f"Preprocessing data saved to {preprocessing_filename}")


Model saved to flowguard_xgboost_model_final.pkl
Scaler saved to flowguard_scaler.pkl
Encoder saved to flowguard_encoder.pkl
Label encoder saved to flowguard_label_encoder.pkl
Preprocessing data saved to flowguard_preprocessing.json
