In [40]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import lightgbm as lgb

In [41]:
# Load the preprocessed dataset
df = pd.read_csv('preprocessed_data.csv', low_memory=False)
df

Unnamed: 0,Flow Duration,Fwd Packet Length Mean,Bwd Packet Length Mean,Flow Bytes/s,Flow Packets/s,Flow IAT Mean,Fwd IAT Mean,Bwd IAT Mean,Packet Length Mean,FIN Flag Count,...,ACK Flag Count,URG Flag Count,CWE Flag Count,Down/Up Ratio,Average Packet Size,act_data_pkt_fwd,min_seg_size_forward,Active Mean,Idle Mean,Label
0,37027,0.000000,0.000000,0.000000,54.014638,3.702700e+04,0.000000e+00,0.000000,0.000000,0.0,...,1.0,1.0,0.0,1.0,0.000000,0.0,32.0,0.0,0.0,Android_Adware
1,36653,0.000000,0.000000,0.000000,54.565793,3.665300e+04,0.000000e+00,0.000000,0.000000,0.0,...,1.0,1.0,0.0,1.0,0.000000,0.0,32.0,0.0,0.0,Android_Adware
2,534099,126.375000,993.666667,24218.356522,37.446241,2.811047e+04,6.876286e+04,44362.727273,615.952381,0.0,...,0.0,0.0,0.0,1.0,646.750000,3.0,20.0,0.0,0.0,Android_Adware
3,9309,0.000000,0.000000,0.000000,322.268772,4.654500e+03,4.654500e+03,0.000000,0.000000,0.0,...,1.0,0.0,0.0,0.0,0.000000,0.0,20.0,0.0,0.0,Android_Adware
4,19890496,53.750000,946.500000,307.131607,0.703854,1.530038e+06,2.841499e+06,82192.800000,407.266667,0.0,...,0.0,0.0,0.0,0.0,436.357143,2.0,20.0,0.0,0.0,Android_Adware
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
355625,126711,0.000000,0.000000,0.000000,15.783949,1.267110e+05,0.000000e+00,0.000000,0.000000,0.0,...,1.0,1.0,0.0,1.0,0.000000,0.0,32.0,0.0,0.0,Benign
355626,48012,30.000000,140.000000,3540.781471,41.656253,4.801200e+04,0.000000e+00,0.000000,66.666667,0.0,...,0.0,0.0,0.0,1.0,100.000000,0.0,32.0,0.0,0.0,Benign
355627,20028018,30.818182,791.875000,333.233174,0.948671,1.112668e+06,2.002802e+06,33229.428571,333.700000,0.0,...,0.0,0.0,0.0,0.0,351.263158,2.0,20.0,367528.0,19660490.0,Benign
355628,347926,32.000000,48.000000,229.933951,5.748349,3.479260e+05,0.000000e+00,0.000000,37.333333,0.0,...,0.0,0.0,0.0,1.0,56.000000,0.0,20.0,0.0,0.0,Benign


In [42]:
df.columns

Index(['Flow Duration', 'Fwd Packet Length Mean', 'Bwd Packet Length Mean',
       'Flow Bytes/s', 'Flow Packets/s', 'Flow IAT Mean', 'Fwd IAT Mean',
       'Bwd IAT Mean', 'Packet Length Mean', 'FIN Flag Count',
       'SYN Flag Count', 'RST Flag Count', 'PSH Flag Count', 'ACK Flag Count',
       'URG Flag Count', 'CWE Flag Count', 'Down/Up Ratio',
       'Average Packet Size', 'act_data_pkt_fwd', 'min_seg_size_forward',
       'Active Mean', 'Idle Mean', 'Label'],
      dtype='object')

In [43]:
# Define features (X) and target variable (y)
X = df.drop('Label', axis=1)
y = df['Label']

In [44]:
# Apply label encoding to the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# XGBoost Model

In [45]:
xgb_model = xgb.XGBClassifier()
xgb_model.fit(X_train, y_train)

In [46]:
# Predictions on the test set
y_pred_xgb = xgb_model.predict(X_test)

In [47]:
# Evaluate XGBoost model
print("XGBoost Model:")
print("Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("Classification Report:\n", classification_report(y_test, y_pred_xgb))

XGBoost Model:
Accuracy: 0.47272446081601666
Classification Report:
               precision    recall  f1-score   support

           0       0.49      0.78      0.60     29513
           1       0.54      0.15      0.24     13432
           2       0.42      0.36      0.39     23404
           3       0.48      0.02      0.03      4777

    accuracy                           0.47     71126
   macro avg       0.48      0.33      0.31     71126
weighted avg       0.48      0.47      0.42     71126



# LightGBM Model

In [48]:
lgb_model = lgb.LGBMClassifier()
lgb_model.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.021696 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3229
[LightGBM] [Info] Number of data points in the train set: 284504, number of used features: 20
[LightGBM] [Info] Start training from score -0.880656
[LightGBM] [Info] Start training from score -1.662412
[LightGBM] [Info] Start training from score -1.110884
[LightGBM] [Info] Start training from score -2.709947


In [49]:
# Predictions on the test set
y_pred_lgb = lgb_model.predict(X_test)

In [50]:
# Evaluate LightGBM model
print("\nLightGBM Model:")
print("Accuracy:", accuracy_score(y_test, y_pred_lgb))
print("Classification Report:\n", classification_report(y_test, y_pred_lgb))


LightGBM Model:
Accuracy: 0.46835193881281106
Classification Report:
               precision    recall  f1-score   support

           0       0.47      0.83      0.60     29513
           1       0.67      0.13      0.22     13432
           2       0.42      0.30      0.35     23404
           3       0.50      0.01      0.02      4777

    accuracy                           0.47     71126
   macro avg       0.52      0.32      0.30     71126
weighted avg       0.50      0.47      0.41     71126

