In [1]:
#import 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#knn
from sklearn.neighbors import KNeighborsClassifier
#svm
from sklearn.svm import SVC
#rf
from sklearn.ensemble import RandomForestClassifier
#xgboost
from xgboost import XGBClassifier
#split
from sklearn.model_selection import train_test_split
#metrics
from sklearn.metrics import mean_squared_error
#cross validation
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import mean_squared_error
#grid search
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
#scaling
from sklearn.preprocessing import StandardScaler


In [2]:
# data
df = pd.read_csv('dataset/total_preprocessed_featureselected.csv')


In [3]:
df['Attack Name'].value_counts()

Attack Name
Benign Traffic              86525
Recon Ping Sweep            47123
Recon OS Scan               42173
Recon Vulnerability Scan    39489
Dictionary Brute Force      18151
DoS SYN Flood               15243
MITM ARP Spoofing           14768
DoS UDP Flood                1848
DoS DNS Flood                1702
DoS ICMP Flood               1405
Recon Host Discovery          424
Name: count, dtype: int64

In [4]:
# x,y split
X = df.drop(columns=['Attack Name'])
y = df['Attack Name']

In [5]:
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Preprocessing 함수
def preprocess(X, y):
    # 문자열 레이블을 정수로 변환
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)  # 문자열 -> 정수 변환

    # Train/Test Split
    X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

    # SMOTE로 클래스 균형 조정
    smote = SMOTE(random_state=0, sampling_strategy={
        label_encoder.transform(['Recon Ping Sweep'])[0]: 60000,
        label_encoder.transform(['Recon OS Scan'])[0]: 60000,
        label_encoder.transform(['Recon Vulnerability Scan'])[0]: 50000,
        label_encoder.transform(['Dictionary Brute Force'])[0]: 30000,
        label_encoder.transform(['DoS SYN Flood'])[0]: 25000,
        label_encoder.transform(['MITM ARP Spoofing'])[0]: 20000,
        label_encoder.transform(['DoS UDP Flood'])[0]: 5000,
        label_encoder.transform(['DoS DNS Flood'])[0]: 5000,
        label_encoder.transform(['DoS ICMP Flood'])[0]: 4000,
        label_encoder.transform(['Recon Host Discovery'])[0]: 2000
    })

    # Oversampling
    X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
    print(f"Resampled Data Shape: {X_train_res.shape}, {y_train_res.shape}")

    # Scaling
    scaler = StandardScaler()
    X_train_res = scaler.fit_transform(X_train_res)
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    return X_train_res, X_train, X_test, y_train_res, y_train, y_test, label_encoder


In [6]:
X_train_res, X_train, X_test, y_train_res, y_train, y_test,label_encoder=preprocess(X,y)

Resampled Data Shape: (330213, 48), (330213,)


In [7]:
def knn_model(X_train, y_train, X_test, y_test, label_encoder):
    param = {
        'n_neighbors': 16,
        'weights': 'distance',
        'metric': 'manhattan'
    }
    knn = KNeighborsClassifier()
    knn.set_params(**param)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    # 라벨 복원
    y_test_str = label_encoder.inverse_transform(y_test)  # 실제 라벨 복원
    y_pred_str = label_encoder.inverse_transform(y_pred)  # 예측 라벨 복원

    # 결과 출력
    print("Classification Report:")
    print(classification_report(y_test_str, y_pred_str))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test_str, y_pred_str))


In [8]:
def rf_model(X_train, y_train, X_test, y_test, label_encoder):

    # RandomForest 초기화 및 하이퍼파라미터 탐색 설정
    
    param = {
        'n_estimators': 300,
        'max_depth': 120,
        'criterion': 'gini'
    }
    rf = RandomForestClassifier(random_state=42)
    rf.set_params(**param)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    y_test_str = label_encoder.inverse_transform(y_test)
    y_pred_str = label_encoder.inverse_transform(y_pred)
    print("Confusion Matrix:")
    print(confusion_matrix(y_test_str, y_pred_str))
    print("\nClassification Report:")
    print(classification_report(y_test_str, y_pred_str))


In [9]:

from lightgbm import LGBMClassifier
def lgbm_model(X_train, y_train, X_test, y_test, label_encoder):
    # LightGBM 초기화
    lgbm = LGBMClassifier(random_state=42)


    param_dist = {
        'n_estimators': 100,
        'max_depth': 20,
        'learning_rate': 0.01,
        'num_leaves': 20,
        'objective': 'multiclass',
        'num_class': 10  
    }

    lgbm.set_params(**param_dist)
    lgbm.fit(X_train, y_train)
    y_pred = lgbm.predict(X_test)
    y_test_str = label_encoder.inverse_transform(y_test)
    y_pred_str = label_encoder.inverse_transform(y_pred)
    print("Confusion Matrix:")
    print(confusion_matrix(y_test_str, y_pred_str))
    print("\nClassification Report:")
    print(classification_report(y_test_str, y_pred_str))
    
    

In [10]:
#증강 모델
knn_model(X_train , y_train , X_test, y_test, label_encoder)

Classification Report:
                          precision    recall  f1-score   support

          Benign Traffic       0.73      0.93      0.82     17312
  Dictionary Brute Force       0.69      0.47      0.56      3603
           DoS DNS Flood       0.47      0.16      0.24       326
          DoS ICMP Flood       0.37      0.10      0.15       294
           DoS SYN Flood       0.98      0.91      0.95      3096
           DoS UDP Flood       0.18      0.06      0.09       341
       MITM ARP Spoofing       0.83      0.57      0.67      2888
    Recon Host Discovery       0.20      0.03      0.05        70
           Recon OS Scan       0.50      0.50      0.50      8365
        Recon Ping Sweep       0.51      0.42      0.46      9513
Recon Vulnerability Scan       0.99      0.98      0.99      7963

                accuracy                           0.71     53771
               macro avg       0.59      0.47      0.50     53771
            weighted avg       0.70      0.71      

In [13]:
rf_model(X_train, y_train, X_test, y_test, label_encoder)

Confusion Matrix:
[[16167   407    22    25    20    36   208     8   104   299    16]
 [ 1687  1750     4     6     1     4    86     2    18    42     3]
 [  204    16    57     7     8     6     8     0     3    17     0]
 [  211    12     5    37     5     6    10     0     2     5     1]
 [  196    10     7     4  2831    23     6     0     8    11     0]
 [  265    18     3     5    11    22     5     1     0    11     0]
 [  952    85     7     5     3     3  1807     0     5    13     8]
 [   56     0     1     0     0     2     3     0     0     8     0]
 [  645    30     2     3     1     4    14     1  3722  3943     0]
 [ 1368    48    11     4     4     3    13     6  3442  4609     5]
 [   89     6     2     0     0     1     4     0     4    11  7846]]

Classification Report:
                          precision    recall  f1-score   support

          Benign Traffic       0.74      0.93      0.83     17312
  Dictionary Brute Force       0.73      0.49      0.58      3603

In [10]:
lgbm_model(X_train , y_train, X_test, y_test, label_encoder)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011154 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6927
[LightGBM] [Info] Number of data points in the train set: 215080, number of used features: 48
[LightGBM] [Info] Start training from score -1.133821
[LightGBM] [Info] Start training from score -2.693557
[LightGBM] [Info] Start training from score -5.051829
[LightGBM] [Info] Start training from score -5.265750
[LightGBM] [Info] Start training from score -2.873928
[LightGBM] [Info] Start training from score -4.960889
[LightGBM] [Info] Start training from score -2.896154
[LightGBM] [Info] Start training from score -6.409468
[LightGBM] [Info] Start training from score -1.850313
[LightGBM] [Info] Start training from score -1.743740
[LightGBM] [Info] Start training from score -1.920197
Confusion Matrix:
[[17281     6     0     0     2   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                          precision    recall  f1-score   support

          Benign Traffic       0.68      1.00      0.81     17312
  Dictionary Brute Force       0.99      0.33      0.49      3603
           DoS DNS Flood       1.00      0.00      0.01       326
          DoS ICMP Flood       0.00      0.00      0.00       294
           DoS SYN Flood       1.00      0.91      0.95      3096
           DoS UDP Flood       1.00      0.00      0.01       341
       MITM ARP Spoofing       0.99      0.42      0.59      2888
    Recon Host Discovery       0.00      0.00      0.00        70
           Recon OS Scan       0.56      0.30      0.39      8365
        Recon Ping Sweep       0.53      0.60      0.57      9513
Recon Vulnerability Scan       1.00      0.97      0.98      7963

                accuracy                           0.71     53771
               macro avg       0.70      0.41      0.44     53771
            weighted avg       0.74      0.71      0.69     53771



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [11]:
#공격 유형별로 분류하는 것은 어려움이 있을 것으로 판단, benign traffic과 아닌 것으로 나누기
df['Attack Name'] = df['Attack Name'].apply(lambda x: 1 if x == 'Benign Traffic' else 0)

In [12]:
df['Attack Name'].value_counts()

Attack Name
0    182326
1     86525
Name: count, dtype: int64

In [13]:
from sklearn.model_selection import train_test_split
X_01 = df.drop(columns=['Attack Name'])
y_01 = df['Attack Name']

In [14]:
#train test split
X_train_01, X_test_01, y_train_01, y_test_01 = train_test_split(X_01, y_01, test_size=0.2, random_state=42)

In [15]:
#scaling
scaler = StandardScaler()
X_train_01 = scaler.fit_transform(X_train_01)
X_test_01 = scaler.transform(X_test_01)


In [None]:
#knn
knn_model(X_train_01, y_train_01, X_test_01, y_test_01, label_encoder)

Classification Report:
                        precision    recall  f1-score   support

        Benign Traffic       0.92      0.87      0.89     36459
Dictionary Brute Force       0.75      0.83      0.79     17312

              accuracy                           0.86     53771
             macro avg       0.83      0.85      0.84     53771
          weighted avg       0.86      0.86      0.86     53771

Confusion Matrix:
[[31747  4712]
 [ 2946 14366]]


In [None]:
#rf
rf_model(X_train_01, y_train_01, X_test_01, y_test_01,label_encoder )

Confusion Matrix:
[[31705  4754]
 [ 2168 15144]]

Classification Report:
                        precision    recall  f1-score   support

        Benign Traffic       0.94      0.87      0.90     36459
Dictionary Brute Force       0.76      0.87      0.81     17312

              accuracy                           0.87     53771
             macro avg       0.85      0.87      0.86     53771
          weighted avg       0.88      0.87      0.87     53771



In [16]:
#lgbm
lgbm_model(X_train_01, y_train_01, X_test_01, y_test_01, label_encoder)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009396 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6927
[LightGBM] [Info] Number of data points in the train set: 215080, number of used features: 48
[LightGBM] [Info] Start training from score -0.388315
[LightGBM] [Info] Start training from score -1.133821
[LightGBM] [Info] Start training from score -34.538776
[LightGBM] [Info] Start training from score -34.538776
[LightGBM] [Info] Start training from score -34.538776
[LightGBM] [Info] Start training from score -34.538776
[LightGBM] [Info] Start training from score -34.538776
[LightGBM] [Info] Start training from score -34.538776
[LightGBM] [Info] Start training from score -34.538776
[LightGBM] [Info] Start training from score -34.538776
Confusion Matrix:
[[28969  7490]
 [  737 16575]]

Classification Report:
                        p