In [1]:
#import 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#knn
from sklearn.neighbors import KNeighborsClassifier
#svm
from sklearn.svm import SVC
#rf
from sklearn.ensemble import RandomForestClassifier
#xgboost
from xgboost import XGBClassifier
#split
from sklearn.model_selection import train_test_split
#metrics
from sklearn.metrics import mean_squared_error
#cross validation
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import mean_squared_error
#grid search
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
#scaling
from sklearn.preprocessing import StandardScaler


In [2]:
# data
df = pd.read_csv('dataset/total_preprocessed_featureselected.csv')


In [3]:
df['Attack Name'].value_counts()

Attack Name
Benign Traffic              86525
Recon Ping Sweep            47123
Recon OS Scan               42173
Recon Vulnerability Scan    39489
Dictionary Brute Force      18151
DoS SYN Flood               15243
MITM ARP Spoofing           14768
DoS UDP Flood                1848
DoS DNS Flood                1702
DoS ICMP Flood               1405
Recon Host Discovery          424
Name: count, dtype: int64

In [4]:
# x,y split
X = df.drop(columns=['Attack Name'])
y = df['Attack Name']

In [5]:
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Preprocessing 함수
def preprocess(X, y):
    # 문자열 레이블을 정수로 변환
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)  # 문자열 -> 정수 변환

    # Train/Test Split
    X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

    # SMOTE로 클래스 균형 조정
    smote = SMOTE(random_state=0, sampling_strategy={
        label_encoder.transform(['Recon Ping Sweep'])[0]: 60000,
        label_encoder.transform(['Recon OS Scan'])[0]: 60000,
        label_encoder.transform(['Recon Vulnerability Scan'])[0]: 50000,
        label_encoder.transform(['Dictionary Brute Force'])[0]: 30000,
        label_encoder.transform(['DoS SYN Flood'])[0]: 25000,
        label_encoder.transform(['MITM ARP Spoofing'])[0]: 20000,
        label_encoder.transform(['DoS UDP Flood'])[0]: 5000,
        label_encoder.transform(['DoS DNS Flood'])[0]: 5000,
        label_encoder.transform(['DoS ICMP Flood'])[0]: 4000,
        label_encoder.transform(['Recon Host Discovery'])[0]: 2000
    })

    # Oversampling
    X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
    print(f"Resampled Data Shape: {X_train_res.shape}, {y_train_res.shape}")

    # Scaling
    scaler = StandardScaler()
    X_train_res = scaler.fit_transform(X_train_res)
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    return X_train_res, X_train, X_test, y_train_res, y_train, y_test, label_encoder


In [6]:
X_train_res, X_train, X_test, y_train_res, y_train, y_test,label_encoder=preprocess(X,y)

Resampled Data Shape: (330213, 48), (330213,)


In [7]:
def knn_model(X_train, y_train, X_test, y_test, label_encoder):
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.model_selection import RandomizedSearchCV

    # 모델 초기화 및 하이퍼파라미터 탐색
    knn = KNeighborsClassifier()
    param_dist = {
        'n_neighbors': range(5, 19),
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan', 'minkowski']
    }
    random_search = RandomizedSearchCV(knn, param_distributions=param_dist, n_iter=10, cv=3, n_jobs=-1, verbose=1)
    random_search.fit(X_train, y_train)

    # 최적 파라미터 출력
    print("Best Parameters:", random_search.best_params_)

    # 예측
    y_pred = random_search.predict(X_test)

    # 라벨 복원
    y_test_str = label_encoder.inverse_transform(y_test)  # 실제 라벨 복원
    y_pred_str = label_encoder.inverse_transform(y_pred)  # 예측 라벨 복원

    # 결과 출력
    print("Classification Report:")
    print(classification_report(y_test_str, y_pred_str))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test_str, y_pred_str))


In [8]:
def rf_model(X_train, y_train, X_test, y_test, label_encoder):
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.model_selection import RandomizedSearchCV
    from sklearn.metrics import classification_report, confusion_matrix

    # RandomForest 초기화 및 하이퍼파라미터 탐색 설정
    rf = RandomForestClassifier(random_state=42)
    param_dist = {
        'n_estimators': range(100, 1000, 100),
        'max_depth': range(10, 100, 10),
        'max_features': ['auto', 'sqrt', 'log2'],
        'criterion': ['gini', 'entropy']
    }

    # RandomizedSearchCV로 하이퍼파라미터 최적화
    random_search = RandomizedSearchCV(rf, param_distributions=param_dist, n_iter=10, cv=3, n_jobs=-1, verbose=1)
    random_search.fit(X_train, y_train)

    # 최적 파라미터 출력
    print("Best Parameters:", random_search.best_params_)

    # 테스트 데이터에 대한 예측
    y_pred = random_search.predict(X_test)

    # 라벨 복원 (정수 -> 문자열)
    y_test_str = label_encoder.inverse_transform(y_test)
    y_pred_str = label_encoder.inverse_transform(y_pred)

    # 평가 결과 출력
    print("Confusion Matrix:")
    print(confusion_matrix(y_test_str, y_pred_str))
    print("\nClassification Report:")
    print(classification_report(y_test_str, y_pred_str))


In [9]:
#lightgbm
from lightgbm import LGBMClassifier

def lgbm_model(X_train, y_train, X_test, y_test):
    lgbm = LGBMClassifier(random_state=42)

    param_dist = {
        'n_estimators': range(100, 1000, 100),
        'max_depth': range(10, 100, 10),
        'learning_rate': range(0.001, 0.01, 0.001),
        'num_leaves': range(10, 100, 10)
    }

    random_search = RandomizedSearchCV(lgbm, param_distributions=param_dist, n_iter=10, cv=3, n_jobs=-1, verbose=1)

    random_search.fit(X_train, y_train)
    print(random_search.best_params_)
    # evaluation
    y_pred = random_search.predict(X_test)
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    


In [15]:
#증강 모델
knn_model(X_train_res, y_train_res, X_test, y_test, label_encoder)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best Parameters: {'weights': 'distance', 'n_neighbors': 16, 'metric': 'manhattan'}
Classification Report:
                          precision    recall  f1-score   support

          Benign Traffic       0.72      0.68      0.70     17312
  Dictionary Brute Force       0.46      0.38      0.42      3603
           DoS DNS Flood       0.03      0.13      0.04       326
          DoS ICMP Flood       0.04      0.07      0.05       294
           DoS SYN Flood       0.99      0.91      0.95      3096
           DoS UDP Flood       0.03      0.17      0.05       341
       MITM ARP Spoofing       0.69      0.51      0.59      2888
    Recon Host Discovery       0.00      0.03      0.01        70
           Recon OS Scan       0.48      0.27      0.34      8365
        Recon Ping Sweep       0.49      0.61      0.54      9513
Recon Vulnerability Scan       0.95      0.97      0.96      7963

                accuracy               

In [10]:
rf_model(X_train_res, y_train_res, X_test, y_test, label_encoder)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


: 

In [None]:
lgbm_model(X_train_res, y_train_res, X_test, y_test)

In [None]:
#공격 유형별로 분류하는 것은 어려움이 있을 것으로 판단, benign traffic과 아닌 것으로 나누기
df['Attack Name'] = df['Attack Name'].apply(lambda x: 1 if x == 'Benign Traffic' else 0)

In [None]:
df['Attack Name'].value_counts()

In [None]:
from sklearn.model_selection import train_test_split
X_01 = df.drop(columns=['Attack Name'])
y_01 = df['Attack Name']

In [None]:
#train test split
X_train_01, X_test_01, y_train_01, y_test_01 = train_test_split(X_01, y_01, test_size=0.2, random_state=42)

In [None]:
#scaling
scaler = StandardScaler()
X_train_01 = scaler.fit_transform(X_train_01)
X_test_01 = scaler.transform(X_test_01)


In [None]:
#knn
knn_model(X_train_01, y_train_01, X_test_01, y_test_01)

In [None]:
#rf
rf_model(X_train_01, y_train_01, X_test_01, y_test_01)

In [None]:
#lgbm
lgbm_model(X_train_01, y_train_01, X_test_01, y_test_01)