<a href="https://colab.research.google.com/github/lee191/LSTM/blob/main/LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.cluster import KMeans
import eli5
from eli5.sklearn import PermutationImportance

# KDD 데이터셋을 읽어옵니다. 파일 경로는 실제 데이터셋 파일에 맞게 수정하세요.
PATH = "/Users/iseongjin/Desktop/Python/중간고사/IsolationForest 모델/NSL-KDD/attack_name_data/dos_data.csv"
kdd_data = pd.read_csv(PATH, header=None)

# 컬럼 이름을 직접 지정합니다.
column_names = [
    'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes',
    'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins',
    'logged_in', 'num_compromised', 'root_shell', 'su_attempted',
    'num_root', 'num_file_creations', 'num_shells', 'num_access_files',
    'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count',
    'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate',
    'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate',
    'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate',
    'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
    'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
    'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate',
    'label'
]

kdd_data.columns = column_names

# Class 라벨을 제외한 41가지 특징 선택
features = kdd_data.iloc[:, :-1]

# DoS 공격에 해당하는 레이블 선택
dos_data = kdd_data[kdd_data['label'] == 'dos']

# 랜덤 포레스트를 사용하여 특징 중요도 추출
rf_classifier = RandomForestClassifier()
rf_classifier.fit(features, dos_data['label'])
feature_importance_rf = pd.Series(rf_classifier.feature_importances_, index=features.columns)

# Recursive Feature Elimination (RFE)을 사용하여 특징 중요도 추출
rfe_selector = RFE(estimator=rf_classifier, n_features_to_select=1, step=1)
rfe_selector.fit(features, dos_data['label'])
feature_importance_rfe = pd.Series(rfe_selector.ranking_, index=features.columns)

# K-means 클러스터링을 사용하여 특징 중요도 추출
kmeans_model = KMeans(n_clusters=2, random_state=42)
kmeans_model.fit(features)
feature_importance_kmeans = pd.Series(kmeans_model.labels_, index=features.columns)

# Eli5를 사용하여 특징 중요도 추출
perm = PermutationImportance(rf_classifier).fit(features, dos_data['label'])
feature_importance_eli5 = eli5.show_weights(perm, feature_names=features.columns.tolist(), top=20)

# 각 알고리즘에서 추출된 특징 중요도를 출력
print("Random Forest Feature Importance:")
print(feature_importance_rf.nlargest(20))

print("\nRFE Feature Importance:")
print(feature_importance_rfe[feature_importance_rfe == 1].index)

print("\nK-means Feature Importance:")
print(feature_importance_kmeans.value_counts().nlargest(20))

print("\nEli5 Feature Importance:")
print(feature_importance_eli5)


FileNotFoundError: ignored