In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
from imblearn.over_sampling import SMOTE

# 파일 경로 리스트
file_paths = [
    r"C:\Users\gudru\.vscode\sw\MachineLearningCVE\Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv",
    r"C:\Users\gudru\.vscode\sw\MachineLearningCVE\Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv",
    r"C:\Users\gudru\.vscode\sw\MachineLearningCVE\Friday-WorkingHours-Morning.pcap_ISCX.csv",
    r"C:\Users\gudru\.vscode\sw\MachineLearningCVE\Monday-WorkingHours.pcap_ISCX.csv",
    r"C:\Users\gudru\.vscode\sw\MachineLearningCVE\Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv",
    r"C:\Users\gudru\.vscode\sw\MachineLearningCVE\Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv",
    r"C:\Users\gudru\.vscode\sw\MachineLearningCVE\Tuesday-WorkingHours.pcap_ISCX.csv",
    r"C:\Users\gudru\.vscode\sw\MachineLearningCVE\Wednesday-workingHours.pcap_ISCX.csv"
]

# 각 파일에서 n줄씩 샘플링 (너무 크면 nrows 조절)
nrows = 50000
dfs = [pd.read_csv(fp, nrows=nrows) for fp in file_paths]
df = pd.concat(dfs, ignore_index=True)

# 전처리
df.drop(columns=['Flow ID', 'Source IP', 'Destination IP', 'Timestamp'], errors='ignore', inplace=True)
df.columns = df.columns.str.strip().str.lower()

# 라벨 변환 및 분포 확인
if df['label'].dtype == object:
    df['label'] = df['label'].apply(lambda x: 0 if str(x).upper() == 'BENIGN' else 1)
print("\n[라벨 값 분포]")
print(df['label'].value_counts())

# 컬럼명 출력 (ID/고유값 남았는지 체크)
print("\n[컬럼명(고유값 남았는지 확인)]")
print(df.columns)

# 결측/이상치 처리
df = df.replace([np.inf, -np.inf], np.nan).dropna()

# 문자열형 feature 자동 인코딩
for col in df.select_dtypes(include='object').columns:
    if col != 'label':
        df[col] = LabelEncoder().fit_transform(df[col])

# X, y 분리
X = df.drop('label', axis=1)
y = df['label']

# feature-label 상관계수 상위 10 진단
print("\n[feature-label 상관계수 상위 10]")
corrs = X.corrwith(y).abs().sort_values(ascending=False)
print(corrs.head(10))
if corrs.iloc[0] > 0.99:
    print("\n⚠️ 경고: label과 1:1로 상관관계 높은 피처가 있음! (데이터 누수/feature-leakage 가능성 높음)")

# 7:3 분할
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

# SMOTE (train set에만)
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_train, y_train)

# 모델 학습
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_res, y_res)

# 예측 및 평가
y_pred = model.predict(X_test)
print("\n정확도:", accuracy_score(y_test, y_pred))
print("분류 리포트:\n", classification_report(y_test, y_pred))

# ROC-AUC 지표 추가
y_pred_proba = model.predict_proba(X_test)[:,1]
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"ROC-AUC: {roc_auc:.4f}")



[라벨 값 분포]
label
0    138806
1     61194
Name: count, dtype: int64

[컬럼명(고유값 남았는지 확인)]
Index(['destination port', 'flow duration', 'total fwd packets',
       'total backward packets', 'total length of fwd packets',
       'total length of bwd packets', 'fwd packet length max',
       'fwd packet length min', 'fwd packet length mean',
       'fwd packet length std', 'bwd packet length max',
       'bwd packet length min', 'bwd packet length mean',
       'bwd packet length std', 'flow bytes/s', 'flow packets/s',
       'flow iat mean', 'flow iat std', 'flow iat max', 'flow iat min',
       'fwd iat total', 'fwd iat mean', 'fwd iat std', 'fwd iat max',
       'fwd iat min', 'bwd iat total', 'bwd iat mean', 'bwd iat std',
       'bwd iat max', 'bwd iat min', 'fwd psh flags', 'bwd psh flags',
       'fwd urg flags', 'bwd urg flags', 'fwd header length',
       'bwd header length', 'fwd packets/s', 'bwd packets/s',
       'min packet length', 'max packet length', 'packet length mean',
    

  c /= stddev[:, None]
  c /= stddev[None, :]


bwd packet length mean    0.676727
avg bwd segment size      0.676727
bwd packet length std     0.665013
bwd packet length max     0.656352
packet length std         0.628001
average packet size       0.622754
packet length mean        0.618801
packet length variance    0.594499
max packet length         0.592689
min packet length         0.311112
dtype: float64

정확도: 0.9998999449697333
분류 리포트:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     41609
           1       1.00      1.00      1.00     18358

    accuracy                           1.00     59967
   macro avg       1.00      1.00      1.00     59967
weighted avg       1.00      1.00      1.00     59967

