# Import 

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_class_weight
from lightgbm import LGBMClassifier, early_stopping, log_evaluation
import warnings
from sklearn.ensemble import RandomForestRegressor
warnings.filterwarnings('ignore')

# Data Load

In [92]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')
sample_submission = pd.read_csv('./sample_submission.csv')

FileNotFoundError: [Errno 2] No such file or directory: './train.csv'

In [None]:
train['attack_type'].value_counts()

attack_type
Benign             8791
Hulk               1719
Port_Scanning       793
DDoS                471
FTP_Brute_Force      47
GoldenEye            41
Slow_HTTP            34
SSH_Brute_Force      30
Botnet               27
Slowloris            26
Web_Brute_Force      14
Web_XSS               6
Name: count, dtype: int64

# 파생변수 생성

In [None]:


# ✅ 데이터 불러오기
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')
sample_submission = pd.read_csv('./sample_submission.csv')

# ✅ 타겟 및 피처 분리
X = train.drop(columns=['ID', 'attack_type'])
y = train['attack_type']
test_id = test['ID']
test = test.drop(columns=['ID'])

# ✅ Label Encoding for target
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# ✅ IP 전처리 (앞 2블럭 사용)
def extract_subnet(ip, level=2):
    try:
        return '.'.join(ip.split('.')[:level])
    except:
        return 'unknown'

for col in ['ip_src', 'ip_dst']:
    X[col] = X[col].fillna('Missing').astype(str).apply(lambda x: extract_subnet(x, level=2))
    test[col] = test[col].fillna('Missing').astype(str).apply(lambda x: extract_subnet(x, level=2))

# ✅ 범주형 라벨 인코딩
category_features = ['ip_src', 'ip_dst', 'protocol']
encoders = {}
def safe_transform(encoder, series):
    known_classes = set(encoder.classes_)
    return series.apply(lambda x: encoder.transform([x])[0] if x in known_classes else -1)

for col in category_features:
    enc = LabelEncoder()
    X[col] = enc.fit_transform(X[col])
    test[col] = safe_transform(enc, test[col])
    encoders[col] = enc

# ✅ 수치형 결측값 처리
numeric_cols = ['port_src','port_dst','duration','pkt_count_fwd','pkt_count_bwd',
                'rate_fwd_pkts','rate_bwd_pkts','rate_fwd_bytes','rate_bwd_bytes',
                'payload_fwd_mean','payload_bwd_mean','tcp_win_fwd_init','tcp_win_bwd_init',
                'tcp_syn_count','tcp_psh_count','tcp_rst_count','iat_avg_packets']
X[numeric_cols] = X[numeric_cols].fillna(X[numeric_cols].mean())
test[numeric_cols] = test[numeric_cols].fillna(X[numeric_cols].mean())

# ✅ 파생변수 생성
X['payload_diff'] = X['payload_fwd_mean'] - X['payload_bwd_mean']
X['payload_sum'] = X['payload_fwd_mean'] + X['payload_bwd_mean']
X['pkt_ratio'] = X['pkt_count_fwd'] / (X['pkt_count_bwd'] + 1)
X['rate_byte_ratio'] = X['rate_fwd_bytes'] / (X['rate_bwd_bytes'] + 1)
X['rate_pkt_ratio'] = X['rate_fwd_pkts'] / (X['rate_bwd_pkts'] + 1)
X['tcp_win_diff'] = X['tcp_win_fwd_init'] - X['tcp_win_bwd_init']

test['payload_diff'] = test['payload_fwd_mean'] - test['payload_bwd_mean']
test['payload_sum'] = test['payload_fwd_mean'] + test['payload_bwd_mean']
test['pkt_ratio'] = test['pkt_count_fwd'] / (test['pkt_count_bwd'] + 1)
test['rate_byte_ratio'] = test['rate_fwd_bytes'] / (test['rate_bwd_bytes'] + 1)
test['rate_pkt_ratio'] = test['rate_fwd_pkts'] / (test['rate_bwd_pkts'] + 1)
test['tcp_win_diff'] = test['tcp_win_fwd_init'] - test['tcp_win_bwd_init']



In [None]:
drop_cols = [
    'payload_diff', 'protocol', 'rate_pkt_ratio', 'tcp_psh_count',
    'rate_bwd_bytes', 'rate_fwd_bytes', 'payload_sum', 'pkt_ratio',
    'payload_bwd_mean', 'payload_fwd_mean', 'rate_fwd_pkts',
    'rate_bwd_pkts'
]

X = X.drop(columns=drop_cols)
test = test.drop(columns=drop_cols)

# LGBM+Optuna+5Fold

In [None]:


# ✅ 고정 하이퍼파라미터
best_params = {
    'n_estimators': 1685,
    'learning_rate': 0.06555405498060013,
    'num_leaves': 65,
    'max_depth': 10,
    'min_child_samples': 24,
    'subsample': 0.8484011539017597,
    'colsample_bytree': 0.7337216490210687,
    'reg_alpha': 0.43019239671249737,
    'reg_lambda': 0.9937168583657578,
    'objective': 'multiclass',
    'num_class': len(np.unique(y_encoded)),
    'random_state': 42,
    'n_jobs': -1,
    'verbosity': -1
}

# ✅ 클래스별 수동 가중치 계산
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_encoded), y=y_encoded)
class_weight_dict = dict(zip(np.unique(y_encoded), class_weights))

# ✅ KFold 설정
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
test_preds = np.zeros((test.shape[0], len(np.unique(y_encoded))))
oof_preds = np.zeros((X.shape[0], len(np.unique(y_encoded))))

# ✅ 학습 및 예측
for fold, (train_idx, val_idx) in enumerate(skf.split(X, y_encoded)):
    print(f"\n🚀 Fold {fold + 1}")
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y_encoded[train_idx], y_encoded[val_idx]

    sample_weights = np.array([class_weight_dict[label] for label in y_train])

    model = LGBMClassifier(**best_params)
    model.fit(
        X_train, y_train,
        sample_weight=sample_weights,
        eval_set=[(X_val, y_val)],
        eval_metric='multi_logloss',
        callbacks=[
            early_stopping(stopping_rounds=50),
            log_evaluation(period=0)
        ]
    )

    val_pred = model.predict_proba(X_val)
    oof_preds[val_idx] = val_pred

    val_labels = np.argmax(val_pred, axis=1)
    print(classification_report(y_val, val_labels, target_names=le.classes_))

    test_pred = model.predict_proba(test)
    test_preds += test_pred / skf.n_splits

# ✅ OOF 평가
oof_labels = np.argmax(oof_preds, axis=1)
print("\n✅ OOF 성능 평가:")
print(classification_report(y_encoded, oof_labels, target_names=le.classes_))
macro_f1 = classification_report(y_encoded, oof_labels, output_dict=True)['macro avg']['f1-score']
print("✅ OOF Macro F1 Score:", macro_f1)

# ✅ 제출 파일 저장
final_pred_labels = le.inverse_transform(np.argmax(test_preds, axis=1))
sample_submission['attack_type'] = final_pred_labels
sample_submission.to_csv('lgbm_kfold_submission_stop.csv', index=False)
print("📁 Saved: lgbm_kfold_submission_stop.csv")


🚀 Fold 1
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1685]	valid_0's multi_logloss: 0.0276587
                 precision    recall  f1-score   support

         Benign       1.00      1.00      1.00      1759
         Botnet       1.00      1.00      1.00         5
           DDoS       1.00      1.00      1.00        95
FTP_Brute_Force       0.89      0.89      0.89         9
      GoldenEye       0.86      0.75      0.80         8
           Hulk       0.99      1.00      1.00       343
  Port_Scanning       1.00      0.99      1.00       159
SSH_Brute_Force       1.00      1.00      1.00         6
      Slow_HTTP       1.00      0.43      0.60         7
      Slowloris       0.50      0.80      0.62         5
Web_Brute_Force       0.50      0.67      0.57         3
        Web_XSS       0.00      0.00      0.00         1

       accuracy                           0.99      2400
      macro avg       0.81      0.79   