## 數據加載與探索

In [12]:
## 載入並探索資料集

import pandas as pd

# 讀取 CSV 檔案
data = pd.read_csv('text-book/kddcup.data.corrected.csv')

print("=== 數據集基本信息 ===")
print(f"數據形狀: {data.shape}")
print(f"特徵數量: {data.shape[1]}")
print(f"樣本數量: {data.shape[0]}")

print("\n=== 數據前5行 ===")
data.head()

=== 數據集基本信息 ===
數據形狀: (4898431, 42)
特徵數量: 42
樣本數量: 4898431

=== 數據前5行 ===


Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
0,0,tcp,http,SF,215,45076,0,0,0,0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal.
1,0,tcp,http,SF,162,4528,0,0,0,0,...,1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,normal.
2,0,tcp,http,SF,236,1228,0,0,0,0,...,2,1.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,normal.
3,0,tcp,http,SF,233,2032,0,0,0,0,...,3,1.0,0.0,0.33,0.0,0.0,0.0,0.0,0.0,normal.
4,0,tcp,http,SF,239,486,0,0,0,0,...,4,1.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,normal.


### KDD Cup 99 資料集欄位定義補充說明

KDD Cup 99 資料集每一筆「連線」記錄包含多個特徵，主要分為三大類：

#### 1. 基本連線特徵
- `duration`：連線持續時間（秒）
- `protocol_type`：協定類型（如 tcp, udp, icmp）
- `service`：目的地網路服務（如 http, telnet）
- `flag`：連線狀態（正常或錯誤）
- `src_bytes`：來源到目的地的資料位元組數
- `dst_bytes`：目的地到來源的資料位元組數
- `land`：是否來源與目的地相同（1: 是, 0: 否）
- `wrong_fragment`：錯誤片段數
- `urgent`：緊急封包數

#### 2. 內容特徵
- `hot`：熱指標數量（可疑行為）
- `num_failed_logins`：登入失敗次數
- `logged_in`：是否成功登入（1: 是, 0: 否）
- `num_compromised`：被攻陷條件數
- `root_shell`：是否取得 root shell（1: 是, 0: 否）
- `su_attempted`：是否嘗試 su root（1: 是, 0: 否）
- `num_root`：root 存取次數
- `num_file_creations`：檔案建立操作數
- `num_shells`：shell 提示符數
- `num_access_files`：存取控制檔案操作數
- `num_outbound_cmds`：FTP session 中的外部指令數
- `is_hot_login`：是否屬於熱登入（1: 是, 0: 否）
- `is_guest_login`：是否為 guest 登入（1: 是, 0: 否）

#### 3. 流量特徵（基於時間或主機）
- `count`：過去兩秒內與同一主機的連線數
- `serror_rate`：SYN 錯誤連線比例
- `rerror_rate`：REJ 錯誤連線比例
- `same_srv_rate`：同服務連線比例
- `diff_srv_rate`：不同服務連線比例
- `srv_count`：過去兩秒內與同一服務的連線數
- `srv_serror_rate`：同服務 SYN 錯誤比例
- `srv_rerror_rate`：同服務 REJ 錯誤比例
- `srv_diff_host_rate`：同服務不同主機連線比例

---

這些特徵有助於區分正常連線與各類型攻擊（如 DOS、R2L、U2R、Probe），並支援各種入侵偵測模型的訓練與分析。

In [13]:
# 檢查是否有缺失值
print("\n=== 缺失值檢查 ===")
missing_values = data.isnull().sum()
print(missing_values[missing_values > 0])


=== 缺失值檢查 ===
Series([], dtype: int64)


In [14]:
# 檢查是否有非數值型別的特徵
non_numeric_features = data.select_dtypes(exclude=['number']).columns
print("\n=== 非數值型別特徵 ===")
print(non_numeric_features)



=== 非數值型別特徵 ===
Index(['protocol_type', 'service', 'flag', 'label'], dtype='object')


In [15]:
# 其中 label 欄位表示連線是否異常，除了 normal 以外，包含以下類型：
# back,buffer_overflow,ftp_write,guess_passwd,imap,ipsweep,land,loadmodule,multihop,neptune,nmap,
# perl,phf,pod,portsweep,rootkit,satan,smurf,spy,teardrop,warezclient,warezmaster.
print("\n=== 正常與異常連線數量 ===")
print(data['label'].value_counts())


=== 正常與異常連線數量 ===
label
smurf.              2807886
neptune.            1072017
normal.              972781
satan.                15892
ipsweep.              12481
portsweep.            10413
nmap.                  2316
back.                  2203
warezclient.           1020
teardrop.               979
pod.                    264
guess_passwd.            53
buffer_overflow.         30
land.                    21
warezmaster.             20
imap.                    12
rootkit.                 10
loadmodule.               9
ftp_write.                8
multihop.                 7
phf.                      4
perl.                     3
spy.                      2
Name: count, dtype: int64


## 數據型態轉換

In [19]:
# 將 label 欄位轉換為二元標籤：normal 為 0，其他為 1
data['label'] = data['label'].apply(lambda x: 0 if x == 'normal' else 1)

# 重新取得非數值型欄位（排除已經被轉換的欄位）
non_numeric_features = [col for col in data.columns if data[col].dtype == 'object' and col != 'label']

# 執行 one-hot 編碼
data = pd.get_dummies(data, columns=non_numeric_features)

print("\n=== 轉換後的數據前5行 ===")
data.head()


=== 轉換後的數據前5行 ===


Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
0,0,215,45076,0,0,0,0,0,1,0,...,False,False,False,False,False,False,False,False,True,False
1,0,162,4528,0,0,0,0,0,1,0,...,False,False,False,False,False,False,False,False,True,False
2,0,236,1228,0,0,0,0,0,1,0,...,False,False,False,False,False,False,False,False,True,False
3,0,233,2032,0,0,0,0,0,1,0,...,False,False,False,False,False,False,False,False,True,False
4,0,239,486,0,0,0,0,0,1,0,...,False,False,False,False,False,False,False,False,True,False


In [20]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
data_scaled = scaler.fit_transform(data)

print("\n=== 標準化後的數據形狀 ===")
print(data_scaled.shape)



=== 標準化後的數據形狀 ===
(4898431, 123)


In [23]:
print("\n=== 轉換後的數據前5行 ===")
data_scaled[:5]


=== 轉換後的數據前5行 ===


array([[-6.68331854e-02, -1.72038228e-03,  6.81884351e-02,
        -2.39084686e-03, -1.51391734e-02, -1.10348462e-03,
        -2.65207600e-02, -4.39091558e-03,  2.44279187e+00,
        -2.09732783e-03, -8.25770840e-03, -4.54646139e-03,
        -3.28458917e-03, -9.57233922e-03, -8.50457842e-03,
        -2.87561127e-02,  0.00000000e+00, -6.38979005e-04,
        -2.89113034e-02, -1.57541507e+00, -1.19624324e+00,
        -4.66042614e-01, -4.65755574e-01, -2.48285775e-01,
        -2.48130352e-01,  5.39733093e-01, -2.56056520e-01,
        -2.01059296e-01, -3.63913926e+00, -1.78651044e+00,
        -1.83302273e+00, -2.82939000e-01, -1.25793664e+00,
        -1.56668488e-01, -4.66404784e-01, -4.65453641e-01,
        -2.50831829e-01, -2.49631966e-01,  0.00000000e+00,
        -1.17143182e+00,  1.27225957e+00, -2.03227620e-01,
        -1.03136755e-02, -5.24981950e-03, -1.48364049e-02,
        -6.38979005e-04, -2.62850181e-02, -1.46214776e-02,
        -1.44387514e-02, -1.46493871e-02, -1.47674150e-0