In [18]:
import glob
import os
from multiprocessing import Pool
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from tqdm import tqdm
from imblearn.over_sampling import KMeansSMOTE
from imblearn.under_sampling import RandomUnderSampler

In [22]:
wpa3_dataset_class_list = ['BF', 'Deauth', 'SAE_CV', 'UGD', 'SAE_AF', 'TSC', 'DG', 'KRACK_DG']
WIDS_class_list = ['benign', 'BF', 'Deauth', 'SAE_CV', 'UGD', 'SAE_AF', 'TSC', 'DG', 'KRACK_DG']
use_feat = {
    "frame.len": "Int64",
    "radiotap.length": "Int64",
    "radiotap.dbm_antsignal": "Int64",
    "wlan.duration": "Int64",
    "radiotap.present.tsft": "Int64",
    "radiotap.channel.freq": "Int64",
    "radiotap.channel.flags.cck": "Int64",
    "radiotap.channel.flags.ofdm": "Int64",
    "wlan.fc.type": "Int64",
    "wlan.fc.subtype": "Int64",
    "wlan.fc.ds": str,
    "wlan.fc.frag": "Int64",
    "wlan.fc.retry": "Int64",
    "wlan.fc.pwrmgt": "Int64",
    "wlan.fc.moredata": "Int64",
    "wlan.fc.protected": "Int64",
    "wlan.fixed.status_code": "Int64",
    "wlan.fixed.auth_seq": str,
    "Label": str,
}

root_path = f"/home/sun10/WIDS"

# Preprocess

## 產生 npy file

In [4]:
class GENERATE_FEAT_NPY():
    def __init__(
        self, path: list[str], att: str, use_feat: dict
    ):
        self.path = path
        self.att = att
        self.CPU_CORE = int(30)
        self.use_feat = use_feat
        self.directory = f"{root_path}/data/{self.att}"

    def _clean_row_data(self, row):
        keys = self.use_feat.keys()
        if "wlan.fixed.auth_seq" in keys:
            row["wlan.fixed.auth_seq"] = int(
                str(row["wlan.fixed.auth_seq"]), 16)
        if "wlan.fc.ds" in keys:
            row["wlan.fc.ds"] = int(str(row["wlan.fc.ds"]), 16)

        if "radiotap.rxflags" in keys:
            row["radiotap.rxflags"] = int(str(row["radiotap.rxflags"]), 16)
        if "wlan.tag.length" in keys:
            if str(row["wlan.tag.length"]).find(" ") != -1:
                row["wlan.tag.length"] = float(
                    str(row["wlan.tag.length"]).split(' ')[0])
            else:
                row["wlan.tag.length"] = float(row["wlan.tag.length"])
        return row

    def _deal_file(self, path: str):
        selected = self.use_feat
        selected_columns = list(selected.keys())
        # 從CSV文件中讀取數據
        df = pd.read_csv(path, usecols=selected_columns,
                         dtype=selected)  # , dtype=selected

        df.fillna(0, inplace=True)
        df = df.apply(self._clean_row_data, axis=1)

        df.fillna(0, inplace=True)
        b_pkts = df[df["Label"] == "Normal"].copy()
        m_pkts = df[df["Label"] != "Normal"].copy()

        b_pkts.drop(["Label"], axis=1, inplace=True)
        m_pkts.drop(["Label"], axis=1, inplace=True)
        b_data = b_pkts.values.tolist()
        m_data = m_pkts.values.tolist()

        return b_data, m_data

    def _save_np_files(self, b_flows: list, m_flows: list):
        # list to numpy array
        m_flows = np.asarray(m_flows)
        b_flows = np.asarray(b_flows)

        # make directories
        directory = self.directory

        if not os.path.exists(directory):
            os.makedirs(directory)

        # delete old file
        if os.path.isfile(f"{directory}/malicious_t.npy"):
            os.remove(f"{directory}/malicious_t.npy")
        if os.path.isfile(f"{directory}/benign_t.npy"):
            os.remove(f"{directory}/benign_t.npy")

        print(f'{"*" * 11} Number of {self.att} class {"*" * 11}')
        # save new file
        np.save(f"{directory}/malicious_t", m_flows)
        print(f"* malicious has {len(m_flows)} flows")

        np.save(f"{directory}/benign_t", b_flows)
        print(f"* benign    has {len(b_flows)} flows")

    def run(self):
        b_flows = list([])
        m_flows = list([])

        chunk_size = self.CPU_CORE
        num_chunks = len(self.path) // chunk_size
        remainder = len(self.path) % chunk_size
        # 對每個完整的塊應用 deal_file 函數
        for i in range(num_chunks):
            chunk = self.path[i * chunk_size: (i + 1) * chunk_size]
            pool = Pool(chunk_size)  # 設定處理程序數量
            results = pool.map_async(self._deal_file, chunk)
            pool_result = results.get()
            for r in pool_result:
                tmp_b_flows, tmp_m_flows = r
                b_flows.extend(tmp_b_flows)
                m_flows.extend(tmp_m_flows)
                del tmp_b_flows, tmp_m_flows

        # 如果有剩餘的資料，也將其作為一個單獨的塊處理
        if remainder > 0:
            last_chunk = self.path[num_chunks * chunk_size:]
            pool = Pool(len(last_chunk))  # 設定處理程序數量
            results = pool.map_async(self._deal_file, last_chunk)
            pool_result = results.get()
            for r in pool_result:
                tmp_b_flows, tmp_m_flows = r
                b_flows.extend(tmp_b_flows)
                m_flows.extend(tmp_m_flows)
                del tmp_b_flows, tmp_m_flows

        self._save_np_files(b_flows, m_flows)

In [9]:
flag = 1
for att in wpa3_dataset_class_list:
    csv_path = f"{root_path}/The_WPA3_Dataset_csv/{flag}. {att}/{flag}. {att}_*.csv"
    paths = glob.glob(csv_path)
    npy = GENERATE_FEAT_NPY(paths, att, use_feat)
    npy.run()
    del npy
    flag += 1

*********** Number of BF class ***********
* malicious has 54083 flows
* benign    has 145240 flows
*********** Number of Deauth class ***********
* malicious has 5507 flows
* benign    has 116799 flows
*********** Number of SAE_CV class ***********
* malicious has 1210 flows
* benign    has 116722 flows
*********** Number of UGD class ***********
* malicious has 2344 flows
* benign    has 157855 flows
*********** Number of SAE_AF class ***********
* malicious has 90904 flows
* benign    has 155751 flows
*********** Number of TSC class ***********
* malicious has 819 flows
* benign    has 149259 flows
*********** Number of DG class ***********
* malicious has 11296 flows
* benign    has 127643 flows
*********** Number of KRACK_DG class ***********
* malicious has 7330 flows
* benign    has 160763 flows


## 切割 training and testing set

In [26]:
class SPLIT_SET():
    def __init__(self, data_path: str, classes: list):
        self.data_path = data_path
        self.classes = classes
        self.save_to = f"{root_path}/data"

    def save_np(self, path, data):
        if os.path.isfile(path):
            os.remove(path)
        np.save(path, data)
    
    def concat(self, old_data, new_data):
        if old_data is None and new_data.shape[0] != 0:
            concat = new_data
        else:
            try:
                concat = np.concatenate((old_data, new_data))
            except ValueError:
                concat = old_data
        return concat

    def trainTestSplit(self, data, proportion=0.2):
        """使用攻擊種類的惡意流量切割出train和test set"""
        train_data, test_data = train_test_split(
            data, test_size=proportion, random_state=42
        )  # 分訓練/驗證
        return train_data, test_data

    def get_label(self, id, size):
        return np.ones(size).astype(int) * (id)

    def get_train_benign(self, data_path, ratio):
        """取得所有benign的npy檔案, 並切割出benign的train set"""

        files = glob.glob(f"{data_path}/*/benign_t.npy")  # 取得所有benign的npy檔案
        random_benign = None
        for file in tqdm(files):
            data = np.load(file)
            total_size = data.shape[0]
            sample_size = int(total_size * ratio)
            # 使用numpy.random.choice来随机抽取数据
            random_samples = np.random.choice(
                total_size, size=sample_size, replace=False
            )
            random_data = data[random_samples]
            # 將抽取數據後剩餘的數據儲存至untrain_benign.npy中
            other_samples = np.setdiff1d(np.arange(total_size), random_samples)
            other_path = file[: file.rfind("/")]
            self.save_np(f"{other_path}/untrain_benign.npy",
                         data[other_samples])
            random_benign = self.concat(random_benign, random_data)

        return random_benign

    def get_test_benign(self, data_path, ratio):
        """使用抽取train benign數據後的剩餘數據, 並切割出benign的test set"""

        files = glob.glob(
            f"{data_path}/*/untrain_benign.npy"
        )  # 取得所有剩餘的benign的npy檔案
        random_benign = None
        for file in tqdm(files):
            data = np.load(file)
            total_size = data.shape[0]
            sample_size = int(total_size * ratio)
            # 使用numpy.random.choice来随机抽取数据
            random_samples = np.random.choice(
                total_size, size=sample_size, replace=False
            )
            random_data = data[random_samples]
            # 將抽取數據後剩餘的數據儲存至untest_benign.npy中
            other_samples = np.setdiff1d(np.arange(total_size), random_samples)
            other_path = file[: file.rfind("/")]
            self.save_np(f"{other_path}/untest_benign.npy",
                         data[other_samples])

            random_benign = self.concat(random_benign, random_data)

        return random_benign 
    
    def run(self):
        # 建立儲存資料夾路徑
        # 建立儲存資料夾路徑
        if not os.path.exists(f"{self.save_to}/train/"):
            os.makedirs(f"{self.save_to}/train/")
        if not os.path.exists(f"{self.save_to}/test/"):
            os.makedirs(f"{self.save_to}/test/")

        total_count = list([])
        train_count = list([])
        test_count = list([])


        for index, c in enumerate(self.classes):
            if c == "benign":
                train_data = self.get_train_benign(self.data_path, 0.7)

                self.save_np(f"{self.save_to}/train/{c}.npy", train_data)
                train_size = train_data.shape[0]
                del train_data
                self.save_np(
                    f"{self.save_to}/train/{c}_label.npy",
                    self.get_label(index, train_size),
                )
                total_count.append(train_size)
                train_count.append(train_size)

                test_data = self.get_test_benign(self.data_path, 0.7)
                test_size = test_data.shape[0]
                self.save_np(f"{self.save_to}/test/{c}.npy", test_data)
                del test_data
                self.save_np(
                    f"{self.save_to}/test/{c}_label.npy",
                    self.get_label(index, test_size),
                )
                test_count.append(test_size)
                continue

            else:
                npy_path = f"{self.data_path}/{c}/malicious_t.npy"
                npy_data = np.load(npy_path)
                try:
                    c_num = npy_data.shape[0]
                except AttributeError:
                    print(f" {c} No Data !")
                    c_num = 0

                train_data, test_data = self.trainTestSplit(
                    npy_data, proportion=0.2)
                del npy_data  # 釋放記憶體

                train_size = train_data.shape[0]
                test_size = test_data.shape[0]

                self.save_np(f"{self.save_to}/train/{c}.npy", train_data)
                self.save_np(f"{self.save_to}/test/{c}.npy", test_data)
                del train_data, test_data  # 釋放記憶體

                # creat label .npy
                self.save_np(
                    f"{self.save_to}/train/{c}_label.npy",
                    self.get_label(index, train_size),
                )
                self.save_np(
                    f"{self.save_to}/test/{c}_label.npy",
                    self.get_label(index, test_size),
                )

                total_count.append(c_num)
                train_count.append(train_size)
                test_count.append(test_size)

        print("*" * 15 + " total size " + "*" * 15)
        for index, c in enumerate(self.classes):
            print(f"{c} total: {total_count[index]}")
        print("*" * 15 + " train size " + "*" * 15)
        for index, c in enumerate(self.classes):
            print(f"{c} total: {train_count[index]}")
        print("*" * 15 + " test size " + "*" * 15)
        for index, c in enumerate(self.classes):
            print(f"{c} total: {test_count[index]}")
        

In [29]:
split = SPLIT_SET(f"{root_path}/data", WIDS_class_list)
split.run()

100%|██████████| 8/8 [00:00<00:00, 13.39it/s]
100%|██████████| 8/8 [00:00<00:00, 57.47it/s]


*************** total size ***************
benign total: 791020
BF total: 54083
Deauth total: 5507
SAE_CV total: 1210
UGD total: 2344
SAE_AF total: 90904
TSC total: 819
DG total: 11296
KRACK_DG total: 7330
*************** train size ***************
benign total: 791020
BF total: 43266
Deauth total: 4405
SAE_CV total: 968
UGD total: 1875
SAE_AF total: 72723
TSC total: 655
DG total: 9036
KRACK_DG total: 5864
*************** test size ***************
benign total: 237305
BF total: 10817
Deauth total: 1102
SAE_CV total: 242
UGD total: 469
SAE_AF total: 18181
TSC total: 164
DG total: 2260
KRACK_DG total: 1466


## Sampling

In [30]:
class Sampling:
    SEED = 42

    def __init__(self, classes, data_path=None, test=False) -> None:
        self.classes = classes
        self.path = data_path
        self.save_to = f"{root_path}/sampling"
        self.test = test

    def load_data(self, path=None):
        if path == None:
            path = self.path
        # load data
        x = []
        y = []
        for c in self.classes:
            print(f"Process Class: {c}")
            if not os.path.exists(f"{path}"):
                os.makedirs(f"{path}")
            data_path = f"{path}/{c}.npy"
            label_path = f"{path}/{c}_label.npy"
            x.extend(np.load(data_path))
            y.extend(np.load(label_path))
        # combine all the flow
        x = np.asarray(x)
        y = np.asarray(y)
        return x, y

    def get_oversampling(self, x, y, over_strategy, k, cluster_balance_threshold):
        # output number of each class
        print("*" * 25 + " Before Sampling " + "*" * 25)
        unique, counts = np.unique(y, return_counts=True)
        print(f"y: {dict(zip(unique, counts))}")
        # k-means smote oversampling
        # over_strategy = {8: O}
        oversample = KMeansSMOTE(
            sampling_strategy=over_strategy,
            random_state=self.SEED,
            k_neighbors=k,
            cluster_balance_threshold=cluster_balance_threshold,
        )  # 0.0016
        x, y = oversample.fit_resample(x, y)

        # output number of each class
        print("*" * 25 + " After OverSampling " + "*" * 25)
        unique, counts = np.unique(y, return_counts=True)
        print(f"y: {dict(zip(unique, counts))}")
        return x, y

    def get_undersampling(self, x, y, under_strategy):
        # output number of each class
        print("*" * 25 + " Before Sampling " + "*" * 25)
        unique, counts = np.unique(y, return_counts=True)
        print(f"y: {dict(zip(unique, counts))}")
        # random undersampling
        # under_strategy = {0: U, 1: U, 2: U, 3: U, 4: U, 5: U, 7: U, 8: U}
        # under_strategy = {0: 360, 1: 360}
        undersample = RandomUnderSampler(
            sampling_strategy=under_strategy, random_state=self.SEED
        )
        x, y = undersample.fit_resample(x, y)

        # output number of each class
        print("*" * 25 + " After UnderSampling " + "*" * 25)
        unique, counts = np.unique(y, return_counts=True)
        print(f"y: {dict(zip(unique, counts))}")
        return x, y

    def save_np(self, data, label):
        if not self.test:
            if not os.path.exists(f"{self.save_to}"):
                os.makedirs(f"{self.save_to}")
            # delete old file
            if os.path.isfile(f"{self.save_to}/train_data.npy"):
                os.remove(f"{self.save_to}/train_data.npy")
            if os.path.isfile(f"{self.save_to}/train_label.npy"):
                os.remove(f"{self.save_to}/train_label.npy")

            if not os.path.isfile(f"{self.save_to}/train_data.npy"):
                np.save(f"{self.save_to}/train_data.npy", data)
            else:
                print(f"no delete old file")
            if not os.path.isfile(f"{self.save_to}/train_label.npy"):
                np.save(f"{self.save_to}/train_label.npy", label)
            else:
                print(f"no delete old file")
            print(f"save to: {self.save_to}")
        elif self.test:
            if not os.path.exists(f"{self.save_to}"):
                os.makedirs(f"{self.save_to}")
            # delete old file
            if os.path.isfile(f"{self.save_to}/test_data.npy"):
                os.remove(f"{self.save_to}/test_data.npy")
            if os.path.isfile(f"{self.save_to}/test_label.npy"):
                os.remove(f"{self.save_to}/test_label.npy")

            if not os.path.isfile(f"{self.save_to}/test_data.npy"):
                np.save(f"{self.save_to}/test_data.npy", data)
            else:
                print(f"no delete old file")
            if not os.path.isfile(f"{self.save_to}/test_label.npy"):
                np.save(f"{self.save_to}/test_label.npy", label)
            else:
                print(f"no delete old file")
            print(f"save to: {self.save_to}")

In [32]:
training_set_path = f"{root_path}/data/train"
testing_set_path = f"{root_path}/data/test"

# training set sampling
sampling = Sampling(WIDS_class_list, data_path=training_set_path)

OVERSAMPLING = dict({3: 4000, 4: 4000, 6: 4000})
UNDERSAMPLING = dict({0: 10000, 1: 10000, 5: 10000})
# load data
data, label = sampling.load_data()
if OVERSAMPLING != dict({}):
    # oversampling
    data, label = sampling.get_oversampling(
        data, label, OVERSAMPLING, k=5, cluster_balance_threshold=0.0005
    )
if UNDERSAMPLING != dict({}):
    # undersampling
    data, label = sampling.get_undersampling(
        data, label, under_strategy=UNDERSAMPLING
    )
# save
sampling.save_np(data, label)
del data, label, sampling

Process Class: benign
Process Class: BF
Process Class: Deauth
Process Class: SAE_CV
Process Class: UGD
Process Class: SAE_AF
Process Class: TSC
Process Class: DG
Process Class: KRACK_DG
************************* Before Sampling *************************
y: {0: 791020, 1: 43266, 2: 4405, 3: 968, 4: 1875, 5: 72723, 6: 655, 7: 9036, 8: 5864}




************************* After OverSampling *************************
y: {0: 791020, 1: 43266, 2: 4405, 3: 4000, 4: 4000, 5: 72723, 6: 4000, 7: 9036, 8: 5864}
************************* Before Sampling *************************
y: {0: 791020, 1: 43266, 2: 4405, 3: 4000, 4: 4000, 5: 72723, 6: 4000, 7: 9036, 8: 5864}
************************* After UnderSampling *************************
y: {0: 10000, 1: 10000, 2: 4405, 3: 4000, 4: 4000, 5: 10000, 6: 4000, 7: 9036, 8: 5864}
save to: /home/sun10/IDS/upload/sampling


In [33]:
# testing set sampling
sampling = Sampling(WIDS_class_list, data_path=testing_set_path, test=True)

OVERSAMPLING = dict({})
UNDERSAMPLING = dict({0: 20000})
# load data
data, label = sampling.load_data()
if OVERSAMPLING != dict({}):
    # oversampling
    data, label = sampling.get_oversampling(
        data, label, OVERSAMPLING, k=5, cluster_balance_threshold=0.0005
    )
if UNDERSAMPLING != dict({}):
    # undersampling
    data, label = sampling.get_undersampling(
        data, label, under_strategy=UNDERSAMPLING
    )
# save
sampling.save_np(data, label)
del data, label, sampling

Process Class: benign
Process Class: BF
Process Class: Deauth
Process Class: SAE_CV
Process Class: UGD
Process Class: SAE_AF
Process Class: TSC
Process Class: DG
Process Class: KRACK_DG
************************* Before Sampling *************************
y: {0: 237305, 1: 10817, 2: 1102, 3: 242, 4: 469, 5: 18181, 6: 164, 7: 2260, 8: 1466}
************************* After UnderSampling *************************
y: {0: 20000, 1: 10817, 2: 1102, 3: 242, 4: 469, 5: 18181, 6: 164, 7: 2260, 8: 1466}
save to: /home/sun10/IDS/upload/sampling
