In [41]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
from collections import Counter

from sklearn.preprocessing import minmax_scale, OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report

import mlflow

np.random.seed(0)
random.seed(0)

## 導入訓練資料

In [42]:
def read_data(path):
    data = pd.read_csv(path)
    # drop_list = ["Label", "Flow_ID", "Src_IP", "Dst_IP", "Timestamp"]
    # data = data.drop(drop_list, axis=1)

    # y = data["Sub_Cat"]
    # X = data.drop(["Sub_Cat", "Cat"], axis=1)
    y = data["Cat"]
    X = data.drop(["Cat"], axis=1)
    column = X.columns
     # replace infinnity data by maximum value in float
    X = X.replace([np.inf, -np.inf], np.finfo(np.float32).max)
    
    # Data Normalziation
    X = minmax_scale(X, axis=0)
    X = pd.DataFrame(X)
    X.columns = column

    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=0)

    return X_train, X_test, y_train, y_test

In [43]:
path = "../data/IoT Network Intrusion Dataset.csv"
path = "../data/Train_Cat.csv"

X_train, X_test, y_train, y_test= read_data(path)

In [44]:
y_train.head()

42865               Normal
91446    MITM ARP Spoofing
66374                Mirai
79106                  DoS
13404               Normal
Name: Cat, dtype: object

## 建立具有資料投毒攻擊防禦能力的隨機森林

隨機森林在訓練模型時會從訓練資料集中隨機抽取資料形成子訓練集，這一步驟稱爲Bagging。Bagging可以避免訓練決策樹的時候造成過擬合，減小模型的方差。子資料集會交由不同的決策樹進行訓練，決策樹會隨機選取資料的特徵作爲決策依據，因此每一顆樹的結構都不同。不同的決策樹會生成不同的預測結果，最後使用投票決定最終的預測結果。  
隨機森林進行Bagging的時候，若訓練資料帶有一定比例下的惡意資料，則bagging會將資料可能會將資料分爲帶有惡意資料的自集合和正常資料的子集合，帶有惡意資料的子集合其決策樹也會受影響，受影響的決策樹能夠在投票階段被識別出，最小化惡意資料的影響。

使用其他決策方法取代隨機森林，並比較各決策方法的性能與能耗

In [20]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import log_loss

class RandomForest():
    def __init__(self, n_estimator=10, max_samples=1.0,
                 max_features=1.0, boostrap=True, bootstrap_features=True, random_state=1):

        '''
        BaggingClassifier:
        base_estimator: 決策方法，如Decision Tree, XGBoost等
        n_estimators: 評估器個數
        max_samples: 從訓練資料集X中抽取的樣本數，用於訓練每個評估器，如果值為int則抽取n個樣本，若爲float則按比例抽取特徵
        max_features: 從訓練資料集X中提取用於訓練每個基本評估器的特徵數，同上
        bootstrap: 是否放回采樣，如果為False則是passing
        bootstrap_features: 是否針對特徵重抽樣
        oob_score: 是否使用oob估計汎化誤差
        random_state: 隨機種子
        n_jobs: 調用CPU内核的數量，默認為1，-1為使用所有内核
        '''
        base_estimator = DecisionTreeClassifier(random_state=1)
        self.classifier = BaggingClassifier(
            DecisionTreeClassifier(random_state=1),
            n_estimators=n_estimator, max_samples=max_samples,
            max_features=max_features, bootstrap=boostrap, bootstrap_features=bootstrap_features,
            random_state=random_state, oob_score=True, n_jobs=-1)

        
    def fit(self, X, y):
        self.classifier.fit(X,y)

        # # get the loss by out-of-bag score
        # oob_score = self.classifier.oob_score_
        # print(oob_score)
        # return self.loss_values

    def predict(self, X):
        return self.classifier.predict(X)
    

### 使用tensor flow 建具有bagging機制的分類器

In [45]:
# Trun Warnings off to keep notebook clean
import warnings
warnings.filterwarnings("ignore")

In [46]:
# Load module
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import backend as K
from tensorflow.keras.losses import binary_crossentropy
from tensorflow.keras.utils import to_categorical

In [47]:
class MLP_classifier():

    def __init__(self, number_features, label_mapping, learning_rate=0.003):
        # Clear Tensorflow

        K.clear_session()
        number_label = len(label_mapping)
        self.selected_feature = random.sample(range(number_features), number_features//4)

        # Define Layers
        inputs = layers.Input(shape=(len(self.selected_feature), ))
        dropout_0 = layers.Dropout(0.2)(inputs) # 正規化

        dense_1 = layers.Dense(240, activation='relu')(dropout_0)
        dropout_1 = layers.Dropout(0.2)(dense_1)

        dense_2 = layers.Dense(50, activation='relu')(dropout_1)
        dropout_2 = layers.Dropout(0.2)(dense_2)

        outputs = layers.Dense(number_label, activation="softmax")(dropout_2)

        self.net = Model(inputs, outputs)
        self.label_mapping = label_mapping

        # compile model
        opt = Adam(learning_rate=learning_rate)
        self.net.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])

        self.encoder = LabelEncoder()
    
    def fit(self, X, y, epochs = 10, batch_size=32, verbose=1):
        integers_labels = [self.label_mapping[label] for label in y]
        y = to_categorical(integers_labels, num_classes=len(self.label_mapping))

        random_feature = X.iloc[:,self.selected_feature]

        return self.net.fit(random_feature,y,epochs=epochs, batch_size=batch_size, verbose=verbose)
    
    def predict(self, X):
        random_feature = X.iloc[:,self.selected_feature]
        predict = self.net.predict(random_feature, verbose=None)
        mapping = {v: k for k, v in self.label_mapping.items()}
        predict_int =  np.argmax(predict, axis=-1)
        return np.vectorize(mapping.get)(predict_int)


In [48]:
l = list(y_train.unique())
lable_mapping = {label: idx for idx, label in enumerate(l)}

number_features = X_train.shape[1]
model = MLP_classifier(number_features=number_features, label_mapping=lable_mapping)

In [49]:
model.fit(X_train, y_train, epochs=1, verbose=0)

<keras.src.callbacks.history.History at 0x1e6936ba510>

In [50]:
predict = model.predict(X_test)
print(accuracy_score(y_test, predict))

0.7198333333333333


In [51]:
print(classification_report(y_test, predict))

                   precision    recall  f1-score   support

              DoS       0.92      0.97      0.94      3721
MITM ARP Spoofing       0.40      0.87      0.55      3771
            Mirai       0.91      0.35      0.51      3794
           Normal       0.81      0.90      0.85     14909
             Scan       0.00      0.00      0.00      3805

         accuracy                           0.72     30000
        macro avg       0.61      0.62      0.57     30000
     weighted avg       0.68      0.72      0.67     30000



In [52]:
class Random_Forest():

    def __init__(self, number_of_nets=10, learning_rate=0.003):
        self.number_of_nets = number_of_nets
        self.models = []
        self.learning_rate = learning_rate
        self.label_mapping = {}
    
    def fit(self, X, y, epochs = 10, batch_size=32, verbose=1):
        training_set_size = len(X)
        sample_size = training_set_size // 10
        number_features = X.shape[1]

        # get the lable mapping
        label_list = list(y.unique())
        self.label_mapping = {label: idx for idx, label in enumerate(label_list)}

        for i in range(self.number_of_nets):
            print(f'Training model {i+1} of {self.number_of_nets}')
            model = MLP_classifier(number_features=number_features, label_mapping=self.label_mapping, learning_rate=self.learning_rate)
            # Get samples of training data
            indexes = np.random.choice(range(training_set_size), sample_size)
            resample_X = X.iloc[indexes]
            resample_y = y.iloc[indexes]

            model.fit(resample_X, resample_y, epochs=epochs, batch_size=batch_size, verbose=0)
            self.models.append(model)

    def predict(self, X):
        predicts = []
        for model in self.models:
            predict = model.predict(X)
            predicts.append(predict)
        final_predict = []
        for i in range(len(X)):
            votes = [pred[i] for pred in predicts]
            majority_vote = Counter(votes).most_common(1)[0][0]
            final_predict.append(majority_vote)

        return final_predict
                

## 訓練隨機森林
使用經過處理的訓練資料對隨機森林進行訓練, 期間使用MLFlow記錄模型的訓練過程

In [60]:
mlflow.tensorflow.autolog(checkpoint=True, checkpoint_save_best_only=False)

model = Random_Forest(number_of_nets=1)



In [61]:
with mlflow.start_run() as run:
    model.fit(X_train, y_train, epochs=1, batch_size=32)



Training model 1 of 1




In [62]:
predict = model.predict(X_test)

In [63]:
print(accuracy_score(y_test, predict))

0.6601333333333333


In [64]:
print(classification_report(y_test, predict))
mlflow.end_run()

                   precision    recall  f1-score   support

              DoS       0.96      0.95      0.95      3721
MITM ARP Spoofing       0.00      0.00      0.00      3771
            Mirai       0.65      0.40      0.49      3794
           Normal       0.62      0.99      0.76     14909
             Scan       0.00      0.00      0.00      3805

         accuracy                           0.66     30000
        macro avg       0.45      0.47      0.44     30000
     weighted avg       0.51      0.66      0.56     30000



In [17]:
mlflow.sklearn.autolog()

with mlflow.start_run(run_name="DecisionTreeClassifier") as run:
    rf = RandomForest()
    rf.fit(X_train, y_train)
    mt.accuracy_score(y_test, rf.predict(X_test))
    
mlflow.end_run()


Exception: Run with UUID ebaa0c186f4d455198c516ac7d62b30f is already active. To start a new run, first end the current run with mlflow.end_run(). To start a nested run, call start_run with nested=True

In [None]:
train_predict = rf.predict(X_train)

accuracy_train = mt.accuracy_score(y_train, train_predict)

mlflow.log_metric("accuracy_train", accuracy_train*100)

print("Train Accuracy: {:.4f}%".format(accuracy_train * 100))



Train Accuracy: 82.2515%


In [None]:
print(classification_report(y_train, train_predict))

                       precision    recall  f1-score   support

      DoS-Synflooding       1.00      1.00      1.00     41639
    MITM ARP Spoofing       0.99      0.99      0.99     24867
    Mirai-Ackflooding       0.40      0.39      0.40     38601
  Mirai-HTTP Flooding       0.41      0.40      0.41     39152
Mirai-Hostbruteforceg       0.95      0.98      0.96     84951
   Mirai-UDP Flooding       0.82      0.82      0.82    128339
               Normal       1.00      1.00      1.00     27914
        Scan Hostport       0.92      0.69      0.79     15506
         Scan Port OS       0.88      0.97      0.93     37079

             accuracy                           0.82    438048
            macro avg       0.82      0.81      0.81    438048
         weighted avg       0.82      0.82      0.82    438048



使用測試資料集驗證模型

In [None]:
test_predict = rf.predict(X_test)

accuracy_test = mt.accuracy_score(y_test, test_predict)

print("Test Accuracy: {:.4f}%".format(accuracy_test * 100))



Test Accuracy: 75.8143%


In [None]:
print(classification_report(y_test, test_predict))

                       precision    recall  f1-score   support

      DoS-Synflooding       1.00      1.00      1.00     17752
    MITM ARP Spoofing       0.97      0.97      0.97     10510
    Mirai-Ackflooding       0.18      0.17      0.18     16523
  Mirai-HTTP Flooding       0.19      0.19      0.19     16666
Mirai-Hostbruteforceg       0.93      0.98      0.95     36230
   Mirai-UDP Flooding       0.76      0.75      0.76     55215
               Normal       1.00      0.99      1.00     12159
        Scan Hostport       0.86      0.62      0.72      6686
         Scan Port OS       0.86      0.95      0.90     15994

             accuracy                           0.76    187735
            macro avg       0.75      0.74      0.74    187735
         weighted avg       0.75      0.76      0.76    187735

