# Import data

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import os
from sklearn.metrics import matthews_corrcoef
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv("../../dataset/Merged01.csv", skipinitialspace=True)
data

Unnamed: 0,Header_Length,Protocol Type,Time_To_Live,Rate,fin_flag_number,syn_flag_number,rst_flag_number,psh_flag_number,ack_flag_number,ece_flag_number,...,Tot sum,Min,Max,AVG,Std,Tot size,IAT,Number,Variance,Label
0,19.92,6,63.36,25893.962218,0.00,0.0,0.00,0.99,0.99,0.0,...,6421,60,481,64.21,42.100000,64.21,0.000039,100,1772.410000,DDOS-PSHACK_FLOOD
1,0.00,47,64.00,3703.841331,0.00,0.0,0.00,0.00,0.00,0.0,...,57320,98,578,573.20,48.000000,573.20,0.000271,100,2304.000000,MIRAI-GREIP_FLOOD
2,7.92,17,65.91,19673.095685,0.00,0.0,0.00,0.00,0.00,0.0,...,6010,60,70,60.10,1.000000,60.10,0.000057,100,1.000000,DOS-UDP_FLOOD
3,20.40,6,110.50,261.664826,0.10,0.0,0.30,0.20,0.40,0.0,...,2223,54,1500,222.30,451.596686,222.30,0.004766,10,203939.566667,DNS_SPOOFING
4,0.32,1,63.96,28944.199848,0.00,0.0,0.00,0.00,0.01,0.0,...,6006,60,66,60.06,0.600000,60.06,0.000035,100,0.360000,DDOS-ICMP_FLOOD
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
712306,20.00,6,64.00,33177.535200,0.00,1.0,0.00,0.00,0.00,0.0,...,6000,60,60,60.00,0.000000,60.00,0.000034,100,0.000000,DDOS-SYN_FLOOD
712307,8.00,17,64.00,49038.980475,0.00,0.0,0.00,0.00,0.00,0.0,...,6000,60,60,60.00,0.000000,60.00,0.000020,100,0.000000,DOS-UDP_FLOOD
712308,20.00,6,67.82,10496.518932,0.98,0.0,0.98,0.01,0.02,0.0,...,6061,60,121,60.61,6.100000,60.61,0.000102,100,37.210000,DDOS-RSTFINFLOOD
712309,20.00,6,64.00,23321.123158,0.00,1.0,0.00,0.00,0.00,0.0,...,6000,60,60,60.00,0.000000,60.00,0.000048,100,0.000000,DDOS-SYNONYMOUSIP_FLOOD


# Data Cleaning

In [5]:
category_mapping = {
    "DDOS": ["DDOS-PSHACK_FLOOD", "DDOS-ICMP_FLOOD", "DDOS-TCP_FLOOD", "DDOS-SYN_FLOOD",
             "DDOS-UDP_FLOOD", "DDOS-SYNONYMOUSIP_FLOOD", "DDOS-RSTFINFLOOD", "DDOS-SLOWLORIS",
             "DDOS-ICMP_FRAGMENTATION", "DDOS-ACK_FRAGMENTATION", "DDOS-UDP_FRAGMENTATION",
             "DDOS-HTTP_FLOOD"],
    
    "DOS": ["DOS-UDP_FLOOD", "DOS-TCP_FLOOD", "DOS-SYN_FLOOD", "DOS-HTTP_FLOOD"],
    "MIRAI": ["MIRAI-GREIP_FLOOD", "MIRAI-GREETH_FLOOD", "MIRAI-UDPPLAIN"],
    "RECON": ["RECON-HOSTDISCOVERY", "RECON-PORTSCAN", "RECON-OSSCAN", "RECON-PINGSWEEP"],
    "MITM": ["MITM-ARPSPOOFING"],
    "VULNERABILITY": ["VULNERABILITYSCAN"],
    "BACKDOOR": ["BACKDOOR_MALWARE"],
    "XSS": ["XSS"],
    "SQLINJECTION": ["SQLINJECTION"],
    "BRUTEFORCE": ["DICTIONARYBRUTEFORCE"],
    "COMMANDINJECTION": ["COMMANDINJECTION"],
    "BROWSERHIJACKING": ["BROWSERHIJACKING"],
    "UPLOADATTACK": ["UPLOADING_ATTACK"],
    "DNS": ["DNS_SPOOFING"],
    "BENIGN": ["BENIGN"]
}

def map_label_to_category(label):
    for category, labels in category_mapping.items():
        if label in labels:
            return category
    return "UNKNOWN"  

data["Cat"] = data["Label"].apply(map_label_to_category)



output_file = os.path.join("../../dataset/", "CIC_IoT_Dataset2023.csv")
data.to_csv(output_file, index=False)

print(f"File mới đã được lưu")


File mới đã được lưu


In [94]:
data = data.drop_duplicates()
data = data.drop(columns=['Flow_ID', 'Src_IP', 'Dst_IP', 'Timestamp'])
data

Unnamed: 0,Src_Port,Dst_Port,Protocol,Flow_Duration,Tot_Fwd_Pkts,Tot_Bwd_Pkts,TotLen_Fwd_Pkts,TotLen_Bwd_Pkts,Fwd_Pkt_Len_Max,Fwd_Pkt_Len_Min,Fwd_Pkt_Len_Mean,Fwd_Pkt_Len_Std,Bwd_Pkt_Len_Max,Bwd_Pkt_Len_Min,Bwd_Pkt_Len_Mean,Bwd_Pkt_Len_Std,Flow_Byts/s,Flow_Pkts/s,Flow_IAT_Mean,Flow_IAT_Std,Flow_IAT_Max,Flow_IAT_Min,Fwd_IAT_Tot,Fwd_IAT_Mean,Fwd_IAT_Std,Fwd_IAT_Max,Fwd_IAT_Min,Bwd_IAT_Tot,Bwd_IAT_Mean,Bwd_IAT_Std,Bwd_IAT_Max,Bwd_IAT_Min,Fwd_PSH_Flags,Bwd_PSH_Flags,Fwd_URG_Flags,Bwd_URG_Flags,Fwd_Header_Len,Bwd_Header_Len,Fwd_Pkts/s,Bwd_Pkts/s,Pkt_Len_Min,Pkt_Len_Max,Pkt_Len_Mean,Pkt_Len_Std,Pkt_Len_Var,FIN_Flag_Cnt,SYN_Flag_Cnt,RST_Flag_Cnt,PSH_Flag_Cnt,ACK_Flag_Cnt,URG_Flag_Cnt,CWE_Flag_Count,ECE_Flag_Cnt,Down/Up_Ratio,Pkt_Size_Avg,Fwd_Seg_Size_Avg,Bwd_Seg_Size_Avg,Fwd_Byts/b_Avg,Fwd_Pkts/b_Avg,Fwd_Blk_Rate_Avg,Bwd_Byts/b_Avg,Bwd_Pkts/b_Avg,Bwd_Blk_Rate_Avg,Subflow_Fwd_Pkts,Subflow_Fwd_Byts,Subflow_Bwd_Pkts,Subflow_Bwd_Byts,Init_Fwd_Win_Byts,Init_Bwd_Win_Byts,Fwd_Act_Data_Pkts,Fwd_Seg_Size_Min,Active_Mean,Active_Std,Active_Max,Active_Min,Idle_Mean,Idle_Std,Idle_Max,Idle_Min,Label,Cat,Sub_Cat
0,10000,10101,17,75,1,1,982.0,1430.0,982.0,982.0,982.0,0.000000,1430.0,1430.0,1430.000000,0.000000,3.216000e+07,26666.666667,75.0,0.000000,75.0,75.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0,0,0,0,8,8,13333.333333,13333.333333,982.0,1430.0,1280.666667,258.652921,66901.333333,0,0,0,0,0,0,0,0,1.0,1921.000000,982.0,1430.000000,0,0,0,0,0,0,1,982,1,1430,-1,-1,1,0,0.0,0.0,0.0,0.0,75.0,0.000000,75.0,75.0,Anomaly,Mirai,Mirai-Ackflooding
1,2179,554,6,5310,1,2,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000e+00,564.971751,2655.0,2261.327486,4254.0,1056.0,0.0,0.0,0.000000,0.0,0.0,5310.0,5310.0,0.000000,5310.0,5310.0,0,0,0,0,20,44,188.323917,376.647834,0.0,0.0,0.000000,0.000000,0.000000,0,1,0,0,0,0,0,0,2.0,0.000000,0.0,0.000000,0,0,0,0,0,0,1,0,2,0,-1,14600,0,0,0.0,0.0,0.0,0.0,2655.0,2261.327486,4254.0,1056.0,Anomaly,DoS,DoS-Synflooding
2,52727,9020,6,141,0,3,0.0,2806.0,0.0,0.0,0.0,0.000000,1388.0,30.0,935.333333,784.041666,1.990071e+07,21276.595745,70.5,0.707107,71.0,70.0,0.0,0.0,0.000000,0.0,0.0,141.0,70.5,0.707107,71.0,70.0,0,0,0,0,0,96,0.000000,21276.595745,30.0,1388.0,1048.500000,679.000000,461041.000000,0,0,0,0,1,0,0,0,0.0,1398.000000,0.0,935.333333,0,0,0,0,0,0,0,0,3,2806,-1,1869,0,0,0.0,0.0,0.0,0.0,70.5,0.707107,71.0,70.0,Anomaly,Scan,Scan Port OS
3,52964,9020,6,151,0,2,0.0,2776.0,0.0,0.0,0.0,0.000000,1388.0,1388.0,1388.000000,0.000000,1.838411e+07,13245.033113,151.0,0.000000,151.0,151.0,0.0,0.0,0.000000,0.0,0.0,151.0,151.0,0.000000,151.0,151.0,0,0,0,0,0,64,0.000000,13245.033113,1388.0,1388.0,1388.000000,0.000000,0.000000,0,0,0,0,1,0,0,0,0.0,2082.000000,0.0,1388.000000,0,0,0,0,0,0,0,0,2,2776,-1,1869,0,0,0.0,0.0,0.0,0.0,151.0,0.000000,151.0,151.0,Anomaly,Mirai,Mirai-Hostbruteforceg
4,36763,1900,17,153,2,1,886.0,420.0,452.0,434.0,443.0,12.727922,420.0,420.0,420.000000,0.000000,8.535948e+06,19607.843137,76.5,0.707107,77.0,76.0,76.0,76.0,0.000000,76.0,76.0,0.0,0.0,0.000000,0.0,0.0,0,0,0,0,16,8,13071.895425,6535.947712,420.0,452.0,431.500000,15.176737,230.333333,0,0,0,0,0,0,0,0,0.0,575.333333,443.0,420.000000,0,0,0,0,0,0,2,886,1,420,-1,-1,2,0,0.0,0.0,0.0,0.0,76.5,0.707107,77.0,76.0,Anomaly,Mirai,Mirai-Hostbruteforceg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
625773,60165,8899,17,29,5,1,160.0,32.0,32.0,32.0,32.0,0.000000,32.0,32.0,32.000000,0.000000,6.620690e+06,206896.551724,5.8,3.346640,11.0,3.0,22.0,5.5,3.785939,11.0,3.0,0.0,0.0,0.000000,0.0,0.0,0,0,0,0,40,8,172413.793103,34482.758621,32.0,32.0,32.000000,0.000000,0.000000,0,0,0,0,0,0,0,0,0.0,37.333333,32.0,32.000000,0,0,0,0,0,0,5,160,1,32,-1,-1,5,0,0.0,0.0,0.0,0.0,5.8,3.346640,11.0,3.0,Anomaly,Mirai,Mirai-UDP Flooding
625776,8739,19604,6,1092,0,2,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000e+00,1831.501832,1092.0,0.000000,1092.0,1092.0,0.0,0.0,0.000000,0.0,0.0,1092.0,1092.0,0.000000,1092.0,1092.0,0,0,0,0,0,40,0.000000,1831.501832,0.0,0.0,0.000000,0.000000,0.000000,0,1,0,0,0,0,0,0,0.0,0.000000,0.0,0.000000,0,0,0,0,0,0,0,0,2,0,-1,0,0,0,0.0,0.0,0.0,0.0,1092.0,0.000000,1092.0,1092.0,Anomaly,DoS,DoS-Synflooding
625778,56112,8043,17,277,1,1,18.0,18.0,18.0,18.0,18.0,0.000000,18.0,18.0,18.000000,0.000000,1.299639e+05,7220.216606,277.0,0.000000,277.0,277.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0,0,0,0,8,8,3610.108303,3610.108303,18.0,18.0,18.000000,0.000000,0.000000,0,0,0,0,0,0,0,0,1.0,27.000000,18.0,18.000000,0,0,0,0,0,0,1,18,1,18,-1,-1,1,0,0.0,0.0,0.0,0.0,277.0,0.000000,277.0,277.0,Anomaly,Mirai,Mirai-UDP Flooding
625779,4570,554,6,1658,0,2,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000e+00,1206.272618,1658.0,0.000000,1658.0,1658.0,0.0,0.0,0.000000,0.0,0.0,1658.0,1658.0,0.000000,1658.0,1658.0,0,0,0,0,0,44,0.000000,1206.272618,0.0,0.0,0.000000,0.000000,0.000000,0,1,0,0,0,0,0,0,0.0,0.000000,0.0,0.000000,0,0,0,0,0,0,0,0,2,0,-1,14600,0,0,0.0,0.0,0.0,0.0,1658.0,0.000000,1658.0,1658.0,Anomaly,DoS,DoS-Synflooding


In [95]:
dataLabel = data[['Label', 'Cat', 'Sub_Cat']]
data = data.drop(columns=['Label', 'Cat', 'Sub_Cat'])
data.replace([np.inf, -np.inf], np.nan, inplace=True)
data.fillna(data.mean(), inplace=True)

In [96]:
# Kiểm tra số lớp duy nhất trong từng cột
num_classes_label = len(dataLabel['Label'].unique())
num_classes_cat = len(dataLabel['Cat'].unique())
num_classes_sub_cat = len(dataLabel['Sub_Cat'].unique())

print(f'Số lớp cho Label: {num_classes_label}')
print(f'Số lớp cho Cat: {num_classes_cat}')
print(f'Số lớp cho Sub_Cat: {num_classes_sub_cat}')

Số lớp cho Label: 2
Số lớp cho Cat: 5
Số lớp cho Sub_Cat: 9


In [97]:
from sklearn.preprocessing import StandardScaler, LabelEncoder

scaler = StandardScaler()
data = scaler.fit_transform(data)
X_train, X_test, y_train, y_test = train_test_split(data, dataLabel, test_size=0.2, random_state=42)

In [98]:
# Tạo LabelEncoder cho mỗi cột nhãn
label_encoders = {
    'Label': LabelEncoder(),
    'Cat': LabelEncoder(),
    'Sub_Cat': LabelEncoder()
}

# Mã hóa nhãn cho tập huấn luyện
y_train_encoded = {
    'Label': label_encoders['Label'].fit_transform(y_train['Label']),
    'Cat': label_encoders['Cat'].fit_transform(y_train['Cat']),
    'Sub_Cat': label_encoders['Sub_Cat'].fit_transform(y_train['Sub_Cat'])
}

# Mã hóa nhãn cho tập kiểm tra
y_test_encoded = {
    'Label': label_encoders['Label'].transform(y_test['Label']),
    'Cat': label_encoders['Cat'].transform(y_test['Cat']),
    'Sub_Cat': label_encoders['Sub_Cat'].transform(y_test['Sub_Cat'])
}

In [99]:
X_train_tensor = torch.Tensor(X_train)
y_train_tensor = {
    'Label': torch.LongTensor(y_train_encoded['Label']),
    'Cat': torch.LongTensor(y_train_encoded['Cat']),
    'Sub_Cat': torch.LongTensor(y_train_encoded['Sub_Cat'])
}

X_test_tensor = torch.Tensor(X_test)
y_test_tensor = {
    'Label': torch.LongTensor(y_test_encoded['Label']),
    'Cat': torch.LongTensor(y_test_encoded['Cat']),
    'Sub_Cat': torch.LongTensor(y_test_encoded['Sub_Cat'])
}

In [100]:
class SimpleModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_label_classes, num_cat_classes, num_sub_cat_classes):
        super(SimpleModel, self).__init__()
        # Định nghĩa các lớp chung và các lớp đầu ra riêng cho mỗi nhiệm vụ
        self.shared_layers = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.ReLU()
        )
        self.label_head = nn.Linear(hidden_size, num_label_classes)
        self.cat_head = nn.Linear(hidden_size, num_cat_classes)
        self.sub_cat_head = nn.Linear(hidden_size, num_sub_cat_classes)

    def forward(self, x):
        shared_rep = self.shared_layers(x)  # Truyền qua các lớp chung
        outputs_label = self.label_head(shared_rep)  # Dự đoán label
        outputs_cat = self.cat_head(shared_rep)  # Dự đoán category
        outputs_sub_cat = self.sub_cat_head(shared_rep)  # Dự đoán sub-category

        # Trả về ba đầu ra
        return outputs_label, outputs_cat, outputs_sub_cat


In [101]:
input_size = 79  # Số đầu vào
hidden_size = 100  # Kích thước ẩn
num_label_classes = 2  # Số lớp cho đầu ra nhãn (label)
num_cat_classes = 5  # Số lớp cho category
num_sub_cat_classes = 9  # Số lớp cho sub-category

model = SimpleModel(input_size=input_size, hidden_size=hidden_size, 
                    num_label_classes=num_label_classes, 
                    num_cat_classes=num_cat_classes, 
                    num_sub_cat_classes=num_sub_cat_classes)

In [102]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
X_train_tensor = X_train_tensor.to(device)
y_train_tensor['Label'] = y_train_tensor['Label'].to(device)
y_train_tensor['Cat'] = y_train_tensor['Cat'].to(device)
y_train_tensor['Sub_Cat'] = y_train_tensor['Sub_Cat'].to(device)
model.to(device)  # Chuyển mô hình sang thiết bị

SimpleModel(
  (shared_layers): Sequential(
    (0): Linear(in_features=79, out_features=100, bias=True)
    (1): ReLU()
  )
  (label_head): Linear(in_features=100, out_features=2, bias=True)
  (cat_head): Linear(in_features=100, out_features=5, bias=True)
  (sub_cat_head): Linear(in_features=100, out_features=9, bias=True)
)

In [103]:
train_dataset = TensorDataset(X_train_tensor, y_train_tensor['Label'], y_train_tensor['Cat'], y_train_tensor['Sub_Cat'])
train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)

In [104]:
import torch
import torch.nn as nn
import torch.optim as optim

def train_model(model, train_loader, epochs=100, lr=0.001):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    for epoch in range(epochs):
        model.train()
        for batch in train_loader:
            optimizer.zero_grad()

            X_batch, y_label_batch, y_cat_batch, y_sub_cat_batch = batch

            # Đưa dữ liệu vào cùng thiết bị với mô hình
            X_batch = X_batch.to(device)
            y_label_batch = y_label_batch.to(device)
            y_cat_batch = y_cat_batch.to(device)
            y_sub_cat_batch = y_sub_cat_batch.to(device)

            outputs_label, outputs_cat, outputs_sub_cat = model(X_batch)

            loss_label = criterion(outputs_label, y_label_batch)
            loss_cat = criterion(outputs_cat, y_cat_batch)
            loss_sub_cat = criterion(outputs_sub_cat, y_sub_cat_batch)

            loss = loss_label + loss_cat + loss_sub_cat
            loss.backward()
            optimizer.step()

        if (epoch + 1) % 10 == 0:
            print(f'Epoch [{epoch + 1}/{epochs}], Loss: {loss.item():.4f}')


In [105]:
train_model(model, train_loader)

Epoch [10/100], Loss: 0.7872
Epoch [20/100], Loss: 0.7695
Epoch [30/100], Loss: 0.6712
Epoch [40/100], Loss: 0.7587
Epoch [50/100], Loss: 0.5635
Epoch [60/100], Loss: 0.6828
Epoch [70/100], Loss: 0.6315
Epoch [80/100], Loss: 0.6564
Epoch [90/100], Loss: 0.6612
Epoch [100/100], Loss: 0.4754


In [107]:
from sklearn.metrics import matthews_corrcoef

num_classes_label = 2  # Ví dụ, số lớp cho 'Label'
num_classes_cat = 5     # Ví dụ, số lớp cho 'Cat'
num_classes_sub_cat = 9  # Ví dụ, số lớp cho 'Sub_Cat'

def evaluate_model(model, X_test, y_test):
    model.eval()
    with torch.no_grad():
        outputs_label, outputs_cat, outputs_sub_cat = model(X_test)

        # Dự đoán cho mỗi đầu ra
        predicted_labels = torch.argmax(outputs_label, dim=1)
        predicted_cat = torch.argmax(outputs_cat, dim=1)
        predicted_sub_cat = torch.argmax(outputs_sub_cat, dim=1)

        # Tính toán MCC cho mỗi đầu ra
        mcc_label = matthews_corrcoef(y_test['Label'].cpu().numpy(), predicted_labels.cpu().numpy())
        mcc_cat = matthews_corrcoef(y_test['Cat'].cpu().numpy(), predicted_cat.cpu().numpy())
        mcc_sub_cat = matthews_corrcoef(y_test['Sub_Cat'].cpu().numpy(), predicted_sub_cat.cpu().numpy())

        print(f'MCC for Label: {mcc_label:.4f}')
        print(f'MCC for Cat: {mcc_cat:.4f}')
        print(f'MCC for Sub_Cat: {mcc_sub_cat:.4f}')


# Đánh giá mô hình
evaluate_model(model, X_test_tensor, y_test_tensor)

MCC for Label: 0.9752
MCC for Cat: 0.8726
MCC for Sub_Cat: 0.7283
