In [1]:
import torch.nn as nn
import torch.nn.functional as F
import torch
import torch.optim as optim
import networkx as nx
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
from sklearn.model_selection import train_test_split
from torch_geometric.nn import GCNConv, global_mean_pool

In [2]:
import os
import pandas as pd

# Đường dẫn đến folder chứa các file CSV
list_folder_malware = ['./CSVS/SMSmalware-CSVs/SMSmalware/Biige', './CSVS/SMSmalware-CSVs/SMSmalware/Fakeinst', './CSVS/SMSmalware-CSVs/SMSmalware/FakeNotify',
                       './CSVS/SMSmalware-CSVs/SMSmalware/Mazarbot', './CSVS/SMSmalware-CSVs/SMSmalware/Jifake', './CSVS/SMSmalware-CSVs/SMSmalware/Nandrobox',
                      './CSVS/SMSmalware-CSVs/SMSmalware/Plankton', './CSVS/SMSmalware-CSVs/SMSmalware/Zsone']

list_folder_benign = ['./CSVS/Benign-CSVs/Benign/Benign2017', './CSVS/Benign-CSVs/Benign/Benign2016']

# Tạo một DataFrame để chứa dữ liệu từ tất cả các file CSV
list_df = []
all_data = pd.DataFrame()
#Đọc từng file CSV và nối dữ liệu vào DataFrame chung
# Đọc benign
for folder in list_folder_benign: 
     file_list =  os.listdir(folder)
     csv_files = [file for file in file_list if file.endswith('.csv')]
     for csv in csv_files: 
            file_path = os.path.join(folder, csv)
            data = pd.read_csv(file_path, header=0)
            data.columns = data.columns.str.replace(' ', '')
            data = data.sort_values(by='Timestamp')
            data['Label'] = 1
            list_df.append(data)
            all_data = pd.concat([all_data, data], ignore_index=True)
# Đọc malware
for folder in list_folder_malware: 
     file_list =  os.listdir(folder)
     csv_files = [file for file in file_list if file.endswith('.csv')]
     for csv in csv_files: 
        file_path = os.path.join(folder, csv)
        data = pd.read_csv(file_path, header=0)
        data.columns = data.columns.str.replace(' ', '')
        data = data.sort_values(by='Timestamp')
        data['Label'] = 0
        list_df.append(data)
        all_data = pd.concat([all_data, data], ignore_index=True)




# Hiển thị DataFrame chứa dữ liệu từ tất cả các file CSV
print(all_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1057303 entries, 0 to 1057302
Data columns (total 85 columns):
 #   Column                   Non-Null Count    Dtype  
---  ------                   --------------    -----  
 0   FlowID                   1057299 non-null  object 
 1   SourceIP                 1057303 non-null  object 
 2   SourcePort               1057303 non-null  float64
 3   DestinationIP            1057303 non-null  object 
 4   DestinationPort          1057303 non-null  float64
 5   Protocol                 1057303 non-null  float64
 6   Timestamp                1057303 non-null  object 
 7   FlowDuration             1057303 non-null  float64
 8   TotalFwdPackets          1057303 non-null  float64
 9   TotalBackwardPackets     1057303 non-null  float64
 10  TotalLengthofFwdPackets  1057303 non-null  float64
 11  TotalLengthofBwdPackets  1057303 non-null  float64
 12  FwdPacketLengthMax       1057303 non-null  float64
 13  FwdPacketLengthMin       1057303 non-null 

In [3]:
len(list_df)

1281

In [4]:
def convert_ip_label(df):
    le_columns = ['SourceIP', 'DestinationIP']
    ip_list = list(df['SourceIP'].unique()) + list(df['DestinationIP'].unique())
    ip_set = list(set(ip_list))

    for column in le_columns:
        list_unique = list(df[column].unique())
        for val in list_unique:
            df.loc[df[column] == val, column] = ip_set.index(val)

    df['DestinationIP'] = df['DestinationIP'].astype(int)
    df['SourceIP'] = df['SourceIP'].astype(int)
    return df

In [5]:
for df in list_df:
    convert_ip_label(df)

In [6]:
columns_drop=['FlowID', 'SourcePort', 'DestinationPort', 'Timestamp',
                               'PacketLengthStd', 'CWEFlagCount', 'Down/UpRatio','FwdAvgPackets/Bulk', 'FwdAvgBulkRate', 
                               'BwdAvgBytes/Bulk', 'BwdAvgPackets/Bulk', 'BwdAvgBulkRate', 'FwdURGFlags', 'BwdURGFlags', 
                               'RSTFlagCount', 'ECEFlagCount', 'BwdPSHFlags', 'FwdAvgBytes/Bulk']
for df in list_df:
    df.dropna(inplace=True)
    df.drop(columns=columns_drop, inplace=True)

In [None]:
from sklearn.preprocessing import MaxAbsScaler
# Khởi tạo MaxAbsScaler
scaler = MaxAbsScaler()
for df in list_df:
    # Lấy các cột dữ liệu
    columns = df.columns
    # Fit và transform dữ liệu của từng cột
    df[columns] = scaler.fit_transform(df[columns])

In [7]:
# # Function to divide each cell's value by 10 until the largest number in the column is <= 10
# def divide_until_largest_less_than_10(df):
#     for column in df.columns[2:]:
#         largest_number = df[column].max()
#         while largest_number > 10:
#             df[column] /= 10
#             largest_number = df[column].max()
#     return df

# # Iterate over each DataFrame in the list
# for df in list_df:
#     df_copy = df.copy()  # Create a copy to avoid modifying the original DataFrame
#     df_modified = divide_until_largest_less_than_10(df_copy)
#     list_df.pop(0)
#     list_df.append(df_modified)

In [8]:
list_df[0].head()

Unnamed: 0,SourceIP,DestinationIP,Protocol,FlowDuration,TotalFwdPackets,TotalBackwardPackets,TotalLengthofFwdPackets,TotalLengthofBwdPackets,FwdPacketLengthMax,FwdPacketLengthMin,...,min_seg_size_forward,ActiveMean,ActiveStd,ActiveMax,ActiveMin,IdleMean,IdleStd,IdleMax,IdleMin,Label
34,34,16,0.6,0.001419,0.04,0.06,2.771,0.00692,1.368,0.0,...,3.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
37,34,23,0.6,0.003804,0.08,0.09,0.966,0.05841,0.717,0.0,...,3.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
38,34,23,0.6,0.002588,0.1,0.07,1.09,0.04683,0.834,0.0,...,3.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
43,34,20,0.6,0.026859,0.12,0.16,1.1,0.1358,0.302,0.0,...,3.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
16,25,43,0.0,0.749121,0.03,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.745603,0.40177,4.029697,3.461509,1


In [9]:
import networkx as nx
import matplotlib.pyplot as plt
list_graph = []
list_label = []
IP_Column_Drop = ['SourceIP', 'DestinationIP']
for df in list_df:
    graph = nx.from_pandas_edgelist(df, 'SourceIP', 'DestinationIP',
                                     create_using=nx.MultiDiGraph(), edge_attr=df.iloc[:, 2:-1].columns.values.tolist())
    list_graph.append(graph)
    count_value = len(df['Label'].unique())
    if count_value == 1: 
        label = df['Label'].unique()[0]
    else: 
        label = 0
    list_label.append(label)
    


In [10]:
def graph_to_pyg_data(graph, label):
    # Khởi tạo ma trận B_in với tất cả các phần tử bằng 0
    num_nodes = graph.number_of_nodes()
    num_edges = graph.number_of_edges()
    B_in = torch.zeros((num_nodes, num_edges), dtype=torch.float32)
    B_out = torch.zeros((num_nodes, num_edges), dtype=torch.float32)
    for i, node in enumerate(graph.nodes):
        for j, edge in enumerate(graph.edges):
            if node == edge[1]:
                B_in[i, j] = 1
            if node == edge[0]:
                B_out[i, j] = 1
    Y = torch.tensor(label, dtype=torch.long)
    X = torch.tensor([list(graph.edges[edge].values()) for edge in graph.edges], dtype=torch.float)
    return Data(B_in = B_in, B_out=B_out, X=X, Y=Y)

In [11]:
list_graph_labeled = list(zip(list_graph, list_label))
list_data = []
for graph, label in list_graph_labeled:
    list_data.append(graph_to_pyg_data(graph, label))

In [12]:
list_data[0].X.shape

torch.Size([220, 64])

In [13]:
class GNN(nn.Module):
    def __init__(self, edge_feature_dim, hidden_dim, output_dim):
        super(GNN, self).__init__()
        self.layer1_e = nn.Linear(edge_feature_dim, hidden_dim)
        self.layer1_h = nn.Linear(2 * hidden_dim, hidden_dim)
        self.layer2_e = nn.Linear(3 * hidden_dim, hidden_dim)
        self.layer2_h = nn.Linear(3 * hidden_dim, hidden_dim)
        self.layer_pool = nn.Linear(hidden_dim, output_dim)

    def forward(self, B_in, B_out, edge_feature_matrix):
        # print(f'---model---edge_feature_matrix: {edge_feature_matrix}')
        # E_0 = F.relu(self.layer1_e(edge_feature_matrix))
        E_0 = F.relu(self.layer1_e(edge_feature_matrix))
        # print(f'---model---E_0: {E_0}')

        matrix_1 = torch.matmul(B_in, E_0)
        matrix_2 = torch.matmul(B_out, E_0)
        result_matrix_0 = torch.cat((matrix_1, matrix_2), dim=1)
        # H_0 = F.relu(self.layer1_h(result_matrix_0))
        H_0 = F.relu(self.layer1_h(result_matrix_0))

        matrix_3 = torch.matmul(B_in.t(), H_0)
        matrix_4 = torch.matmul(B_out.t(), H_0)
        result_matrix_1 = torch.cat((matrix_3, matrix_4, E_0), dim=1)
        # E_1 = F.relu(self.layer2(result_matrix_1))
        E_1 = F.relu(self.layer2_e(result_matrix_1))

        matrix_5 = torch.matmul(B_in, E_1)
        matrix_6 = torch.matmul(B_out, E_1)
        result_matrix_2 = torch.cat((matrix_5, matrix_6, H_0), dim=1)
        # H_1 = F.relu(self.layer2(result_matrix_2))
        H_1 = F.relu(self.layer2_h(result_matrix_2))

        # H_1 = F.dropout(H_1, p=0.2, training=self.training)

        H_1_mean = torch.mean(H_1, dim=0, keepdim=True, dtype=torch.float)
        # print(f'---model---mean: {H_1_mean}')

        # output = F.softmax(self.layer_pool(H_1_mean), dim=1)
        # output = F.sigmoid(self.layer_pool(H_1_mean))
        output = self.layer_pool(H_1_mean)
        
        return output
        # return H_1_mean


In [14]:
# Chia dữ liệu thành tập huấn luyện và tập kiểm tra
train_data, test_data = train_test_split(list_data, test_size=0.2, random_state=42)

train_loader = DataLoader(train_data, batch_size=1, shuffle=True)
test_loader = DataLoader(test_data, batch_size=1, shuffle=False)

In [15]:
# Huấn luyện mô hình
def train(model, train_loader, criterion, optimizer):
    model.train()
    for data in train_loader:
        optimizer.zero_grad()
        output = model(data.B_in, data.B_out, data.X)
        if torch.isnan(output).any():
            continue
        loss = criterion(output, torch.unsqueeze(data.Y, dim=0))
        loss.backward()
        optimizer.step()

# Đánh giá mô hình
def evaluate(model, loader):
    model.eval()
    
    correct = 0
    total_samples = 0
    for data in loader:
        output = model(data.B_in, data.B_out, data.X)
        # print(f'---eval---output: {output}')
        pred = output.argmax(dim=1)
        # print(f'---eval---pred: {pred}, Y: {data.Y}')
        correct += int((pred == data.Y).sum())
        total_samples += 1
    return correct / total_samples

In [16]:
criterion = nn.CrossEntropyLoss()
model = GNN(64, 64, 2)
lr = 0.001
optimizer = optim.Adam(model.parameters(), lr=lr)

epochs = 50
for epoch in range (epochs):
    train(model, train_data, criterion, optimizer)
    train_accuracy = evaluate(model, train_data)
    test_accuracy = evaluate(model, test_data)
    print(f'Epoch {epoch + 1}/{epochs}, Train Acc: {train_accuracy * 100: .2f}%, Test Acc: {test_accuracy * 100:.2f}%')

Epoch 1/50, Train Acc:  95.41%, Test Acc: 95.33%
Epoch 2/50, Train Acc:  97.36%, Test Acc: 96.89%
Epoch 3/50, Train Acc:  95.90%, Test Acc: 96.50%
Epoch 4/50, Train Acc:  97.56%, Test Acc: 96.50%
Epoch 5/50, Train Acc:  97.56%, Test Acc: 96.50%
Epoch 6/50, Train Acc:  94.24%, Test Acc: 95.72%
Epoch 7/50, Train Acc:  93.26%, Test Acc: 94.55%
Epoch 8/50, Train Acc:  97.17%, Test Acc: 96.89%
Epoch 9/50, Train Acc:  96.58%, Test Acc: 96.50%
Epoch 10/50, Train Acc:  93.65%, Test Acc: 94.55%
Epoch 11/50, Train Acc:  95.31%, Test Acc: 96.50%
Epoch 12/50, Train Acc:  97.17%, Test Acc: 96.89%
Epoch 13/50, Train Acc:  97.07%, Test Acc: 96.89%
Epoch 14/50, Train Acc:  96.19%, Test Acc: 96.50%
Epoch 15/50, Train Acc:  94.63%, Test Acc: 96.50%
Epoch 16/50, Train Acc:  97.36%, Test Acc: 96.89%
Epoch 17/50, Train Acc:  98.63%, Test Acc: 99.22%
Epoch 18/50, Train Acc:  98.83%, Test Acc: 98.83%
Epoch 19/50, Train Acc:  97.75%, Test Acc: 98.05%
Epoch 20/50, Train Acc:  93.65%, Test Acc: 94.55%
Epoch 21/