In [None]:
import sys
sys.path.append('../')
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.parameter import Parameter
from torch.autograd import Variable
import torch.optim as optim
import numpy as np
import os
from time import time
import math
#import matplotlib.pyplot as plt
from sklearn.metrics import f1_score,accuracy_score,precision_score,recall_score, roc_auc_score
from Utils.pytorchtools import EarlyStopping  #location: /utils/
from tqdm import tqdm  #用于在循环或迭代过程中显示进度条
from transformers import BertTokenizer, BertModel
#from torch_scatter import scatter_max, scatter_add
import networkx as nx
#from torch_geometric.nn import GCNConv   后面有用到，由于服务器出错这里
#import dgl
#import cugraph
#import cudf

In [None]:
"""无序列化"""
class Graph():
    def __init__(self,num_nodes,edge_pairs,features,label,node_degs):
        self.num_nodes = num_nodes
        self.edge_pairs = edge_pairs
        self.features = features
        self.label = label
        self.node_degs = node_degs  #邻接点数量
        
    def __str__(self):
        return ('nodes: %d  edge_pairs: %d features: %d' % (self.num_nodes,len(self.edge_pairs),len(self.features)))

In [None]:
"""序列化"""
class Graph_sort():
    def __init__(self,num_nodes,features):
        self.num_nodes = num_nodes
        self.features = features
        
    def __str__(self):
        return ('nodes: %d  features: %d' % (self.num_nodes,len(self.features)))

In [None]:
#读取图数据并构建一个Graph对象
def getGraph(filename):
    f = open(filename)
    row = f.readline().strip().split()
    nodes,label = [int(w) for w in row]
    node_features = []
    edge_pairs = []
    node_degs = []
    for j in range(nodes):
        row = f.readline().strip().split()
        node_deg = int(row[0])+1
        row ,attr = [int(w) for w in row[1:int(row[0])+1]],np.array([float(w) for w in row[int(row[0])+1:]])
        if attr is not None:
            node_features.append(attr)
        if node_deg is not None:
            node_degs.append(node_deg)
        if row is not None:
            for k in row:
                edge = [j,k]
                edge_pairs.append(edge)
    g = Graph(nodes,edge_pairs,node_features,label,node_degs)
    return g

In [None]:
"""无序列化"""
def getGraphList(files):
    glist = []
    for file in files:
        graph = getGraph(file)
        glist.append(graph)
    return glist

In [None]:
def merge_Graph(graph_list, files_n, edge_folder):
    prefix_sum = []  # 每个图数据中节点数量的和
    node_features = []  # 节点特征
    node_degs = []  # 节点度数
    node_labels = []  # 节点标签
    total_num_edges = 0  # 总边数
    total_num_nodes = 0  # 总节点数
    edge_pairs = []  # 每个图数据的边信息的列表
    graph_sizes = []  # 存储每个图数据的节点数量
    edge_features_list = []  # 存储边特征列表

    for i in range(len(graph_list)):
        prefix_sum.append(graph_list[i].num_nodes)
        if i != 0:
            prefix_sum[i] += prefix_sum[i-1]
        node_features.extend(graph_list[i].features)
        node_degs.extend(graph_list[i].node_degs)
        node_labels.append(graph_list[i].label)
        total_num_edges += len(graph_list[i].edge_pairs)
        total_num_nodes += graph_list[i].num_nodes
        graph_sizes.append(graph_list[i].num_nodes)
        edge_pairs.append(graph_list[i].edge_pairs)

        # Load edge features
        sanitized_filename = files_n[i].split('/')[-1].replace('.txt', '')  
        edge_file_path = os.path.join(edge_folder, sanitized_filename + '.txt')
        with open(edge_file_path, 'r') as edge_file:
            lines = edge_file.readlines()
            edge_features = {int(line.split()[0]): torch.zeros(len(line.split()) - 1) for line in lines[1:]}
            for line in lines[1:]:
                i, *edge_feature = map(float, line.split())
                edge_features_list.append((int(i), torch.tensor(edge_feature)))
            

    # create batch_graph
    n2n_idxes = torch.LongTensor(2, total_num_edges)  
    n2n_vals = torch.FloatTensor(total_num_edges)  
    
    for i in range(len(graph_list)):
        prefix_sum[len(graph_list)-i-1] = prefix_sum[len(graph_list)-i-2]
    prefix_sum[0] = 0
    
    for i in range(total_num_edges):
        n2n_vals[i] = 1
    
    j = 0
    
    for i in range(len(graph_list)):
        for k, item in enumerate(edge_pairs[i]):
            n2n_idxes[0][j] = item[0] + prefix_sum[i]
            n2n_idxes[1][j] = item[1] + prefix_sum[i]
            j += 1
            
    merged_edge_features = []

    for _, edge_feature in edge_features_list:
        merged_edge_features.append(edge_feature)

    n2n = torch.sparse.FloatTensor(n2n_idxes, n2n_vals, torch.Size([total_num_nodes, total_num_nodes]))
    node_features = torch.FloatTensor(node_features)
    merged_edge_features = torch.stack(merged_edge_features)
    
    indices = n2n._indices()  
    values = n2n._values() 
    in_degrees = torch.zeros(total_num_nodes, device=n2n.device)
    in_degrees.index_add_(0, indices[1], values)  
    in_degrees_inv = torch.pow(in_degrees, -1)
    in_degrees_inv[torch.isinf(in_degrees_inv)] = 0  
    
    

    # 创建稀疏度矩阵的索引
    degs_index = torch.arange(total_num_nodes, device=n2n.device)
    degs_index = torch.stack([degs_index, degs_index], dim=0)
    
    # 构建稀疏度矩阵，按入度计算的
    node_degs = torch.sparse.FloatTensor(degs_index, in_degrees_inv, torch.Size([total_num_nodes, total_num_nodes]))  
        
    return n2n, node_features, node_degs, graph_sizes, merged_edge_features,in_degrees_inv  # merged_edge_features
    

In [None]:
"""序列化图的合并"""
def merge_Graph_sort(graph_list):
    node_features = []  # 节点特征
    graph_sizes = []  # 每个图的节点数
    
    for graph in graph_list:
        node_features.extend(graph.features)
        graph_sizes.append(graph.num_nodes)
    
    # 将节点特征向量转换为张量
    node_features = torch.FloatTensor(node_features)
    
    return node_features, graph_sizes
    

In [None]:
class GraphConvolution(nn.Module):
    def __init__(self, in_features, out_features,alpha=0.5, num_iterations=2, activation=None, bias=True):#edge_feat_dim
        super(GraphConvolution, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.alpha = alpha
        self.num_iterations = num_iterations
        self.activation = activation
        self.weight = Parameter(torch.FloatTensor(in_features, out_features))
        self.edge_weight = Parameter(torch.FloatTensor(in_features, out_features))
        
        self.biLinear = nn.Bilinear(out_features, out_features, out_features)
        
        if bias:
            self.bias = Parameter(torch.FloatTensor(out_features))
        else:
            self.register_parameter('bias', None)
        self.reset_parameters()

    def reset_parameters(self):
        stdv = 1. / math.sqrt(self.weight.size(1))
        self.weight.data.uniform_(-stdv, stdv)
        self.edge_weight.data.uniform_(-stdv, stdv)
        if self.bias is not None:
            self.bias.data.uniform_(-stdv, stdv)

    def forward(self, features, adj, degs, edge_features ,graph_sizes):#edge_features
        # Add self-loops to the adjacency matrix (A + I)
        identity = torch.eye(adj.size(0)).to_sparse().to(adj.device)
        adj = adj + identity

        # Transpose adjacency matrix for in-degree normalization
        adj = adj.t()
        
        # Initialize normalized adjacency matrix
        norm_indices = adj._indices()
        norm_values = adj._values().clone()

        # Calculate normalized adjacency matrix
        start = 0
        for size in graph_sizes:
            end = start + size
            sub_adj_indices = norm_indices[:, (norm_indices[0] >= start) & (norm_indices[0] < end)]
            sub_adj_indices = sub_adj_indices - start
            sub_adj_values = norm_values[(norm_indices[0] >= start) & (norm_indices[0] < end)]
            sub_adj = torch.sparse.FloatTensor(sub_adj_indices, sub_adj_values, torch.Size([size, size]))
            row, col = sub_adj._indices()
            deg = torch.sparse.sum(sub_adj, dim=1).to_dense()
            deg_inv_sqrt = torch.pow(deg, -0.5)
            deg_inv_sqrt[deg_inv_sqrt == float('inf')] = 0
            norm = deg_inv_sqrt[row] * deg_inv_sqrt[col]
            norm_values[(norm_indices[0] >= start) & (norm_indices[0] < end)] = norm
            start = end

        adj_norm = torch.sparse.FloatTensor(norm_indices, norm_values, adj.size())

        # Node feature transformation
        support = torch.mm(features, self.weight)
        
        # Initial feature matrix
        H0 = support.clone()
        
        H = support
        # Aggregate neighbor node features using sparse matrix multiplication
        for _ in range(self.num_iterations):
            aggregated_neighbors = torch.mm(adj_norm, H)
            H = self.alpha * aggregated_neighbors + (1 - self.alpha) * H0

        # Transform edge features
        transformed_edge_features = torch.mm(edge_features, self.edge_weight)
        
        # Ensure edge features have the correct dimensions
        if transformed_edge_features.size(0) != features.size(0):
            raise ValueError("Edge features size must match the number of nodes")

        # Combine node features and edge aggregated features
        #updated_features = H + transformed_edge_features
        #updated_features = H
        updated_features = self.biLinear(H, transformed_edge_features)

        # Apply degree matrix
        output = torch.mm(degs, updated_features)

        if self.activation is not None:
            output = self.activation(output)

        if self.bias is not None:
            output = output + self.bias
        
        return output, transformed_edge_features
        #return output
        

    def __repr__(self):
        return self.__class__.__name__ + ' (' \
               + str(self.in_features) + ' -> ' \
               + str(self.out_features) + ')'

In [None]:
class SelfAttentionPool(nn.Module):
    def __init__(self, feature_dim, pooled_dim):
        super(SelfAttentionPool, self).__init__()
        self.feature_dim = feature_dim
        self.pooled_dim = pooled_dim
        
        # 自注意力机制所需的查询、键、值线性层
        self.query_layer = nn.Linear(feature_dim, pooled_dim)
        self.key_layer = nn.Linear(feature_dim, pooled_dim)
        self.value_layer = nn.Linear(feature_dim, pooled_dim)
        
        # 用于聚合被丢弃节点的特征
        self.linear_agg = nn.Linear(feature_dim, pooled_dim)
        self.norm_layer = nn.LayerNorm(pooled_dim)  # 归一层
        
        # 用于计算丢弃节点的权重
        self.weight_layer = nn.Linear(feature_dim, 1)

    def forward(self, x, graph_sizes, in_degrees):
        start_idx = 0
        output_features = []
        
        for size in graph_sizes:
            end_idx = start_idx + size
            x_graph = x[start_idx:end_idx]
            in_deg_graph = in_degrees[start_idx:end_idx]

            # 计算查询、键和值
            queries = self.query_layer(x_graph)
            keys = self.key_layer(x_graph)
            values = self.value_layer(x_graph)

            # 计算注意力分数
            attn_scores = torch.matmul(queries, keys.transpose(0, 1)) / (self.pooled_dim ** 0.5)  # 缩放点积注意力
            attn_scores = attn_scores.mean(dim=1)  # 每个节点的注意力分数
            
            # 结合入度信息
            attn_scores = attn_scores * in_deg_graph  # 将注意力分数与入度相乘
            attn_scores = F.softmax(attn_scores, dim=0)

            # 对节点特征进行加权求和
            x_pooled = torch.sum(attn_scores.unsqueeze(1) * values, dim=0, keepdim=True)

            # 归一化与激活
            x_pooled = self.norm_layer(x_pooled)
            x_pooled = F.relu(x_pooled)
            #x_pooled = F.leaky_relu(x_pooled)

            output_features.append(x_pooled)

            start_idx = end_idx

        output_features = torch.cat(output_features, dim=0)
        
        return output_features

In [None]:
class AugGcn(nn.Module):
    def __init__(self, feature_dim, hid_dim, out_dim):
        super(AugGcn, self).__init__()
        self.feature_dim = feature_dim
        self.hid_dim = hid_dim
        self.out_dim = out_dim
        
        self.bn = nn.BatchNorm1d(feature_dim)
                
        self.conv1 = GraphConvolution(feature_dim, hid_dim,activation = F.leaky_relu)
        self.bn1 = nn.BatchNorm1d(hid_dim)
        
        self.conv2 = GraphConvolution(hid_dim, hid_dim,activation = F.leaky_relu)
        self.bn2 = nn.BatchNorm1d(hid_dim)
        
        self.conv3 = GraphConvolution(hid_dim, out_dim,activation = F.leaky_relu)

    def forward(self, features, graph_adj, deg, edge_f,graph_sizes, in_degree):
        
        features = self.bn(features)
        
        feat1,edge_f1 = self.conv1(features, graph_adj, deg, edge_f, graph_sizes) 
        feat1 = self.bn1(feat1)
        
        feat2,edge_f2 = self.conv2(feat1, graph_adj, deg, edge_f1, graph_sizes) 
        feat2 = self.bn2(feat2)
        
        feat3,edge_f3= self.conv3(feat2, graph_adj, deg, edge_f2, graph_sizes)
        
        return feat3

In [None]:
"""分类模型"""
class Classifier(nn.Module):
    def __init__(self,classNum,dropout_rate,nfeat,nhid,out_dim):
        super(Classifier,self).__init__()
        self.classNum = classNum
        self.dropout_rate = dropout_rate
      
        self.g_s = AugGcn(nfeat, nhid, out_dim)
        
        self.bn = nn.BatchNorm1d(128)
        self.denseLayer1 = nn.Linear(128,256) 
        self.dropout1 = nn.Dropout(p = self.dropout_rate)
        
        self.bn2 = nn.BatchNorm1d(256)
        self.denseLayer2 = nn.Linear(256,64)
        self.dropout2 = nn.Dropout(p = self.dropout_rate)
        
        self.bn4 = nn.BatchNorm1d(64)
        self.outputLayer = nn.Linear(64,classNum) 

    def forward(self,features, graphs,degs,graph_sizes, edge_f,in_degree):#edge_f
    
        feats = self.g_s(features, graphs, degs, edge_f,graph_sizes, in_degree)#edge_f
     
        features = self.sa_pool(feats, graph_sizes, in_degree)
       
        res = features
       
        res = self.bn(res)
        
        res = F.relu(self.denseLayer1(res))
        res = self.dropout1(res)
        
        res = self.bn2(res)
        res = F.relu(self.denseLayer2(res))
        res = self.dropout2(res)
        
        res = self.bn4(res)
        output = self.outputLayer(res)
        
        return output.flatten() 
     

In [None]:
path = '../Data/Graph_normal/'        # 无漏洞
pathA = '../Data/Graph_vulnerable/'    # 有漏洞

files = os.listdir(path)
filesA = os.listdir(pathA)

graph_filename = [path+file for file in files]
graph_filename.extend([pathA+file for file in filesA])

all_label = [0.0 for i in range(len(files))]
all_label.extend([1.0 for i in range(len(filesA))]) 
length = len(files) + len(filesA)

from sklearn.utils import shuffle  
graph_filename,all_label = shuffle(graph_filename,all_label)

k = 0.8
k1 = 0.9

train_graph_data = graph_filename[0:int(k*length)]  
train_label = all_label[0:int(k*length)]

test_graph_data = graph_filename[int(k*length):] 
test_label = all_label[int(k*length):]

len(train_graph_data),len(test_graph_data)

In [None]:
def evaluate(predict,t_label,thresh):  

    pre = (predict >= thresh).float()
    accuracy = accuracy_score(t_label.data.cpu(),pre.data.cpu())
    precision = precision_score(t_label.data.cpu(),pre.data.cpu(), zero_division = 0)
    recall = recall_score(t_label.data.cpu(),pre.data.cpu(), zero_division = 0)
    f1 = f1_score(t_label.data.cpu(),pre.data.cpu(), zero_division = 0)
    return precision, recall, f1, accuracy #without auc

def test_evaluate(predict,t_label,thresh):  
    pre = (predict >= thresh).float()
#     print(predict,t_label)
    accuracy = accuracy_score(t_label.data.cpu(),pre.data.cpu())
    precision = precision_score(t_label.data.cpu(),pre.data.cpu(), zero_division = 0)
    recall = recall_score(t_label.data.cpu(),pre.data.cpu(), zero_division = 0)
    f1 = f1_score(t_label.data.cpu(),pre.data.cpu(), zero_division = 0)
    # fpr,tpr,_ = roc_curve(t_label.data.cpu(),predict.data.cpu())
    auc = roc_auc_score(t_label.data.cpu(),predict.data.cpu())

    return precision, recall, f1, accuracy, auc 

In [None]:
def test():
    with torch.no_grad():
        model.eval()
        graphs,features,node_degs,graph_sizes, edge_f,in_degree = merge_Graph(getGraphList(test_graph_data), test_graph_data, '../Data/Graph_edge/')
        graphs = graphs.cuda()
        labels = torch.FloatTensor(test_label).cuda()
        node_degs = node_degs.cuda()
        features = features.cuda()
        edge_f = edge_f.cuda()
        in_degree = in_degree.cuda()
        
            
        graphs = Variable(graphs)
        node_degs = Variable(node_degs)
        features = Variable(features)
        edge_f = Variable(edge_f)
        in_degree = Variable(in_degree)

         
        output = model(features, graphs, node_degs, graph_sizes,edge_f,in_degree)  #调用forward
        l = loss(output,labels)
        precesion,recall,f1_score,acc,auc = test_evaluate(torch.sigmoid(output),labels,threshold)

    return precesion,recall,f1_score,acc,auc

def train(epoch):
    model.train()
    start = time()
    accuracy = []
    j = 0
    best_loss=1
    for graph_files ,labels in zip([train_graph_data[i:i + batch_size] for i in range(0, len(train_graph_data),batch_size)],[train_label[i:i + batch_size] for i in range(0, len(train_label), batch_size)]):

        j += 1
        ss = time()
        
        graphs,features,node_degs,graph_sizes, edge_f,in_degree = merge_Graph(getGraphList(graph_files),graph_files,'../Data/Graph_edge/') #edge_f,
        
        graphs = graphs.cuda()      
        node_degs = node_degs.cuda()
        features = features.cuda()
        edge_f = edge_f.cuda()
        in_degree = in_degree.cuda()
        
        labels = torch.FloatTensor(labels).cuda()
        
        graphs = Variable(graphs)
        node_degs = Variable(node_degs)
        features = Variable(features)
        edge_f = Variable(edge_f)
        in_degree = Variable(in_degree)
        
        output = model(features, graphs, node_degs, graph_sizes, edge_f,in_degree)  #调用forward
        
        l = loss(output,labels)
        optimizer.zero_grad()
        l.backward()
        
        if(best_loss<=l.item()):
            model.save_to_state_dict(torch.load('checkpoint.pt'))
        
        optimizer.step()
        scheduler.step()
        
        pre,rec,f1, acc = evaluate(output,labels,threshold)
        accuracy.append(acc)
        
        ee = time()

    accuracy = np.array(accuracy).mean()
    end = time()
    print('epoch %d, train_loss : %f trainacc: %f ' % (epoch+1, l.item(), accuracy))

In [None]:
model = Classifier(
    classNum = 1,  #类别数
    dropout_rate=0.5, 
    nfeat = 100, #输入特征维度
    nhid = 64, #隐藏大小
    out_dim = 128, #输出维度
)
sortk = 200
batch_size = 32 #32  
threshold = 0.6  #阈值，用于二分类，需要实现寻找最佳的阈值 0.45，尝试设置的高一点0.9
if torch.cuda.is_available():
    model.cuda()
    
loss = nn.BCEWithLogitsLoss()    # This loss combines a Sigmoid layer and the BCELoss in one single class. 二进制交叉熵损失函数
optimizer = optim.Adam(model.parameters(),lr = 0.1)  
scheduler = optim.lr_scheduler.CyclicLR(optimizer, base_lr=0.0001, max_lr=0.0004 ,cycle_momentum = False)   

model.parameters

In [None]:
train_loss = []
test_acc = []
train_acc = []
test_loss = []
l_rs = []
Precesion = []
Recall = []
F1_score = []
AUC = []
#wf = './BFS_EA_RGCN(SG)/'
wf='../Data/Record/'
early_stop_flag = False

In [None]:
def start_training():
    global train_graph_data,train_label,early_stop_flag,model
    for j in range(100):
        train_graph_data,train_label = shuffle(train_graph_data,train_label)
        train(j)
    model.load_state_dict(torch.load('checkpoint.pt'))

In [None]:
#训练了30次
for i in tqdm(range(30)):
    s_time = time()
    graph_filename, all_label = shuffle(graph_filename,all_label)

    train_graph_data = graph_filename[0:int(k*length)]  #训练集80%
    train_label = all_label[0:int(k*length)]    

    test_graph_data = graph_filename[int(k*length):]  #测试集10% 20%
    test_label = all_label[int(k*length):]
    
    model = Classifier(
    classNum = 1,
    dropout_rate=0.5,
    nfeat = 100, 
    nhid = 64, 
    out_dim = 128,
    )
    if torch.cuda.is_available():
        model.cuda()

    loss = nn.BCEWithLogitsLoss()    
    optimizer = optim.Adam(model.parameters(),lr = 0.1)
    
    scheduler = optim.lr_scheduler.CyclicLR(optimizer, base_lr=0.0001, max_lr=0.0004 ,cycle_momentum = False)
    
    start_training()
    
    pre,recall,f1,acc,auc = test() 
    e_time = time()
    f = open(wf+'Mytest_result-2.txt','a')
    print('exp-%d acc: %f  precesion: %f  recall: %f  f1_score: %f auc: %f\n' % (i,acc, pre, recall, f1,auc),file = f)
    print('exp-%d acc: %f  precesion: %f  recall: %f  f1_score: %f auc: %f time: %f' % (i,acc, pre, recall, f1,auc, e_time - s_time))
    test_acc.append(acc)
    Precesion.append(pre)
    Recall.append(recall)
    F1_score.append(f1)
    AUC.append(auc)
    f.close()
    
f = open(wf+'Mytest_result.txt','a')
print('ave_acc: %f  ave_precesion: %f  ave_recall: %f  ave_f1_score: %f ave_auc: %f\n' % (np.array(test_acc).mean(), np.array(Precesion).mean(), np.array(Recall).mean(), np.array(F1_score).mean(),np.array(AUC).mean()),file=f)
print('var_acc: %f  var_precesion: %f  var_recall: %f  var_f1_score: %f var_auc: %f\n' % (np.array(test_acc).var(), np.array(Precesion).var(), np.array(Recall).var(), np.array(F1_score).var(),np.array(AUC).var()),file=f)
f.close()

In [None]:
torch.save(model, wf+'GcnP.pkl')