In [55]:
import torch
import numpy as np
import torch.nn.functional as F
import scipy.sparse as sp
from torch_geometric.nn import GCNConv,GATConv,SAGEConv
from torch_geometric.datasets import Planetoid

In [56]:
# 把标签转换成onehot
def encode_onehot(labels):                                   
    classes = set(labels)
    classes_dict = {c: np.identity(len(classes))[i, :] for i, c in enumerate(classes)}
    labels_onehot = np.array(list(map(classes_dict.get, labels)), dtype=np.int32)
    return labels_onehot

In [66]:
# 测试onehot函数
labels = ("input","output","entityFunctionalStatus","catalystActivity","regulatedBy")
a = encode_onehot(labels)
print (a)

[[0 1 0 0 0]
 [0 0 1 0 0]
 [0 0 0 0 1]
 [0 0 0 1 0]
 [1 0 0 0 0]]


In [72]:
def normalize(mx):                                          # 归一化
    rowsum = np.array(mx.sum(1))
    r_inv = np.power(rowsum, -1).flatten() ## 幂
    r_inv[np.isinf(r_inv)] = 0.
    r_mat_inv = sp.diags(r_inv) # 对角化
    mx = r_mat_inv.dot(mx) # 乘积
    return mx

In [79]:
# 测试normailize 函数
mx1 =  np.arange(0, 30, 2).reshape(3,5).astype(np.float32)
print (mx1)
rowsum = np.array(mx1.sum(1))
print (rowsum)
r_inv = np.power(rowsum, -1).flatten()
print (r_inv)
print (normalize(mx1))
#a = torch.rand(2,3,4)
#print (a)

[[ 0.  2.  4.  6.  8.]
 [10. 12. 14. 16. 18.]
 [20. 22. 24. 26. 28.]]
[ 20.  70. 120.]
[0.05       0.01428571 0.00833333]
[[0.         0.1        0.2        0.3        0.4       ]
 [0.14285715 0.17142858 0.2        0.22857143 0.25714287]
 [0.16666667 0.18333334 0.20000002 0.21666668 0.23333335]]


In [87]:
def  load_data(nodes="/public/home/liujunwu/workdir/scripts/GNN_Reactome/test_data/Cora/Cora/cora.content",edges="/public/home/liujunwu/workdir/scripts/GNN_Reactome/test_data/Cora/Cora/cora.cites"):
    idx_features_labels = np.genfromtxt("{}".format(nodes),dtype=np.dtype(str))# 读取节点特征和标签
    print (idx_features_labels[:,-1])
    features = sp.csr_matrix(idx_features_labels[:, 1:-1], dtype=np.float32) # 读取节点特征
    dict = {int(element):i for i,element in enumerate(idx_features_labels[:, 0:1].reshape(-1))}    # 建立字典
    labels = encode_onehot(idx_features_labels[:, -1])                       # 节点标签用onehot方式表示
    e = np.genfromtxt("{}".format(edges), dtype=np.int32)    # 读取边信息
    edges = []
    for i, x in enumerate(e):
        edges.append([dict[e[i][0]], dict[e[i][1]]])                         # 若A->B有边 则B->A 也有边   ### 后续这里要修改？需要是有向图                 
        edges.append([dict[e[i][1]], dict[e[i][0]]])                         # 给的数据是没有从0开始需要转换
    features = normalize(features)                                           # 特征值归一化       
    features = torch.tensor(np.array(features.todense()), dtype=torch.float32)
    labels = torch.LongTensor(np.where(labels)[1])                       
    edges = torch.tensor(edges, dtype=torch.int64).T
    return features, edges, labels
features, edges, labels = load_data()                         
idx_train = range(2000)                                       # 其中2000个点是训练数据                   
idx_test = range(2000, 2700)                                  # 700个测试数据
idx_train = torch.LongTensor(idx_train) # 用于表示包含整数（64整型数据）的张量
idx_test = torch.LongTensor(idx_test)
#print (labels.tolist()) 

['Neural_Networks' 'Rule_Learning' 'Reinforcement_Learning' ...
 'Genetic_Algorithms' 'Case_Based' 'Neural_Networks']


In [93]:
class GCN(torch.nn.Module):
    def __init__(self,feature,hidden,classes):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(feature, hidden)  #输入=节点特征维度，hidden是中间隐藏神经元个数
        self.conv2 = GCNConv(hidden, classes)
    def forward(self, features, edges):
        features = self.conv1(features, edges)
        features = F.relu(features)
        features = F.dropout(features, training=self.training)
        features = self.conv2(features, edges)
        return F.log_softmax(features, dim=1)

In [94]:
# 设置参数和模型定义
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GCN(1433, 64, 7)                                               # 节点特征维度，隐藏神经元个数，标签数目
optimizer = torch.optim.Adam(model.parameters(),lr=0.01,weight_decay=5e-4)               # 梯度优化算法

In [97]:
#模型训练
model.train()
for epoch in range(300):
    optimizer.zero_grad()
    out = model(features, edges)    #模型的输入有节点特征还有边特征,使用的是全部数据
    loss = F.nll_loss(out[idx_train], labels[idx_train])    #损失仅仅计算的是训练集的损失
    loss.backward()
    optimizer.step()
    if (epoch > 250):
        print(f"epoch:{epoch+1}, loss:{loss.item()}")
    else:
        pass



epoch:252, loss:0.3696596324443817
epoch:253, loss:0.364935964345932
epoch:254, loss:0.3683640658855438
epoch:255, loss:0.3657892048358917
epoch:256, loss:0.3638060390949249
epoch:257, loss:0.3660072982311249
epoch:258, loss:0.36475828289985657
epoch:259, loss:0.36472803354263306
epoch:260, loss:0.36476242542266846
epoch:261, loss:0.36531245708465576
epoch:262, loss:0.3770420253276825
epoch:263, loss:0.3715515434741974
epoch:264, loss:0.37264105677604675
epoch:265, loss:0.36958810687065125
epoch:266, loss:0.36638206243515015
epoch:267, loss:0.3630881905555725
epoch:268, loss:0.3794698417186737
epoch:269, loss:0.369625061750412
epoch:270, loss:0.3701733648777008
epoch:271, loss:0.3751410245895386
epoch:272, loss:0.36737310886383057
epoch:273, loss:0.36584511399269104
epoch:274, loss:0.378248929977417
epoch:275, loss:0.37657520174980164
epoch:276, loss:0.3594202995300293
epoch:277, loss:0.3686261773109436
epoch:278, loss:0.3610154688358307
epoch:279, loss:0.3707183599472046
epoch:280, lo

In [99]:
#测试：
model.eval()
_,pred = model(features,edges).max(dim=1)

correct = pred[idx_test].eq(labels[idx_test]).sum()            # 计算预测与标签相等个数
acc = int(correct) / int(len(idx_test))                        # 计算正确率
print(acc)
# 另一种写法
'''
test_predict = model(data.x, data.edge_index)[data.test_mask]
max_index = torch.argmax(test_predict, dim=1)
test_true = data.y[data.test_mask]
correct = 0
for i in range(len(max_index)):
    if max_index[i] == test_true[i]:
        correct += 1
print('测试集准确率为：{}%'.format(correct*100/len(test_true)
'''

0.8557142857142858


"\ntest_predict = model(data.x, data.edge_index)[data.test_mask]\nmax_index = torch.argmax(test_predict, dim=1)\ntest_true = data.y[data.test_mask]\ncorrect = 0\nfor i in range(len(max_index)):\n    if max_index[i] == test_true[i]:\n        correct += 1\nprint('测试集准确率为：{}%'.format(correct*100/len(test_true)\n"