In [2]:
import torch
import numpy as np
import pandas as pd
import copy
import torch.nn.functional as F
import scipy.sparse as sp
from torch_geometric.nn import GCNConv,GATConv,SAGEConv
from torch_geometric.datasets import Planetoid

In [3]:
# 把标签转换成onehot
def encode_onehot(labels):                                   
    classes = set(labels)
    classes_dict = {c: np.identity(len(classes))[i, :] for i, c in enumerate(classes)}
    labels_onehot = np.array(list(map(classes_dict.get, labels)), dtype=np.int32)
    return labels_onehot

In [4]:
# 测试onehot函数
labels = ("input","output","entityFunctionalStatus","catalystActivity","regulatedBy")
a = encode_onehot(labels)
print (a)

[[0 0 0 1 0]
 [0 0 1 0 0]
 [0 0 0 0 1]
 [1 0 0 0 0]
 [0 1 0 0 0]]


In [5]:
def normalize(mx):                                          # 归一化
    rowsum = np.array(mx.sum(1))
    r_inv = np.power(rowsum, -1).flatten() ## 幂
    r_inv[np.isinf(r_inv)] = 0.
    r_mat_inv = sp.diags(r_inv) # 对角化
    mx = r_mat_inv.dot(mx) # 乘积
    mx = mx * 10000
    return mx

In [79]:
# 测试normailize 函数
mx1 =  np.arange(0, 30, 2).reshape(3,5).astype(np.float32)
print (mx1)
rowsum = np.array(mx1.sum(1))
print (rowsum)
r_inv = np.power(rowsum, -1).flatten()
print (r_inv)
print (normalize(mx1))
#a = torch.rand(2,3,4)
#print (a)

[[ 0.  2.  4.  6.  8.]
 [10. 12. 14. 16. 18.]
 [20. 22. 24. 26. 28.]]
[ 20.  70. 120.]
[0.05       0.01428571 0.00833333]
[[0.         0.1        0.2        0.3        0.4       ]
 [0.14285715 0.17142858 0.2        0.22857143 0.25714287]
 [0.16666667 0.18333334 0.20000002 0.21666668 0.23333335]]


In [21]:
GTEx_exp = pd.read_csv("/public/home/liujunwu/workdir/scripts/GNN_Reactome/GTEx_exp/GTEx_exp.csv.test",sep=',',header=0)
print (GTEx_exp.iloc[0:3,0:3])
sample_labels = GTEx_exp["Tissue"]
sample_labels_onehot = encode_onehot(sample_labels)
print (sample_labels_onehot)


                     Sample          Tissue  DDX11L1
0  GTEX-1117F-0226-SM-5GZZ7  Adipose Tissue      0.0
1  GTEX-1117F-0426-SM-5EGHI          Muscle      0.0
2  GTEX-1117F-0526-SM-5EGHJ    Blood Vessel      0.0
[[0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0]
 [0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0]
 [0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0]
 [0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0]
 [0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0]
 [0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1]]


In [230]:
#print (GTEx_exp.iloc[0:3,-2:-1])
# log2表达量归一化
normlizeGTEx = pd.DataFrame(normalize(GTEx_exp.iloc[:,2:GTEx_exp.shape[1]]),columns=GTEx_exp.iloc[:,2:GTEx_exp.shape[1]].columns)
#print (pd.DataFrame(normlizeGTEx).shape)
print (normlizeGTEx.iloc[0:3,[0,1,-4]])


   DDX11L1    WASH7P     MT-TE
0      0.0  0.443207  0.553907
1      0.0  0.443591  1.069985
2      0.0  0.442149  0.668421


In [231]:
print (normlizeGTEx.shape)
del GTEx_exp

(99, 54393)


In [63]:
# 整理蛋白和基因名的对应关系
reaction_protein = pd.read_csv("/public/home/liujunwu/workdir/scripts/GNN_Reactome/reaction_file/reaction_related.protein",sep='\t',header=None)
reaction_protein_list = reaction_protein.loc[:,0].tolist()
#reaction_protein.columns = ["Protein"] ## 9233 proteins, 其表达量作为节点reaction的特征
P2Symbol = pd.read_csv("/public/ref/UniProt/HUMAN_9606_id.GeneName",sep='\t',header=None)
P2Symbol.columns = ["Protein","Genecard","GeneSymbol"]
P2Symbol_dict = P2Symbol.set_index(['Protein'])['GeneSymbol'].to_dict()
reaction_gene_list = []
for p in reaction_protein_list:
    if (p in P2Symbol_dict.keys()):
        reaction_gene_list.append(P2Symbol_dict[p])
    else:
        pass
        #reaction_gene_list.append(p)
reaction_nodes = pd.read_csv("/public/home/liujunwu/workdir/scripts/GNN_Reactome/reaction_file/reactome_reaction.nodes.txt",sep='\t',header=None)
print (reaction_nodes.shape)
reaction_nodes.columns = ["Reaction","Type"]
zero_matrix = pd.DataFrame(np.zeros((reaction_nodes.shape[0],len(reaction_gene_list))),columns = reaction_gene_list)
print (zero_matrix.shape)
#print (reaction_nodes.iloc[0:3,0:3])
#print (zero_matrix.iloc[0:3,0:3])
#nodes_init_matrix = pd.concat([reaction_nodes,zero_matrix],axis=1) ## 初始化节点特征 （可以不用带）
#print (nodes_init_matrix.iloc[0:10,0:3])
#reaction_edges = "/public/home/liujunwu/workdir/scripts/GNN_Reactome/reaction_file/reactome_reaction.edges.txt"
## 根据表达量生成 样本的nodes_feature 
#print (nodes_init_matrix.loc[0,"Reaction"])

(13590, 2)
(13590, 9029)


In [64]:
print (nodes_init_matrix.shape)
print (reaction_nodes.iloc[0:5,0:2])
reaction_nodes_dict = dict(zip(reaction_nodes["Reaction"],reaction_nodes.index))
print (reaction_nodes_dict["R-HSA-5336466"])

(13589, 9235)
        Reaction             Type
0  R-HSA-5336466  ProteinReaction
1  R-HSA-5216072     EwasReaction
2  R-HSA-5226964  ProteinReaction
3  R-HSA-5215980     EwasReaction
4  R-HSA-5339528  ProteinReaction
0


In [40]:
## 获取每个样本的节点特征矩阵
protein_reaction_file= '/public/ref/Msigdb/Reactome/useInfo/Human.ProteinRoleReaction.txt'
prf = pd.read_csv(protein_reaction_file,sep='\t',header=None)
prf.columns = ["Protein",'Realation','Reaction']
prf["Gene"] = prf['Protein'].apply(lambda x : P2Symbol_dict[x] if x in P2Symbol_dict.keys() else x)
reaction_gene_dict = {}
print (prf.loc[prf["Reaction"] == 'R-HSA-5226964'])
print (type(prf.loc[2,"Reaction"]))
for i in range(prf.shape[0]):
    reaction_gene_dict.setdefault(prf.loc[i,"Reaction"],[]).append(prf.loc[i,"Gene"]) ## Reaction包含哪些基因
    
# 错误代码    
reaction_gene_dict[prf.loc[i,"Reaction"]] = list(set(reaction_gene_dict[prf.loc[i,"Reaction"]])) # 去重

print (len(reaction_gene_dict))
#for i, (k, v) in enumerate(reaction_gene_dict.items()):
#    if i in range(0, 3):
#        print(k, v)
print (reaction_gene_dict["R-HSA-5336453"])
print (reaction_gene_dict["R-HSA-5226964"])

       Protein         Realation       Reaction  Gene
194943  Q9HCJ1  catalystActivity  R-HSA-5226964  ANKH
<class 'str'>
13702
['NIPAL2', 'NIPAL3', 'NIPA2', 'NIPAL4', 'NIPA1', 'NIPAL1']
['ANKH']


In [34]:
import datetime
common_gene_list = set(reaction_gene_list) & set(GTEx_exp.columns)
nodes_features = list()
#print (type(GTEx_exp))
print (reaction_gene_dict["R-HSA-5226964"])
#print ("ANKH" in normlizeGTEx.columns)
print(datetime.datetime.now().strftime('%Y-%m-%d  %H:%M:%S'))
for index in range(GTEx_exp.shape[0]):
    print (index)
    sample_node_feature = copy.deepcopy(zero_matrix)
    for j in range(sample_node_feature.shape[0]):
        reaction_name = reaction_nodes.loc[j,"Reaction"]
        if (reaction_nodes.loc[j,"Type"] != "ProteinReaction"):continue
        hsa_gene_list = reaction_gene_dict[reaction_name] 
        for gene in hsa_gene_list:
            if (gene in common_gene_list):
                sample_node_feature.at[j,gene] = GTEx_exp.at[index,gene]
            else:
                continue
        #break
    sample_node_feature = normalize(sp.csr_matrix(sample_node_feature,dtype=np.float32))    
    nodes_features.append(torch.tensor(np.array(sample_node_feature.todense()), dtype=torch.float32))
    print(datetime.datetime.now().strftime('%Y-%m-%d  %H:%M:%S'))
    #break
print (sample_node_feature.shape)
print (len(nodes_features))
print ("store all sample node features done")
print(datetime.datetime.now().strftime('%Y-%m-%d  %H:%M:%S'))

['ANKH']
2023-09-11  15:00:35
0


  r_inv = np.power(rowsum, -1).flatten() ## 幂


2023-09-11  15:01:01
1
2023-09-11  15:01:28
2
2023-09-11  15:01:55
3
2023-09-11  15:02:21
4
2023-09-11  15:02:47
5
2023-09-11  15:03:13
6
2023-09-11  15:03:40
7
2023-09-11  15:04:06
8
2023-09-11  15:04:33
9
2023-09-11  15:04:59
10
2023-09-11  15:05:25
11
2023-09-11  15:05:52
12
2023-09-11  15:06:18
13
2023-09-11  15:06:45
14
2023-09-11  15:07:11
15
2023-09-11  15:07:37
16
2023-09-11  15:08:03
17
2023-09-11  15:08:30
18
2023-09-11  15:08:56
19
2023-09-11  15:09:22
(13589, 9029)
20
store all sample node features done
2023-09-11  15:09:22


In [20]:
print (sample_node_feature.shape)
a =sample_node_feature.iloc[:,2:sample_node_feature.shape[1]]
a["sum"]=a.apply(lambda x:sum(x),axis=1)
print (a["sum"])

(13589, 9029)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  a["sum"]=a.apply(lambda x:sum(x),axis=1)


0          0.000
1          4.852
2          0.000
3          3.792
4          0.000
          ...   
13584      0.137
13585    267.447
13586      0.000
13587     17.128
13588      0.000
Name: sum, Length: 13589, dtype: float64


In [253]:
# 清理内存
#del GTEx_exp
#del zero_matrix
#del normlizeGTEx
#del features
#del nodes_features

In [79]:
# sample_labels_onehot 样本标签
# nodes_features 各样本节点特征
# sample_edges 各样本边的特征
edges_file = "/public/home/liujunwu/workdir/scripts/GNN_Reactome/reaction_file/reactome_reaction.edges.txt"
#sample_edges = np.genfromtxt("{}".format(edges_file), dtype=np.float32)    # 读取边信息
edges_pd = pd.read_csv(edges_file,header=None,sep='\t')
edges_pd.columns = ["Edge1","Edge2","Type","TopLevel"]
#prf["Gene"] = prf['Protein'].apply(lambda x : P2Symbol_dict[x] if x in P2Symbol_dict.keys() else x)
#reaction_nodes_dict: store the nodes: index
edges_pd["Edge1_index"] = edges_pd["Edge1"].apply(lambda x : reaction_nodes_dict[x])
edges_pd["Edge2_index"] = edges_pd["Edge2"].apply(lambda x : reaction_nodes_dict[x])
#edges_trans.loc[df["Edge2"]] = edges_trans["Edge2"].apply(lambda x : reaction_nodes_dict[x])
edges_trans = edges_pd[["Edge1_index","Edge2_index"]]
edges_trans = torch.tensor(np.array(edges_trans), dtype=torch.int64).T
#print (edges_trans[20:30])

In [80]:
sample_labels_onehot = torch.LongTensor(np.where(sample_labels_onehot)[1])                       

In [84]:
print (sample_labels_onehot)
print (len(set(sample_labels_onehot)))
# 设置参数和模型定义
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GCN(len(nodes_features), 64, len(sample_labels_onehot))                                               # 节点特征维度，隐藏神经元个数，标签数目
optimizer = torch.optim.Adam(model.parameters(),lr=0.01,weight_decay=5e-4)      

tensor([ 1, 13,  9,  9, 14,  1,  0, 12,  6, 11,  3,  5, 10,  4,  2,  8,  7, 15,
        15, 15])
20


In [89]:
idx_train = range(15)                                       # 其中2000个点是训练数据                   
idx_test = range(15, 20)                                  # 700个测试数据
idx_train = torch.LongTensor(idx_train) # 用于表示包含整数（64整型数据）的张量
idx_test = torch.LongTensor(idx_test)
model.train()
for epoch in range(100):
    optimizer.zero_grad()
    out = model(torch.tensor(np.array(nodes_features.todense()), dtype=torch.float32), edges_trans)    #模型的输入有节点特征还有边特征,使用的是全部数据
    loss = F.nll_loss(out[idx_train], labels[idx_train])    #损失仅仅计算的是训练集的损失
    loss.backward()
    optimizer.step()
    if (epoch > 250):
        print(f"epoch:{epoch+1}, loss:{loss.item()}")
    else:
        pass


AttributeError: 'list' object has no attribute 'todense'

In [27]:
def  load_data(nodes="/public/home/liujunwu/workdir/scripts/GNN_Reactome/test_data/Cora/Cora/cora.content",edges="/public/home/liujunwu/workdir/scripts/GNN_Reactome/test_data/Cora/Cora/cora.cites"):
    idx_features_labels = np.genfromtxt("{}".format(nodes),dtype=np.dtype(str))# 读取节点特征和标签
    #print (idx_features_labels[:,-1])
    features = sp.csr_matrix(idx_features_labels[:, 1:-1], dtype=np.float32) # 读取节点特征
    dict = {int(element):i for i,element in enumerate(idx_features_labels[:, 0:1].reshape(-1))}    # 建立字典
    labels = encode_onehot(idx_features_labels[:, -1])                       # 节点标签用onehot方式表示
    e = np.genfromtxt("{}".format(edges), dtype=np.int32)    # 读取边信息
    print (e)
    edges = []
    for i, x in enumerate(e):
        edges.append([dict[e[i][0]], dict[e[i][1]]])                         # 若A->B有边 则B->A 也有边   ### 后续这里要修改？需要是有向图                 
        edges.append([dict[e[i][1]], dict[e[i][0]]])                         # 给的数据是没有从0开始需要转换
    features = normalize(features)                                           # 特征值归一化       
    features = torch.tensor(np.array(features.todense()), dtype=torch.float32)
    labels = torch.LongTensor(np.where(labels)[1])                       
    edges = torch.tensor(edges, dtype=torch.int64).T
    return features, edges, labels
features, edges, labels = load_data()
print (type(features.numpy()))
print (features[0:3])
#idx_train = range(2000)                                       # 其中2000个点是训练数据                   
#idx_test = range(2000, 2700)                                  # 700个测试数据
#idx_train = torch.LongTensor(idx_train) # 用于表示包含整数（64整型数据）的张量
#idx_test = torch.LongTensor(idx_test)
#print (idx_test.tolist()) 

[[     35    1033]
 [     35  103482]
 [     35  103515]
 ...
 [ 853118 1140289]
 [ 853155  853118]
 [ 954315 1155073]]
<class 'numpy.ndarray'>
tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])


In [222]:
def getsizeof(p_object, default=None): # real signature unknown; restored from __doc__
    """
    getsizeof(object [, default]) -> int
    
    Return the size of object in bytes.
    """
    return 0
def binary_conversion(var: int):
    """
    二进制单位转换
    :param var: 需要计算的变量，bytes值
    :return: 单位转换后的变量，kb 或 mb
    """
    assert isinstance(var, int)
    if var <= 1024:
        return f'占用 {round(var / 1024, 2)} KB内存'
    else:
        return f'占用 {round(var / (1024 ** 2), 2)} MB内存'
from sys import getsizeof as getsize
print (binary_conversion(getsize(nodes_features)))
#keys = dir()
#for variable in keys:
#    print(variable, binary_conversion(getsize(eval(variable))), '\n')

占用 0.18 KB内存


In [50]:
print (edges.T.numpy())
print (len(features.numpy()[1]))

[[ 163  402]
 [ 402  163]
 [ 163  659]
 ...
 [1887 1902]
 [ 837 1686]
 [1686  837]]
1433


In [36]:
class GCN(torch.nn.Module):
    def __init__(self,feature,hidden,classes):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(feature, hidden)  #输入=节点特征维度，hidden是中间隐藏神经元个数
        self.conv2 = GCNConv(hidden, classes)
    def forward(self, features, edges):
        features = self.conv1(features, edges)
        features = F.relu(features)
        features = F.dropout(features, training=self.training)
        features = self.conv2(features, edges)
        return F.log_softmax(features, dim=1)

In [37]:
# 设置参数和模型定义
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GCN(1433, 64, 7)                                               # 节点特征维度，隐藏神经元个数，标签数目
optimizer = torch.optim.Adam(model.parameters(),lr=0.01,weight_decay=5e-4)               # 梯度优化算法

In [97]:
#模型训练
model.train()
for epoch in range(300):
    optimizer.zero_grad()
    out = model(features, edges)    #模型的输入有节点特征还有边特征,使用的是全部数据
    loss = F.nll_loss(out[idx_train], labels[idx_train])    #损失仅仅计算的是训练集的损失
    loss.backward()
    optimizer.step()
    if (epoch > 250):
        print(f"epoch:{epoch+1}, loss:{loss.item()}")
    else:
        pass



epoch:252, loss:0.3696596324443817
epoch:253, loss:0.364935964345932
epoch:254, loss:0.3683640658855438
epoch:255, loss:0.3657892048358917
epoch:256, loss:0.3638060390949249
epoch:257, loss:0.3660072982311249
epoch:258, loss:0.36475828289985657
epoch:259, loss:0.36472803354263306
epoch:260, loss:0.36476242542266846
epoch:261, loss:0.36531245708465576
epoch:262, loss:0.3770420253276825
epoch:263, loss:0.3715515434741974
epoch:264, loss:0.37264105677604675
epoch:265, loss:0.36958810687065125
epoch:266, loss:0.36638206243515015
epoch:267, loss:0.3630881905555725
epoch:268, loss:0.3794698417186737
epoch:269, loss:0.369625061750412
epoch:270, loss:0.3701733648777008
epoch:271, loss:0.3751410245895386
epoch:272, loss:0.36737310886383057
epoch:273, loss:0.36584511399269104
epoch:274, loss:0.378248929977417
epoch:275, loss:0.37657520174980164
epoch:276, loss:0.3594202995300293
epoch:277, loss:0.3686261773109436
epoch:278, loss:0.3610154688358307
epoch:279, loss:0.3707183599472046
epoch:280, lo

In [99]:
#测试：
model.eval()
_,pred = model(features,edges).max(dim=1)

correct = pred[idx_test].eq(labels[idx_test]).sum()            # 计算预测与标签相等个数
acc = int(correct) / int(len(idx_test))                        # 计算正确率
print(acc)
# 另一种写法
'''
test_predict = model(data.x, data.edge_index)[data.test_mask]
max_index = torch.argmax(test_predict, dim=1)
test_true = data.y[data.test_mask]
correct = 0
for i in range(len(max_index)):
    if max_index[i] == test_true[i]:
        correct += 1
print('测试集准确率为：{}%'.format(correct*100/len(test_true)
'''

0.8557142857142858


"\ntest_predict = model(data.x, data.edge_index)[data.test_mask]\nmax_index = torch.argmax(test_predict, dim=1)\ntest_true = data.y[data.test_mask]\ncorrect = 0\nfor i in range(len(max_index)):\n    if max_index[i] == test_true[i]:\n        correct += 1\nprint('测试集准确率为：{}%'.format(correct*100/len(test_true)\n"

In [90]:
x = [[-1],[0],[1]]
print (type(x))

<class 'list'>
