In [1]:
import torch
import numpy as np
import pandas as pd
import copy
import torch.nn.functional as F
import scipy.sparse as sp
from torch_geometric.nn import GCNConv,GATConv,SAGEConv
from torch_geometric.datasets import Planetoid
from torch.nn import Linear
from torch_geometric.data import Data
from torch_geometric.nn import global_mean_pool
from torch_geometric.loader import DataLoader
import datetime
from sklearn.preprocessing import StandardScaler

In [None]:
def Scale(X):
    raw = X
    scaler = StandardScaler()
    scaler.fit(X.T)
    X = pd.DataFrame(scaler.transform(X.T)).T
    X.columns = raw.columns
    X.index = raw.index
    return X

In [2]:
def ProssGSEA(csv):
    dataframe = pd.read_csv(csv,sep=",")
    dataframe = dataframe.pivot(index = "Name",columns=["Term"],values=["NES"])
    dataframe.columns = dataframe.columns.droplevel(0)
    dataframe.index.rename(None, inplace=True)
    dataframe = dataframe.dropna(axis=1, how='all') ## 去除所有都是NaN的列
    return dataframe

In [3]:
from torch.nn import Linear
from torch_geometric.nn import GraphConv

class GNN(torch.nn.Module):
    def __init__(self, num_node_features,hidden_channels,num_classes):
        super(GNN, self).__init__()
        torch.manual_seed(12345)
        self.conv1 = GraphConv(num_node_features, hidden_channels)
        self.conv2 = GraphConv(hidden_channels, hidden_channels)
        self.conv3 = GraphConv(hidden_channels, hidden_channels)
        self.lin = Linear(hidden_channels,num_classes)
        
    def forward(self, x, edge_index, batch):
        # 1. 获得节点嵌入
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = self.conv2(x, edge_index)
        x = x.relu()
        x = self.conv3(x, edge_index)
        
        # 2. Readout layer
        x = global_mean_pool(x, batch)   # [batch_size, hidden_channels]
        
        # 3. 分类器
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.lin(x)
        return x

In [4]:

def train_tuning():
    model_tuning.train() ## 
    for data in TCGA_train_loader:
        #print (data.y)
        optimizer_tuning.zero_grad()
        out = model_tuning(data.x, data.edge_index,data.batch)
        loss = criterion_tuning(out, data.y)
        loss.backward()
        optimizer_tuning.step()
def test_tuning(loader):
    model_tuning.eval()
    correct = 0
    for data in loader:                            # 批遍历测试集数据集。
        out = model_tuning(data.x, data.edge_index, data.batch) # 一次前向传播
        pred = out.argmax(dim=1)   # 使用概率最高的类别
        #print (pred,data.y)
        correct += int((pred == data.y).sum())           # 检查真实标签
    return correct / len(loader.dataset)


In [5]:
# 新建微调的模型，并导入之前预训练的模型参数
model_tuning = GNN(num_node_features=1,hidden_channels=64,num_classes=31)
checkpoint = torch.load('/public/home/liujunwu/workdir/scripts/GNN_Reactome/GTEx_exp/GTEx.nomalizeSample.axis1.pth.tar')
model_tuning.load_state_dict(checkpoint["model"])
model_tuning.lin = Linear(64,20)
print (model_tuning)
# 固定除线性层外的layer 参数
for name,layer in model_tuning.named_parameters():
    if (name == 'lin.weight' or (name == 'lin.bias')):
        #print (name,layer)
        layer.requires_grad = True
    else:
        layer.requires_grad = False

GNN(
  (conv1): GraphConv(1, 64)
  (conv2): GraphConv(64, 64)
  (conv3): GraphConv(64, 64)
  (lin): Linear(in_features=64, out_features=20, bias=True)
)


In [6]:
# 读取 edges信息
edges = torch.load("/public/home/liujunwu/workdir/scripts/GNN_Reactome/GTEx_exp/pathway_edges.pt")
print (edges)

tensor([[   1,    1,    1,  ..., 2579, 2580, 2580],
        [   3,  359, 1240,  ..., 2177,  362, 2581]])


In [7]:
## 读取TCGA 样本对应的组织标签
TCGA_sample_label = pd.read_csv('/public/home/liujunwu/workdir/scripts/GNN_Reactome/TCGA_exp/GSEA/TCGA_sample.label',header=None,sep=",")
TCGA_sample_label.columns = ['Sample','short']
TCGA_tissue_label = pd.read_csv('/public/home/liujunwu/workdir/scripts/GNN_Reactome/TCGA_exp/GSEA/TCGA_tissue.label',header=None,sep='\t')
TCGA_tissue_label.columns = ['short','long']
TCGA_merge = pd.merge(left=TCGA_sample_label, 
                   right=TCGA_tissue_label, 
                   how='left', 
                   on='short')
#print (TCGA_merge.iloc[0:3,0:3])
TCGA_sample_label_dict = TCGA_merge[['Sample', "long"]].set_index("Sample").to_dict(orient='dict')["long"]
#print (TCGA_sample_label_dict["TCGA-BT-A20R-11A-11R-A16R-07"])
TCGAsample_full_labels = sorted(list(set(TCGA_merge["long"].tolist()))) ## 唯一值
TCGAsample_full_labels_dict = dict(zip(TCGAsample_full_labels,range(len(TCGAsample_full_labels)))) ## label对应的index
TCGAsample_full_labels_rev_dict = dict(zip(range(len(TCGAsample_full_labels)),TCGAsample_full_labels)) ## label对应的index
TCGA_list_labels = []
print (TCGAsample_full_labels_dict)
for x in range(len(TCGAsample_full_labels)):
    TCGA_list_labels.append([x])
TCGA_list_labels_tensor = torch.tensor(TCGA_list_labels,dtype=torch.int64)
#print (TCGA_list_labels[TCGAsample_full_labels_dict[TCGA_sample_label_dict["TCGA-BT-A20R-11A-11R-A16R-07"]]])

{'Bile duct': 0, 'Bladder': 1, 'Brain': 2, 'Breast': 3, 'Cervix Uteri': 4, 'Colorectal': 5, 'Esophagus': 6, 'Head and neck': 7, 'Kidney': 8, 'Liver': 9, 'Lung': 10, 'Muscle': 11, 'Nerve': 12, 'Pancreas': 13, 'Prostate': 14, 'Skin': 15, 'Stomach': 16, 'Thymus': 17, 'Thyroid': 18, 'Uterus': 19}


In [8]:
import glob
import re
import os
import math
from sklearn import preprocessing

TCGA_GSEA_dir = "/public/home/liujunwu/workdir/scripts/GNN_Reactome/TCGA_exp/GSEA/split/"
TCGA_GSEA_list = glob.glob(TCGA_GSEA_dir + "*/gseapy.gene_set.ssgsea.report.csv")
TCGA_tuning_sets = []
TCGA_test_sets = []
#print (TCGA_list)
for i in TCGA_GSEA_list:
    base1 = os.path.dirname(i)
    base2 = os.path.basename(base1)
    TCGA_tissue_score = ProssGSEA(i)
    #TCGA_tissue_score = Scale(TCGA_tissue_score)
    
    #TCGA_tissue_score = TCGA_tissue_score.pivot(index="Name",columns="Term",values="NES")
    #TCGA_tissue_score.index.rename(None, inplace=True)
    #TCGA_tissue_score = TCGA_tissue_score.dropna(axis=1, how='all')
    print (base2,TCGA_tissue_score.shape[0])
    tuning_sample = TCGA_tissue_score.sample(math.ceil(TCGA_tissue_score.shape[0]/2)) ## 一半且向上取整
    tuning_sample_ndarray = preprocessing.scale(tuning_sample.values,axis=1)
    #tuning_sample_ndarray = tuning_sample.values
    tuning_sample_index = tuning_sample.index
    #print (tuning_sample.iloc[0:3,0:3])
    for j in range(tuning_sample.shape[0]):
        sampleName = tuning_sample_index[j]
        sample_value = tuning_sample_ndarray[j]
        sample_value = sample_value.reshape(sample_value.shape[0],1)
        sample_node_feature = Data(x=torch.tensor(sample_value,dtype = torch.float32),y=TCGA_list_labels_tensor[TCGAsample_full_labels_dict[TCGA_sample_label_dict[sampleName]]],edge_index = edges)
        TCGA_tuning_sets.append(sample_node_feature)
        
    test_sample = pd.concat([TCGA_tissue_score,tuning_sample,tuning_sample]).drop_duplicates(keep=False)
    if (test_sample.shape[0] == 0):continue
    #test_sample_ndarray = test_sample.values
    test_sample_ndarray = preprocessing.scale(test_sample.values,axis=1)
    test_sample_index = test_sample.index
    for m in range(test_sample.shape[0]):
        sampleName = test_sample_index[m]
        sample_value = test_sample_ndarray[m]
        sample_value = sample_value.reshape(sample_value.shape[0],1)
        sample_node_feature = Data(x=torch.tensor(sample_value,dtype = torch.float32),y=TCGA_list_labels_tensor[TCGAsample_full_labels_dict[TCGA_sample_label_dict[sampleName]]],edge_index = edges)
        TCGA_test_sets.append(sample_node_feature)
        
print (len(TCGA_tuning_sets))
print (len(TCGA_test_sets))


TCGA-UCEC 35
TCGA-SARC 2
TCGA-HNSC 44
TCGA-PAAD 4
TCGA-KICH 24
TCGA-SKCM 1
TCGA-BRCA 113
TCGA-ESCA 11
TCGA-READ 10
TCGA-CESC 3
TCGA-STAD 32
TCGA-CHOL 9
TCGA-THYM 2
TCGA-PCPG 3
TCGA-COAD 41
TCGA-BLCA 19
TCGA-KIRC 72
TCGA-LIHC 50
TCGA-GBM 5
TCGA-LUSC 49
TCGA-LUAD 59
TCGA-THCA 58
TCGA-KIRP 32
TCGA-PRAD 52
371
359


In [9]:
TCGA_train_loader =  DataLoader(TCGA_tuning_sets, batch_size=10,shuffle=True)
TCGA_test_loader = DataLoader(TCGA_test_sets, batch_size=10,shuffle=True)
optimizer_tuning = torch.optim.Adam(model_tuning.parameters(), lr=1e-4)
criterion_tuning = torch.nn.CrossEntropyLoss()

#model_tuning.train()
print(datetime.datetime.now().strftime('%Y-%m-%d  %H:%M:%S'))

for epoch in range(1, 401):
    train_tuning()
    print(datetime.datetime.now().strftime('%Y-%m-%d  %H:%M:%S'))
    train_acc = test_tuning(TCGA_train_loader)
    #test_acc = test_tuning(TCGA_test_loader)
    print(f'Epoch: {epoch:03d}, Train Acc: {train_acc:.4f}')

2023-09-24  19:57:01
2023-09-24  19:57:02
Epoch: 001, Train Acc: 0.0782
2023-09-24  19:57:03
Epoch: 002, Train Acc: 0.0782
2023-09-24  19:57:04
Epoch: 003, Train Acc: 0.0701
2023-09-24  19:57:05
Epoch: 004, Train Acc: 0.1159
2023-09-24  19:57:06
Epoch: 005, Train Acc: 0.1456
2023-09-24  19:57:08
Epoch: 006, Train Acc: 0.1617
2023-09-24  19:57:09
Epoch: 007, Train Acc: 0.1563
2023-09-24  19:57:10
Epoch: 008, Train Acc: 0.1617
2023-09-24  19:57:11
Epoch: 009, Train Acc: 0.1698
2023-09-24  19:57:12
Epoch: 010, Train Acc: 0.1563
2023-09-24  19:57:13
Epoch: 011, Train Acc: 0.1779
2023-09-24  19:57:14
Epoch: 012, Train Acc: 0.2237
2023-09-24  19:57:15
Epoch: 013, Train Acc: 0.2668
2023-09-24  19:57:17
Epoch: 014, Train Acc: 0.2776
2023-09-24  19:57:18
Epoch: 015, Train Acc: 0.2938
2023-09-24  19:57:19
Epoch: 016, Train Acc: 0.3261
2023-09-24  19:57:20
Epoch: 017, Train Acc: 0.3396
2023-09-24  19:57:21
Epoch: 018, Train Acc: 0.3235
2023-09-24  19:57:22
Epoch: 019, Train Acc: 0.3261
2023-09-24

Epoch: 161, Train Acc: 0.6334
2023-09-24  20:00:06
Epoch: 162, Train Acc: 0.6577
2023-09-24  20:00:07
Epoch: 163, Train Acc: 0.6739
2023-09-24  20:00:08
Epoch: 164, Train Acc: 0.6658
2023-09-24  20:00:09
Epoch: 165, Train Acc: 0.6469
2023-09-24  20:00:10
Epoch: 166, Train Acc: 0.6523
2023-09-24  20:00:11
Epoch: 167, Train Acc: 0.6550
2023-09-24  20:00:12
Epoch: 168, Train Acc: 0.6334
2023-09-24  20:00:14
Epoch: 169, Train Acc: 0.6550
2023-09-24  20:00:15
Epoch: 170, Train Acc: 0.6550
2023-09-24  20:00:16
Epoch: 171, Train Acc: 0.6685
2023-09-24  20:00:17
Epoch: 172, Train Acc: 0.6712
2023-09-24  20:00:18
Epoch: 173, Train Acc: 0.6712
2023-09-24  20:00:19
Epoch: 174, Train Acc: 0.6685
2023-09-24  20:00:21
Epoch: 175, Train Acc: 0.6604
2023-09-24  20:00:22
Epoch: 176, Train Acc: 0.6712
2023-09-24  20:00:23
Epoch: 177, Train Acc: 0.6685
2023-09-24  20:00:24
Epoch: 178, Train Acc: 0.6739
2023-09-24  20:00:25
Epoch: 179, Train Acc: 0.6658
2023-09-24  20:00:26
Epoch: 180, Train Acc: 0.6577
2

Epoch: 322, Train Acc: 0.7278
2023-09-24  20:03:11
Epoch: 323, Train Acc: 0.7332
2023-09-24  20:03:12
Epoch: 324, Train Acc: 0.7358
2023-09-24  20:03:13
Epoch: 325, Train Acc: 0.7278
2023-09-24  20:03:14
Epoch: 326, Train Acc: 0.7251
2023-09-24  20:03:16
Epoch: 327, Train Acc: 0.7197
2023-09-24  20:03:17
Epoch: 328, Train Acc: 0.7466
2023-09-24  20:03:18
Epoch: 329, Train Acc: 0.7358
2023-09-24  20:03:19
Epoch: 330, Train Acc: 0.7385
2023-09-24  20:03:20
Epoch: 331, Train Acc: 0.7251
2023-09-24  20:03:21
Epoch: 332, Train Acc: 0.7332
2023-09-24  20:03:23
Epoch: 333, Train Acc: 0.7251
2023-09-24  20:03:24
Epoch: 334, Train Acc: 0.7305
2023-09-24  20:03:25
Epoch: 335, Train Acc: 0.7412
2023-09-24  20:03:26
Epoch: 336, Train Acc: 0.7466
2023-09-24  20:03:27
Epoch: 337, Train Acc: 0.7466
2023-09-24  20:03:28
Epoch: 338, Train Acc: 0.7358
2023-09-24  20:03:29
Epoch: 339, Train Acc: 0.7143
2023-09-24  20:03:30
Epoch: 340, Train Acc: 0.7224
2023-09-24  20:03:32
Epoch: 341, Train Acc: 0.7385
2

In [10]:
def test_tuning_subclass(loader):
    model_tuning.eval()
    correct = 0
    total = 0
    label_correct = {}
    total_label = {}
    for k in range(20):
        #print (type(k))
        total_label[k] = 0
        label_correct[k] = 0
    #print (label_correct[10])
    for data in loader:# 批遍历测试集数据集。
        out = model_tuning(data.x, data.edge_index, data.batch) # 一次前向传播
        pred = out.argmax(dim=1)   # 使用概率最高的类别
        total += int((pred == data.y).sum())
        index = 0
        for i in pred:
            num = i.item()
            #print (num)
            #print (data.y[0].item())
            total_label[data.y[index].item()] += 1
            if (num == data.y[index].item()):
                label_correct[num] += 1
            index += 1
    for j in total_label.keys():
        correct_num = label_correct[j]
        ratio = correct_num / total_label[j] if total_label[j] != 0 else 0
        print (TCGAsample_full_labels_rev_dict[j],total_label[j],ratio)
    return total / len(loader.dataset)

In [11]:
test_acc = test_tuning_subclass(TCGA_test_loader)
#print (test_acc.)
print(f'Test Acc: {test_acc:.4f}')

Bile duct 4 0.0
Bladder 9 0.0
Brain 2 0.5
Breast 56 0.9107142857142857
Cervix Uteri 1 0.0
Colorectal 25 0.8
Esophagus 5 0.0
Head and neck 22 0.3181818181818182
Kidney 64 0.859375
Liver 25 1.0
Lung 53 0.9245283018867925
Muscle 1 0.0
Nerve 1 0.0
Pancreas 2 0.0
Prostate 26 0.6153846153846154
Skin 0 0
Stomach 16 0.1875
Thymus 1 0.0
Thyroid 29 0.8620689655172413
Uterus 17 0.35294117647058826
Test Acc: 0.7187


In [None]:
print (type(TCGA_test_sets[0].y.item()))

In [None]:
test_csv = "/public/home/liujunwu/workdir/scripts/GNN_Reactome/TCGA_exp/GSEA/split/TCGA-PCPG/gseapy.gene_set.ssgsea.report.csv"
test_score = ProssGSEA(test_csv)
print (test_score.iloc[0:4,0:3])
test_score = Scale(test_score)
print (test_score.iloc[0:4,0:3])

In [None]:
print (np.mean(test_score.iloc[:,0]))
print (np.std(test_score.iloc[:,0]))