In [197]:
import os
import gc
import time
import argparse
import numpy as np
import pandas as pd
import networkx as nx
from datetime import datetime

from MAIN.utils import *
from MAIN.train import *
import MAIN.preprocess_functions
from MAIN.GNN_MME import GCN_MME , GSage_MME , GAT_MME

from Modules.PNetTorch.MAIN.reactome import ReactomeNetwork
from Modules.PNetTorch.MAIN.Pnet import MaskedLinear , PNET
from Modules.PNetTorch.MAIN.utils import numpy_array_to_one_hot, get_gpu_memory
from Modules.PNetTorch.MAIN.interpret import interpret , evaluate_interpret_save , visualize_importances

import dgl
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold

import warnings
warnings.filterwarnings("ignore")

print("Finished Library Import \n")

Finished Library Import 



In [198]:
# Define the merge operation setup
def merge_dfs(left_df, right_df):
    # Merging on 'key' and expanding with 'how=outer' to include all records
    return pd.merge(left_df, right_df, left_index=True, right_index=True, how='outer')
no_cuda=True
output="data(li)delete10\\result\\"
# Map model names to class objects
model_mapping = {
    "GCN": GCN_MME,
    "GSage": GSage_MME,
    'GAT': GAT_MME
}

# Start the timer
start_time = time.time()

# Check if output directory exists, if not create it
if not os.path.exists(output) : 
    os.makedirs(output, exist_ok=True)
    
# Specify the device to use
device = torch.device('cpu' if no_cuda else 'cuda') # Get GPU device name, else use CPU
print("Using %s device" % device)
#get_gpu_memory()


Using cpu device


In [199]:
input="data(li)delete10"
modalities=['exp', 'cna','mut']
target='diagnosis'
index_col='sample_id'
label_file="labels.csv"
# Load data and metadata
datModalities , meta = data_parsing(input , modalities , target , index_col,label_file)
for mod, expr in datModalities.items():
    print(f"Modality: {mod}, Shape: {expr.shape}")
interpret_feat=False
pnet=False
if interpret_feat : 
    features = {}
    for i , mod in enumerate(datModalities) : 
        features[i] = list(datModalities[mod].columns)

if pnet : 
    # List of genes of interest in PNet (keep to less than 1000 for big models)
    genes = pd.read_csv(f'{input}/../ext_data/genelist.txt', header=0)

    # Build network to obtain gene and pathway relationships
    net = ReactomeNetwork(genes_of_interest=np.unique(list(genes['genes'].values)) , n_levels=5)
meta = meta.loc[sorted(meta.index)]
# 将 diagnosis 列转换为分类数据类型
diagnosis_series = meta['diagnosis'].astype('category')

# 获取独热编码
label = F.one_hot(torch.Tensor(diagnosis_series.cat.codes).to(torch.int64))


MME_input_shapes = [ datModalities[mod].shape[1] for mod in datModalities]
h = reduce(merge_dfs , list(datModalities.values()))
h = h.loc[sorted(h.index)]
# 读取 CSV 文件
csv_file = 'data(li)delete10\\snf_graph.csv'
df = pd.read_csv(csv_file, na_values=['NA'])

# 创建一个有向图
G = nx.DiGraph()

# 添加边到图中
for index, row in df.iterrows():
    G.add_edge(row['from'], row['to'], 
               from_color=row['from_frame.color'], 
               from_name=row['from_name'], 
               from_class=row['from_class'], 
               from_vertex_color=row['from_vertex.frame.color'],
               to_color=row['to_frame.color'], 
               to_name=row['to_name'], 
               to_class=row['to_class'], 
               to_vertex_color=row['to_vertex.frame.color'])

# 将 NetworkX 图转换为 DGL 图
g = dgl.from_networkx(G)

# 打印图的一些基本信息
print("Number of nodes:", g.number_of_nodes())
print("Number of edges:", g.number_of_edges())

# 这里可以为节点和边添加特征，如果需要的话
# 例如，可以为节点添加一个特征张量
g.ndata['feat'] = torch.Tensor(h.to_numpy())
#node_features = torch.zeros(g.number_of_nodes(), 3)  # 假设每个节点有3个特征
#g.ndata['feat'] = node_features

# 如果需要为边添加特征
#edge_features = torch.zeros(g.number_of_edges(), 4)  # 假设每条边有4个特征
#g.edata['feat'] = edge_features
g.ndata['label'] = torch.tensor(meta['diagnosis'], dtype=torch.int64)
# 输出图的结构
print(g)
g = dgl.add_self_loop(g)
no_shuffle=False
n_splits=3
# Generate K Fold splits
if no_shuffle : 
    skf = StratifiedKFold(n_splits=n_splits , shuffle=False) 
else :
    skf = StratifiedKFold(n_splits=n_splits , shuffle=True) 

print(skf)

output_metrics = []
test_logits = []
test_labels = []
latent_dim=[32,16]
model="GCN"
decoder_dim=64
h_feats= [64]
epochs=1000
lr=0.001
patience=100
output="data(li)delete10\\result\\"
interpret_feat=False

Modality: exp, Shape: (36, 20789)
Modality: cna, Shape: (36, 16286)
Modality: mut, Shape: (36, 10086)
Number of nodes: 36
Number of edges: 294
Graph(num_nodes=36, num_edges=294,
      ndata_schemes={'feat': Scheme(shape=(47161,), dtype=torch.float32), 'label': Scheme(shape=(), dtype=torch.int64)}
      edata_schemes={})
StratifiedKFold(n_splits=3, random_state=None, shuffle=True)


In [200]:
test_logits = []
test_labels = []
output_metrics = []
test_indices = [] 
for i, (train_index, test_index) in enumerate(skf.split(meta.index, meta['diagnosis'])):
    test_indices.append(test_index)
    model_name = 'GCN_MME'
    # Initialize model
    if pnet : 
        model = model_mapping[model](MME_input_shapes , latent_dim , decoder_dim , h_feats,  len(meta.unique()), PNet=net).to(device)
    else :
        #model = model_mapping[model](MME_input_shapes , latent_dim , decoder_dim , h_feats,  len(meta.unique())).to(device)
        #model = model_mapping[model](MME_input_shapes , latent_dim , decoder_dim , h_feats,  len(meta['diagnosis'].unique())).to(device)#改
        model=GCN_MME(MME_input_shapes , [16 , 32,32] , 32 , [16]  , len(meta['diagnosis'].unique())).to(device)
    print(model)
    print(g)
    
    g = g.to(device)

    # Train the model
    loss_plot = train(g, train_index, device ,  model , label , epochs , lr , patience)
    plt.title(f'Loss for split {i}')
    save_path = output + '/loss_plots/'
    os.makedirs(save_path, exist_ok=True)
    plt.savefig(f'{save_path}loss_split_{i}.png' , dpi = 200)
    plt.clf()
    
    sampler = NeighborSampler(
        [15 for i in range(len(model.gnnlayers))],  # fanout for each layer
        prefetch_node_feats=['feat'],
        prefetch_labels=['label'],
    )
    test_dataloader = DataLoader(
        g,
        torch.Tensor(test_index).to(torch.int64).to(device),
        sampler,
        device=device,
        batch_size=1024,
        shuffle=True,
        drop_last=False,
        num_workers=0,
        use_uva=False,
    )
    print(test_dataloader)
    # Evaluate the model
    test_output_metrics = evaluate(model , g , test_dataloader)

    print(
        "Fold : {:01d} | Test Accuracy = {:.4f} | F1 = {:.4f} ".format(
        i+1 , test_output_metrics[1] , test_output_metrics[2] )
    )
    
    # 这里假设 test_output_metrics[-2] 和 test_output_metrics[-1] 是张量
    test_logits.extend(test_output_metrics[-2].detach().cpu().numpy().tolist())  # 将 Tensor 转换为列表
    test_labels.extend(test_output_metrics[-1].detach().cpu().numpy().tolist())   # 将 Tens
    
    if interpret_feat : 
        prev_dim = 0
        for i_int , (pnet , dim) in enumerate(zip(model.encoder_dims , model.input_dims)) : 

            pnet.features = features[i_int]

            x = g.ndata['feat'][torch.Tensor(test_index).to(device).to(torch.int) , prev_dim:dim+prev_dim]

            if i_int == 0 :
                model_importances_cv = interpret(pnet , x , savedir='None' , plot=False)
                for layer in model_importances_cv.keys() : 
                    model_importances_cv[layer] = model_importances_cv[layer].fillna(0)
                model_importances_cv['Features'] = (model_importances_cv['Features'] - model_importances_cv['Features'].mean().mean())/model_importances_cv['Features'].mean().std()
                model_importances_cv['Features'] = model_importances_cv['Features'].abs().mean(axis=0)
            else : 
                model_importances_tmp = interpret(pnet , x , savedir='None', plot=False)
                model_importances_tmp['Features'] = (model_importances_tmp['Features'] - model_importances_tmp['Features'].mean().mean())/model_importances_tmp['Features'].mean().std()
                model_importances_tmp['Features'] = model_importances_tmp['Features'].abs().mean(axis=0)
                for layer in model_importances_cv.keys() : 
                    model_importances_tmp[layer] = model_importances_tmp[layer].fillna(0)
                    if layer == 'Features' : 
                        model_importances_cv[layer] = pd.concat([model_importances_cv[layer] , model_importances_tmp[layer]])
                    else : 
                        model_importances_cv[layer] += model_importances_tmp[layer]

            prev_dim += dim

        model_importances_cv = {k: (v.divide(i_int+1) if k != 'Features' else v) for k, v in model_importances_cv.items()}
        if i == 0 : 
            model_importances = model_importances_cv
        else : 
            for layer in model_importances.keys() :
                if layer == 'Features' : 
                    model_importances[layer] +=  model_importances_cv[layer]
                else : 
                    model_importances[layer] = pd.concat([model_importances[layer] , model_importances_cv[layer]] , axis=0).reset_index(drop=True)
    
    # Save the output metrics and best performing model
    output_metrics.append(test_output_metrics)
    if i == 0 : 
        best_model = model
        best_idx = i
    elif output_metrics[best_idx][1] < test_output_metrics[1] : 
        best_model = model
        best_idx   = i
    # 保存最佳模型的测试集结果
    if output_metrics:  # 确保 output_metrics 不为空
        best_test_metrics = output_metrics[best_idx]
        best_test_accuracy = best_test_metrics[1]
        best_test_f1 = best_test_metrics[2]
        best_test_logits = test_logits  # 或者从 output_metrics 中提取
        best_test_labels = test_labels  # 或者从 output_metrics 中提取

        # 将结果保存到文件
        import pandas as pd

        results_df = pd.DataFrame({
            'Test Accuracy': [best_test_accuracy],
            'Test F1': [best_test_f1],
            'Test Logits': [best_test_logits],
            'Test Labels': [best_test_labels],
            'Best Model Index': [test_index]  # 添加最佳模型索引
        })
    #get_gpu_memory()
    del model
    gc.collect()
    #torch.cuda.empty_cache()
    print('Clearing gpu memory')
    #get_gpu_memory()

GCN_MME(
  (encoder_dims): ModuleList(
    (0): Encoder(
      (encoder): ModuleList(
        (0): Linear(in_features=20789, out_features=500, bias=True)
        (1): Linear(in_features=500, out_features=16, bias=True)
      )
      (norm): ModuleList(
        (0): BatchNorm1d(500, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (1): BatchNorm1d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (decoder): Sequential(
        (0): Linear(in_features=16, out_features=32, bias=True)
      )
      (drop): Dropout(p=0.5, inplace=False)
    )
    (1): Encoder(
      (encoder): ModuleList(
        (0): Linear(in_features=16286, out_features=500, bias=True)
        (1): Linear(in_features=500, out_features=32, bias=True)
      )
      (norm): ModuleList(
        (0): BatchNorm1d(500, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (1): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)


<Figure size 600x400 with 0 Axes>

<Figure size 600x400 with 0 Axes>

<Figure size 600x400 with 0 Axes>

In [201]:
logits_expanded = pd.DataFrame(results_df['Test Logits'].tolist(), index=results_df.index)
labels_expanded = pd.DataFrame(results_df['Test Labels'].tolist(), index=results_df.index)
best_model_index_expanded = pd.DataFrame(results_df['Best Model Index'].tolist(), index=results_df.index)

# 将所有数据合并到一个新的 DataFrame 中
final_df = pd.concat(
    [results_df[['Test Accuracy', 'Test F1']], logits_expanded, labels_expanded, best_model_index_expanded],
    axis=1
)
print(best_model_index_expanded )
print(labels_expanded)
# 保存 DataFrame 到 CSV 文件
best_model_index_expanded.to_csv('D:\\比较方法\\比较方法\\MOGDx-main(5)\\data(li)delete10\\a.csv', index=False)
labels_expanded.to_csv('D:\\比较方法\\比较方法\\MOGDx-main(5)\\data(li)delete10\\b.csv', index=False)
print(f"最佳模型的测试结果已保存到 'best_model_test_results.csv'。")
#test_logits = torch.stack(test_logits)
#test_labels = torch.stack(test_labels)

if interpret_feat : 
    model_importances = {k: (v.divide(i+1)) for k, v in model_importances.items()}
    with open(f'{output}/model_importance.pkl', 'wb') as file:
        pickle.dump(model_importances, file)
# Save the output metrics to a file   
accuracy = []
F1 = []
output_file = output + '/' + "test_metrics.txt"
with open(output_file , 'w') as f :
    i = 0
    for metric in output_metrics :
        i += 1
        f.write("Fold %i \n" % i)
        f.write(f"acc = %2.3f , avg_prc = %2.3f , avg_recall = %2.3f , avg_f1 = %2.3f" % 
                (metric[1] , metric[3] , metric[4] , metric[2]))
        f.write('\n')
        accuracy.append(metric[1])
        F1.append(metric[2])
        
    f.write('-------------------------\n')
    f.write("%i Fold Cross Validation Accuracy = %2.2f \u00B1 %2.2f \n" %(n_splits , np.mean(accuracy)*100 , np.std(accuracy)*100))
    f.write("%i Fold Cross Validation F1 = %2.2f \u00B1 %2.2f \n" %(n_splits , np.mean(F1)*100 , np.std(F1)*100))
    f.write('-------------------------\n')

print("%i Fold Cross Validation Accuracy = %2.2f \u00B1 %2.2f" %(5 , np.mean(accuracy)*100 , np.std(accuracy)*100))
print("%i Fold Cross Validation F1 = %2.2f \u00B1 %2.2f" %(5 , np.mean(F1)*100 , np.std(F1)*100))

   0   1   2   3   4   5   6   7   8   9   10  11
0   4   6   8  10  11  12  14  26  30  31  33  34
   0   1   2   3   4   5   6   7   8   9   ...  26  27  28  29  30  31  32  \
0   1   0   0   0   0   0   0   0   1   0  ...   0   1   0   0   0   0   0   

   33  34  35  
0   1   0   0  

[1 rows x 36 columns]
最佳模型的测试结果已保存到 'best_model_test_results.csv'。
5 Fold Cross Validation Accuracy = 72.22 ± 3.93
5 Fold Cross Validation F1 = 25.66 ± 1.37


In [202]:
# Get the current date
current_date = datetime.now()

# Extract month and day as string names
month = current_date.strftime('%B')[:3]  # Full month name
day = current_date.day

save_path = output + '/Models/'
os.makedirs(save_path, exist_ok=True)
torch.save({
    'model_state_dict': best_model.state_dict(),
    # You can add more information to save, such as training history, hyperparameters, etc.
}, f'{save_path}GCN_MME_model_{month}{day}' )
torch.save(best_model,"./data(li)delete10/result/Models/best_model.model")
input="data(li2)\\外部测试"
modalities=['exp', 'cna','mut']
target='diagnosis'
index_col='sample_id'
label_file="labels.csv"
# Load data and metadata
datModalities , meta = data_parsing(input , modalities , target , index_col,label_file)
for mod, expr in datModalities.items():
    print(f"Modality: {mod}, Shape: {expr.shape}")
    meta = meta.loc[sorted(meta.index)]
# 将 diagnosis 列转换为分类数据类型
diagnosis_series = meta['diagnosis'].astype('category')

# 获取独热编码
label = F.one_hot(torch.Tensor(diagnosis_series.cat.codes).to(torch.int64))
MME_input_shapes = [ datModalities[mod].shape[1] for mod in datModalities]
h = reduce(merge_dfs , list(datModalities.values()))
h = h.loc[sorted(h.index)]
# 读取 CSV 文件
csv_file = 'data(li2)\\外部测试\\snf_graph.csv'
df = pd.read_csv(csv_file, na_values=['NA'])

# 创建一个有向图
G = nx.DiGraph()

# 添加边到图中
for index, row in df.iterrows():
    G.add_edge(row['from'], row['to'], 
               from_color=row['from_frame.color'], 
               from_name=row['from_name'], 
               from_class=row['from_class'], 
               from_vertex_color=row['from_vertex.frame.color'],
               to_color=row['to_frame.color'], 
               to_name=row['to_name'], 
               to_class=row['to_class'], 
               to_vertex_color=row['to_vertex.frame.color'])

# 将 NetworkX 图转换为 DGL 图
g = dgl.from_networkx(G)

# 打印图的一些基本信息
print("Number of nodes:", g.number_of_nodes())
print("Number of edges:", g.number_of_edges())

# 这里可以为节点和边添加特征，如果需要的话
# 例如，可以为节点添加一个特征张量
g.ndata['feat'] = torch.Tensor(h.to_numpy())
#node_features = torch.zeros(g.number_of_nodes(), 3)  # 假设每个节点有3个特征
#g.ndata['feat'] = node_features

# 如果需要为边添加特征
#edge_features = torch.zeros(g.number_of_edges(), 4)  # 假设每条边有4个特征
#g.edata['feat'] = edge_features
g.ndata['label'] = torch.tensor(meta['diagnosis'], dtype=torch.int64)
# 输出图的结构
print(g)
g = dgl.add_self_loop(g)
best_model_path = 'data(li)delete10\\result\\Models\\GCN_MME_model_Sep2'  # 替换为你的模型路径
# 加载最佳模型
best_model = GCN_MME(MME_input_shapes , [16 , 32,32] , 32 , [16]  , len(meta['diagnosis'].unique())).to(device)
checkpoint = torch.load(best_model_path )
best_model.load_state_dict(checkpoint['model_state_dict'])
best_model.eval()  # 设置模型为评估模式
test_logits = []
test_labels = []
test_dataloader = DataLoader(
    g,
    torch.arange(g.num_nodes()).to(device),
    sampler,
    device=device,
    batch_size=1024,
    shuffle=True,
    drop_last=False,
    num_workers=0,
    use_uva=False,
)
print(test_dataloader)
# Evaluate the model
test_output_metrics = evaluate(best_model , g , test_dataloader)

print(
    "Fold : {:01d} | Test Accuracy = {:.4f} | F1 = {:.4f} ".format(
    i+1 , test_output_metrics[1] , test_output_metrics[2] )
)

# 这里假设 test_output_metrics[-2] 和 test_output_metrics[-1] 是张量
test_logits.extend(test_output_metrics[-2].detach().cpu().numpy().tolist())  # 将 Tensor 转换为列表
test_labels.extend(test_output_metrics[-1].detach().cpu().numpy().tolist())   # 将 Tens
print(test_labels)
df = pd.DataFrame(test_labels, columns=['Predicted_Label'])

# 保存为 CSV 文件
df.to_csv('D:\\比较方法\\比较方法\\MOGDx-main(5)\\data(li)delete10\\predict.csv', index=False)    

Modality: exp, Shape: (16, 20789)
Modality: cna, Shape: (16, 16286)
Modality: mut, Shape: (16, 10086)
Number of nodes: 16
Number of edges: 120
Graph(num_nodes=16, num_edges=120,
      ndata_schemes={'feat': Scheme(shape=(47161,), dtype=torch.float32), 'label': Scheme(shape=(), dtype=torch.int64)}
      edata_schemes={})
<dgl.dataloading.dataloader.DataLoader object at 0x000001A8C8B09450>
Fold : 4 | Test Accuracy = 0.4375 | F1 = 0.4738 
[0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1]


In [203]:
# Stop the timer
end_time = time.time()

# Calculate and print the elapsed time
elapsed_time = (end_time - start_time)/60
print(f"Elapsed time: {elapsed_time} minutes")

Elapsed time: 4.640977084636688 minutes
