In [21]:
import argparse

import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sched import scheduler
from data.data_loader import load_ft
from model.HGCN import HGCN
from model.TMO import TMO
from utils.hypergraph_utils import gen_trte_inc_mat


In [22]:
class AverageMeter(object):
    """
    Computes and stores the average and current value
    """

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


In [23]:

# 4. 修改train_epoch函数，添加设备检查
def train_epoch(data_list, g_list, label, model, optimizer, scheduler_dict, epoch, idx_tr=[]):
    # 添加设备验证
    print(f"Model device: {next(model.parameters()).device}")
    for i, data in enumerate(data_list):
        print(f"Data {i} device: {data.device}")
    for i, g in enumerate(g_list):
        print(f"G {i} device: {g.device}")
    print(f"Label device: {label.device}")
    
    # 强制所有输入到CPU（双重保险）
    data_list = [data.cpu() for data in data_list]
    g_list = [g.cpu() for g in g_list]
    label = label.cpu()
    model = model.cpu()
    
    scheduler_dict.step()
    model.train()
    loss_meter = AverageMeter()
    criterion = torch.nn.CrossEntropyLoss()

    optimizer.zero_grad()
    if len(data_list) >= 2:
        evidence_a, loss = model(data_list, g_list, label, epoch, idx_tr)
    else:
        ci = model(data_list[0], g_list[0])
        loss = torch.mean(criterion(ci[idx_tr], label[idx_tr]))

    loss.backward()
    optimizer.step()
    loss_meter.update(loss.item())
    return loss_meter.avg


In [24]:
def test_epoch(data_list, label, g_list, te_idx, model, epoch, idx_list_all):
    """
    :param data_list: The omics features 
    :param label: Sample labels
    :param g_list: The laplace incidence matrix
    :param te_idx: The index of test set
    :param model: The HyperTMO model
    :param epoch: Current training epoch
    :param idx_list_all: The index of dataset
    """
    model.eval()
    loss_meter = AverageMeter()
    with torch.no_grad():
        if len(data_list) >= 2:
            evidence_a, loss = model(data_list, g_list, label, epoch, idx_list_all)
            loss_meter.update(loss.item())
        else:
            evidence_a = model(data_list[0], g_list[0])
    c = evidence_a[te_idx, :]
    prob = F.softmax(c, dim=1).data.cpu().numpy()
    return prob


In [25]:
def train_model(data_tensor_list, model, g_list, labels_tensor, criterion, optimizer, scheduler, num_epochs, print_freq, 
                  idx_dict, num_class):
    """
    :param data_tensor_list: The omics features
    :param model: The HGCN model
    :param g_list: The laplace incidence matrix
    :param labels_tensor: Sample labels
    :param optimizer: Training optimizer, Adam optimizer
    :param criterion: Cross-entropy criterion
    :param num_epochs: The epochs
    :param print_freq: Print frequency
    :param idx_dict: The index of train set and test set
    :param num_class: Number of classes
    """
    best_acc = 0.0
    best_f1 = 0.0
    best_auc =0.0
    best_macro = 0.0
    best_test_indices = None
    best_true_labels = None
    best_predictions = None
    for epoch in range(num_epochs):
        if epoch % print_freq == 0:
            print('-' * 10)
            print(f'Epoch {epoch}/{num_epochs - 1}')

        for phase in ['train', 'val']:
            if phase == 'train':
                scheduler.step()
                model.train()
            else:
                model.eval()
            running_loss = 0.0
            running_corrects = 0
            idx = idx_dict['tr'] if phase == 'train' else idx_dict['te']
            optimizer.zero_grad()
            with torch.set_grad_enabled(phase == 'train'):
                outputs = model(data_tensor_list[0], g_list[0])
                _, preds = torch.max(outputs, 1)
                if phase == 'train':
                    loss = torch.mean(criterion(outputs[idx], labels_tensor[idx]))
                    loss.backward()
                    optimizer.step()
                if epoch % 200 == 0:
                    print()
            running_loss += loss.item() * data_tensor_list[0].size(0)
            running_corrects += torch.sum(preds[idx] == labels_tensor.data[idx])
            epoch_loss = running_loss / len(idx)
            epoch_acc = running_corrects.double() / len(idx)
            if epoch % print_freq == 0:
                print(f'{phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}')
            if phase == 'val' and epoch_acc > best_acc:          
                best_acc = epoch_acc
                best_epoch = epoch
                # 保存最佳模型
                torch.save(model, "./data(li)delete10/last_model.model")
                # 记录最佳模型时的测试集索引、真实结果和预测结果
                best_test_indices = idx_dict['te']
                best_true_labels = labels_tensor[idx_dict['te']].cpu().numpy()
                best_predictions = preds[idx_dict['te']].cpu().numpy()
            if f1_score(labels_tensor[idx_dict["te"]].cpu(), preds[idx_dict['te']].cpu(), average='weighted') > best_f1:
                best_f1 = f1_score(labels_tensor[idx_dict["te"]].cpu(), preds[idx_dict['te']].cpu(), average='weighted')
            if (f1_score(labels_tensor[idx_dict["te"]].cpu(), preds[idx_dict['te']].cpu(), average='macro') > best_macro and num_class > 2):
                best_macro = f1_score(labels_tensor[idx_dict["te"]].cpu(), preds[idx_dict['te']].cpu(), average='macro')
            if (num_class == 2 and roc_auc_score(labels_tensor[idx_dict["te"]].cpu(), F.softmax(outputs[idx_dict["te"]], dim=1).data.cpu().numpy()[:, 1]) > best_auc  ):
                best_auc = roc_auc_score(labels_tensor[idx_dict["te"]].cpu(), F.softmax(outputs[idx_dict["te"]], dim=1).data.cpu().numpy()[:, 1])
        if epoch % print_freq == 0:
            print(f'Best val Acc: {best_acc:4f}')
            print('-' * 20)
    print(f'Best val Acc: {best_acc:4f} in {best_epoch}')
    print(f'Best val f1: {best_f1:4f}')
    return best_acc.cpu(), best_f1, best_macro, best_auc,best_test_indices,best_true_labels,best_predictions,best_epoch

In [26]:
parser = argparse.ArgumentParser()
#parser.add_argument('--file_dir', '-fd', type=str, default='训练', help='The dataset file folder.')
parser.add_argument('--seed', '-s', type=int, default=20, help='Random seed, default=20.')
parser.add_argument('--num_epoch', '-ne', type=int, default=500, help='Training epochs, default: 40000.')
parser.add_argument('--lr_e', '-lr', type=float, default=0.001, help='Learning rate, default: 0.001.')
parser.add_argument('--dim_he_list', '-dh', nargs = '+', type=int, default=[400, 200, 200], help='Hidden layer dimension of HGCN.')
parser.add_argument('--num_class', '-nc', type=int, default=4, help='Number of classes.')
parser.add_argument('--k_neigs', '-kn', type=int, default=4, help='Number of vertices in hyperedge.')
#args = parser.parse_args()
args, unknown = parser.parse_known_args()
print(args)

Namespace(seed=20, num_epoch=500, lr_e=0.001, dim_he_list=[400, 200, 200], num_class=4, k_neigs=4)


In [27]:





data_folder = 'data(li)delete10'
omics_list = ['cna','exp','mut']
test_inverval = 50
num_omics = len(omics_list)
cuda = False  # 强制使用CPU
idx_dict = {}
file_dir='训练'
data_tensor_list, labels_tensor = load_ft(data_folder, omics_list, file_dir)
# 确保所有数据在CPU上
data_tensor_list = [x.cpu() for x in data_tensor_list]
labels_tensor = labels_tensor.cpu()
dim_list = [x.shape[1] for x in data_tensor_list]
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=args.seed)

acc_res, F1_res, AUC_res = [],[],[]# 初始化结果存储

for idx_train, idx_test in skf.split(pd.DataFrame(data=data_tensor_list[0].cpu()), 
                                      pd.DataFrame(labels_tensor.cpu())):
    # 在训练集中随机删除10%的样本
    num_train_samples = len(idx_train)
    num_samples_to_delete = int(0.1 * num_train_samples)
    
    samples_to_delete = np.random.choice(idx_train, size=num_samples_to_delete, replace=False)
    idx_train_reduced = np.setdiff1d(idx_train, samples_to_delete)
    
    # 修改g_list创建，移除.cuda()
    g_list = []
    g = gen_trte_inc_mat(data_tensor_list, args.k_neigs)
    for i in range(len(data_tensor_list)):
        g_tensor = torch.Tensor(g[i]).cpu()  # 明确使用CPU
        g_list.append(g_tensor)
    idx_list_all = list(range(g_list[0].shape[0]))  # 定义 idx_list_all
    # 模型也保持在CPU上
    if num_omics >= 2:
      model_dict = TMO(dim_list, args.num_class, num_omics, args.dim_he_list)
    else:
      model_dict = HGCN(dim_list[0], args.num_class, args.dim_he_list)
    # 2. 确保模型在CPU上
    model_dict = model_dict.cpu()
    print("\nTraining on CPU...")
    optim_dict = torch.optim.Adam(model_dict.parameters(), lr=args.lr_e, weight_decay=0.0005)

    scheduler_dict = torch.optim.lr_scheduler.MultiStepLR(optim_dict, milestones=[100], gamma=0.9)
    
    best_acc = 0.0
    best_f1 = 0.0
    best_macro = 0.0
    best_auc = 0.0
    idx_dict["tr"] = idx_train_reduced  # 使用删减后的训练集
    idx_dict["te"] = idx_test           # 测试集保持不变

    if num_omics >= 2:
        for epoch in range(args.num_epoch + 1):
            train_epoch(data_tensor_list, g_list, labels_tensor,
                        model_dict, optim_dict, scheduler_dict, epoch=epoch, idx_tr=idx_dict["tr"])
            te_prob = test_epoch(data_tensor_list, labels_tensor, g_list, idx_dict["te"], 
                                 model_dict, epoch, idx_list_all)

            # 计算当前模型在测试集上的性能
            current_acc = accuracy_score(labels_tensor[idx_dict["te"]].cpu(), te_prob.argmax(1))
            current_f1 = f1_score(labels_tensor[idx_dict["te"]].cpu(), te_prob.argmax(1), average='weighted')
            current_macro = f1_score(labels_tensor[idx_dict["te"]].cpu(), te_prob.argmax(1), average='macro') if args.num_class > 2 else 0
            current_auc = roc_auc_score(labels_tensor[idx_dict["te"]].cpu(), te_prob[:, 1]) if args.num_class == 2 else 0

            # 更新最佳模型及其相关信息
            if current_acc > best_acc:
                best_acc = current_acc
                best_epoch = epoch
                best_test_indices = idx_dict["te"]
                best_true_labels = labels_tensor[idx_dict["te"]].cpu().numpy()
                best_predictions = te_prob.argmax(1) 
                torch.save(model_dict.state_dict(), "./data(li)delete10/best_model.model")

            # 更新其他最佳指标
            best_f1 = max(best_f1, current_f1)
            best_macro = max(best_macro, current_macro)
            best_auc = max(best_auc, current_auc)

            if epoch % test_inverval == 0:
                print("\nTest: Epoch {:d}".format(epoch))
                print("Test ACC: {:.3f}".format(current_acc))
                print("Test F1: {:.3f}".format(current_f1))
                if args.num_class == 2:
                    print("Test AUC: {:.3f}".format(current_auc))
                print("Best Test ACC: {:.3f}".format(best_acc))

        # 保存每次交叉验证的结果
        F1_res.append(best_f1)
        acc_res.append(best_acc)
        AUC_res.append(best_auc)

    else:
        # 处理只有一个组学的情况（同样需要修改训练索引）
        criterion = torch.nn.CrossEntropyLoss()
        best_acc, best_f1, best_macro, best_auc, best_test_indices, best_true_labels, best_predictions, best_epoch = train_model(
            model_dict, criterion, optim_dict, scheduler_dict, args.num_epoch, 50, data_tensor_list, g_list, labels_tensor, idx_dict, args.num_class)

        F1_res.append(best_f1)
        acc_res.append(best_acc)
        AUC_res.append(best_macro)

# 输出交叉验证的最终性能
print('3-fold performance: Acc(%.4f ± %.4f)  F1(%.4f ± %.4f)  AUC/F1_mac(%.4f ± %.4f)'
      % (np.mean(acc_res), np.std(acc_res), np.mean(F1_res), np.std(F1_res),
         np.mean(AUC_res), np.std(AUC_res)))
print('Finished!')

Constructing hypergraph incidence matrix! 
(It may take several minutes! Please wait patiently!)
Constructing hypergraph incidence matrix! 
(It may take several minutes! Please wait patiently!)
Constructing hypergraph incidence matrix! 
(It may take several minutes! Please wait patiently!)

Training on CPU...
Model device: cpu
Data 0 device: cpu
Data 1 device: cpu
Data 2 device: cpu
G 0 device: cpu
G 1 device: cpu
G 2 device: cpu
Label device: cpu


  index = torch.tensor(nearest_idx).long()



Test: Epoch 0
Test ACC: 0.846
Test F1: 0.776
Best Test ACC: 0.846
Model device: cpu
Data 0 device: cpu
Data 1 device: cpu
Data 2 device: cpu
G 0 device: cpu
G 1 device: cpu
G 2 device: cpu
Label device: cpu
Model device: cpu
Data 0 device: cpu
Data 1 device: cpu
Data 2 device: cpu
G 0 device: cpu
G 1 device: cpu
G 2 device: cpu
Label device: cpu
Model device: cpu
Data 0 device: cpu
Data 1 device: cpu
Data 2 device: cpu
G 0 device: cpu
G 1 device: cpu
G 2 device: cpu
Label device: cpu
Model device: cpu
Data 0 device: cpu
Data 1 device: cpu
Data 2 device: cpu
G 0 device: cpu
G 1 device: cpu
G 2 device: cpu
Label device: cpu
Model device: cpu
Data 0 device: cpu
Data 1 device: cpu
Data 2 device: cpu
G 0 device: cpu
G 1 device: cpu
G 2 device: cpu
Label device: cpu
Model device: cpu
Data 0 device: cpu
Data 1 device: cpu
Data 2 device: cpu
G 0 device: cpu
G 1 device: cpu
G 2 device: cpu
Label device: cpu
Model device: cpu
Data 0 device: cpu
Data 1 device: cpu
Data 2 device: cpu
G 0 device: 

  index = torch.tensor(nearest_idx).long()



Training on CPU...
Model device: cpu
Data 0 device: cpu
Data 1 device: cpu
Data 2 device: cpu
G 0 device: cpu
G 1 device: cpu
G 2 device: cpu
Label device: cpu

Test: Epoch 0
Test ACC: 0.846
Test F1: 0.776
Best Test ACC: 0.846
Model device: cpu
Data 0 device: cpu
Data 1 device: cpu
Data 2 device: cpu
G 0 device: cpu
G 1 device: cpu
G 2 device: cpu
Label device: cpu
Model device: cpu
Data 0 device: cpu
Data 1 device: cpu
Data 2 device: cpu
G 0 device: cpu
G 1 device: cpu
G 2 device: cpu
Label device: cpu
Model device: cpu
Data 0 device: cpu
Data 1 device: cpu
Data 2 device: cpu
G 0 device: cpu
G 1 device: cpu
G 2 device: cpu
Label device: cpu
Model device: cpu
Data 0 device: cpu
Data 1 device: cpu
Data 2 device: cpu
G 0 device: cpu
G 1 device: cpu
G 2 device: cpu
Label device: cpu
Model device: cpu
Data 0 device: cpu
Data 1 device: cpu
Data 2 device: cpu
G 0 device: cpu
G 1 device: cpu
G 2 device: cpu
Label device: cpu
Model device: cpu
Data 0 device: cpu
Data 1 device: cpu
Data 2 devi

  index = torch.tensor(nearest_idx).long()



Test: Epoch 0
Test ACC: 0.769
Test F1: 0.669
Best Test ACC: 0.769
Model device: cpu
Data 0 device: cpu
Data 1 device: cpu
Data 2 device: cpu
G 0 device: cpu
G 1 device: cpu
G 2 device: cpu
Label device: cpu
Model device: cpu
Data 0 device: cpu
Data 1 device: cpu
Data 2 device: cpu
G 0 device: cpu
G 1 device: cpu
G 2 device: cpu
Label device: cpu
Model device: cpu
Data 0 device: cpu
Data 1 device: cpu
Data 2 device: cpu
G 0 device: cpu
G 1 device: cpu
G 2 device: cpu
Label device: cpu
Model device: cpu
Data 0 device: cpu
Data 1 device: cpu
Data 2 device: cpu
G 0 device: cpu
G 1 device: cpu
G 2 device: cpu
Label device: cpu
Model device: cpu
Data 0 device: cpu
Data 1 device: cpu
Data 2 device: cpu
G 0 device: cpu
G 1 device: cpu
G 2 device: cpu
Label device: cpu
Model device: cpu
Data 0 device: cpu
Data 1 device: cpu
Data 2 device: cpu
G 0 device: cpu
G 1 device: cpu
G 2 device: cpu
Label device: cpu
Model device: cpu
Data 0 device: cpu
Data 1 device: cpu
Data 2 device: cpu
G 0 device: 

In [28]:
best_test_indices = pd.DataFrame(best_test_indices, columns=['best_test_indices'])
best_true_labels = pd.DataFrame(best_true_labels, columns=['best_true_labels'])
best_predictions = pd.DataFrame(best_predictions, columns=['best_predictions'])
merged_data = pd.concat([best_test_indices, best_true_labels, best_predictions], axis=1)
# 导出为 CSV 文件
merged_data.to_csv('data(li)delete10/训练/prediction.csv', index=False)

In [29]:
data_folder='data(li2)'
file_dir='外部测试'
# 假设你已经加载了外部数据
external_data_tensor_list, external_labels_tensor = load_ft(data_folder, omics_list, file_dir)
if num_omics >= 2:
    model_dict = TMO(dim_list, args.num_class, num_omics, args.dim_he_list)
else:
    model_dict = HGCN(dim_list[0], args.num_class, args.dim_he_list)

# 将模型移动到 GPU（如果使用 GPU）
if cuda:
    model_dict.cuda()

# 加载模型的状态字典
model_dict.load_state_dict(torch.load('data(li)delete10\\best_model.model'))

# 切换到评估模式
model_dict.eval()
# 创建 Laplace 关联矩阵（假设 g_list 和 idx_list_all 是必须的）
g_list = []  # 需要根据你的数据生成 Laplace 关联矩阵
g = gen_trte_inc_mat(external_data_tensor_list, args.k_neigs)
for i in range(len(external_data_tensor_list)):
    g_list.append(torch.Tensor(g[i]).cpu())

idx_list_all = list(range(g_list[0].shape[0]))  # 所有数据的索引


# 使用训练好的模型进行测试
te_prob = test_epoch(external_data_tensor_list, external_labels_tensor, g_list, idx_list_all, 
                     model_dict, epoch=0, idx_list_all=idx_list_all)
te_prob = torch.from_numpy(te_prob).cpu()
# 获取预测类别
_, preds = torch.max(te_prob, 1)
# 计算并输出准确率和F1分数
_, preds = torch.max(te_prob, 1)  # 获取预测类别
external_acc = accuracy_score(external_labels_tensor.cpu(), preds.cpu())
external_f1 = f1_score(external_labels_tensor.cpu(), preds.cpu(), average='weighted')

print(f'External Test Acc: {external_acc:.3f}')
print(f'External Test F1: {external_f1:.3f}')

Constructing hypergraph incidence matrix! 
(It may take several minutes! Please wait patiently!)
Constructing hypergraph incidence matrix! 
(It may take several minutes! Please wait patiently!)
Constructing hypergraph incidence matrix! 
(It may take several minutes! Please wait patiently!)
External Test Acc: 0.750
External Test F1: 0.643


  model_dict.load_state_dict(torch.load('data(li)delete10\\best_model.model'))
  index = torch.tensor(nearest_idx).long()


In [30]:
# 将预测结果写入文件
preds_np = preds.cpu().numpy()  # 转换为 NumPy 数组
labels_np = external_labels_tensor.cpu().numpy()  # 转换为 NumPy 数组

# 创建 DataFrame
results_df = pd.DataFrame({
    'True Labels': labels_np,
    'Predicted Labels': preds_np
})

# 写入 CSV 文件
results_df.to_csv('data(li)delete10/外部测试/predictions.csv', index=False)
print('预测结果已写入 predictions.csv')

预测结果已写入 predictions.csv
