## Parameter setting

In [None]:
import os
import time
import argparse
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.autograd import Variable
from sklearn.metrics import r2_score, roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, average_precision_score

from scipy.stats import pearsonr
from scipy.stats import zscore
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

from utils import *
from model import *
from trainer import *
import torch.utils.data as Dataset
import argparse
import sys



# Training settings
sys.argv = sys.argv[:1]
parser = argparse.ArgumentParser()
parser.add_argument('--seed', type=int, default=0, help='Random seed.')
parser.add_argument('--epochs', type=int, default=150, help='Number of epochs to train.')
parser.add_argument('--lr', type=float, default=0.001, help='Initial learning rate.')
parser.add_argument('--batch', type=int, default=256, help='Number of batch size.')
parser.add_argument('--n_heads', type=int, default=3, help='Number of head attentions.')
parser.add_argument('--alpha', type=float, default=0.2, help='Alpha for the leaky_relu.')
parser.add_argument('--patience', type=int, default=10, help='')
parser.add_argument('--nOutputGAT_p', type=int, default=343, help='')
parser.add_argument('--nOutputGAT_c', type=int, default=86, help='')
parser.add_argument('--ntype', type=int, default=2, help='Number of predicted sample type.')
parser.add_argument('--d_k', type=int, default=32, help='')
parser.add_argument('--datapath', type=str, default="", help='')
parser.add_argument('--step_size', type=int, default=30, help='')
parser.add_argument('--gamma', type=float, default=0.5, help='')
parser.add_argument('--gain', type=float, default=1.414, help='')
parser.add_argument('--task', type=str, default="", help='Name of the task.')
args = parser.parse_args()


if torch.cuda.is_available():
    device = torch.device("cuda:0")
    torch.cuda.manual_seed(args.seed)
else:
    device = torch.device("cpu")
    
    

activation_function = nn.ReLU()
args.task = "cancerdetection"
args.epochs = 30
args.batch = 256
args.lr = 1e-4
args.dropout_rate = 0.5

args.ntype = 2
args.d_k = 32
args.nheads = 3
args.nOutputGAT_p = 343
args.nOutputGAT_c = 86
args.adp_hidden_dims = [100]
args.adp_code_dim = 50
args.adc_hidden_dims = [50]
args.adc_code_dim = 30
args.gain = 1.414
args.code_dropout = True
args.alpha = 0.2
pgraph_k=2
cgraph_k=2

fm = "Pancancer_disease_exp"
ss = "Pancancer_disease_label"
args.datapath = "/home/wjs/hdd/DualPRE/data/cancerdetection/"


args.patience = 10
args.gamma = 0.5
args.step_size = 30
args.early_stop_delta = 0.0001
num_folds = 1 
save = True
drop_last = False
cv_results = []



## Training

In [None]:


for cv in range(1, num_folds + 1):
    print(f"Starting {cv}th fold of cross-validation")
    

    feature_train, y_train, feature_valid, y_valid = data_prepare_train(
        path=args.datapath, 
        feature_matrix=fm, 
        sampleset=ss
    )
    if save:  
        os.makedirs("datasplit", exist_ok=True)
        np.savetxt(os.path.join("datasplit", f'feature_valid_{args.task}_cv{cv}.csv'), feature_valid.T.detach().numpy(), fmt='%.6f', delimiter=',')
        np.savetxt(os.path.join("datasplit", f'y_valid_{args.task}_cv{cv}.csv'), y_valid.detach().numpy(), fmt='%.6f', delimiter=',')
        
    
    counts = torch.bincount(y_train.long(), minlength=args.ntype)

    weights = 1 / counts.float()

    weights = weights / weights.sum()
    
    loss_func = nn.CrossEntropyLoss(weight=weights)
    loss_func.to(device)
    
    adp = Autodecoder(input_dim=args.nOutputGAT_p, 
                                    hidden_dims=args.adp_hidden_dims, 
                                    code_dim=args.adp_code_dim,
                                    code_activation=True, dropout=args.code_dropout, dropout_rate=args.dropout_rate)
    
    adc = Autodecoder(input_dim=args.nOutputGAT_c, 
                                    hidden_dims=args.adc_hidden_dims, 
                                    code_dim=args.adc_code_dim,
                                    code_activation=True, dropout=args.code_dropout, dropout_rate=args.dropout_rate)
    

    
    model = DualPRE(device,
                Autodecoder_p=adp,
                Autodecoder_c=adc,
                npath=343,
                ncell=86,
                hidden = [args.adp_code_dim,args.adc_code_dim],
                nOutputGAT_p=args.nOutputGAT_p,
                nOutputGAT_c=args.nOutputGAT_c,
                pgraph_k=pgraph_k,
                cgraph_k=cgraph_k,
                d_k=args.d_k,
                nheads=args.n_heads,
                ntype=args.ntype,
                alpha=args.alpha,
                activation_function = activation_function,
                dropout_rate= args.dropout_rate).to(device)
    model.to(device)

    optimizer = optim.Adam([{'params': model.psample_MultiHead1.parameters(), 'lr': 1e-4},  
                            {'params': model.psample_attentions1.parameters(), 'lr': 1e-4},
                            {'params': model.csample_MultiHead1.parameters(), 'lr': 1e-4},
                            {'params': model.csample_attentions1.parameters(), 'lr': 1e-4},
                            {'params': model.Autodecoder_p.parameters(), 'lr': 1e-3},
                            {'params': model.Autodecoder_c.parameters(), 'lr': 1e-3},
                            {'params': model.FClayer1.parameters(), 'lr': 1e-3},  
                            {'params': model.FClayer2.parameters(), 'lr': 1e-3},  
                            {'params': model.FClayer3.parameters(), 'lr': 1e-3}
                            ], weight_decay=1e-5)
    
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=args.step_size, gamma=args.gamma)

    valid_loss = [0]
    bad_counter = 0
    t_total = time.time()
    bv = [0,1]

    for epoch in range(args.epochs):
        print(f"Epoch {epoch+1}/{args.epochs} in fold {cv}")
        best_epoch, loss, f1 = train(epoch, model, device, optimizer, loss_func, feature_train, y_train, feature_valid, y_valid, args.batch, args.task, args.ntype, cv ,best_value = bv, save = save,drop_last=drop_last)
        valid_loss.append(loss)

        if abs(valid_loss[-2] - valid_loss[-1]) <= args.early_stop_delta:
            bad_counter += 1
        else:
            bad_counter = 0
        if bad_counter >= args.patience:
            print(f"Early stopping triggered at epoch {epoch+1} in fold {cv}")
            break

        scheduler.step()

        if save:
            os.makedirs("./model", exist_ok=True)
            torch.save(model, f"./model/last_model_{args.task}_cv{cv}_B{args.batch}_L{args.lr}.model")

    print(f"Optimization Finished for fold {cv}. Total time: {time.time() - t_total:.4f}s")
    print(f"Loading best epoch {best_epoch} for fold {cv}")

   
    cv_results.append({
        "fold": cv,
        "best_epoch": best_epoch,
        "best_validation_f1": f1
    })

print("Cross-validation results:", cv_results)

##  External test

In [None]:


cv = 1
feature_ex, y_ex, num_ex = data_prepare_external(path  = args.datapath, feature_matrix = "GDC CPTAC-3_disease_exp", sampleset = "GDC CPTAC-3_disease_label",internal = False)

model = torch.load(f'./model/best_model_{args.task}_cv{cv}_B256.model')
pred,output_dict = compute_test( model,device,feature_ex, y_ex,256,args.ntype,drop_last)

output_file1 = os.path.join("pred", f'pred_ex_cancerdetection.csv')
np.savetxt(output_file1, np.array(pred.cpu()), fmt='%.6f', delimiter=',')