In [5]:
from sklearn.metrics import auc, roc_auc_score
from sklearn.metrics import precision_recall_curve
import pandas as pd
import numpy as np
import torch
import random
import scipy.sparse as sp
import os
def cal_auc(output, labels):
    outputTest = output.cpu().detach().numpy()
    outputTest = np.exp(outputTest)
    outputTest = outputTest[:,1]
    labelsTest = labels.cpu().numpy()
    AUROC = roc_auc_score(labelsTest, outputTest)
    precision, recall, _thresholds = precision_recall_curve(labelsTest, outputTest)
    AUPRC = auc(recall, precision)
    return AUROC,AUPRC

def _generate_G_from_H_weight(H, W):
    n_edge = H.shape[1]
    DV = np.sum(H * W, axis=1)  # the degree of the node
    DE = np.sum(H, axis=0)  # the degree of the hyperedge
    invDE = np.mat(np.diag(1/DE))
    DV2 = np.mat(np.diag(np.power(DV, -0.5)))
    W = np.mat(np.diag(W))
    H = np.mat(H)
    HT = H.T
    G = DV2 * H * W * invDE * HT * DV2
    return G
def generate_P_from_H_weight(H: np.ndarray, W: np.ndarray):
    """
    Generate the RWR-based propagation matrix P = D_u^{-1} * H * D_e^{-1} * W^T

    Args:
        H (np.ndarray): Incidence matrix of shape (n_nodes, n_edges)
        W (np.ndarray): Hyperedge weight vector of shape (n_nodes, n_edges)

    Returns:
        np.ndarray: Propagation matrix P of shape (n_nodes, n_edges)
    """
    H = np.array(H)
    W = np.array(W)
    assert not np.isnan(H).any(), "H has NaNs"
    assert not np.isinf(H).any(), "H has Infs"
    assert not np.isnan(W).any(), "W has NaNs"
    assert not np.isinf(W).any(), "W has Infs"
    # Compute hyperedge degrees
    DE = np.sum(H, axis=0)  # (m,)
    # invDE = np.diag(1.0 / DE)
    invDE = np.diag(np.where(DE != 0, 1.0 / DE, 0.0))
    # Compute node degrees
    DU = np.sum(H * W, axis=1)  # (n,)
    # invDU = np.diag(1.0 / DU)
    invDU = np.diag(np.where(DU != 0, 1.0 / DU, 0.0))

    # Final P = D_u^{-1} * H * D_e^{-1} * W^T
    P = invDU @ H @ invDE @ W.T
    assert not np.isnan(P).any(), "P has NaNs"
    assert not np.isinf(P).any(), "P has Infs"
    return P
def getData(positiveGenePath, negativeGenePath, geneList):
    positiveGene = pd.read_csv(positiveGenePath, header = None)
    positiveGene = list(positiveGene[0].values)
    positiveGene = list(set(geneList)&set(positiveGene))
    positiveGene.sort()
    negativeGene = pd.read_csv(negativeGenePath, header = None)     
    negativeGene = negativeGene[0]
    negativeGene = list(set(negativeGene)&set(geneList))
    negativeGene.sort()

    labelFrame = pd.DataFrame(data = [0]*len(geneList), index = geneList)
    labelFrame.loc[positiveGene,:] = 1
    positiveIndex = np.where(labelFrame == 1)[0]
    labelFrame.loc[negativeGene,:] = -1
    negativeIndex = np.where(labelFrame == -1)[0]
    labelFrame = pd.DataFrame(data = [0]*len(geneList), index = geneList)
    labelFrame.loc[positiveGene,:] = 1
    
    positiveIndex = list(positiveIndex)
    negativeIndex = list(negativeIndex)
    sampleIndex = positiveIndex + negativeIndex
    sampleIndex = np.array(sampleIndex)
    label = pd.DataFrame(data = [1]*len(positiveIndex) + [0]*len(negativeIndex))
    label = label.values.ravel()
    return  sampleIndex, label, labelFrame

def processingIncidenceMatrix(geneList):
    ids = ['c2','c5']
    incidenceMatrix = pd.DataFrame(index= geneList)
    for id in ids:
        geneSetNameList = pd.read_csv('./Data/'+id+'Name.txt',sep='\t',header=None)
        geneSetNameList = list(geneSetNameList[0].values)
        z=0
        idList = list()
        for name in geneSetNameList:
            idList.append(z)
            # if(id=='c2'):
            #     q = name.split('_')
            #     if('CANCER' in q or 'TUMOR' in q or 'NEOPLASM' in q):
            #         print(name)
            #     else:
            #         idList.append(z)
            # elif(name[:2]=='HP'):
            #     q = name.split('_')
            #     if('CANCER' in q or 'TUMOR' in q or 'NEOPLASM' in q):
            #         print(name)
            #     else:
            #         idList.append(z)
            # else:
            #     idList.append(z)
            z=z+1
        genesetData = sp.load_npz('./Data/'+id+'_GenesetsMatrix.npz')
        incidenceMatrixTemp = pd.DataFrame(data = genesetData.A,index= geneList)
        incidenceMatrixTemp = incidenceMatrixTemp.iloc[:,idList]

        incidenceMatrix = pd.concat([incidenceMatrix,incidenceMatrixTemp],axis=1)

    incidenceMatrix.columns = np.arange(incidenceMatrix.shape[1])
    return incidenceMatrix
def getWeightedIncidenceMatrix(incidenceMatrix):
    # Step 2: Load HumanNet
    HUMANNET = pd.read_csv("./Data/humannet_claim.tsv", sep='\t', header=None)
    HUMANNET.columns = ["Gene1", "Gene2", "Score"] if HUMANNET.shape[1] == 3 else ["Gene1", "Gene2"]

    # Step 3: Compute degrees
    gene_to_degree = {}
    for _, row in HUMANNET.iterrows():
        gene_to_degree[row["Gene1"]] = gene_to_degree.get(row["Gene1"], 0) + 1
        gene_to_degree[row["Gene2"]] = gene_to_degree.get(row["Gene2"], 0) + 1

   # Step 4: Apply weights
    weighted_matrix = incidenceMatrix.copy()
    
    for gene in weighted_matrix.index:
        # Convert to string if it's not already
        if isinstance(gene, list):
            gene = gene[0]  # or: gene = "_".join(gene)
        degree = gene_to_degree.get(str(gene), 0)
        weighted_matrix.loc[gene] = weighted_matrix.loc[gene] * degree

    # Step 5: Save to file
    # weighted_matrix.to_csv(output_path)
    # print(f"Saved weighted incidence matrix to {output_path}")
    return weighted_matrix

In [6]:
import torch
import torch.nn as nn
import math
from torch.nn.parameter import Parameter
import torch.nn.functional as F
from torch.nn.modules.module import Module
class RWR_HGNN_Prop(nn.Module):
    def __init__(self, alpha):
        super(RWR_HGNN_Prop, self).__init__()
        self.alpha = alpha

    def forward(self, H0, H_l, P):
        return self.alpha * H0 + (1 - self.alpha) * P.T.matmul(H_l)

class RWR_HGNN_Layer(nn.Module):
    def __init__(self, nhid, alpha, dropout=0.5):
        super(RWR_HGNN_Layer, self).__init__()
        self.propagate = RWR_HGNN_Prop(alpha)
        self.linear = nn.Linear(nhid, nhid)
        self.activation = nn.ReLU()
        self.dropout = dropout

    def forward(self, H0, x, P):
        h = self.propagate(H0, x, P)
        # Possibly to prevent oversmoothing
        h = h + x
        h = self.activation(h)
        h = F.dropout(h, p=self.dropout, training=self.training)
        return h
class DISHyperNet_RWR(nn.Module):
    def __init__(self, in_ch, n_hid, n_class, alpha=0.2, num_layers=3, dropout=0.5):
        super(DISHyperNet_RWR, self).__init__()
        self.dropout = dropout
        self.alpha = alpha
        self.fc = nn.Linear(in_ch, n_hid)
        self.layers = nn.ModuleList()
        for i in range(num_layers):
            self.layers.append(
                RWR_HGNN_Layer(n_hid, alpha, dropout=dropout)
            )
        self.outLayer = nn.Linear(n_hid, n_class)

    def forward(self, x, P):
        H0 = F.relu(self.fc(x))
        H0 = F.dropout(H0, self.dropout, training=self.training)
        x = H0
        for layer in self.layers:
            x = layer(H0, x, P)
        out = self.outLayer(x)
        return F.log_softmax(out, dim=1)


In [7]:
import pandas as pd
import os
import numpy as np
import scipy.sparse as sp
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.parameter import Parameter
from torch.nn.modules.module import Module
import torch.optim as optim
from sklearn.model_selection import StratifiedKFold
import random

def train_test(trainIndex, testIndex, labelFrame, incidenceMatrix, geneList, lr, epochs, dropout, n_hid, weight_decay):
    print("Starting train_test function...")

    print("Selecting training labels...")
    trainFrame = labelFrame.iloc[trainIndex]

    print("Finding positive training genes...")
    trainPositiveGene = list(trainFrame.where(trainFrame == 1).dropna().index)
    print(f"Number of positive genes in training set: {len(trainPositiveGene)}")

    print("Computing sum over selected rows in incidence matrix...")
    positiveMatrixSum = incidenceMatrix.loc[trainPositiveGene].sum()

    print("Selecting hyperedges with at least 3 positive gene hits...")
    selHyperedgeIndex = np.where(positiveMatrixSum >= 3)[0]
    print(f"Selected {len(selHyperedgeIndex)} hyperedges")

    selHyperedge = incidenceMatrix.iloc[:, selHyperedgeIndex]
    # RIGHT NOW THIS IS |E| x 1, NEEDS TO BE |V| x |E|
    hyperedgeWeight = positiveMatrixSum[selHyperedgeIndex].values
    print(hyperedgeWeight.shape)
    selHyperedgeWeightSum = incidenceMatrix.iloc[:, selHyperedgeIndex].values.sum(0)

    print("Computing normalized hyperedge weights...")
    hyperedgeWeight = hyperedgeWeight / selHyperedgeWeightSum

    print("Constructing incidence matrix H...")
    H = np.array(selHyperedge).astype('float')

    print("Computing DV vector and applying regularization to isolated nodes...")
    DV = np.sum(H * hyperedgeWeight, axis=1)
    for i in range(DV.shape[0]):
        if DV[i] == 0:
            t = random.randint(0, H.shape[1] - 1)
            H[i][t] = 0.0001

    print("Generating graph G from H and hyperedge weights...")
    print("COMPUTING P FIRST")
    W_matrix = getWeightedIncidenceMatrix(incidenceMatrix.iloc[:, selHyperedgeIndex])
    # W_matrix = np.array(selHyperedge).astype('float')
    print(W_matrix.shape)  
    # assert not np.isnan(W_matrix).any(), "W_matrix has NaNs"
    # assert not np.isinf(W_matrix).any(), "W_matrix has Infs"
    print(H.shape)
    assert H.shape == W_matrix.shape, f"Mismatch: H {H.shape} vs W {W_matrix.shape}"
    P = generate_P_from_H_weight(H, W_matrix)
    print(P.shape)
    # G = _generate_G_from_H_weight(H, hyperedgeWeight)
    # print(G.shape)
    N = H.shape[0]

    print("Preparing adjacency matrix and features...")
    # adj = torch.Tensor(G).float()
    adj = torch.Tensor(P).float()
    features = torch.eye(N).float()
    theLabels = torch.from_numpy(labelFrame.values.reshape(-1,))

    print("Initializing model...")
    model = DISHyperNet_RWR(in_ch=N, n_hid=n_hid, n_class=2, dropout=dropout)

    print("Setting up optimizer and learning rate scheduler...")
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    schedular = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[100, 200, 300, 400], gamma=0.5)

    if torch.cuda.is_available():
        model.cuda()
        features = features.cuda()
        adj = adj.cuda()
        theLabels = theLabels.cuda()
    for epoch in range(epochs):
        print("EPOCH IS", epoch)
        model.train()
        optimizer.zero_grad() 
        output = model(features, adj)
        loss_train = F.nll_loss(output[trainIndex], theLabels[trainIndex]) 
        loss_train.backward()
        optimizer.step()
        schedular.step()
    model.eval()
    with torch.no_grad():
        output = model(features, adj)
        loss_test = F.nll_loss(output[testIndex], theLabels[testIndex])
        AUROC_val, AUPRC_val = cal_auc(output[testIndex], theLabels[testIndex])
        outputFrame = pd.DataFrame(data = output.exp().cpu().detach().numpy(), index = geneList)
    return AUROC_val, AUPRC_val, outputFrame

def trainPred(geneList, incidenceMatrix, positiveGenePath, negativeGenePath, lr, epochs, dropout, n_hid, weight_decay):
    aurocList = list()
    auprcList = list()
    evaluationRes = pd.DataFrame(index = geneList)
    for i in range(5):
        print(f"\n===== Outer Loop Iteration {i+1}/5 =====")
        print("🔄 Loading data...")
        sampleIndex, label, labelFrame = getData(positiveGenePath, negativeGenePath, geneList)

        print("🧮 Reshaping data for StratifiedKFold...")
        sk_X = sampleIndex.reshape([-1, 1])
        sfolder = StratifiedKFold(n_splits=5, random_state=i, shuffle=True)

        for fold_num, (train_index, test_index) in enumerate(sfolder.split(sk_X, label), start=1):
            print(f"\n📂 Fold {fold_num}/5 — splitting train/test sets...")
            trainIndex = sampleIndex[train_index]
            testIndex = sampleIndex[test_index]
            _, __ = label[train_index], label[test_index]

            print("🚀 Starting training and evaluation...")
            AUROC_val, AUPRC_val, outputFrame = train_test(
                trainIndex, testIndex, labelFrame, incidenceMatrix,
                geneList, lr, epochs, dropout, n_hid, weight_decay
            )

            print(f"✅ Finished training — AUROC: {AUROC_val.item():.4f}, AUPRC: {AUPRC_val.item():.4f}")
            aurocList.append(AUROC_val.item())
            auprcList.append(AUPRC_val.item())

            print("📊 Appending results to evaluation dataframe...")
            evaluationRes = pd.concat([evaluationRes, outputFrame[1]], axis=1)
    return aurocList, auprcList, evaluationRes

In [8]:
import pandas as pd
import sys, os, random
import numpy as np
import scipy.sparse as sp
import torch

seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
 

if __name__ == "__main__":    
    _, outputPath = sys.argv
    lr = 1e-3
    dropout = 0.3
    weight_decay = 5e-6
    epochs = 500
    n_hid = 256
    
    positiveGenePath = r'./Data/BreastCancerOTGenes.txt'
    negativeGenePath = r'./Data/BreastCancerFalsegenes.txt'
    print("READING GENE LIST")


    geneList = pd.read_csv(r'./Data/geneList.csv', header=None)
    geneList = list(geneList[0].values)
    print("PROCESSING MATRIX")
    incidenceMatrix = processingIncidenceMatrix(geneList)
    print("TRAINING")

    aurocList, auprcList, evaluationRes = trainPred(geneList, incidenceMatrix, positiveGenePath,
                                          negativeGenePath, lr, epochs, dropout, n_hid, weight_decay) 
    predRes = evaluationRes.sum(1).sort_values(ascending = False) / 25
    predRes.to_csv("RWRCANCER.tsv",sep='\t', header = False)
    print(np.mean(aurocList)) # 0.936
    print(np.mean(auprcList)) # 0.894


READING GENE LIST
PROCESSING MATRIX
TRAINING

===== Outer Loop Iteration 1/5 =====
🔄 Loading data...
🧮 Reshaping data for StratifiedKFold...

📂 Fold 1/5 — splitting train/test sets...
🚀 Starting training and evaluation...
Starting train_test function...
Selecting training labels...
Finding positive training genes...
Number of positive genes in training set: 9708
Computing sum over selected rows in incidence matrix...
Selecting hyperedges with at least 3 positive gene hits...
Selected 20745 hyperedges
(20745,)
Computing normalized hyperedge weights...
Constructing incidence matrix H...
Computing DV vector and applying regularization to isolated nodes...
Generating graph G from H and hyperedge weights...
COMPUTING P FIRST
(17442, 20745)
(17442, 20745)




(17442, 17442)
Preparing adjacency matrix and features...
Initializing model...
Setting up optimizer and learning rate scheduler...
EPOCH IS 0
EPOCH IS 1
EPOCH IS 2
EPOCH IS 3
EPOCH IS 4
EPOCH IS 5
EPOCH IS 6
EPOCH IS 7
EPOCH IS 8
EPOCH IS 9
EPOCH IS 10
EPOCH IS 11
EPOCH IS 12
EPOCH IS 13
EPOCH IS 14
EPOCH IS 15
EPOCH IS 16
EPOCH IS 17
EPOCH IS 18
EPOCH IS 19
EPOCH IS 20
EPOCH IS 21
EPOCH IS 22
EPOCH IS 23
EPOCH IS 24
EPOCH IS 25
EPOCH IS 26
EPOCH IS 27
EPOCH IS 28
EPOCH IS 29
EPOCH IS 30
EPOCH IS 31
EPOCH IS 32
EPOCH IS 33
EPOCH IS 34
EPOCH IS 35
EPOCH IS 36
EPOCH IS 37
EPOCH IS 38
EPOCH IS 39
EPOCH IS 40
EPOCH IS 41
EPOCH IS 42
EPOCH IS 43
EPOCH IS 44
EPOCH IS 45
EPOCH IS 46
EPOCH IS 47
EPOCH IS 48
EPOCH IS 49
EPOCH IS 50
EPOCH IS 51
EPOCH IS 52
EPOCH IS 53
EPOCH IS 54
EPOCH IS 55
EPOCH IS 56
EPOCH IS 57
EPOCH IS 58
EPOCH IS 59
EPOCH IS 60
EPOCH IS 61
EPOCH IS 62
EPOCH IS 63
EPOCH IS 64
EPOCH IS 65
EPOCH IS 66
EPOCH IS 67
EPOCH IS 68
EPOCH IS 69
EPOCH IS 70
EPOCH IS 71
EPOCH IS 72
EP