In [45]:
import pandas as pd

if 'NCBI_INFO' not in globals():
    print("Reading gene2refseq.gz...")
    NCBI_INFO = pd.read_csv("../Data/ncbi/gene2refseq.gz", sep='\t', compression='gzip')
else:
    print("NCBI_INFO already loaded.")


NCBI_INFO already loaded.


In [46]:
import numpy as np
import json
from scipy.sparse import load_npz
import os
from tqdm import tqdm

DISEASE = "FULL"

DGIDB_DIRECTORY = f"../Gen_Hypergraph/output/DGIDB_{DISEASE}/"
DGIDB_CONVERGED_VECTOR_PATH = f"./output/DGIDB_{DISEASE}/DGIDB_vector.npy"
MSIGDB_DIRECTORY = "../Gen_Hypergraph/output/MSigDB_FULL/" 
OUTPUT_FOLDER = f"./output/DGIDB_{DISEASE}/"
os.makedirs(OUTPUT_FOLDER, exist_ok=True)
restart_prob = 0.2  # Restart probability (theta)
num_iterations = 10  # Number of iterations

In [47]:
# Open the JSON file and load its content into a dictionary
with open(DGIDB_DIRECTORY + "gene_to_index.json", "r") as file:
    dgidb = json.load(file)
with open(MSIGDB_DIRECTORY + "gene_to_index.json", "r") as file:
    msigdb = json.load(file)

In [48]:
import numpy as np
# Jump probability for matching genes
w = 1

# Number of genes (assuming they are both of same size or matchable)
num_genes_dgidb = len(dgidb)
num_genes_msigdb = len(msigdb)

# Initialize the inter-layer matrix (D) with zeros
D = np.zeros((num_genes_dgidb, num_genes_msigdb))
i = 0
# Build the inter-layer matrix (D)
for gene_dgidb, idx_dgidb in dgidb.items():
    # If the gene exists in both gene-to-index mappings
    if gene_dgidb in msigdb:      
        idx_msigdb = msigdb[gene_dgidb]
        D[idx_dgidb, idx_msigdb] = w  # Set jump probability
        i += 1
rows_with_high_sum = np.where(D.sum(axis=1) > 0)[0]

In [49]:
# Load matrices
MSIGDB_weighted_matrix = load_npz(MSIGDB_DIRECTORY + "hypergraph_incidence_matrix_weighted.npz")
MSIGDB_binary_matrix = load_npz(MSIGDB_DIRECTORY + "hypergraph_incidence_matrix_binary.npz")
DGIDB_binary_matrix = load_npz(DGIDB_DIRECTORY + "hypergraph_incidence_matrix_binary.npz")
DGIDB_weighted_matrix = load_npz(DGIDB_DIRECTORY + "hypergraph_incidence_matrix_weighted.npz")

num_genes_DGIDB = DGIDB_binary_matrix.shape[0]  # Number of genes in DGIDB
num_genes_MSIGDB = MSIGDB_binary_matrix.shape[0]  # Number of genes in MSIGDB

# Initialize probability vectors
v0 = np.zeros(num_genes_DGIDB + num_genes_MSIGDB)  # Combined vector for DGIDB and MSIGDB
teleport = np.zeros(num_genes_DGIDB + num_genes_MSIGDB)  # Restart probability vector
print(num_genes_DGIDB + num_genes_MSIGDB)
# Initialize probability vectors
v0[:] = 1.0 / (num_genes_DGIDB + num_genes_MSIGDB)  # Starting on DGIDB
teleport[:] = 1.0 / (num_genes_DGIDB + num_genes_MSIGDB)

def get_hyper_randomwalk(DGIDB_binary_matrix, DGIDB_weighted_matrix, MSIGDB_weighted_matrix, MSIGDB_binary_matrix, D, restart_prob, num_iterations):
    """Performs a hypergraph-based random walk with restart considering inter-layer transitions."""
    vi = v0.copy()  # Start with uniform probability (combined for both DGIDB and MSIGDB)
    distance_list = []

    for k in range(num_iterations):
        print(f"{k+1} iteration")

        # Store previous probability vector
        vj = vi.copy()

        # Initialize new probability vector
        vi_new = np.zeros_like(vi)

        # Handle transitions for DGIDB (first part of the vector)
        for gene in range(num_genes_DGIDB):
            # Intra-hypergraph transitions in DGIDB (moving within DGIDB)
            connected_drugs = DGIDB_binary_matrix[gene, :].nonzero()[1]  # Nonzero columns in DGIDB binary matrix

            if len(connected_drugs) == 0:
                continue  # Skip if no drugs are found

            # Collect probabilities from neighbors in DGIDB
            prob_sum = 0
            for drug in connected_drugs:
                # Find genes connected to the selected drug (weighted transition in DGIDB)
                connected_genes = DGIDB_weighted_matrix[:, drug].toarray().flatten()
                neighbor_genes = np.where(connected_genes > 0)[0]  # Get genes with nonzero weight

                if len(neighbor_genes) == 0:
                    continue

                # Normalize weights within DGIDB
                weights = connected_genes[neighbor_genes]
                weight_sum = np.sum(weights)
                if weight_sum > 0:
                    weights /= weight_sum  # Normalize to sum to 1

                # Transition probability contribution
                prob_sum += np.sum(weights * vj[neighbor_genes])

            # Add inter-hypergraph jump probabilities (from DGIDB to MSIGDB)
            vi_new[gene] = prob_sum + np.sum(D[gene, :] * vj[num_genes_DGIDB:])  # Jump to MSIGDB part
        print("vi_new", vi_new)
        # Handle transitions for MSIGDB (second part of the vector)
        for gene in tqdm(range(num_genes_MSIGDB), desc="MSIGDB Gene Processing"):
            # Intra-hypergraph transitions in MSIGDB (moving within MSIGDB)
            connected_pathways = MSIGDB_binary_matrix[gene, :].nonzero()[1]  # Nonzero columns in MSIGDB binary matrix

            if len(connected_pathways) == 0:
                continue  # Skip if no pathways are found

            # Collect probabilities from neighbors in MSIGDB
            prob_sum = 0
            for pathway in connected_pathways:
                # Find genes connected to the selected pathway (weighted transition in MSIGDB)
                connected_genes = MSIGDB_weighted_matrix[:, pathway].toarray().flatten()
                neighbor_genes = np.where(connected_genes > 0)[0]  # Get genes with nonzero weight

                if len(neighbor_genes) == 0:
                    continue

                # Normalize weights within MSIGDB
                weights = connected_genes[neighbor_genes]
                weight_sum = np.sum(weights)
                if weight_sum > 0:
                    weights /= weight_sum  # Normalize to sum to 1

                # Transition probability contribution
                prob_sum += np.sum(weights * vj[num_genes_DGIDB + neighbor_genes])

            # Add inter-hypergraph jump probabilities (from MSIGDB to DGIDB)
            vi_new[num_genes_DGIDB + gene] = prob_sum + np.sum(D[:, gene] * vj[:num_genes_DGIDB])  # Jump to DGIDB part
        print("vi_new", vi_new)
        # Normalize vi_new to avoid overflow
        vi_new /= np.sum(vi_new) if np.sum(vi_new) > 0 else 1

        # Apply restart probability
        vi = restart_prob * vi_new + (1 - restart_prob) * teleport

        # Calculate distance (convergence criterion)
        distance = np.sum(np.abs(vj - vi))
        distance_list.append(distance)

    # Sort importance scores in descending order
    return {"Importance": vi, "Distance": distance_list}

26755


In [50]:
# import torch
# from tqdm import tqdm

# # Use GPU if available
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# print("Using device:", device)

# # Convert sparse matrices to dense (or use PyTorch sparse support if memory limited)
# MSIGDB_weighted = torch.tensor(MSIGDB_weighted_matrix.toarray(), dtype=torch.float32, device=device)
# MSIGDB_binary = torch.tensor(MSIGDB_binary_matrix.toarray(), dtype=torch.float32, device=device)
# DGIDB_binary = torch.tensor(DGIDB_binary_matrix.toarray(), dtype=torch.float32, device=device)
# # DGIDB_vector = torch.tensor(DGIDB_vector, dtype=torch.float32, device=device)
# D = torch.tensor(D, dtype=torch.float32, device=device)

# num_genes_MSIGDB = MSIGDB_binary.shape[0]
# v0 = torch.full((num_genes_MSIGDB,), 1.0 / num_genes_MSIGDB, dtype=torch.float32, device=device)
# teleport = v0.clone()
# P = MSIGDB_weighted @ MSIGDB_weighted.T

In [51]:
import torch
from tqdm import tqdm

def get_hyper_randomwalk_cuda(
    DGIDB_binary_matrix, DGIDB_weighted_matrix,
    MSIGDB_weighted_matrix, MSIGDB_binary_matrix,
    D_np, restart_prob, num_iterations
):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Using device:", device)

    # Convert matrices to dense PyTorch tensors
    DGIDB_binary = torch.tensor(DGIDB_binary_matrix.toarray(), dtype=torch.float32, device=device)
    DGIDB_weighted = torch.tensor(DGIDB_weighted_matrix.toarray(), dtype=torch.float32, device=device)
    MSIGDB_binary = torch.tensor(MSIGDB_binary_matrix.toarray(), dtype=torch.float32, device=device)
    MSIGDB_weighted = torch.tensor(MSIGDB_weighted_matrix.toarray(), dtype=torch.float32, device=device)
    D = torch.tensor(D_np, dtype=torch.float32, device=device)

    num_genes_DGIDB = DGIDB_binary.shape[0]
    num_genes_MSIGDB = MSIGDB_binary.shape[0]

    total_genes = num_genes_DGIDB + num_genes_MSIGDB

    # Initial uniform distribution over all nodes
    v0 = torch.full((total_genes,), 1.0 / total_genes, dtype=torch.float32, device=device)
    vi = v0.clone()
    teleport = v0.clone()
    distance_list = []

    for k in range(num_iterations):
        print(f"Iteration {k + 1}")
        vj = vi.clone()
        vi_new = torch.zeros_like(vi)

        # DGIDB side
        for gene in range(num_genes_DGIDB):
            connected_drugs = torch.nonzero(DGIDB_binary[gene], as_tuple=False).flatten()
            if connected_drugs.numel() == 0:
                continue

            prob_sum = 0.0
            for drug in connected_drugs:
                connected_genes = DGIDB_weighted[:, drug]
                neighbor_genes = torch.nonzero(connected_genes > 0, as_tuple=False).flatten()

                if neighbor_genes.numel() == 0:
                    continue

                weights = connected_genes[neighbor_genes]
                weight_sum = weights.sum()
                if weight_sum > 0:
                    weights = weights / weight_sum
                    prob_sum += torch.sum(weights * vj[neighbor_genes])

            # Add inter-layer transitions
            vi_new[gene] = prob_sum + torch.sum(D[gene, :] * vj[num_genes_DGIDB:])

        # MSIGDB side
        for gene in tqdm(range(num_genes_MSIGDB), desc="MSIGDB Gene Processing"):
            connected_pathways = torch.nonzero(MSIGDB_binary[gene], as_tuple=False).flatten()
            if connected_pathways.numel() == 0:
                continue

            prob_sum = 0.0
            for pathway in connected_pathways:
                connected_genes = MSIGDB_weighted[:, pathway]
                neighbor_genes = torch.nonzero(connected_genes > 0, as_tuple=False).flatten()

                if neighbor_genes.numel() == 0:
                    continue

                weights = connected_genes[neighbor_genes]
                weight_sum = weights.sum()
                if weight_sum > 0:
                    weights = weights / weight_sum
                    prob_sum += torch.sum(weights * vj[num_genes_DGIDB + neighbor_genes])

            vi_new[num_genes_DGIDB + gene] = prob_sum + torch.sum(D[:, gene] * vj[:num_genes_DGIDB])

        # Normalize
        vi_new_sum = vi_new.sum()
        if vi_new_sum > 0:
            vi_new /= vi_new_sum

        # Apply restart
        vi = restart_prob * vi_new + (1 - restart_prob) * teleport

        # Convergence metric
        distance = torch.sum(torch.abs(vj - vi)).item()
        distance_list.append(distance)

    return {
        "Importance": vi.detach().cpu().numpy(),
        "Distance": distance_list
    }


In [52]:
result = get_hyper_randomwalk_cuda(DGIDB_binary_matrix, DGIDB_weighted_matrix, MSIGDB_weighted_matrix, MSIGDB_binary_matrix, D, restart_prob, num_iterations)

Using device: cuda
Iteration 1


MSIGDB Gene Processing: 100%|██████████| 21981/21981 [02:20<00:00, 156.63it/s]


Iteration 2


MSIGDB Gene Processing: 100%|██████████| 21981/21981 [02:14<00:00, 163.94it/s]


Iteration 3


MSIGDB Gene Processing: 100%|██████████| 21981/21981 [02:28<00:00, 148.28it/s]


Iteration 4


MSIGDB Gene Processing: 100%|██████████| 21981/21981 [02:11<00:00, 166.53it/s]


Iteration 5


MSIGDB Gene Processing: 100%|██████████| 21981/21981 [02:29<00:00, 146.68it/s]


Iteration 6


MSIGDB Gene Processing: 100%|██████████| 21981/21981 [02:10<00:00, 168.02it/s]


Iteration 7


MSIGDB Gene Processing: 100%|██████████| 21981/21981 [02:23<00:00, 153.38it/s]


Iteration 8


MSIGDB Gene Processing: 100%|██████████| 21981/21981 [02:18<00:00, 159.24it/s]


Iteration 9


MSIGDB Gene Processing: 100%|██████████| 21981/21981 [02:18<00:00, 158.58it/s]


Iteration 10


MSIGDB Gene Processing: 100%|██████████| 21981/21981 [02:26<00:00, 150.21it/s]


In [53]:
import pandas as pd

DGIDB_vector = result["Importance"][:num_genes_DGIDB]
MSIGDB_vector = result["Importance"][num_genes_DGIDB:]

DGIDB_Importance_df = pd.DataFrame({"Index": np.arange(len(DGIDB_vector)), "Score": DGIDB_vector})
MSIGDB_Importance_df = pd.DataFrame({"Index": np.arange(len(MSIGDB_vector)), "Score": MSIGDB_vector})

DGIDB_gene_to_index_df = pd.DataFrame({
    "ncbi_gene_id": list(dgidb.keys()),
    "Index": list(dgidb.values()),
})

MSIGDB_gene_to_index_df = pd.DataFrame({
    "ncbi_gene_id": list(msigdb.keys()),
    "Index": list(msigdb.values()),
})

DGIDB_df = pd.merge(DGIDB_Importance_df, DGIDB_gene_to_index_df, on="Index")
MSIGDB_df = pd.merge(MSIGDB_Importance_df, MSIGDB_gene_to_index_df, on="Index")


In [54]:
# Assuming DGIDB_df and MSIGDB_df both have 'gene' and 'Importance' columns
combined_df = pd.concat([DGIDB_df, MSIGDB_df])

# Group by 'gene' and sum the 'Importance' scores
final_df = combined_df.groupby("ncbi_gene_id", as_index=False).sum()

# Sort by 'Importance' in descending order
final_df = final_df.sort_values(by="Score", ascending=False)

# Reset the index
final_df = final_df.reset_index(drop=True)

In [55]:
human_gene2refseq = NCBI_INFO[NCBI_INFO['#tax_id'] == 9606]
id_to_gene_claim = pd.Series(human_gene2refseq.Symbol.values, index=human_gene2refseq.GeneID).to_dict()

def get_gene_claim_name(ncbi_gene_id):
    ncbi_gene_id = int(ncbi_gene_id)
    result = id_to_gene_claim[ncbi_gene_id]
    if result:
        return result
    else:
        return "Gene name not found"

In [56]:
final_df["claim_name"] = final_df["ncbi_gene_id"].apply(get_gene_claim_name)
final_df.to_csv(OUTPUT_FOLDER + "multilayer_rwr_results.csv", index=False)