In [1]:
import numpy as np
import json
from scipy.sparse import load_npz
import pandas as pd
import os
from tqdm import tqdm

DGIDB_DIRECTORY = "../Gen_Hypergraph/output/DGIDB_BIPOLAR/"
DGIDB_CONVERGED_VECTOR_PATH = "./output/DGIDB_BIPOLAR/DGIDB_vector.npy"
MSIGDB_DIRECTORY = "../Gen_Hypergraph/output/MSigDB_FULL/"
OUTPUT_FOLDER = "./output/DGIDB_BIPOLAR/"
NCBI_INFO = pd.read_csv("../Data/ncbi/gene2refseq.gz", sep='\t', compression='gzip')
os.makedirs(OUTPUT_FOLDER, exist_ok=True)
restart_prob = 0.2  # Restart probability (theta)
num_iterations = 10  # Number of iterations

  interactivity=interactivity, compiler=compiler, result=result)


MemoryError: Unable to allocate 9.06 GiB for an array with shape (14, 86818407) and data type object

In [None]:
# Open the JSON file and load its content into a dictionary
with open(DGIDB_DIRECTORY + "gene_to_index.json", "r") as file:
    dgidb = json.load(file)
with open(MSIGDB_DIRECTORY + "gene_to_index.json", "r") as file:
    msigdb = json.load(file)

In [None]:
import numpy as np
# Jump probability for matching genes
w = 1

# Number of genes (assuming they are both of same size or matchable)
num_genes_dgidb = len(dgidb)
num_genes_msigdb = len(msigdb)

# Initialize the inter-layer matrix (D) with zeros
D = np.zeros((num_genes_dgidb, num_genes_msigdb))
i = 0
# Build the inter-layer matrix (D)
for gene_dgidb, idx_dgidb in dgidb.items():
    # If the gene exists in both gene-to-index mappings
    if gene_dgidb in msigdb:      
        idx_msigdb = msigdb[gene_dgidb]
        D[idx_dgidb, idx_msigdb] = w  # Set jump probability
        i += 1
rows_with_high_sum = np.where(D.sum(axis=1) > 0)[0]

In [None]:
# Load matrices
MSIGDB_weighted_matrix = load_npz(MSIGDB_DIRECTORY + "hypergraph_incidence_matrix_weighted.npz")
MSIGDB_binary_matrix = load_npz(MSIGDB_DIRECTORY + "hypergraph_incidence_matrix_binary.npz")
DGIDB_binary_matrix = load_npz(DGIDB_DIRECTORY + "hypergraph_incidence_matrix_binary.npz")
DGIDB_weighted_matrix = load_npz(DGIDB_DIRECTORY + "hypergraph_incidence_matrix_weighted.npz")

num_genes_DGIDB = DGIDB_binary_matrix.shape[0]  # Number of genes in DGIDB
num_genes_MSIGDB = MSIGDB_binary_matrix.shape[0]  # Number of genes in MSIGDB

# Initialize probability vectors
v0 = np.zeros(num_genes_DGIDB + num_genes_MSIGDB)  # Combined vector for DGIDB and MSIGDB
teleport = np.zeros(num_genes_DGIDB + num_genes_MSIGDB)  # Restart probability vector
print(num_genes_DGIDB + num_genes_MSIGDB)
# Initialize probability vectors
v0[:] = 1.0 / (num_genes_DGIDB + num_genes_MSIGDB)  # Starting on DGIDB
teleport[:] = 1.0 / (num_genes_DGIDB + num_genes_MSIGDB)

def get_hyper_randomwalk(DGIDB_binary_matrix, DGIDB_weighted_matrix, MSIGDB_weighted_matrix, MSIGDB_binary_matrix, D, restart_prob, num_iterations):
    """Performs a hypergraph-based random walk with restart considering inter-layer transitions."""
    vi = v0.copy()  # Start with uniform probability (combined for both DGIDB and MSIGDB)
    distance_list = []

    for k in range(num_iterations):
        print(f"{k+1} iteration")

        # Store previous probability vector
        vj = vi.copy()

        # Initialize new probability vector
        vi_new = np.zeros_like(vi)

        # Handle transitions for DGIDB (first part of the vector)
        for gene in range(num_genes_DGIDB):
            # Intra-hypergraph transitions in DGIDB (moving within DGIDB)
            connected_drugs = DGIDB_binary_matrix[gene, :].nonzero()[1]  # Nonzero columns in DGIDB binary matrix

            if len(connected_drugs) == 0:
                continue  # Skip if no drugs are found

            # Collect probabilities from neighbors in DGIDB
            prob_sum = 0
            for drug in connected_drugs:
                # Find genes connected to the selected drug (weighted transition in DGIDB)
                connected_genes = DGIDB_weighted_matrix[:, drug].toarray().flatten()
                neighbor_genes = np.where(connected_genes > 0)[0]  # Get genes with nonzero weight

                if len(neighbor_genes) == 0:
                    continue

                # Normalize weights within DGIDB
                weights = connected_genes[neighbor_genes]
                weight_sum = np.sum(weights)
                if weight_sum > 0:
                    weights /= weight_sum  # Normalize to sum to 1

                # Transition probability contribution
                prob_sum += np.sum(weights * vj[neighbor_genes])

            # Add inter-hypergraph jump probabilities (from DGIDB to MSIGDB)
            vi_new[gene] = prob_sum + np.sum(D[gene, :] * vj[num_genes_DGIDB:])  # Jump to MSIGDB part
        print("vi_new", vi_new)
        # Handle transitions for MSIGDB (second part of the vector)
        for gene in tqdm(range(num_genes_MSIGDB), desc="MSIGDB Gene Processing"):
            # Intra-hypergraph transitions in MSIGDB (moving within MSIGDB)
            connected_pathways = MSIGDB_binary_matrix[gene, :].nonzero()[1]  # Nonzero columns in MSIGDB binary matrix

            if len(connected_pathways) == 0:
                continue  # Skip if no pathways are found

            # Collect probabilities from neighbors in MSIGDB
            prob_sum = 0
            for pathway in connected_pathways:
                # Find genes connected to the selected pathway (weighted transition in MSIGDB)
                connected_genes = MSIGDB_weighted_matrix[:, pathway].toarray().flatten()
                neighbor_genes = np.where(connected_genes > 0)[0]  # Get genes with nonzero weight

                if len(neighbor_genes) == 0:
                    continue

                # Normalize weights within MSIGDB
                weights = connected_genes[neighbor_genes]
                weight_sum = np.sum(weights)
                if weight_sum > 0:
                    weights /= weight_sum  # Normalize to sum to 1

                # Transition probability contribution
                prob_sum += np.sum(weights * vj[num_genes_DGIDB + neighbor_genes])

            # Add inter-hypergraph jump probabilities (from MSIGDB to DGIDB)
            vi_new[num_genes_DGIDB + gene] = prob_sum + np.sum(D[:, gene] * vj[:num_genes_DGIDB])  # Jump to DGIDB part
        print("vi_new", vi_new)
        # Normalize vi_new to avoid overflow
        vi_new /= np.sum(vi_new) if np.sum(vi_new) > 0 else 1

        # Apply restart probability
        vi = restart_prob * vi_new + (1 - restart_prob) * teleport

        # Calculate distance (convergence criterion)
        distance = np.sum(np.abs(vj - vi))
        distance_list.append(distance)

    # Sort importance scores in descending order
    return {"Importance": vi, "Distance": distance_list}

In [None]:
result = get_hyper_randomwalk(DGIDB_binary_matrix, DGIDB_weighted_matrix, MSIGDB_weighted_matrix, MSIGDB_binary_matrix, D, restart_prob, num_iterations)

In [None]:
import pandas as pd

DGIDB_vector = result["Importance"][:num_genes_DGIDB]
MSIGDB_vector = result["Importance"][num_genes_DGIDB:]

DGIDB_Importance_df = pd.DataFrame({"Index": np.arange(len(DGIDB_vector)), "Score": DGIDB_vector})
MSIGDB_Importance_df = pd.DataFrame({"Index": np.arange(len(MSIGDB_vector)), "Score": MSIGDB_vector})

DGIDB_gene_to_index_df = pd.DataFrame({
    "ncbi_gene_id": list(dgidb.keys()),
    "Index": list(dgidb.values()),
})

MSIGDB_gene_to_index_df = pd.DataFrame({
    "ncbi_gene_id": list(msigdb.keys()),
    "Index": list(msigdb.values()),
})

DGIDB_df = pd.merge(DGIDB_Importance_df, DGIDB_gene_to_index_df, on="Index")
MSIGDB_df = pd.merge(MSIGDB_Importance_df, MSIGDB_gene_to_index_df, on="Index")


In [None]:
# Assuming DGIDB_df and MSIGDB_df both have 'gene' and 'Importance' columns
combined_df = pd.concat([DGIDB_df, MSIGDB_df])

# Group by 'gene' and sum the 'Importance' scores
final_df = combined_df.groupby("ncbi_gene_id", as_index=False).sum()

# Sort by 'Importance' in descending order
final_df = final_df.sort_values(by="Score", ascending=False)

# Reset the index
final_df = final_df.reset_index(drop=True)

In [None]:
human_gene2refseq = NCBI_INFO[NCBI_INFO['#tax_id'] == 9606]
id_to_gene_claim = pd.Series(human_gene2refseq.Symbol.values, index=human_gene2refseq.GeneID).to_dict()

def get_gene_claim_name(ncbi_gene_id):
    ncbi_gene_id = int(ncbi_gene_id)
    result = id_to_gene_claim[ncbi_gene_id]
    if result:
        return result
    else:
        return "Gene name not found"

In [None]:
final_df["claim_name"] = final_df["ncbi_gene_id"].apply(get_gene_claim_name)
results_df.to_csv(OUTPUT_FOLDER + "multilayer_rwr_results.csv", index=False)