In [1]:
import numpy as np
import json
from scipy.sparse import load_npz
import pandas as pd
import os
from tqdm import tqdm

DGIDB_DIRECTORY = "../Gen_Hypergraph/output/DGIDB_BIPOLAR/"
DGIDB_CONVERGED_VECTOR_PATH = "./output/DGIDB_BIPOLAR/DGIDB_vector.npy"
MSIGDB_DIRECTORY = "../Gen_Hypergraph/output/MSigDB_FULL/"
OUTPUT_FOLDER = "./output/DGIDB_BIPOLAR/"
NCBI_INFO = pd.read_csv("../Data/ncbi/gene2refseq.gz", sep='\t', compression='gzip')
os.makedirs(OUTPUT_FOLDER, exist_ok=True)
restart_prob = 0.2  # Restart probability (theta)
num_iterations = 10  # Number of iterationsh


  interactivity=interactivity, compiler=compiler, result=result)


In [2]:
# Open the JSON file and load its content into a dictionary
with open(DGIDB_DIRECTORY + "gene_to_index.json", "r") as file:
    dgidb = json.load(file)
with open(MSIGDB_DIRECTORY + "gene_to_index.json", "r") as file:
    msigdb = json.load(file)

In [3]:
import numpy as np
# Jump probability for matching genes
w = 1

# Number of genes (assuming they are both of same size or matchable)
num_genes_dgidb = len(dgidb)
num_genes_msigdb = len(msigdb)

# Initialize the inter-layer matrix (D) with zeros
D = np.zeros((num_genes_dgidb, num_genes_msigdb))
i = 0
# Build the inter-layer matrix (D)
for gene_dgidb, idx_dgidb in dgidb.items():
    # If the gene exists in both gene-to-index mappings
    if gene_dgidb in msigdb:      
        idx_msigdb = msigdb[gene_dgidb]
        D[idx_dgidb, idx_msigdb] = w  # Set jump probability
        i += 1
rows_with_high_sum = np.where(D.sum(axis=1) > 0)[0]

In [4]:
# Load matrices
MSIGDB_weighted_matrix = load_npz(MSIGDB_DIRECTORY + "hypergraph_incidence_matrix_weighted.npz")
MSIGDB_binary_matrix = load_npz(MSIGDB_DIRECTORY + "hypergraph_incidence_matrix_binary.npz")
DGIDB_binary_matrix = load_npz(DGIDB_DIRECTORY + "hypergraph_incidence_matrix_binary.npz")
DGIDB_vector =  np.load(DGIDB_CONVERGED_VECTOR_PATH)

num_genes_MSIGDB = MSIGDB_binary_matrix.shape[0]  # Number of genes in MSIGDB
# Initialize probability vectors
v0 = np.zeros(num_genes_MSIGDB)  # Combined vector for DGIDB and MSIGDB
teleport = np.zeros(num_genes_MSIGDB)  # Restart probability vector
# Initialize probability vectors
v0[:] = 1.0 / (num_genes_MSIGDB)  # Starting on DGIDB
teleport[:] = 1.0 / (num_genes_MSIGDB)

def get_hyper_randomwalk(DGIDB_binary_matrix, DGIDB_vector, MSIGDB_weighted_matrix, MSIGDB_binary_matrix, D, restart_prob, num_iterations):
    """Performs a hypergraph-based random walk with restart considering inter-layer transitions."""
    vi = v0.copy()  # Start with uniform probability (combined for both DGIDB and MSIGDB)
    distance_list = []
    # Initialize the complete vector with zeros
    dgidb_vector_complete = np.zeros(num_genes_MSIGDB)

    for gene in tqdm(range(num_genes_MSIGDB), desc="DGIDB full vector calculation"):
        # Intra-hypergraph transitions in MSIGDB (moving within MSIGDB)
        connected_pathways = MSIGDB_binary_matrix[gene, :].nonzero()[1]  # Nonzero columns in MSIGDB binary matrix

        if len(connected_pathways) == 0:
            continue  # Skip if no pathways are found

        for pathway in connected_pathways:
            # Find genes connected to the selected pathway (weighted transition in MSIGDB)
            connected_genes = MSIGDB_weighted_matrix[:, pathway].toarray().flatten()
            neighbor_genes = np.where(connected_genes > 0)[0]  # Get genes with nonzero weight

            # Check if the current gene has a DGIDB connection
            dgidb_gene = np.where(D[:, gene] > 0)[0]  # Find DGIDB neighbors of the current MSIGDB gene
            if len(dgidb_gene) == 1:
                dgidb_drugs = DGIDB_binary_matrix[dgidb_gene[0], :].nonzero()[1]
                neighbor_genes_set = set()  # To avoid duplicates
            
                for drug in dgidb_drugs:
                    # Get genes connected through the same drug (edge)
                    connected_genes = DGIDB_binary_matrix[:, drug].toarray().flatten()
                    neighbor_genes = np.where(connected_genes > 0)[0]
                    # Add unique neighbors to the set
                    neighbor_genes_set.update(neighbor_genes)

                # Sum contributions from unique DGIDB neighbors
                if len(neighbor_genes_set) > 0:
                    neighbor_genes_list = list(neighbor_genes_set)
                    dgidb_contribution = np.sum(DGIDB_vector[neighbor_genes_list])  # Sum unique contributions
                    dgidb_vector_complete[gene] += dgidb_contribution  # Store in the complete vector

    # Normalize dgidb_vector_complete to avoid overflow
    dgidb_vector_complete /= np.sum(dgidb_vector_complete) if np.sum(dgidb_vector_complete) > 0 else 1
    


    for k in range(num_iterations):
        print(f"{k+1} iteration")

        # Store previous probability vector
        vj = vi.copy()

        # Initialize new probability vector
        vi_new = np.zeros_like(vi)
        # Handle transitions for MSIGDB (second part of the vector)
        for gene in tqdm(range(num_genes_MSIGDB), desc="MSIGDB Gene Processing"):
            # Intra-hypergraph transitions in MSIGDB (moving within MSIGDB)
            connected_pathways = MSIGDB_binary_matrix[gene, :].nonzero()[1]  # Nonzero columns in MSIGDB binary matrix

            if len(connected_pathways) == 0:
                continue  # Skip if no pathways are found

            # Collect probabilities from neighbors in MSIGDB
            prob_sum = 0
            for pathway in connected_pathways:
                # Find genes connected to the selected pathway (weighted transition in MSIGDB)
                connected_genes = MSIGDB_weighted_matrix[:, pathway].toarray().flatten()
                neighbor_genes = np.where(connected_genes > 0)[0]  # Get genes with nonzero weight

                if len(neighbor_genes) == 0:
                    continue

                # Normalize weights within MSIGDB
                weights = connected_genes[neighbor_genes]
                weight_sum = np.sum(weights)
                if weight_sum > 0:
                    weights /= weight_sum  # Normalize to sum to 1

                # Transition probability contribution
                prob_sum += np.sum(weights * vj[neighbor_genes])


            # Add inter-hypergraph jump probabilities (from MSIGDB to DGIDB)
            vi_new[gene] = prob_sum
        print("vi_new", vi_new)
        # Normalize vi_new to avoid overflow
        vi_new /= np.sum(vi_new) if np.sum(vi_new) > 0 else 1

        # Apply restart probability
        vi = restart_prob * vi_new + (1 - restart_prob) * teleport + dgidb_vector_complete

        # Calculate distance (convergence criterion)
        distance = np.sum(np.abs(vj - vi))
        distance_list.append(distance)

    # Sort importance scores in descending order
    importance_scores = np.argsort(vi)[::-1]
    importance_values = vi[importance_scores]

    # Return importance scores and distance values
    return {"Importance": list(zip(importance_scores, importance_values)), "Distance": distance_list}

In [5]:
result = get_hyper_randomwalk(DGIDB_binary_matrix, DGIDB_vector, MSIGDB_weighted_matrix, MSIGDB_binary_matrix, D, restart_prob, num_iterations)
# Print results
print("Top Indices by Importance:")
for index, score in result["Importance"][:10]:
    print(f"Index {index}: {score:.6f}")

print("\nDistance per Iteration:")
print(result["Distance"])

DGIDB full vector calculation: 100%|██████████| 21981/21981 [12:14<00:00, 29.94it/s] 


1 iteration


MSIGDB Gene Processing: 100%|██████████| 21981/21981 [17:05<00:00, 21.43it/s] 


vi_new [0.00081889 0.00086438 0.00300259 ... 0.0014558  0.0015013  0.00113735]
2 iteration


MSIGDB Gene Processing: 100%|██████████| 21981/21981 [18:32<00:00, 19.75it/s] 


vi_new [0.00436917 0.01265123 0.01446418 ... 0.00543762 0.00585295 0.00491162]
3 iteration


MSIGDB Gene Processing: 100%|██████████| 21981/21981 [18:53<00:00, 19.39it/s]  


vi_new [0.00442037 0.0128603  0.01465456 ... 0.00545228 0.00590244 0.00498851]
4 iteration


MSIGDB Gene Processing: 100%|██████████| 21981/21981 [1:08:59<00:00,  5.31it/s] 


vi_new [0.00442296 0.01285729 0.01465804 ... 0.00545235 0.00590327 0.00499068]
5 iteration


MSIGDB Gene Processing: 100%|██████████| 21981/21981 [24:44<00:00, 14.81it/s]  


vi_new [0.00442324 0.01285657 0.01465803 ... 0.00545237 0.00590323 0.00499074]
6 iteration


MSIGDB Gene Processing: 100%|██████████| 21981/21981 [29:58<00:00, 12.22it/s]  


vi_new [0.00442328 0.01285648 0.01465802 ... 0.00545237 0.00590322 0.00499074]
7 iteration


MSIGDB Gene Processing: 100%|██████████| 21981/21981 [29:34<00:00, 12.39it/s]  


vi_new [0.00442328 0.01285647 0.01465801 ... 0.00545237 0.00590322 0.00499074]
8 iteration


MSIGDB Gene Processing: 100%|██████████| 21981/21981 [32:36<00:00, 11.24it/s]  


vi_new [0.00442328 0.01285646 0.01465801 ... 0.00545237 0.00590322 0.00499074]
9 iteration


MSIGDB Gene Processing: 100%|██████████| 21981/21981 [37:29<00:00,  9.77it/s]   


vi_new [0.00442328 0.01285646 0.01465801 ... 0.00545237 0.00590322 0.00499074]
10 iteration


MSIGDB Gene Processing: 100%|██████████| 21981/21981 [24:27<00:00, 14.98it/s]  


vi_new [0.00442328 0.01285646 0.01465801 ... 0.00545237 0.00590322 0.00499074]
Top Indices by Importance:
Index 17056: 0.023415
Index 17461: 0.021858
Index 17490: 0.017392
Index 4555: 0.013329
Index 5636: 0.013108
Index 13094: 0.012905
Index 9127: 0.012025
Index 3023: 0.011670
Index 15885: 0.010955
Index 7417: 0.010756

Distance per Iteration:
[1.1637569640659267, 0.06925456361656208, 0.003885809671152212, 0.0004162002277098982, 5.112898988552348e-05, 6.552055973745666e-06, 8.53180614994132e-07, 1.1187416928834732e-07, 1.4716902981644006e-08, 1.9389203174735364e-09]


In [6]:
# Load the JSON data from the file
with open(MSIGDB_DIRECTORY + 'gene_to_index.json', 'r') as file:
    gene_to_index = json.load(file)

# Invert the dictionary to map indices back to genes
index_to_gene = {v: k for k, v in gene_to_index.items()}
human_gene2refseq = NCBI_INFO[NCBI_INFO['#tax_id'] == 9606]
id_to_gene_claim = pd.Series(human_gene2refseq.Symbol.values, index=human_gene2refseq.GeneID).to_dict()

def get_gene_claim_name(ncbi_gene_id):
    ncbi_gene_id = int(ncbi_gene_id)
    result = id_to_gene_claim[ncbi_gene_id]
    if result:
        return result
    else:
        return "Gene name not found"

In [7]:
results_df = pd.DataFrame(result["Importance"], columns=['Index', 'Score'])
results_df["ncbi_gene_id"] = results_df["Index"].apply(index_to_gene.get)
results_df["claim_name"] = results_df["ncbi_gene_id"].apply(get_gene_claim_name)
results_df.to_csv(OUTPUT_FOLDER + "unidirectional_multilayer_rwr_results_ORIGINAL.csv", index=False)