In [39]:
import numpy as np
import json
from scipy.sparse import load_npz
import pandas as pd
import os
FOLDER = "DGIDB_NEOPLASM_BREAST/"
HYPERGRAPH_DIRECTORY = "../Gen_Hypergraph/output/" + FOLDER
OUTPUT_FOLDER = "./output/" + FOLDER 
DGIDB = pd.read_csv("../Data/DGIDB/DrugToGene.tsv", sep="\t")
os.makedirs(OUTPUT_FOLDER, exist_ok=True)
restart_prob = 0.2  # Restart probability (theta)
num_iterations = 10  # Number of iterations

In [40]:
binary_matrix = load_npz(HYPERGRAPH_DIRECTORY + "hypergraph_incidence_matrix_binary.npz")
weighted_matrix = load_npz(HYPERGRAPH_DIRECTORY + "hypergraph_incidence_matrix_weighted.npz")

num_genes = binary_matrix.shape[0]  # Number of genes
num_drugs = binary_matrix.shape[1]  # Number of drugs

# Initialize probability vectors
v0 = np.ones(num_genes) / num_genes  # Initial uniform probability
teleport = np.ones(num_genes) / num_genes  # Restart probability vector

def get_hyper_randomwalk(binary_matrix, weighted_matrix, restart_prob, num_iterations):
    """Performs a hypergraph-based random walk with restart with proper normalization."""
    vi = v0.copy()  # Start with uniform probability
    distance_list = []

    for k in range(num_iterations):
        print(f"{k+1} iteration")

        # Store previous probability vector
        vj = vi.copy()

        # Initialize new probability vector
        vi_new = np.zeros(num_genes)

        for gene in range(num_genes):
            # Find drugs (hyperedges) connected to the current gene
            connected_drugs = binary_matrix[gene, :].nonzero()[1]  # Nonzero columns

            if len(connected_drugs) == 0:
                continue  # Skip if no drugs are found

            # Collect probabilities from neighbors
            prob_sum = 0
            for drug in connected_drugs:
                # Find genes connected to the selected drug (via weights)
                connected_genes = weighted_matrix[:, drug].toarray().flatten()
                neighbor_genes = np.where(connected_genes > 0)[0]  # Get genes with nonzero weight

                if len(neighbor_genes) == 0:
                    continue

                # Normalize weights
                weights = connected_genes[neighbor_genes]
                weight_sum = np.sum(weights)
                if weight_sum > 0:
                    weights /= weight_sum  # Normalize to sum to 1

                # Transition probability contribution
                prob_sum += np.sum(weights * vj[neighbor_genes])  

            vi_new[gene] = prob_sum  # Update probability for the gene

        # Normalize vi_new to avoid overflow
        vi_new /= np.sum(vi_new) if np.sum(vi_new) > 0 else 1

        # Apply restart probability
        vi = restart_prob * vi_new + (1 - restart_prob) * teleport

        # Calculate distance
        distance = np.sum(np.abs(vj - vi))
        distance_list.append(distance)

    original = vi
    # Sort importance scores in descending order
    importance_scores = np.argsort(vi)[::-1]
    importance_values = vi[importance_scores]

    # Return importance scores and distance values
    return {"Importance": list(zip(importance_scores, importance_values)), "Distance": distance_list, "unsorted": original}

In [41]:

# Run the random walk
result = get_hyper_randomwalk(binary_matrix, weighted_matrix, restart_prob, num_iterations)

# Print results
print("Top Indices by Importance:")
for index, score in result["Importance"][:10]:
    print(f"Index {index}: {score:.6f}")

print("\nDistance per Iteration:")
print(result["Distance"])


1 iteration
2 iteration
3 iteration
4 iteration
5 iteration
6 iteration
7 iteration
8 iteration
9 iteration
10 iteration
Top Indices by Importance:
Index 364: 0.002598
Index 56: 0.002443
Index 216: 0.001937
Index 183: 0.001711
Index 821: 0.001706
Index 53: 0.001445
Index 251: 0.001445
Index 27: 0.001422
Index 300: 0.001408
Index 658: 0.001357

Distance per Iteration:
[0.35341432760787606, 0.019550141197298383, 0.005287998534421963, 0.0016093878942900525, 0.0005455201090555145, 0.00020375258754663355, 7.840489800299392e-05, 3.066825866072801e-05, 1.2113100382427587e-05, 4.807507626255548e-06]


In [42]:
# Load the JSON data from the file
with open(HYPERGRAPH_DIRECTORY + 'gene_to_index.json', 'r') as file:
    gene_to_index = json.load(file)

# Invert the dictionary to map indices back to genes
index_to_gene = {v: k for k, v in gene_to_index.items()}
def get_gene_claim_name(ncbi_gene_id):
    ncbi_gene_id = int(ncbi_gene_id)
    result = DGIDB[(DGIDB['ncbi_gene_id']) == ncbi_gene_id]
    if not result.empty:
        return result['gene_name'].values[0]
    else:
        return "Gene name not found"

In [43]:
results_df = pd.DataFrame(result["Importance"], columns=['Index', 'Score'])
results_df["ncbi_gene_id"] = results_df["Index"].apply(index_to_gene.get)
results_df["claim_name"] = results_df["ncbi_gene_id"].apply(get_gene_claim_name)
results_df.to_csv(OUTPUT_FOLDER + "single_layer_rwr_results.csv", index=False)

In [44]:
np.save(OUTPUT_FOLDER + 'DGIDB_vector.npy', result["unsorted"])
