In [1]:
import numpy as np
import random
from scipy.sparse import load_npz

# Load matrices
binary_matrix = load_npz("../../Data/hypergraphs/DGIDB_HumanNet/human/undirected/bipolar/hypergraph_incidence_matrix_binary.npz")
weighted_matrix = load_npz("../../Data/hypergraphs/DGIDB_HumanNet/human/undirected/bipolar/hypergraph_incidence_matrix_weighted.npz")

# Parameters
restart_prob = 0.2  # Restart probability (theta)
num_iterations = 10  # Number of iterations
num_genes = binary_matrix.shape[0]  # Number of genes
num_drugs = binary_matrix.shape[1]  # Number of drugs

# Initialize probability vectors
v0 = np.ones(num_genes) / num_genes  # Initial uniform probability
teleport = np.ones(num_genes) / num_genes  # Restart probability vector

def get_hyper_randomwalk(binary_matrix, weighted_matrix, restart_prob, num_iterations):
    """Performs a hypergraph-based random walk with restart with proper normalization."""
    vi = v0.copy()  # Start with uniform probability
    distance_list = []

    for k in range(num_iterations):
        print(f"{k+1} iteration")

        # Store previous probability vector
        vj = vi.copy()

        # Initialize new probability vector
        vi_new = np.zeros(num_genes)

        for gene in range(num_genes):
            # Find drugs (hyperedges) connected to the current gene
            connected_drugs = binary_matrix[gene, :].nonzero()[1]  # Nonzero columns

            if len(connected_drugs) == 0:
                continue  # Skip if no drugs are found

            # Collect probabilities from neighbors
            prob_sum = 0
            for drug in connected_drugs:
                # Find genes connected to the selected drug (via weights)
                connected_genes = weighted_matrix[:, drug].toarray().flatten()
                neighbor_genes = np.where(connected_genes > 0)[0]  # Get genes with nonzero weight

                if len(neighbor_genes) == 0:
                    continue

                # Normalize weights
                weights = connected_genes[neighbor_genes]
                weight_sum = np.sum(weights)
                if weight_sum > 0:
                    weights /= weight_sum  # Normalize to sum to 1

                # Transition probability contribution
                prob_sum += np.sum(weights * vj[neighbor_genes])  

            vi_new[gene] = prob_sum  # Update probability for the gene

        # Normalize vi_new to avoid overflow
        vi_new /= np.sum(vi_new) if np.sum(vi_new) > 0 else 1

        # Apply restart probability
        vi = restart_prob * vi_new + (1 - restart_prob) * teleport

        # Calculate distance
        distance = np.sum(np.abs(vj - vi))
        distance_list.append(distance)

    # Sort importance scores in descending order
    importance_scores = np.argsort(vi)[::-1]
    importance_values = vi[importance_scores]

    # Return importance scores and distance values
    return {"Importance": list(zip(importance_scores, importance_values)), "Distance": distance_list}

# Run the random walk
result = get_hyper_randomwalk(binary_matrix, weighted_matrix, restart_prob, num_iterations)

# Print results
print("Top Genes by Importance:")
for gene, score in result["Importance"][:10]:
    print(f"Gene {gene}: {score:.6f}")

print("\nDistance per Iteration:")
print(result["Distance"])


1 iteration
2 iteration
3 iteration
4 iteration
5 iteration
6 iteration
7 iteration
8 iteration
9 iteration
10 iteration
Top Genes by Importance:
Gene 63: 0.003661
Gene 76: 0.003643
Gene 0: 0.003290
Gene 115: 0.002824
Gene 37: 0.002619
Gene 111: 0.002590
Gene 1076: 0.002590
Gene 1031: 0.002590
Gene 318: 0.002590
Gene 1029: 0.002590

Distance per Iteration:
[0.3737745604963806, 0.021338884202819652, 0.0067845337283496026, 0.00229106878154604, 0.0008357860517593382, 0.00033490177351925686, 0.0001453670667538425, 6.531599680891057e-05, 3.025454651723171e-05, 1.4441154384770016e-05]


In [2]:
import json

# Load the JSON data from the file
with open('../../Data/hypergraphs/DGIDB_HumanNet/human/undirected/bipolar/gene_to_index.json', 'r') as file:
    gene_to_index = json.load(file)

# Invert the dictionary to map indices back to genes
index_to_gene = {v: k for k, v in gene_to_index.items()}

def get_gene_by_index(index):
    return index_to_gene.get(index, "Index not found")


In [3]:
import pandas as pd
dgidb = pd.read_csv("../../Data/DGIDB/converted/human/dgidb_ncbi_v3.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [8]:
print(dgidb[(dgidb['ncbi_gene_id']) == 367])

      gene_claim_name gene_concept_id gene_name interaction_source_db_name  \
155                AR        hgnc:644        AR                        DTC   
178                AR        hgnc:644        AR                     ChEMBL   
196                AR        hgnc:644        AR                        DTC   
311                AR        hgnc:644        AR                        DTC   
352                AR        hgnc:644        AR                        DTC   
...               ...             ...       ...                        ...   
69386              AR        hgnc:644        AR                        DTC   
69470              AR        hgnc:644        AR                        DTC   
69478              AR        hgnc:644        AR                        DTC   
69520              AR        hgnc:644        AR                        DTC   
69535              AR        hgnc:644        AR                        DTC   

      interaction_source_db_version interaction_type  interacti

In [4]:
# Function to get the gene_claim_name from ncbi_gene_id
def get_gene_claim_name(ncbi_gene_id):
    ncbi_gene_id = int(ncbi_gene_id)
    # result = dgidb[dgidb['ncbi_gene_id'] == ncbi_gene_id]
    result = dgidb[(dgidb['ncbi_gene_id']) == ncbi_gene_id]
    if not result.empty:
        return result['gene_claim_name'].values[0]
    else:
        return "Gene name not found"
for gene, score in result["Importance"][:10]:
    print(gene)
    ncbi_gene = get_gene_by_index(gene)
    claim_name = get_gene_claim_name(ncbi_gene)
    print(f"Gene {ncbi_gene}, Claim Name: {claim_name} : {score:.6f}")

63
Gene 1576, Claim Name: CYP3A4 : 0.003661
76
Gene 1544, Claim Name: CYP1A2 : 0.003643
0
Gene 1565, Claim Name: CYP2D6 : 0.003290
115
Gene 1813, Claim Name: NCBIGENE:1813 : 0.002824
37
Gene 1559, Claim Name: CYP2C9 : 0.002619
111
Gene 6, Claim Name: NCBIGENE:6 : 0.002590
1076
Gene 215, Claim Name: NCBIGENE:215 : 0.002590
1031
Gene 12, Claim Name: NCBIGENE:12 : 0.002590
318
Gene 262, Claim Name: NCBIGENE:262 : 0.002590
1029
Gene 8, Claim Name: NCBIGENE:8 : 0.002590


In [None]:
import pandas as pd

# Assuming 'result' is a dictionary with "Importance" as a key containing a list of tuples (gene, score)
top_genes = result["Importance"]

# Create a DataFrame from the top genes
df_top_genes = pd.DataFrame(top_genes, columns=['Gene', 'Score'])

# Save the DataFrame to a TSV file
df_top_genes.to_csv('importance_scores.tsv', sep='\t', index=False)

print("Top genes by importance have been saved to 'importance_scores.tsv'")


Top genes by importance have been saved to 'top_genes_by_importance.tsv'
