In [1]:
import numpy as np
import random
from scipy.sparse import load_npz

# Load matrices
# binary_matrix = load_npz("../../Data/hypergraphs/DGIDB_HumanNet/human/undirected/bipolar/hypergraph_incidence_matrix_binary.npz")
# weighted_matrix = load_npz("../../Data/hypergraphs/DGIDB_HumanNet/human/undirected/bipolar/hypergraph_incidence_matrix_weighted.npz")
binary_matrix = load_npz("../../Data/hypergraphs/DGIDB_HumanNet/neighbor/bipolar_k=10_h=1/hypergraph_incidence_matrix_binary.npz")
weighted_matrix = load_npz("../../Data/hypergraphs/DGIDB_HumanNet/neighbor/bipolar_k=10_h=1/hypergraph_incidence_matrix_weighted.npz")
# Parameters
restart_prob = 0.2  # Restart probability (theta)
num_iterations = 10  # Number of iterations
num_genes = binary_matrix.shape[0]  # Number of genes
num_drugs = binary_matrix.shape[1]  # Number of drugs

# Initialize probability vectors
v0 = np.ones(num_genes) / num_genes  # Initial uniform probability
teleport = np.ones(num_genes) / num_genes  # Restart probability vector

def get_hyper_randomwalk(binary_matrix, weighted_matrix, restart_prob, num_iterations):
    """Performs a hypergraph-based random walk with restart with proper normalization."""
    vi = v0.copy()  # Start with uniform probability
    distance_list = []

    for k in range(num_iterations):
        print(f"{k+1} iteration")

        # Store previous probability vector
        vj = vi.copy()

        # Initialize new probability vector
        vi_new = np.zeros(num_genes)

        for gene in range(num_genes):
            # Find drugs (hyperedges) connected to the current gene
            connected_drugs = binary_matrix[gene, :].nonzero()[1]  # Nonzero columns

            if len(connected_drugs) == 0:
                continue  # Skip if no drugs are found

            # Collect probabilities from neighbors
            prob_sum = 0
            for drug in connected_drugs:
                # Find genes connected to the selected drug (via weights)
                connected_genes = weighted_matrix[:, drug].toarray().flatten()
                neighbor_genes = np.where(connected_genes > 0)[0]  # Get genes with nonzero weight

                if len(neighbor_genes) == 0:
                    continue

                # Normalize weights
                weights = connected_genes[neighbor_genes]
                weight_sum = np.sum(weights)
                if weight_sum > 0:
                    weights /= weight_sum  # Normalize to sum to 1

                # Transition probability contribution
                prob_sum += np.sum(weights * vj[neighbor_genes])  

            vi_new[gene] = prob_sum  # Update probability for the gene

        # Normalize vi_new to avoid overflow
        vi_new /= np.sum(vi_new) if np.sum(vi_new) > 0 else 1

        # Apply restart probability
        vi = restart_prob * vi_new + (1 - restart_prob) * teleport

        # Calculate distance
        distance = np.sum(np.abs(vj - vi))
        distance_list.append(distance)

    # Sort importance scores in descending order
    importance_scores = np.argsort(vi)[::-1]
    importance_values = vi[importance_scores]

    # Return importance scores and distance values
    return {"Importance": list(zip(importance_scores, importance_values)), "Distance": distance_list}

# Run the random walk
result = get_hyper_randomwalk(binary_matrix, weighted_matrix, restart_prob, num_iterations)

# Print results
print("Top Genes by Importance:")
for gene, score in result["Importance"][:10]:
    print(f"Gene {gene}: {score:.6f}")

print("\nDistance per Iteration:")
print(result["Distance"])


1 iteration
2 iteration
3 iteration
4 iteration
5 iteration
6 iteration
7 iteration
8 iteration
9 iteration
10 iteration
Top Genes by Importance:
Gene 85: 0.027567
Gene 0: 0.013777
Gene 97: 0.013641
Gene 189: 0.009792
Gene 47: 0.009143
Gene 200: 0.003215
Gene 17: 0.002779
Gene 528: 0.002384
Gene 162: 0.002104
Gene 184: 0.001750

Distance per Iteration:
[0.26320896710474495, 0.09055028722027039, 0.026738994035294218, 0.007051998697644948, 0.0019876219121969722, 0.0007312516986020385, 0.000302037993200335, 0.00013486144865043512, 6.167794887390732e-05, 2.8557755697946214e-05]


TRACEROUTE

In [None]:
import numpy as np
import random
from scipy.sparse import load_npz

# Load matrices
binary_matrix = load_npz("../../Data/hypergraphs/DGIDB_HumanNet/human/undirected/bipolar/hypergraph_incidence_matrix_binary.npz")
weighted_matrix = load_npz("../../Data/hypergraphs/DGIDB_HumanNet/human/undirected/bipolar/hypergraph_incidence_matrix_weighted.npz")

# Parameters
restart_prob = 0.2  # Restart probability (theta)
num_iterations = 10  # Number of iterations
num_genes = binary_matrix.shape[0]  # Number of genes
num_drugs = binary_matrix.shape[1]  # Number of drugs

# Initialize probability vectors
v0 = np.ones(num_genes) / num_genes  # Initial uniform probability for genes
teleport = np.ones(num_genes) / num_genes  # Restart probability vector

# Initialize importance vector for drugs

def get_hyperedge_importance(binary_matrix, weighted_matrix, restart_prob, num_iterations):
    """Performs a hypergraph-based random walk with restart and calculates edge importance."""
    vi = v0.copy()  # Start with uniform probability for genes
    distance_list = []
    drug_importance = np.zeros(num_drugs)  # To accumulate importance for drugs
    for k in range(num_iterations):
        print(f"{k+1} iteration")

        # Store previous probability vector
        vj = vi.copy()

        # Initialize new probability vectors
        vi_new = np.zeros(num_genes)
        drug_prob = np.zeros(num_drugs)  # To accumulate probability for drugs

        for gene in range(num_genes):
            # Find drugs (hyperedges) connected to the current gene
            connected_drugs = binary_matrix[gene, :].nonzero()[1]  # Nonzero columns

            if len(connected_drugs) == 0:
                continue  # Skip if no drugs are found

            # Collect probabilities from neighbors
            for drug in connected_drugs:
                # Find genes connected to the selected drug (via weights)
                connected_genes = weighted_matrix[:, drug].toarray().flatten()
                neighbor_genes = np.where(connected_genes > 0)[0]  # Get genes with nonzero weight

                if len(neighbor_genes) == 0:
                    continue

                # Normalize weights
                weights = connected_genes[neighbor_genes]
                weight_sum = np.sum(weights)
                if weight_sum > 0:
                    weights /= weight_sum  # Normalize to sum to 1

                # Transition probability contribution to drug
                prob_contribution = np.sum(weights * vj[neighbor_genes])
                drug_prob[drug] += prob_contribution  # Accumulate for the drug

        # Normalize drug probabilities
        drug_prob /= np.sum(drug_prob) if np.sum(drug_prob) > 0 else 1
        drug_importance += drug_prob  # Accumulate overall importance
        # Apply restart probability to drug
        vi_new = binary_matrix @ drug_prob  # Transition back to gene space

        # Normalize vi_new to avoid overflow
        vi_new /= np.sum(vi_new) if np.sum(vi_new) > 0 else 1

        # Apply restart probability to genes
        vi = restart_prob * vi_new + (1 - restart_prob) * teleport

        # Calculate distance
        distance = np.sum(np.abs(vj - vi))
        distance_list.append(distance)

    # Sort drug importance scores in descending order
    importance_scores = np.argsort(drug_importance)[::-1]
    importance_values = drug_importance[importance_scores]

    # Return importance scores and distance values
    return {"Importance": list(zip(importance_scores, importance_values)), "Distance": distance_list}

# Run the random walk to calculate edge importance
result = get_hyperedge_importance(binary_matrix, weighted_matrix, restart_prob, num_iterations)

# Print results
print("Top Drugs by Importance:")
for drug, score in result["Importance"][:10]:
    print(f"Drug {drug}: {score:.6f}")

print("\nDistance per Iteration:")
print(result["Distance"])


In [2]:
import json

# Load the JSON data from the file
with open('../../Data/hypergraphs/DGIDB_HumanNet/human/undirected/bipolar/gene_to_index.json', 'r') as file:
    gene_to_index = json.load(file)

# Invert the dictionary to map indices back to genes
index_to_gene = {v: k for k, v in gene_to_index.items()}

def get_gene_by_index(index):
    return index_to_gene.get(index, "Index not found")


In [3]:
import pandas as pd
dgidb = pd.read_csv("../../Data/DGIDB/converted/human/dgidb_ncbi_v2.csv")

In [4]:
print(dgidb[(dgidb['ncbi_gene_id']) == 367])

         gene_claim_name gene_concept_id gene_name interaction_source_db_name  \
195                   AR        hgnc:644        AR                        DTC   
224                   AR        hgnc:644        AR                     ChEMBL   
244                   AR        hgnc:644        AR                        DTC   
372                   AR        hgnc:644        AR                        DTC   
423                   AR        hgnc:644        AR                        DTC   
...                  ...             ...       ...                        ...   
87807                 AR        hgnc:644        AR                        DTC   
87849                 AR        hgnc:644        AR                        DTC   
87863                 AR        hgnc:644        AR                        DTC   
88142     UNIPROT:P10275        hgnc:644        AR           TdgClinicalTrial   
88237  ANDROGEN RECEPTOR        hgnc:644        AR                        TTD   

      interaction_source_db

In [6]:
# Function to get the gene_claim_name from ncbi_gene_id
def get_gene_claim_name(ncbi_gene_id):
    ncbi_gene_id = int(ncbi_gene_id)
    # result = dgidb[dgidb['ncbi_gene_id'] == ncbi_gene_id]
    result = dgidb[(dgidb['ncbi_gene_id']) == ncbi_gene_id]
    if not result.empty:
        return result['gene_name'].values[0]
    else:
        return "Gene name not found"
for gene, score in result["Importance"][:10]:
    print(gene)
    ncbi_gene = get_gene_by_index(gene)
    claim_name = get_gene_claim_name(ncbi_gene)
    print(f"Gene {ncbi_gene}, Claim Name: {claim_name} : {score:.6f}")

85
Gene 1576, Claim Name: CYP3A4 : 0.027567
0
Gene 1565, Claim Name: CYP2D6 : 0.013777
97
Gene 1544, Claim Name: CYP1A2 : 0.013641
189
Gene 1557, Claim Name: CYP2C19 : 0.009792
47
Gene 1559, Claim Name: CYP2C9 : 0.009143
200
Gene 3028, Claim Name: HSD17B10 : 0.003215
17
Gene 10919, Claim Name: EHMT2 : 0.002779
528
Gene 216, Claim Name: ALDH1A1 : 0.002384
162
Gene 367, Claim Name: AR : 0.002104
184
Gene 4780, Claim Name: NFE2L2 : 0.001750


In [7]:
import pandas as pd

# Assuming 'result' is a dictionary with "Importance" as a key containing a list of tuples (gene, score)
top_genes = result["Importance"]

# Create a DataFrame from the top genes
df_top_genes = pd.DataFrame(top_genes, columns=['Gene', 'Score'])

# Save the DataFrame to a TSV file
df_top_genes.to_csv('importance_scores.tsv', sep='\t', index=False)

print("Top genes by importance have been saved to 'importance_scores.tsv'")


Top genes by importance have been saved to 'importance_scores.tsv'
