In [1]:
import numpy as np
import random
from scipy.sparse import load_npz

# Load matrices
binary_matrix = load_npz("../Data/hypergraphs/DGIDB_HumanNet/hypergraph_incidence_matrix_binary.npz")
weighted_matrix = load_npz("../Data/hypergraphs/DGIDB_HumanNet/hypergraph_incidence_matrix_weighted.npz")

# Parameters
restart_prob = 0.2  # 20% chance to restart each hop
num_steps = 100  # Number of steps in the random walk
num_genes = binary_matrix.shape[0]  # Number of genes (rows in the matrix)
num_drugs = binary_matrix.shape[1]  # Number of drugs (columns in the matrix)

# Select a random starting gene (vertex)
def random_gene_selection():
    return random.randint(0, num_genes - 1)

# Perform random walk with restart
def random_walk_with_restart(binary_matrix, weighted_matrix, num_steps, restart_prob):
    # Select initial gene
    start_gene = random_gene_selection()
    current_gene = start_gene
    path = [current_gene]  # Store only genes in the path

    for _ in range(num_steps):
        # Restart with probability 0.2
        if random.random() < restart_prob:
            current_gene = start_gene  # Reset to initial gene
        
        # Find drugs (hyperedges) connected to the current gene
        connected_drugs = binary_matrix[current_gene, :].nonzero()[1]  # Get nonzero columns (drugs)
        if len(connected_drugs) == 0:
            continue  # Skip if no drugs are found

        # Randomly select a drug (hyperedge)
        selected_drug = random.choice(connected_drugs)

        # Find genes connected to the selected drug (via weights)
        connected_genes = weighted_matrix[:, selected_drug].toarray().flatten()  # Drug-to-gene weights
        next_genes = np.where(connected_genes > 0)[0]  # Find genes with nonzero weight

        if len(next_genes) == 0:
            continue  # Skip if no valid next genes

        # Select next gene based on weights
        weights = connected_genes[next_genes]
        next_gene = random.choices(next_genes, weights=weights)[0]

        # Update current gene
        current_gene = next_gene
        path.append([current_gene, selected_drug])

    return path

# Perform the random walk
walk_path = random_walk_with_restart(binary_matrix, weighted_matrix, num_steps, restart_prob)

# Print the walk path
print(f"Random Walk Path (length {len(walk_path)}):")
print(walk_path)


Random Walk Path (length 101):
[3410, [1684, 838], [207, 1692], [3, 478], [3, 6371], [3, 2586], [3, 1797], [2, 14339], [2, 8452], [3, 1272], [683, 1050], [2, 3087], [111, 847], [111, 4533], [111, 4813], [13, 6360], [13, 13160], [13, 4642], [13, 3525], [2, 838], [30, 8355], [1403, 838], [186, 838], [863, 838], [60, 3398], [60, 6324], [1096, 838], [111, 1416], [111, 13295], [1712, 838], [423, 838], [423, 4307], [168, 838], [50, 309], [50, 9169], [30, 14306], [3000, 838], [4961, 838], [3000, 838], [111, 838], [111, 2377], [111, 13551], [111, 14285], [30, 117], [30, 13231], [30, 8541], [30, 13667], [198, 6213], [77, 622], [3, 1157], [3, 3485], [1674, 838], [784, 838], [784, 7042], [784, 15513], [1507, 813], [1507, 813], [1507, 813], [1507, 813], [1407, 838], [412, 18331], [60, 11070], [60, 8150], [60, 2404], [2, 886], [430, 838], [30, 239], [30, 16618], [1479, 838], [872, 2529], [830, 838], [830, 10983], [0, 197], [932, 838], [932, 16419], [111, 838], [30, 2685], [30, 1723], [574, 838], [7

In [21]:
import numpy as np
import random
from scipy.sparse import load_npz

# Load matrices
binary_matrix = load_npz("../Data/hypergraphs/DGIDB_HumanNet/human/hypergraph_incidence_matrix_binary.npz")
weighted_matrix = load_npz("../Data/hypergraphs/DGIDB_HumanNet/human/hypergraph_incidence_matrix_weighted.npz")

# Parameters
restart_prob = 0.2  # Restart probability (theta)
num_iterations = 10  # Number of iterations
num_genes = binary_matrix.shape[0]  # Number of genes
num_drugs = binary_matrix.shape[1]  # Number of drugs

# Initialize probability vectors
v0 = np.ones(num_genes) / num_genes  # Initial uniform probability
teleport = np.ones(num_genes) / num_genes  # Restart probability vector

def get_hyper_randomwalk(binary_matrix, weighted_matrix, restart_prob, num_iterations):
    """Performs a hypergraph-based random walk with restart with proper normalization."""
    vi = v0.copy()  # Start with uniform probability
    distance_list = []

    for k in range(num_iterations):
        print(f"{k+1} iteration")

        # Store previous probability vector
        vj = vi.copy()

        # Initialize new probability vector
        vi_new = np.zeros(num_genes)

        for gene in range(num_genes):
            # Find drugs (hyperedges) connected to the current gene
            connected_drugs = binary_matrix[gene, :].nonzero()[1]  # Nonzero columns

            if len(connected_drugs) == 0:
                continue  # Skip if no drugs are found

            # Collect probabilities from neighbors
            prob_sum = 0
            for drug in connected_drugs:
                # Find genes connected to the selected drug (via weights)
                connected_genes = weighted_matrix[:, drug].toarray().flatten()
                neighbor_genes = np.where(connected_genes > 0)[0]  # Get genes with nonzero weight

                if len(neighbor_genes) == 0:
                    continue

                # Normalize weights
                weights = connected_genes[neighbor_genes]
                weight_sum = np.sum(weights)
                if weight_sum > 0:
                    weights /= weight_sum  # Normalize to sum to 1

                # Transition probability contribution
                prob_sum += np.sum(weights * vj[neighbor_genes])  

            vi_new[gene] = prob_sum  # Update probability for the gene

        # Normalize vi_new to avoid overflow
        vi_new /= np.sum(vi_new) if np.sum(vi_new) > 0 else 1

        # Apply restart probability
        vi = restart_prob * vi_new + (1 - restart_prob) * teleport

        # Calculate distance
        distance = np.sum(np.abs(vj - vi))
        distance_list.append(distance)

    # Sort importance scores in descending order
    importance_scores = np.argsort(vi)[::-1]
    importance_values = vi[importance_scores]

    # Return importance scores and distance values
    return {"Importance": list(zip(importance_scores, importance_values)), "Distance": distance_list}

# Run the random walk
result = get_hyper_randomwalk(binary_matrix, weighted_matrix, restart_prob, num_iterations)

# Print results
print("Top Genes by Importance:")
for gene, score in result["Importance"][:10]:
    print(f"Gene {gene}: {score:.6f}")

print("\nDistance per Iteration:")
print(result["Distance"])


1 iteration
2 iteration
3 iteration
4 iteration
5 iteration
6 iteration
7 iteration
8 iteration
9 iteration
10 iteration
Top Genes by Importance:
Gene 135: 0.028428
Gene 152: 0.016031
Gene 14: 0.006491
Gene 63: 0.006399
Gene 239: 0.003673
Gene 88: 0.003294
Gene 76: 0.003217
Gene 0: 0.003171
Gene 473: 0.002997
Gene 156: 0.002711

Distance per Iteration:
[0.23142034632750544, 0.09536370029313398, 0.0410156124472083, 0.018559649457316738, 0.013260046311327973, 0.00983804012819986, 0.007125118479572831, 0.005042331511836427, 0.003505226050412345, 0.0024091235526530956]


In [22]:
import json

# Load the JSON data from the file
with open('../Data/hypergraphs/DGIDB_HumanNet/human/gene_to_index.json', 'r') as file:
    gene_to_index = json.load(file)

# Invert the dictionary to map indices back to genes
index_to_gene = {v: k for k, v in gene_to_index.items()}

def get_gene_by_index(index):
    return index_to_gene.get(index, "Index not found")


In [24]:
import pandas as pd
dgidb = pd.read_csv("../Data/DGIDB/converted/human/dgidb_ncbi_v3.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [27]:
print(dgidb[(dgidb['ncbi_gene_id']) == 367])

      gene_claim_name gene_concept_id gene_name interaction_source_db_name  \
155                AR        hgnc:644        AR                        DTC   
178                AR        hgnc:644        AR                     ChEMBL   
196                AR        hgnc:644        AR                        DTC   
311                AR        hgnc:644        AR                        DTC   
352                AR        hgnc:644        AR                        DTC   
...               ...             ...       ...                        ...   
69386              AR        hgnc:644        AR                        DTC   
69470              AR        hgnc:644        AR                        DTC   
69478              AR        hgnc:644        AR                        DTC   
69520              AR        hgnc:644        AR                        DTC   
69535              AR        hgnc:644        AR                        DTC   

      interaction_source_db_version interaction_type  interacti

In [40]:
# Function to get the gene_claim_name from ncbi_gene_id
def get_gene_claim_name(ncbi_gene_id):
    ncbi_gene_id = int(ncbi_gene_id)
    # result = dgidb[dgidb['ncbi_gene_id'] == ncbi_gene_id]
    result = dgidb[(dgidb['ncbi_gene_id']) == ncbi_gene_id]
    if not result.empty:
        return result['gene_claim_name'].values[0]
    else:
        return "Gene name not found"
for gene, score in result["Importance"][:10]:
    print(gene)
    ncbi_gene = get_gene_by_index(gene)
    claim_name = get_gene_claim_name(ncbi_gene)
    print(f"Gene {ncbi_gene}, Claim Name: {claim_name} : {score:.6f}")

135
Gene 367, Claim Name: AR : 0.028428
152
Gene 4780, Claim Name: NFE2L2 : 0.016031
14
Gene 10919, Claim Name: EHMT2 : 0.006491
63
Gene 1576, Claim Name: CYP3A4 : 0.006399
239
Gene 51053, Claim Name: GMNN : 0.003673
88
Gene 7421, Claim Name: VDR : 0.003294
76
Gene 1544, Claim Name: CYP1A2 : 0.003217
0
Gene 1565, Claim Name: CYP2D6 : 0.003171
473
Gene 216, Claim Name: ALDH1A1 : 0.002997
156
Gene 1557, Claim Name: CYP2C19 : 0.002711


In [44]:
for gene, score in result["Importance"]:
    if gene == 315:
        print(f"Gene {gene}: {score:.6f}")



Gene 315: 0.001781
