### Creates subhypergraph from the original one

In [39]:
import pandas as pd
import os
from scipy.sparse import dok_matrix, save_npz, load_npz
#params
K = 10
HOP = 1
OUTPUT_PATH = f"./Data/hypergraphs/DGIDB_HumanNet/neighbor/bipolar_k={K}_h={HOP}_NO_CYP/"
if not os.path.exists(OUTPUT_PATH):
    os.makedirs(OUTPUT_PATH)
# loading data only really need the scores
# binary_bipolar_matrix = load_npz("./Data/hypergraphs/DGIDB_HumanNet/human/undirected/bipolar/hypergraph_incidence_matrix_binary.npz")
# weighted_bipolar_matrix = load_npz("./Data/hypergraphs/DGIDB_HumanNet/human/undirected/bipolar/hypergraph_incidence_matrix_weighted.npz")
# The bipolar gene to index and the full gene to index are assumed to be IDENTICAL
bipolar_scores = pd.read_csv("./Random_Walk_v1/PPI_Weighted/complete_data/importance_scores_BIPOLAR_complete.tsv", sep="\t")
top_k_indices = bipolar_scores["index"][:K].to_list()
print("top k indices", top_k_indices)
#Full matrices probably don't change
binary_full_matrix = load_npz("./Data/hypergraphs/DGIDB_HumanNet/human/undirected/full/hypergraph_incidence_matrix_binary.npz")
weighted_full_matrix = load_npz("./Data/hypergraphs/DGIDB_HumanNet/human/undirected/full/hypergraph_incidence_matrix_weighted.npz")
print(binary_full_matrix.shape)


top k indices [97, 85, 0, 31, 47, 366, 1333, 1299, 463, 518]
(4774, 17908)


In [43]:
# REMOVE CYP and insert the next one
top_k_indices = []
for row in range(len(bipolar_scores)):
    # if not bipolar_scores.iloc[row]["claim_name"].startswith("CYP"):
    print(bipolar_scores.iloc[row]["claim_name"])
    top_k_indices.append(bipolar_scores.iloc[row]["index"])
    if len(top_k_indices) == K:
        break
print(top_k_indices)

CYP1A2
CYP3A4
CYP2D6
DRD2
CYP2C9
AMD1
ABCD1
SERPINA3
HTR2C
HTR2A
[97, 85, 0, 31, 47, 366, 1333, 1299, 463, 518]


In [41]:
# Initialize a sparse incidence matrix
queue = top_k_indices.copy()
# Perform BFS up to n hops
current_genes = set(top_k_indices)

all_genes = set(top_k_indices)
all_drugs = set()

# BFS for n Hops
for hop in range(HOP):
    next_genes = set()
    next_drugs = set()

    # Step 1: Gene → Drug
    for gene in current_genes:
        connected_drugs = binary_full_matrix[gene, :].nonzero()[1]
        for drug in connected_drugs:
            next_drugs.add(drug)
    
    # Step 2: Drug → Gene
    for drug in next_drugs:
        connected_genes = binary_full_matrix[:, drug].nonzero()[0]
        for gene in connected_genes:
            next_genes.add(gene)
    
    # Update the sets for the next hop
    all_drugs.update(next_drugs)
    all_genes.update(next_genes)
    current_genes = next_genes

    # Move to the next level
print(len(all_genes))
print(len(all_drugs))

2266
601


In [42]:
# Initialize filtered incidence matrices
binary_incidence_matrix = dok_matrix((binary_full_matrix.shape), dtype=float)
weighted_incidence_matrix = dok_matrix((weighted_full_matrix.shape), dtype=float)
# Copy relevant nodes and edges
for gene in all_genes:
    for drug in binary_full_matrix[gene].nonzero()[1]:
        if drug in all_drugs:
            binary_incidence_matrix[gene, drug] = binary_full_matrix[gene, drug]
            weighted_incidence_matrix[gene, drug] = weighted_full_matrix[gene, drug]


binary_csr_matrix = binary_incidence_matrix.tocsr()
weighted_csr_matrix = weighted_incidence_matrix.tocsr()

save_npz(OUTPUT_PATH + "hypergraph_incidence_matrix_binary.npz", binary_csr_matrix)
save_npz(OUTPUT_PATH + "hypergraph_incidence_matrix_weighted.npz", weighted_csr_matrix)

