In [None]:
import json
import json

# Open the JSON file and load its content into a dictionary
with open("./Data/hypergraphs/DGIDB_HumanNet/human/undirected/temp/gene_to_index.json", "r") as file:
    dgidb = json.load(file)
with open("./Data/hypergraphs/MSIGDB_HumanNet/human/gene_to_index.json", "r") as file:
    msigdb = json.load(file)
# Now you can use the 'dgidb' dictionary
# Convert the keys to sets for easy comparison
dgidb_keys = set(dgidb.keys())
msigdb_keys = set(msigdb.keys())

# Find unique matches
unique_matches = dgidb_keys.intersection(msigdb_keys)
unique_match_count = len(unique_matches)

# Find keys that don't match
dgidb_only = dgidb_keys - msigdb_keys
msigdb_only = msigdb_keys - dgidb_keys
unique_non_matches = dgidb_only.union(msigdb_only)
unique_non_match_count = len(unique_non_matches)

# Print the counts
print(f"Number of unique matches: {unique_match_count}")
print(f"Number of unique non-matches: {unique_non_match_count}")


In [None]:
import numpy as np
# Jump probability for matching genes
w = 0.5

# Number of genes (assuming they are both of same size or matchable)
num_genes_dgidb = len(dgidb)
num_genes_msigdb = len(msigdb)

# Initialize the inter-layer matrix (D) with zeros
D = np.zeros((num_genes_dgidb, num_genes_msigdb))
i = 0
# Build the inter-layer matrix (D)
for gene_dgidb, idx_dgidb in dgidb.items():
    # If the gene exists in both gene-to-index mappings
    if gene_dgidb in msigdb:
        idx_msigdb = msigdb[gene_dgidb]
        D[idx_dgidb, idx_msigdb] = w  # Set jump probability
        i += 1
print("Inter-Layer Matrix D:\n", D)

In [None]:
from scipy.sparse import load_npz
from tqdm import tqdm

# Parameters
# Load matrices
DGIDB_binary_matrix = load_npz("./Data/hypergraphs/DGIDB_HumanNet/human/undirected/temp/hypergraph_incidence_matrix_binary.npz")
DGIDB_weighted_matrix = load_npz("./Data/hypergraphs/DGIDB_HumanNet/human/undirected/temp/hypergraph_incidence_matrix_weighted.npz")
MSIGDB_weighted_matrix = load_npz("./Data/hypergraphs/MSIGDB_HumanNet/human/hypergraph_weighted_incidence_matrix.npz")
MSIGDB_binary_matrix = load_npz("./Data/hypergraphs/MSIGDB_HumanNet/human/hypergraph_binary_incidence_matrix.npz")

restart_prob = 0.2  # Restart probability (theta)
num_iterations = 10  # Number of iterations
num_genes_DGIDB = DGIDB_binary_matrix.shape[0]  # Number of genes in DGIDB
num_genes_MSIGDB = MSIGDB_binary_matrix.shape[0]  # Number of genes in MSIGDB

# Initialize probability vectors
v0 = np.zeros(num_genes_DGIDB + num_genes_MSIGDB)  # Combined vector for DGIDB and MSIGDB
teleport = np.zeros(num_genes_DGIDB + num_genes_MSIGDB)  # Restart probability vector

# Parameters
restart_prob = 0.2  # Restart probability (theta)
num_iterations = 10  # Number of iterations
num_genes_DGIDB = DGIDB_binary_matrix.shape[0]  # Number of genes in DGIDB
num_genes_MSIGDB = MSIGDB_binary_matrix.shape[0]  # Number of genes in MSIGDB

# Initialize probability vectors
v0[:num_genes_DGIDB] = 1.0 / (num_genes_DGIDB + num_genes_MSIGDB)  # Starting on DGIDB
teleport[:num_genes_DGIDB] = 1.0 / (num_genes_DGIDB + num_genes_MSIGDB)

def get_hyper_randomwalk(DGIDB_binary_matrix, DGIDB_weighted_matrix, MSIGDB_weighted_matrix, MSIGDB_binary_matrix, D, restart_prob, num_iterations):
    """Performs a hypergraph-based random walk with restart considering inter-layer transitions."""
    vi = v0.copy()  # Start with uniform probability (combined for both DGIDB and MSIGDB)
    distance_list = []

    for k in range(num_iterations):
        print(f"{k+1} iteration")

        # Store previous probability vector
        vj = vi.copy()

        # Initialize new probability vector
        vi_new = np.zeros_like(vi)

        # Handle transitions for DGIDB (first part of the vector)
        for gene in range(num_genes_DGIDB):
            # Intra-hypergraph transitions in DGIDB (moving within DGIDB)
            connected_drugs = DGIDB_binary_matrix[gene, :].nonzero()[1]  # Nonzero columns in DGIDB binary matrix

            if len(connected_drugs) == 0:
                continue  # Skip if no drugs are found

            # Collect probabilities from neighbors in DGIDB
            prob_sum = 0
            for drug in connected_drugs:
                # Find genes connected to the selected drug (weighted transition in DGIDB)
                connected_genes = DGIDB_weighted_matrix[:, drug].toarray().flatten()
                neighbor_genes = np.where(connected_genes > 0)[0]  # Get genes with nonzero weight

                if len(neighbor_genes) == 0:
                    continue

                # Normalize weights within DGIDB
                weights = connected_genes[neighbor_genes]
                weight_sum = np.sum(weights)
                if weight_sum > 0:
                    weights /= weight_sum  # Normalize to sum to 1

                # Transition probability contribution
                prob_sum += np.sum(weights * vj[neighbor_genes])

            # Add inter-hypergraph jump probabilities (from DGIDB to MSIGDB)
            vi_new[gene] = prob_sum + np.sum(D[gene, :] * vj[num_genes_DGIDB:])  # Jump to MSIGDB part
        print("vi_new", vi_new)
        # Handle transitions for MSIGDB (second part of the vector)
        for gene in tqdm(range(num_genes_MSIGDB), desc="MSIGDB Gene Processing"):
            # Intra-hypergraph transitions in MSIGDB (moving within MSIGDB)
            connected_pathways = MSIGDB_binary_matrix[gene, :].nonzero()[1]  # Nonzero columns in MSIGDB binary matrix

            if len(connected_pathways) == 0:
                continue  # Skip if no pathways are found

            # Collect probabilities from neighbors in MSIGDB
            prob_sum = 0
            for pathway in connected_pathways:
                # Find genes connected to the selected pathway (weighted transition in MSIGDB)
                connected_genes = MSIGDB_weighted_matrix[:, pathway].toarray().flatten()
                neighbor_genes = np.where(connected_genes > 0)[0]  # Get genes with nonzero weight

                if len(neighbor_genes) == 0:
                    continue

                # Normalize weights within MSIGDB
                weights = connected_genes[neighbor_genes]
                weight_sum = np.sum(weights)
                if weight_sum > 0:
                    weights /= weight_sum  # Normalize to sum to 1

                # Transition probability contribution
                prob_sum += np.sum(weights * vj[num_genes_DGIDB + neighbor_genes])

            # Add inter-hypergraph jump probabilities (from MSIGDB to DGIDB)
            vi_new[num_genes_DGIDB + gene] = prob_sum + np.sum(D[:, gene] * vj[:num_genes_DGIDB])  # Jump to DGIDB part
        print("vi_new", vi_new)
        # Normalize vi_new to avoid overflow
        vi_new /= np.sum(vi_new) if np.sum(vi_new) > 0 else 1

        # Apply restart probability
        vi = restart_prob * vi_new + (1 - restart_prob) * teleport

        # Calculate distance (convergence criterion)
        distance = np.sum(np.abs(vj - vi))
        distance_list.append(distance)

    # Sort importance scores in descending order
    importance_scores = np.argsort(vi)[::-1]
    importance_values = vi[importance_scores]

    # Return importance scores and distance values
    return {"Importance": list(zip(importance_scores, importance_values)), "Distance": distance_list}

# Run the random walk with restart considering inter-layer transitions
result = get_hyper_randomwalk(DGIDB_binary_matrix, DGIDB_weighted_matrix, MSIGDB_weighted_matrix, MSIGDB_binary_matrix, D, restart_prob, num_iterations)

# Print results
print("Top Genes by Importance:")
for gene, score in result["Importance"][:10]:
    print(f"Gene {gene}: {score:.6f}")

print("\nDistance per Iteration:")
print(result["Distance"])

In [None]:
import pandas as pd
dgidb = pd.read_csv("./Data/DGIDB/converted/human/dgidb_ncbi_v2.csv")

In [None]:
import json
with open("./Data/hypergraphs/DGIDB_HumanNet/human/undirected/temp/gene_to_index.json", "r") as file:
    dgidb2seq = json.load(file)
with open("./Data/hypergraphs/MSIGDB_HumanNet/human/gene_to_index.json", "r") as file:
    msigdb2seq = json.load(file)

def get_gene_by_index_msigdb(index):
    gene_to_index = msigdb2seq
    print(msigdb)
    # Invert the dictionary to map indices back to genes
    index_to_gene = {v: k for k, v in gene_to_index.items()}
    return index_to_gene.get(index, None)

def get_gene_by_index_dgidb(index):
    gene_to_index = dgidb2seq
    # Invert the dictionary to map indices back to genes
    index_to_gene = {v: k for k, v in gene_to_index.items()}
    return index_to_gene.get(index, None)


In [None]:
print(dgidb[(dgidb['ncbi_gene_id']) == 367])

In [None]:
NCBI_PATH = "./Data/ncbi/gene2refseq.gz"
NCBI_INFO = pd.read_csv(NCBI_PATH, sep='\t', compression='gzip')

In [None]:
import pandas as pd

# Assume NCBI_INFO is your DataFrame
# Filter for Homo sapiens genes (tax_id = 9606)
human_gene2refseq = NCBI_INFO[NCBI_INFO['#tax_id'] == 9606]

# Create a mapping from NCBI gene IDs (GeneID) to gene claim names (Symbol)
id_to_gene_claim = pd.Series(human_gene2refseq.Symbol.values, index=human_gene2refseq.GeneID).to_dict()
gene_claim_to_id = pd.Series(human_gene2refseq.GeneID.values, index=human_gene2refseq.Symbol).to_dict()

# Example usage for looking up the gene claim name using an NCBI Gene ID
ncbi_gene_id = 8242  # Replace with your desired NCBI Gene ID
gene_claim_name = id_to_gene_claim.get(ncbi_gene_id, "Gene claim name not found")
print(f"Gene claim name for NCBI Gene ID {ncbi_gene_id}: {gene_claim_name}")

# Example usage for looking up the NCBI Gene ID using a gene claim name
gene_claim_name = "HTR2C"  # Replace with your desired gene claim name
ncbi_gene_id = gene_claim_to_id.get(gene_claim_name, "NCBI gene ID not found")
print(f"NCBI Gene ID for Gene Claim Name {gene_claim_name}: {ncbi_gene_id}")


In [None]:
# # Function to get the gene_claim_name from ncbi_gene_id
# def get_gene_claim_name(ncbi_gene_id):
#     ncbi_gene_id = int(ncbi_gene_id)
#     # result = dgidb[dgidb['ncbi_gene_id'] == ncbi_gene_id]
#     result = dgidb[(dgidb['ncbi_gene_id']) == ncbi_gene_id]
#     if not result.empty:
#         return result['gene_name'].values[0]
#     else:
#         return "Gene name not found"
def get_gene_claim_name_via_NCBI(ncbi_gene_id):
    return id_to_gene_claim.get(ncbi_gene_id, "NCBI GENE CLIAIM NAME NOT FOUND")
    
print(result["Importance"][:10])
for gene, score in result["Importance"][:10]:
    print(gene)
    ncbi_gene = get_gene_by_index_dgidb(gene)
    if not ncbi_gene:
        ncbi_gene = get_gene_by_index_msigdb(gene)
    claim_name = get_gene_claim_name_via_NCBI(int(ncbi_gene))
    print(f"Gene {ncbi_gene}, Claim Name: {claim_name} : {score:.6f}")

In [4]:
# Collect the data for the TSV file
data = []
for gene, score in result["Importance"][:10]:
    ncbi_gene = get_gene_by_index_dgidb(gene)
    if not ncbi_gene:
        ncbi_gene = get_gene_by_index_msigdb(gene)
    claim_name = get_gene_claim_name_via_NCBI(int(ncbi_gene))
    data.append({"Gene_Claim_Name": claim_name, "Importance_Score": score, "NCBI_Gene_ID": ncbi_gene})

# Create a pandas DataFrame
df = pd.DataFrame(data)

# Save the DataFrame to a TSV file
df.to_csv('gene_claims_importance.tsv', sep='\t', index=False)

print("Data successfully written to gene_claims_importance.tsv")

NameError: name 'result' is not defined