In [1]:
import pandas as pd
from scipy.sparse import dok_matrix, save_npz
from tqdm import tqdm
tqdm.pandas()  # Enable tqdm for pandas
import json
MSIGDB_PATH = "Data/MSigDB/c2.all.v2024.1.Hs.json"
with open(MSIGDB_PATH, 'r') as file:
    MSIGDB = json.load(file)
DDDB_PATH = "./Data/DDDB/NIHMS851432-supplement-1.csv"
DDDB = pd.read_csv(DDDB_PATH)
NCBI_PATH = "./Data/ncbi/gene2refseq.gz"
NCBI_INFO = pd.read_csv(NCBI_PATH, sep='\t', compression='gzip')

  interactivity=interactivity, compiler=compiler, result=result)


In [2]:
pathway_to_genes = {pathway: data["geneSymbols"] for pathway, data in MSIGDB.items()}
print(pathway_to_genes["IBRAHIM_NRF2_UP"])

['ABCB1', 'ABCC4', 'ABCD3', 'ABHD4', 'ABRAXAS2', 'ACAP2', 'ACBD3', 'ACER3', 'ACKR3', 'ACSL1', 'ACTR3', 'ADI1', 'AFG3L2', 'AFTPH', 'AGFG1', 'AHCYL1', 'AHSA1', 'AIDA', 'AIFM2', 'AIRIM', 'AK4', 'AKR1B10', 'AKR1C1', 'AKR1C2', 'AKR1C3', 'ALDH1L2', 'ALDH2', 'AMFR', 'ANKRD42', 'ANXA5', 'ANXA7', 'AP3S2', 'APCDD1L-DT', 'ARCN1', 'ARFGEF1', 'ARL5A', 'ARL8B', 'ARMT1', 'ARPC2', 'ARRDC4', 'ASAP2', 'ASF1A', 'ASNS', 'ASPH', 'ATMIN', 'ATP10D', 'ATP11B', 'ATP1A3', 'ATXN10', 'AVEN', 'B3GALNT2', 'B4GALNT1', 'BACH1', 'BAG2', 'BAG3', 'BCAP31', 'BCL2L13', 'BLOC1S5-TXNDC5', 'BLTP3B', 'BMP6', 'BPNT2', 'BRAP', 'BRD7', 'BTBD10', 'C22orf23', 'CALU', 'CANX', 'CARS1', 'CBFB', 'CBR1', 'CCDC47', 'CCDC59', 'CCSAP', 'CCT5', 'CCT6A', 'CCT7', 'CD44', 'CDC123', 'CEBPG', 'CEP20', 'CEP290', 'CEP85', 'CFAP97', 'CLCC1', 'CLIC4', 'CLINT1', 'CLIP1', 'CLIP4', 'CLU', 'CMAS', 'CMPK1', 'CORO1C', 'CSDE1', 'CSGALNACT2', 'CSNK1A1', 'CSNK1G3', 'CSNK2A2', 'CTBS', 'CTNNBL1', 'CTSL', 'CUL1', 'CUL2', 'CUL3', 'DCTN4', 'DDX21', 'DENR', 'DESI

In [6]:
# Filter for Homo sapiens genes (tax_id = 9606)
human_gene2refseq = NCBI_INFO[NCBI_INFO['#tax_id'] == 9606]

# Create a mapping from gene claim names (Symbol) to NCBI gene IDs (GeneID)
gene_claim_to_id = pd.Series(human_gene2refseq.GeneID.values, index=human_gene2refseq.Symbol).to_dict()
gene_claim_name = "A1BG"
ncbi_gene_id = gene_claim_to_id.get(gene_claim_name, "Gene name not found")
print(f"NCBI Gene ID for {gene_claim_name}: {ncbi_gene_id}")

NCBI Gene ID for A1BG: 1


In [4]:
def get_ncbi_gene_id(gene_claim_name):
    return gene_claim_to_id.get(gene_claim_name, None)

In [5]:
import json

# Initialize the new dictionary
pathway_to_ncbi_ids = {}

# Iterate through each pathway and gene symbols
for pathway, gene_symbols in pathway_to_genes.items():
    ncbi_ids = [get_ncbi_gene_id(gene) for gene in gene_symbols]
    # Filter out any None values if gene symbol was not found in the mapping
    ncbi_ids = [ncbi_id for ncbi_id in ncbi_ids if ncbi_id is not None]
    pathway_to_ncbi_ids[pathway] = ncbi_ids

# Save the new dictionary to a JSON file
with open('pathway_to_ncbi_ids.json', 'w') as json_file:
    json.dump(pathway_to_ncbi_ids, json_file, indent=4)

print("JSON file 'pathway_to_ncbi_ids.json' has been created successfully!")


JSON file 'pathway_to_ncbi_ids.json' has been created successfully!


In [7]:
from scipy.sparse import dok_matrix, save_npz
import pandas as pd
import numpy as np
with open("./Data/MSigDB/converted/v3_pathway.json", "r") as f:
    pathway_data = json.load(f)
# Get unique genes and pathways
# Ensure all gene IDs are strings
pathway_data = {pathway: [str(gene) for gene in genes] for pathway, genes in pathway_data.items()}

# Get unique genes and pathways
genes = sorted(set(gene for genes in pathway_data.values() for gene in genes))
pathways = list(pathway_data.keys())

# Create mappings
gene_to_index = {gene: i for i, gene in enumerate(genes)}
pathway_to_index = {pathway: j for j, pathway in enumerate(pathways)}

# Initialize DOK matrix
binary_incidence_matrix = dok_matrix((len(genes), len(pathways)), dtype=np.int8)

# Populate the matrix
for pathway, gene_list in pathway_data.items():
    j = pathway_to_index[pathway]  # Column index for pathway
    for gene in gene_list:
        i = gene_to_index[gene]  # Row index for gene
        binary_incidence_matrix[i, j] = 1  # Binary presence

# Convert to CSR format for efficiency
binary_csr_matrix = binary_incidence_matrix.tocsr()

# Save the matrix
save_npz("hypergraph_incidence_matrix_msigdb_using_dict_with_missing_genes.npz", binary_csr_matrix)

# Print confirmation
print("Binary incidence matrix saved as 'hypergraph_incidence_matrix_msigdb_using_dict_with_missing_genes.npz'.")

Binary incidence matrix saved as 'hypergraph_incidence_matrix_msigdb_using_dict_with_missing_genes.npz'.


In [8]:
print(binary_incidence_matrix)

  (11579, 0)	1
  (15659, 0)	1
  (8820, 0)	1
  (1485, 0)	1
  (20736, 0)	1
  (15975, 1)	1
  (8820, 1)	1
  (15982, 1)	1
  (11873, 1)	1
  (15042, 1)	1
  (3648, 2)	1
  (4332, 2)	1
  (4584, 2)	1
  (10093, 2)	1
  (10953, 2)	1
  (7845, 2)	1
  (3698, 2)	1
  (18103, 2)	1
  (20519, 2)	1
  (5235, 2)	1
  (2784, 2)	1
  (15994, 2)	1
  (8440, 2)	1
  (5246, 2)	1
  (13637, 2)	1
  :	:
  (7170, 7410)	1
  (18262, 7410)	1
  (985, 7410)	1
  (13642, 7410)	1
  (17807, 7410)	1
  (17808, 7410)	1
  (17811, 7410)	1
  (17844, 7410)	1
  (17845, 7410)	1
  (17846, 7410)	1
  (13956, 7410)	1
  (19724, 7410)	1
  (18289, 7410)	1
  (11670, 7410)	1
  (17662, 7410)	1
  (8880, 7410)	1
  (2230, 7410)	1
  (17871, 7410)	1
  (17872, 7410)	1
  (17873, 7410)	1
  (17874, 7410)	1
  (17875, 7410)	1
  (7769, 7410)	1
  (17897, 7410)	1
  (18869, 7410)	1


In [9]:
import json

# Create pathway_to_index mapping
pathway_to_index = {pathway: idx for idx, pathway in enumerate(pathway_to_genes)}

# Save gene_to_index as a separate JSON file
with open("gene_to_index.json", "w") as gene_json_file:
    json.dump(gene_to_index, gene_json_file, indent=4)

# Save pathway_to_index as a separate JSON file
with open("pathway_to_index.json", "w") as pathway_json_file:
    json.dump(pathway_to_index, pathway_json_file, indent=4)

print("gene_to_index and pathway_to_index have been saved as separate JSON files.")


gene_to_index and pathway_to_index have been saved as separate JSON files.


In [11]:
HUMANNET_PATH = "./Data/HumanNet/HumanNet-GSP.tsv"
HUMANNET = pd.read_csv(HUMANNET_PATH, sep="\t")

In [12]:
from scipy.sparse import dok_matrix, save_npz
import pandas as pd

# Calculate gene degrees in HumanNet
genes_in_humannet = pd.unique(HUMANNET[['Gene1', 'Gene2']].values.ravel())
gene_to_degree = {gene: 0 for gene in genes_in_humannet}

for _, row in HUMANNET.iterrows():
    gene_to_degree[row["Gene1"]] += 1
    gene_to_degree[row["Gene2"]] += 1

# Initialize a sparse incidence matrix
weighted_incidence_matrix = dok_matrix((len(genes), len(pathways)), dtype=float)

# Populate the incidence matrix with degree as weight
for pathway, gene_list in pathway_data.items():
    j = pathway_to_index[pathway]  # Column index for pathway
    
    # Assign degree as weight to all involved genes
    for gene in gene_list:
        if gene in gene_to_index:
            i = gene_to_index[gene]  # Row index for gene
            degree = gene_to_degree.get(gene, 0.01)  # Fallback to 0.01 if gene is missing
            weighted_incidence_matrix[i, j] = degree

# Convert to CSR format for efficiency
weighted_csr_matrix = weighted_incidence_matrix.tocsr()

# Save the matrix as .npz file
save_npz("hypergraph_weighted_incidence_matrix.npz", weighted_csr_matrix)

# Print confirmation
print("Weighted incidence matrix saved as 'hypergraph_weighted_incidence_matrix.npz'.")


Weighted incidence matrix saved as 'hypergraph_weighted_incidence_matrix.npz'.


In [13]:
print(weighted_incidence_matrix)

  (11579, 0)	0.01
  (15659, 0)	0.01
  (8820, 0)	0.01
  (1485, 0)	0.01
  (20736, 0)	0.01
  (15975, 1)	0.01
  (8820, 1)	0.01
  (15982, 1)	0.01
  (11873, 1)	0.01
  (15042, 1)	0.01
  (3648, 2)	0.01
  (4332, 2)	0.01
  (4584, 2)	0.01
  (10093, 2)	0.01
  (10953, 2)	0.01
  (7845, 2)	0.01
  (3698, 2)	0.01
  (18103, 2)	0.01
  (20519, 2)	0.01
  (5235, 2)	0.01
  (2784, 2)	0.01
  (15994, 2)	0.01
  (8440, 2)	0.01
  (5246, 2)	0.01
  (13637, 2)	0.01
  :	:
  (7170, 7410)	0.01
  (18262, 7410)	0.01
  (985, 7410)	0.01
  (13642, 7410)	0.01
  (17807, 7410)	0.01
  (17808, 7410)	0.01
  (17811, 7410)	0.01
  (17844, 7410)	0.01
  (17845, 7410)	0.01
  (17846, 7410)	0.01
  (13956, 7410)	0.01
  (19724, 7410)	0.01
  (18289, 7410)	0.01
  (11670, 7410)	0.01
  (17662, 7410)	0.01
  (8880, 7410)	0.01
  (2230, 7410)	0.01
  (17871, 7410)	0.01
  (17872, 7410)	0.01
  (17873, 7410)	0.01
  (17874, 7410)	0.01
  (17875, 7410)	0.01
  (7769, 7410)	0.01
  (17897, 7410)	0.01
  (18869, 7410)	0.01
