In [49]:
import pandas as pd
import ast
import os
from scipy.sparse import dok_matrix, save_npz
import json
import numpy as np
OUTPUT_FOLDER = "./output/MSigDB_FULL/"
os.makedirs(OUTPUT_FOLDER, exist_ok=True)
MSIGDB = pd.read_csv("../Data/MSigDB/PathwayToGene.tsv", sep="\t")
HUMANNET = pd.read_csv("../Data/HumanNet/HumanNet-GSP.tsv", sep="\t")

In [50]:
# Convert stringified lists to actual lists
MSIGDB["ncbi_gene_ids"] = MSIGDB["ncbi_gene_ids"].apply(ast.literal_eval)
MSIGDB["ncbi_gene_ids"] = MSIGDB["ncbi_gene_ids"].apply(lambda genes: [str(g) for g in genes])

In [51]:

# Get unique genes
all_genes = sorted(set(gene for gene_list in MSIGDB["ncbi_gene_ids"] for gene in gene_list))
# Get unique pathways
all_pathways = MSIGDB["pathway"].tolist()

In [53]:

# Create mappings
gene_to_index = {gene: i for i, gene in enumerate(all_genes)}
pathway_to_index = {pathway: j for j, pathway in enumerate(all_pathways)}
# Define file paths
gene_to_index_path = OUTPUT_FOLDER + "gene_to_index.json"
pathway_to_index_path = OUTPUT_FOLDER + "pathway_to_index.json"

# Save gene_to_index mapping
with open(gene_to_index_path, 'w') as gene_file:
    json.dump(gene_to_index, gene_file, indent=4)

# Save drug_to_index mapping
with open(pathway_to_index_path, 'w') as pathway_file:
    json.dump(pathway_to_index, pathway_file, indent=4)
print(f"Mappings saved to {gene_to_index_path} and {pathway_to_index_path}.")


Mappings saved to ./output/MSigDB_FULL/gene_to_index.json and ./output/MSigDB_FULL/pathway_to_index.json.


In [54]:
# Calculate gene degrees in HumanNet
genes_in_humannet = pd.unique(HUMANNET[['Gene1', 'Gene2']].values.ravel())
gene_to_degree = {gene: 0 for gene in genes_in_humannet}

for _, row in HUMANNET.iterrows():
    gene_to_degree[row["Gene1"]] += 1
    gene_to_degree[row["Gene2"]] += 1


In [56]:
# Initialize DOK matrix
binary_incidence_matrix = dok_matrix((len(all_genes), len(all_pathways)), dtype=np.int8)
weighted_incidence_matrix = dok_matrix((len(all_genes), len(all_pathways)), dtype=float)

# Populate the matrix
for _, row in MSIGDB.iterrows():
    j = pathway_to_index[row["pathway"]]
    for gene in row["ncbi_gene_ids"]:
        i = gene_to_index[gene]  # Row index for gene
        binary_incidence_matrix[i, j] = 1  # Binary presence
        weighted_incidence_matrix[i, j] = gene_to_degree.get(gene, 0.01)

# Convert the DOK matrix to CSR format
csr_matrix = weighted_incidence_matrix.tocsr()
binary_csr_matrix = binary_incidence_matrix.tocsr()

# Save the matrix as .npz file
save_npz(OUTPUT_FOLDER + "hypergraph_incidence_matrix_weighted.npz", csr_matrix)
save_npz(OUTPUT_FOLDER + "hypergraph_incidence_matrix_binary.npz", binary_csr_matrix)

# Print confirmation
print(f"Weighted incidence matrix saved as {OUTPUT_FOLDER}/hypergraph_incidence_matrix_weighted.npz'.")
print(f"Binary incidence matrix saved as {OUTPUT_FOLDER}/hypergraph_incidence_matrix_binary.npz'.")

Weighted incidence matrix saved as ./output/MSigDB_FULL//hypergraph_incidence_matrix_weighted.npz'.
Binary incidence matrix saved as ./output/MSigDB_FULL//hypergraph_incidence_matrix_binary.npz'.
