In [1]:
import pandas as pd
from scipy.sparse import dok_matrix, save_npz
from tqdm import tqdm
tqdm.pandas()  # Enable tqdm for pandas
import json
MSIGDB_PATH = "Data/MSigDB/c2.all.v2024.1.Hs.json"
with open(MSIGDB_PATH, 'r') as file:
    MSIGDB = json.load(file)
DDDB_PATH = "./Data/DDDB/NIHMS851432-supplement-1.csv"
DDDB = pd.read_csv(DDDB_PATH)
NCBI_PATH = "./Data/ncbi/gene2refseq.gz"
NCBI_INFO = pd.read_csv(NCBI_PATH, sep='\t', compression='gzip')

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
pathway_to_genes = {pathway: data["geneSymbols"] for pathway, data in MSIGDB.items()}
print(pathway_to_genes["IBRAHIM_NRF2_UP"])

['ABCB1', 'ABCC4', 'ABCD3', 'ABHD4', 'ABRAXAS2', 'ACAP2', 'ACBD3', 'ACER3', 'ACKR3', 'ACSL1', 'ACTR3', 'ADI1', 'AFG3L2', 'AFTPH', 'AGFG1', 'AHCYL1', 'AHSA1', 'AIDA', 'AIFM2', 'AIRIM', 'AK4', 'AKR1B10', 'AKR1C1', 'AKR1C2', 'AKR1C3', 'ALDH1L2', 'ALDH2', 'AMFR', 'ANKRD42', 'ANXA5', 'ANXA7', 'AP3S2', 'APCDD1L-DT', 'ARCN1', 'ARFGEF1', 'ARL5A', 'ARL8B', 'ARMT1', 'ARPC2', 'ARRDC4', 'ASAP2', 'ASF1A', 'ASNS', 'ASPH', 'ATMIN', 'ATP10D', 'ATP11B', 'ATP1A3', 'ATXN10', 'AVEN', 'B3GALNT2', 'B4GALNT1', 'BACH1', 'BAG2', 'BAG3', 'BCAP31', 'BCL2L13', 'BLOC1S5-TXNDC5', 'BLTP3B', 'BMP6', 'BPNT2', 'BRAP', 'BRD7', 'BTBD10', 'C22orf23', 'CALU', 'CANX', 'CARS1', 'CBFB', 'CBR1', 'CCDC47', 'CCDC59', 'CCSAP', 'CCT5', 'CCT6A', 'CCT7', 'CD44', 'CDC123', 'CEBPG', 'CEP20', 'CEP290', 'CEP85', 'CFAP97', 'CLCC1', 'CLIC4', 'CLINT1', 'CLIP1', 'CLIP4', 'CLU', 'CMAS', 'CMPK1', 'CORO1C', 'CSDE1', 'CSGALNACT2', 'CSNK1A1', 'CSNK1G3', 'CSNK2A2', 'CTBS', 'CTNNBL1', 'CTSL', 'CUL1', 'CUL2', 'CUL3', 'DCTN4', 'DDX21', 'DENR', 'DESI

In [6]:
# Filter for Homo sapiens genes (tax_id = 9606)
human_gene2refseq = NCBI_INFO[NCBI_INFO['#tax_id'] == 9606]

# Create a mapping from gene claim names (Symbol) to NCBI gene IDs (GeneID)
gene_claim_to_id = pd.Series(human_gene2refseq.GeneID.values, index=human_gene2refseq.Symbol).to_dict()
gene_claim_name = "TP53"
ncbi_gene_id = gene_claim_to_id.get(gene_claim_name, "Gene name not found")
print(f"NCBI Gene ID for {gene_claim_name}: {ncbi_gene_id}")

NCBI Gene ID for TP53: 7157


In [4]:
def get_ncbi_gene_id(gene_claim_name):
    return gene_claim_to_id.get(gene_claim_name, None)

In [7]:
import json

# Initialize the new dictionary
pathway_to_ncbi_ids = {}

# Iterate through each pathway and gene symbols
for pathway, gene_symbols in pathway_to_genes.items():
    ncbi_ids = [get_ncbi_gene_id(gene) for gene in gene_symbols]
    # Filter out any None values if gene symbol was not found in the mapping
    ncbi_ids = [ncbi_id for ncbi_id in ncbi_ids if ncbi_id is not None]
    pathway_to_ncbi_ids[pathway] = ncbi_ids

# Save the new dictionary to a JSON file
with open('pathway_to_ncbi_ids.json', 'w') as json_file:
    json.dump(pathway_to_ncbi_ids, json_file, indent=4)

print("JSON file 'pathway_to_ncbi_ids.json' has been created successfully!")


JSON file 'pathway_to_ncbi_ids.json' has been created successfully!


In [8]:
ndfrt_values = DDDB.loc[DDDB["SNOMED"] == 13746004, "NDF-RT"].to_numpy()
print(ndfrt_values)


['N0000146152' 'N0000147738' 'N0000148257' 'N0000148465' 'N0000148690'
 'N0000022085' 'N0000146214' 'N0000146226' 'N0000146247' 'N0000146289'
 'N0000147028' 'N0000147608' 'N0000147892' 'N0000148350' 'N0000148390'
 'N0000148786']


In [9]:
import requests

# Set your BioPortal API key here
BIOPORTAL_API_KEY = "ddb8139c-1d85-4ccf-96c0-2f855b5114ae"

def get_drug_name_from_ndfrt(ndfrt_code):
    """Fetches the drug name for a given NDF-RT code using the BioPortal API."""
    base_url = f"https://data.bioontology.org/ontologies/NDFRT/classes/http%3A%2F%2Fpurl.bioontology.org%2Fontology%2FNDFRT%2F{ndfrt_code}"
    
    headers = {"Authorization": f"apikey token={BIOPORTAL_API_KEY}"}
    
    response = requests.get(base_url, headers=headers)
    
    if response.status_code == 200:
        data = response.json()
        return data.get("prefLabel", "Unknown Drug")  # Extract drug name (preferred label)
    else:
        print(f"Error {response.status_code}: {response.text}")
        return "Unknown Drug"

# Example Usage
ndfrt_code = "N0000004713"
drug_name = get_drug_name_from_ndfrt(ndfrt_code)
print(f"NDF-RT Code: {ndfrt_code} -> Drug Name: {drug_name}")


NDF-RT Code: N0000004713 -> Drug Name: donepezil [Chemical/Ingredient]


In [10]:
bipolar_drugs = []
for drug in ndfrt_values:
    bipolar_drugs.append(get_drug_name_from_ndfrt(drug))
print(bipolar_drugs)

['CLONAZEPAM', 'BUPROPION', 'SERTRALINE', 'OLANZAPINE', 'ZIPRASIDONE', 'QUETIAPINE', 'CHLORPROMAZINE', 'CARBAMAZEPINE', 'ALLOPURINOL', 'PERPHENAZINE', 'VALPROIC ACID', 'CLOZAPINE', 'LITHIUM', 'RISPERIDONE', 'LAMOTRIGINE', 'ARIPIPRAZOLE']


In [12]:
DGIDB =pd.read_csv("./Data/DGIDB/converted/human/dgidb_ncbi_v3.csv")
# Convert to Pandas Series for optimized counting
bipolar_drug_counts = {drug: (DGIDB["drug_claim_name"] == drug).sum() for drug in bipolar_drugs}

# Print results
for drug, count in bipolar_drug_counts.items():
    print(f"Drug: {drug} -> Appears {count} times in DGIDB")


Drug: CLONAZEPAM -> Appears 17 times in DGIDB
Drug: BUPROPION -> Appears 1 times in DGIDB
Drug: SERTRALINE -> Appears 7 times in DGIDB
Drug: OLANZAPINE -> Appears 7 times in DGIDB
Drug: ZIPRASIDONE -> Appears 2 times in DGIDB
Drug: QUETIAPINE -> Appears 2 times in DGIDB
Drug: CHLORPROMAZINE -> Appears 19 times in DGIDB
Drug: CARBAMAZEPINE -> Appears 25 times in DGIDB
Drug: ALLOPURINOL -> Appears 6 times in DGIDB
Drug: PERPHENAZINE -> Appears 14 times in DGIDB
Drug: VALPROIC ACID -> Appears 36 times in DGIDB
Drug: CLOZAPINE -> Appears 26 times in DGIDB
Drug: LITHIUM -> Appears 19 times in DGIDB
Drug: RISPERIDONE -> Appears 14 times in DGIDB
Drug: LAMOTRIGINE -> Appears 17 times in DGIDB
Drug: ARIPIPRAZOLE -> Appears 3 times in DGIDB


  interactivity=interactivity, compiler=compiler, result=result)


In [13]:
import json

# Filter DGIDB for rows where "drug_claim_name" is in bipolar_drugs
filtered_dgidb = DGIDB[DGIDB["drug_claim_name"].isin(bipolar_drugs)]
# Count occurrences of each unique ncbi_gene_id
# Convert ncbi_gene_id to string before counting
ncbi_gene_counts = (
    filtered_dgidb["ncbi_gene_id"]
    .astype(int)  # Convert to integer to drop .0
    .astype(str)  # Convert to string
    .value_counts()
    .to_dict()
)

# Save JSON with correct format
with open("bipolar_gene_counts.json", "w") as f:
    json.dump(ncbi_gene_counts, f, indent=4)

print("Fixed JSON saved as 'bipolar_gene_counts.json'")


# Print confirmation and first few entries
print(json.dumps(dict(list(ncbi_gene_counts.items())[:10]), indent=4))  # Preview first 10


Fixed JSON saved as 'bipolar_gene_counts.json'
{
    "1813": 11,
    "1576": 9,
    "1544": 8,
    "1565": 7,
    "3356": 7,
    "1559": 6,
    "1557": 5,
    "1812": 3,
    "5617": 3,
    "3952": 3
}


In [18]:
from scipy.sparse import dok_matrix, save_npz
import pandas as pd
import numpy as np
with open("pathway_to_ncbi_ids.json", "r") as f:
    pathway_data = json.load(f)
# Get unique genes and pathways
genes = sorted(set(gene for genes in pathway_data.values() for gene in genes))
pathways = list(pathway_data.keys())

# Create mappings
gene_to_index = {gene: i for i, gene in enumerate(genes)}
pathway_to_index = {pathway: j for j, pathway in enumerate(pathways)}

# Initialize DOK matrix
binary_incidence_matrix = dok_matrix((len(genes), len(pathways)), dtype=np.int8)

# Populate the matrix
for pathway, gene_list in pathway_data.items():
    j = pathway_to_index[pathway]  # Column index for pathway
    for gene in gene_list:
        i = gene_to_index[gene]  # Row index for gene
        binary_incidence_matrix[i, j] = 1  # Binary presence

# Convert to CSR format for efficiency
binary_csr_matrix = binary_incidence_matrix.tocsr()

# Save the matrix
save_npz("hypergraph_incidence_matrix_msigdb_using_dict_with_missing_genes.npz", binary_csr_matrix)

# Print confirmation
print("Binary incidence matrix saved as 'hypergraph_incidence_matrix_msigdb_using_dict_with_missing_genes.npz'.")

Binary incidence matrix saved as 'hypergraph_incidence_matrix_msigdb_using_dict_with_missing_genes.npz'.


In [19]:
print(binary_incidence_matrix)

  (328, 0)	1
  (427, 0)	1
  (2012, 0)	1
  (7272, 0)	1
  (6103, 0)	1
  (4398, 1)	1
  (2012, 1)	1
  (4404, 1)	1
  (3104, 1)	1
  (3965, 1)	1
  (95, 2)	1
  (106, 2)	1
  (16487, 2)	1
  (278, 2)	1
  (19176, 2)	1
  (9275, 2)	1
  (15772, 2)	1
  (554, 2)	1
  (5984, 2)	1
  (1380, 2)	1
  (849, 2)	1
  (4413, 2)	1
  (17944, 2)	1
  (1387, 2)	1
  (10734, 2)	1
  :	:
  (8756, 7410)	1
  (12745, 7410)	1
  (6993, 7410)	1
  (10738, 7410)	1
  (5121, 7410)	1
  (5122, 7410)	1
  (5124, 7410)	1
  (5150, 7410)	1
  (5151, 7410)	1
  (5152, 7410)	1
  (11020, 7410)	1
  (13940, 7410)	1
  (20548, 7410)	1
  (19604, 7410)	1
  (20392, 7410)	1
  (9627, 7410)	1
  (7671, 7410)	1
  (5175, 7410)	1
  (5176, 7410)	1
  (5177, 7410)	1
  (5178, 7410)	1
  (5179, 7410)	1
  (9208, 7410)	1
  (5199, 7410)	1
  (13307, 7410)	1


In [26]:
bipolar_genes = list(ncbi_gene_counts.keys())
print(bipolar_genes)


['1813', '1576', '1544', '1565', '3356', '1559', '1557', '1812', '5617', '3952', '3363', '7157', '4922', '3757', '9734', '627', '6336', '3065', '7124', '55869', '7133', '11280', '1814', '2678', '6326', '6335', '10014', '3417', '6332', '8841', '6323', '355', '1815', '2194', '7915', '6750', '6329', '9759', '3358', '51564', '6328', '10951', '367', '6311', '3066', '7066', '3308', '6334', '6331', '6714', '7252', '885', '2554', '2561', '4915', '83933', '3350', '5243', '3727', '4005', '3628', '2559', '818', '3106', '5538', '4158', '2908', '51053', '51738', '2475', '2567', '4908', '983', '4609', '1432', '2565', '2556', '1571', '2560', '79885', '4780', '1543', '11201', '54737', '4353', '1555', '2562', '5893', '5294', '632', '2548', '2671', '3558', '2564', '3553', '6606', '596', '2558', '361', '1577', '9429', '2668', '1072', '18', '10013', '2796', '2568', '3725', '2563', '4907', '8314', '3760', '7498', '7334', '2566', '2670', '10919', '6505', '2555', '2557', '55879']


In [20]:
import json

# Create pathway_to_index mapping
pathway_to_index = {pathway: idx for idx, pathway in enumerate(pathway_to_genes)}

# Save gene_to_index as a separate JSON file
with open("gene_to_index.json", "w") as gene_json_file:
    json.dump(gene_to_index, gene_json_file, indent=4)

# Save pathway_to_index as a separate JSON file
with open("pathway_to_index.json", "w") as pathway_json_file:
    json.dump(pathway_to_index, pathway_json_file, indent=4)

print("gene_to_index and pathway_to_index have been saved as separate JSON files.")


gene_to_index and pathway_to_index have been saved as separate JSON files.


In [27]:
# Initialize DOK matrix
weighted_incidence_matrix = dok_matrix((len(gene_to_index), len(pathway_to_index)), dtype=float)

# Populate the matrix with weights
for pathway, gene_list in pathway_data.items():
    j = pathway_to_index[pathway]  # Column index for pathway
    total_genes = len(gene_list)
    
    # Count bipolar genes in this pathway
    num_bipolar_genes = sum(1 for gene in gene_list if str(gene) in bipolar_genes)
    
    # Compute weight
    weight = num_bipolar_genes / total_genes if total_genes > 0 else 0
    
    # Assign weight to all involved genes
    for gene in gene_list:
        i = gene_to_index[gene]  # Row index for gene
        weighted_incidence_matrix[i, j] = weight  # Assign weight instead of binary 1

# Convert to CSR format for efficiency
weighted_csr_matrix = weighted_incidence_matrix.tocsr()

# Save the matrix
save_npz("hypergraph_weighted_incidence_matrix.npz", weighted_csr_matrix)

# Print confirmation
print("Weighted incidence matrix saved as 'hypergraph_weighted_incidence_matrix.npz'.")


Weighted incidence matrix saved as 'hypergraph_weighted_incidence_matrix.npz'.


In [28]:
print(weighted_incidence_matrix)

  (4398, 1)	0.2
  (2012, 1)	0.2
  (4404, 1)	0.2
  (3104, 1)	0.2
  (3965, 1)	0.2
  (95, 2)	0.015384615384615385
  (106, 2)	0.015384615384615385
  (16487, 2)	0.015384615384615385
  (278, 2)	0.015384615384615385
  (19176, 2)	0.015384615384615385
  (9275, 2)	0.015384615384615385
  (15772, 2)	0.015384615384615385
  (554, 2)	0.015384615384615385
  (5984, 2)	0.015384615384615385
  (1380, 2)	0.015384615384615385
  (849, 2)	0.015384615384615385
  (4413, 2)	0.015384615384615385
  (17944, 2)	0.015384615384615385
  (1387, 2)	0.015384615384615385
  (10734, 2)	0.015384615384615385
  (1623, 2)	0.015384615384615385
  (1630, 2)	0.015384615384615385
  (18307, 2)	0.015384615384615385
  (1677, 2)	0.015384615384615385
  (1635, 2)	0.015384615384615385
  :	:
  (9225, 7402)	0.1111111111111111
  (1337, 7402)	0.1111111111111111
  (8397, 7402)	0.1111111111111111
  (4917, 7402)	0.1111111111111111
  (4918, 7402)	0.1111111111111111
  (5029, 7402)	0.1111111111111111
  (5031, 7402)	0.1111111111111111
  (5812, 7402)	0