In [2]:
import pandas as pd
import os
DGIDB = pd.read_csv("./Data/DGIDB/converted/human/dgidb_ncbi_v2.csv")
HUMANNET_PATH = "./Data/HumanNet/HumanNet-GSP.tsv"
HUMANNET = pd.read_csv(HUMANNET_PATH, sep="\t")
DDDB_PATH = "./Data/DDDB/NIHMS851432-supplement-1.csv"
DDDB = pd.read_csv(DDDB_PATH)
# Bipolar is 13746004 SNOMED
# Schizophrenia is 58214004 SNOMED
# ADHD is 406506008
DISEASE_SNOMED_CODES = [13746004]
OUTPUT_PATH = "./Data/hypergraphs/DGIDB_HumanNet/human/undirected/bipolar/"
if not os.path.exists(OUTPUT_PATH):
    os.makedirs(OUTPUT_PATH)

In [3]:
import requests

# Set your BioPortal API key here
BIOPORTAL_API_KEY = "ddb8139c-1d85-4ccf-96c0-2f855b5114ae"

def get_drug_name_from_ndfrt(ndfrt_code):
    """Fetches the drug name for a given NDF-RT code using the BioPortal API."""
    base_url = f"https://data.bioontology.org/ontologies/NDFRT/classes/http%3A%2F%2Fpurl.bioontology.org%2Fontology%2FNDFRT%2F{ndfrt_code}"
    
    headers = {"Authorization": f"apikey token={BIOPORTAL_API_KEY}"}
    
    response = requests.get(base_url, headers=headers)
    
    if response.status_code == 200:
        data = response.json()
        return data.get("prefLabel", "Unknown Drug")  # Extract drug name (preferred label)
    else:
        print(f"Error {response.status_code}: {response.text}")
        return "Unknown Drug"

# Example Usage
ndfrt_code = "N0000004713"
drug_name = get_drug_name_from_ndfrt(ndfrt_code)
print(f"NDF-RT Code: {ndfrt_code} -> Drug Name: {drug_name}")


NDF-RT Code: N0000004713 -> Drug Name: donepezil [Chemical/Ingredient]


In [4]:
specific_disease_drugs = []
for disease_code in DISEASE_SNOMED_CODES:
    ndfrt_values = DDDB.loc[DDDB["SNOMED"] == disease_code, "NDF-RT"].to_numpy()
    for drug in ndfrt_values:
        specific_disease_drugs.append(get_drug_name_from_ndfrt(drug))
    print(disease_code, ":", len(ndfrt_values))

print(specific_disease_drugs)
print("Length: ", len(specific_disease_drugs))



13746004 : 16
['CLONAZEPAM', 'BUPROPION', 'SERTRALINE', 'OLANZAPINE', 'ZIPRASIDONE', 'QUETIAPINE', 'CHLORPROMAZINE', 'CARBAMAZEPINE', 'ALLOPURINOL', 'PERPHENAZINE', 'VALPROIC ACID', 'CLOZAPINE', 'LITHIUM', 'RISPERIDONE', 'LAMOTRIGINE', 'ARIPIPRAZOLE']
Length:  16


In [5]:
# Convert to Pandas Series for optimized counting
specific_disease_drug_counts = {drug: (DGIDB["drug_claim_name"] == drug).sum() for drug in specific_disease_drugs}

# Print results
for drug, count in specific_disease_drug_counts.items():
    print(f"Drug: {drug} -> Appears {count} times in DGIDB")


Drug: CLONAZEPAM -> Appears 54 times in DGIDB
Drug: BUPROPION -> Appears 5 times in DGIDB
Drug: SERTRALINE -> Appears 10 times in DGIDB
Drug: OLANZAPINE -> Appears 45 times in DGIDB
Drug: ZIPRASIDONE -> Appears 15 times in DGIDB
Drug: QUETIAPINE -> Appears 14 times in DGIDB
Drug: CHLORPROMAZINE -> Appears 19 times in DGIDB
Drug: CARBAMAZEPINE -> Appears 27 times in DGIDB
Drug: ALLOPURINOL -> Appears 8 times in DGIDB
Drug: PERPHENAZINE -> Appears 18 times in DGIDB
Drug: VALPROIC ACID -> Appears 40 times in DGIDB
Drug: CLOZAPINE -> Appears 42 times in DGIDB
Drug: LITHIUM -> Appears 19 times in DGIDB
Drug: RISPERIDONE -> Appears 18 times in DGIDB
Drug: LAMOTRIGINE -> Appears 18 times in DGIDB
Drug: ARIPIPRAZOLE -> Appears 9 times in DGIDB


In [6]:
# BIPOLAR HYPERGRAPH
from scipy.sparse import dok_matrix, save_npz
# Calculate gene degrees in HumanNet
genes_in_humannet = pd.unique(HUMANNET[['Gene1', 'Gene2']].values.ravel())
gene_to_degree = {gene: 0 for gene in genes_in_humannet}

for _, row in HUMANNET.iterrows():
    gene_to_degree[row["Gene1"]] += 1
    gene_to_degree[row["Gene2"]] += 1

# Add degrees to DGIDB with fallback to 0.01 for missing genes
DGIDB['ncbi_gene_id'] = DGIDB['ncbi_gene_id'].astype(str)
gene_to_degree = {str(gene): degree for gene, degree in gene_to_degree.items()}
DGIDB['degree'] = DGIDB['ncbi_gene_id'].map(gene_to_degree).fillna(0.01)

# Create mappings for vertices and hyperedges
genes = DGIDB['ncbi_gene_id'].unique()
# drugs = DGIDB['drug_name'].unique()
# Filter the drugs in DGIDB to include only bipolar drugs
filtered_drugs = [drug for drug in DGIDB['drug_name'].unique() if drug in specific_disease_drugs]

# Create a mapping for drug indices with only bipolar drugs
drug_to_index = {drug: i for i, drug in enumerate(filtered_drugs)}
gene_to_index = {gene: i for i, gene in enumerate(genes)}
# drug_to_index = {drug: i for i, drug in enumerate(drugs)}
# Initialize a sparse incidence matrix
incidence_matrix = dok_matrix((len(genes), len(filtered_drugs)), dtype=float)

# Populate the incidence matrix with degree as weight
for _, row in DGIDB.iterrows():
    if row['drug_name'] in specific_disease_drugs:  # Check if the drug is in the bipolar drugs list
        gene_idx = gene_to_index[row['ncbi_gene_id']]
        drug_idx = drug_to_index[row['drug_name']]
        degree = row['degree']
        incidence_matrix[gene_idx, drug_idx] = degree

# Convert the DOK matrix to CSR format
csr_matrix = incidence_matrix.tocsr()

# Save the matrix as .npz file
save_npz(OUTPUT_PATH + "hypergraph_incidence_matrix_weighted.npz", csr_matrix)
# Print confirmation
print("Weighted incidence matrix saved as 'hypergraph_incidence_matrix_weighted.npz'.")

Weighted incidence matrix saved as 'hypergraph_incidence_matrix_weighted.npz'.


In [7]:
import json

# Define file paths
gene_to_index_path = OUTPUT_PATH + "gene_to_index.json"
drug_to_index_path = OUTPUT_PATH + "drug_to_index.json"

# Save gene_to_index mapping
with open(gene_to_index_path, 'w') as gene_file:
    json.dump(gene_to_index, gene_file, indent=4)

# Save drug_to_index mapping
with open(drug_to_index_path, 'w') as drug_file:
    json.dump(drug_to_index, drug_file, indent=4)

print(f"Mappings saved to {gene_to_index_path} and {drug_to_index_path}.")


Mappings saved to ./Data/hypergraphs/DGIDB_HumanNet/human/undirected/bipolar/gene_to_index.json and ./Data/hypergraphs/DGIDB_HumanNet/human/undirected/bipolar/drug_to_index.json.


In [8]:
# Check length of filtered_drugs and drug_to_index
print("Number of filtered drugs:", len(filtered_drugs))
print("Number of drug indices:", len(drug_to_index))


Number of filtered drugs: 12
Number of drug indices: 12


In [9]:
from scipy.sparse import dok_matrix, save_npz
import pandas as pd
# Initialize a sparse binary incidence matrix
binary_incidence_matrix = dok_matrix((len(genes), len(filtered_drugs)), dtype=int)

# Populate the binary incidence matrix with 1 where there is a relationship
for _, row in DGIDB.iterrows():
    if row['drug_name'] in specific_disease_drugs:
        gene_idx = gene_to_index[row['ncbi_gene_id']]
        drug_idx = drug_to_index[row['drug_name']]
        binary_incidence_matrix[gene_idx, drug_idx] = 1

# Convert the DOK matrix to CSR format
binary_csr_matrix = binary_incidence_matrix.tocsr()

# Save the binary matrix as .npz file
save_npz(OUTPUT_PATH + "hypergraph_incidence_matrix_binary.npz", binary_csr_matrix)

# Print confirmation
print("Binary incidence matrix saved as 'hypergraph_incidence_matrix_binary.npz'.")


Binary incidence matrix saved as 'hypergraph_incidence_matrix_binary.npz'.


In [10]:
print(binary_incidence_matrix)

  (173, 0)	1
  (31, 1)	1
  (330, 2)	1
  (425, 3)	1
  (518, 4)	1
  (97, 5)	1
  (674, 0)	1
  (47, 6)	1
  (698, 1)	1
  (903, 2)	1
  (824, 1)	1
  (706, 1)	1
  (933, 2)	1
  (934, 2)	1
  (935, 2)	1
  (949, 3)	1
  (820, 1)	1
  (971, 7)	1
  (631, 8)	1
  (311, 8)	1
  (35, 8)	1
  (301, 4)	1
  (479, 7)	1
  (399, 7)	1
  (478, 3)	1
  :	:
  (238, 10)	1
  (85, 0)	1
  (1690, 8)	1
  (824, 0)	1
  (2248, 0)	1
  (1551, 0)	1
  (97, 10)	1
  (96, 5)	1
  (61, 8)	1
  (427, 4)	1
  (1333, 4)	1
  (366, 4)	1
  (140, 8)	1
  (436, 7)	1
  (47, 10)	1
  (2133, 5)	1
  (2538, 0)	1
  (822, 0)	1
  (927, 0)	1
  (4766, 0)	1
  (969, 0)	1
  (720, 0)	1
  (2610, 0)	1
  (4767, 0)	1
  (1299, 4)	1


In [11]:
print(incidence_matrix)

  (173, 0)	209.0
  (31, 1)	47.0
  (330, 2)	0.01
  (425, 3)	0.01
  (518, 4)	88.0
  (97, 5)	50.0
  (674, 0)	0.01
  (47, 6)	49.0
  (698, 1)	458.0
  (903, 2)	0.01
  (824, 1)	0.01
  (706, 1)	25.0
  (933, 2)	4.0
  (934, 2)	0.01
  (935, 2)	12.0
  (949, 3)	29.0
  (820, 1)	78.0
  (971, 7)	27.0
  (631, 8)	136.0
  (311, 8)	105.0
  (35, 8)	100.0
  (301, 4)	1.0
  (479, 7)	12.0
  (399, 7)	0.01
  (478, 3)	166.0
  :	:
  (238, 10)	45.0
  (85, 0)	62.0
  (1690, 8)	72.0
  (824, 0)	0.01
  (2248, 0)	244.0
  (1551, 0)	41.0
  (97, 10)	50.0
  (96, 5)	20.0
  (61, 8)	268.0
  (427, 4)	1.0
  (1333, 4)	33.0
  (366, 4)	10.0
  (140, 8)	138.0
  (436, 7)	28.0
  (47, 10)	49.0
  (2133, 5)	249.0
  (2538, 0)	0.01
  (822, 0)	96.0
  (927, 0)	20.0
  (4766, 0)	0.01
  (969, 0)	33.0
  (720, 0)	155.0
  (2610, 0)	61.0
  (4767, 0)	18.0
  (1299, 4)	0.01
