In [86]:
OUTPUT_FOLDER = "./output/DGIDB_ARTHRITIS/"
# Leave blank for the all drugs
SNOMED_DISEASE_CODES = [69896004]

In [87]:
import numpy as np
import pandas as pd
from scipy.sparse import dok_matrix, save_npz
import os
import json
DGIDB = pd.read_csv("../Data/DGIDB/DrugToGene.tsv", sep="\t")
HUMANNET = pd.read_csv("../Data/HumanNet/HumanNet-GSP.tsv", sep="\t")
DDDB = pd.read_csv("../Data/DDDB/DrugToDisease.tsv", sep="\t")
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

In [88]:
specific_disease_drugs = DDDB.loc[DDDB['SNOMED'].isin(SNOMED_DISEASE_CODES), 'ndfrt_preferred_label'].dropna().unique().tolist()
print(specific_disease_drugs)

['Ibuprofen', 'Minocycline', 'TRIAMCINOLONE', 'SULFAMETHOXAZOLE', 'PREDNISONE', 'SULFASALAZINE', 'PREDNISOLONE', 'NAPROXEN', 'DICLOFENAC', 'HYDROXYCHLOROQUINE', 'METHOTREXATE', 'RITUXIMAB', 'INFLIXIMAB', 'LEFLUNOMIDE', 'THALIDOMIDE', 'CELECOXIB', 'MELOXICAM', 'ADALIMUMAB', 'ASPIRIN', 'DEXAMETHASONE', 'CYCLOSPORINE', 'AZATHIOPRINE', 'ETANERCEPT', 'TACROLIMUS', 'FLURBIPROFEN']


In [None]:
# Calculate gene degrees in HumanNet
genes_in_humannet = pd.unique(HUMANNET[['Gene1', 'Gene2']].values.ravel())
gene_to_degree = {gene: 0 for gene in genes_in_humannet}

for _, row in HUMANNET.iterrows():
    gene_to_degree[row["Gene1"]] += 1
    gene_to_degree[row["Gene2"]] += 1

# Add degrees to DGIDB with fallback to 0.01 for missing genes
DGIDB['ncbi_gene_id'] = DGIDB['ncbi_gene_id'].astype(str)
gene_to_degree = {str(gene): degree for gene, degree in gene_to_degree.items()}
DGIDB['degree'] = DGIDB['ncbi_gene_id'].map(gene_to_degree).fillna(0.01)

# Create mappings for vertices and hyperedges
genes = DGIDB['ncbi_gene_id'].unique()
drugs = DGIDB['drug_name'].unique()
gene_to_index = {gene: i for i, gene in enumerate(genes)}
drug_to_index = {drug: i for i, drug in enumerate(drugs)}
# Define file paths
gene_to_index_path = OUTPUT_FOLDER + "gene_to_index.json"
drug_to_index_path = OUTPUT_FOLDER + "drug_to_index.json"

# Save gene_to_index mapping
with open(gene_to_index_path, 'w') as gene_file:
    json.dump(gene_to_index, gene_file, indent=4)

# Save drug_to_index mapping
with open(drug_to_index_path, 'w') as drug_file:
    json.dump(drug_to_index, drug_file, indent=4)

print(f"Mappings saved to {gene_to_index_path} and {drug_to_index_path}.")


# Initialize a sparse incidence matrix
incidence_matrix = dok_matrix((len(genes), len(drugs)), dtype=float)
binary_incidence_matrix = dok_matrix((len(genes), len(drugs)), dtype=int)

# Filter rows only if needed
if specific_disease_drugs:
    relevant_rows = DGIDB[DGIDB['drug_name'].isin(specific_disease_drugs)]
else:
    relevant_rows = DGIDB

# Populate only the relevant entries
for _, row in relevant_rows.iterrows():
    gene_idx = gene_to_index[row['ncbi_gene_id']]
    drug_idx = drug_to_index[row['drug_name']]
    incidence_matrix[gene_idx, drug_idx] = row['degree']
    binary_incidence_matrix[gene_idx, drug_idx] = 1

# Convert the DOK matrix to CSR format
csr_matrix = incidence_matrix.tocsr()
binary_csr_matrix = binary_incidence_matrix.tocsr()

# Save the matrix as .npz file
save_npz(OUTPUT_FOLDER + "hypergraph_incidence_matrix_weighted.npz", csr_matrix)
save_npz(OUTPUT_FOLDER + "hypergraph_incidence_matrix_binary.npz", binary_csr_matrix)

# Print confirmation
print(f"Weighted incidence matrix saved as {OUTPUT_FOLDER}hypergraph_incidence_matrix_weighted.npz'.")
print(f"Binary incidence matrix saved as {OUTPUT_FOLDER}hypergraph_incidence_matrix_binary.npz'.")

Mappings saved to ./output/DGIDB_ARTHRITIS/gene_to_index.json and ./output/DGIDB_ARTHRITIS/drug_to_index.json.
Weighted incidence matrix saved as ./output/DGIDB_ARTHRITIS/hypergraph_incidence_matrix_weighted.npz'.
Binary incidence matrix saved as ./output/DGIDB_ARTHRITIS/hypergraph_incidence_matrix_binary.npz'.


In [90]:
# Count non-zero entries per column
non_empty_drugs = np.array((binary_csr_matrix != 0).sum(axis=0)).flatten()

# Get indices of non-empty columns
non_empty_drug_indices = np.where(non_empty_drugs > 0)[0]

print(f"Number of non-empty drugs: {len(non_empty_drug_indices)}")
print("Indices of non-empty drugs:", non_empty_drug_indices)


Number of non-empty drugs: 17
Indices of non-empty drugs: [ 117  261  267  330  417  565  704 1357 1453 2648 2754 2844 2849 3473
 5016 5393 7589]


In [91]:
for drug in specific_disease_drugs:
    if drug in drug_to_index:
        idx = drug_to_index[drug]
        print(f"Drug: {drug}, Index: {idx}")
    else:
        print(f"Drug: {drug} not found in drug_to_index.")


Drug: Ibuprofen not found in drug_to_index.
Drug: Minocycline not found in drug_to_index.
Drug: TRIAMCINOLONE, Index: 1453
Drug: SULFAMETHOXAZOLE, Index: 267
Drug: PREDNISONE, Index: 2754
Drug: SULFASALAZINE, Index: 5016
Drug: PREDNISOLONE, Index: 2648
Drug: NAPROXEN not found in drug_to_index.
Drug: DICLOFENAC not found in drug_to_index.
Drug: HYDROXYCHLOROQUINE, Index: 2849
Drug: METHOTREXATE, Index: 261
Drug: RITUXIMAB, Index: 565
Drug: INFLIXIMAB not found in drug_to_index.
Drug: LEFLUNOMIDE, Index: 1357
Drug: THALIDOMIDE, Index: 704
Drug: CELECOXIB, Index: 417
Drug: MELOXICAM, Index: 5393
Drug: ADALIMUMAB not found in drug_to_index.
Drug: ASPIRIN, Index: 330
Drug: DEXAMETHASONE, Index: 117
Drug: CYCLOSPORINE, Index: 2844
Drug: AZATHIOPRINE, Index: 7589
Drug: ETANERCEPT not found in drug_to_index.
Drug: TACROLIMUS not found in drug_to_index.
Drug: FLURBIPROFEN, Index: 3473


In [None]:
# # Compute gene-gene adjacency matrix by projecting via shared drugs
# adj_matrix = adj_matrix = binary_csr_matrix @ binary_csr_matrix.T  # Matrix multiplication: shared drugs
# adj_matrix.setdiag(0)
# adj_matrix.eliminate_zeros()

# # --- Step 2: Extract Edgelist from Upper Triangle Only ---
# # Use sparse coo_matrix to iterate efficiently
# from scipy.sparse import triu

# adj_matrix_upper = triu(adj_matrix, k=1)  # upper triangle, no diag
# adj_coo = adj_matrix_upper.tocoo()

# # Optional: if you have gene names
# # gene_names = ['TP53', 'EGFR', 'BRCA1', ...]
# # Otherwise use indices as names

# edges = []
# for i, j, v in zip(adj_coo.row, adj_coo.col, adj_coo.data):
#     edges.append((i, j, v))  # replace i/j with gene_names[i] if available

# # Convert to DataFrame and save
# edge_df = pd.DataFrame(edges, columns=["Gene1", "Gene2", "Weight"])

# # If you have gene names, map them:
# # edge_df["Gene1"] = edge_df["Gene1"].map(lambda i: gene_names[i])
# # edge_df["Gene2"] = edge_df["Gene2"].map(lambda i: gene_names[i])

# edge_df.to_csv("gene_gene_edgelist.csv", index=False)

In [93]:
# import pandas as pd

# if 'NCBI_INFO' not in globals():
#     print("Reading gene2refseq.gz...")
#     NCBI_INFO = pd.read_csv("../Data/ncbi/gene2refseq.gz", sep='\t', compression='gzip')
# else:
#     print("NCBI_INFO already loaded.")

In [94]:
# index_to_ncbi = {idx: gene for gene, idx in gene_to_index.items()}
# human_gene2refseq = NCBI_INFO[NCBI_INFO['#tax_id'] == 9606]
# id_to_gene_claim = pd.Series(human_gene2refseq.Symbol.values, index=human_gene2refseq.GeneID).to_dict()

# # Your existing function to get common gene name from ncbi gene id
# def get_gene_claim_name(ncbi_gene_id):
#     try:
#         ncbi_gene_id = int(ncbi_gene_id)
#         result = id_to_gene_claim.get(ncbi_gene_id, None)
#         return result if result else "Gene name not found"
#     except:
#         return "Gene name not found"

In [95]:
# # Step 1: Map index → NCBI gene ID
# edge_df['Gene1_ncbi'] = edge_df['Gene1'].map(index_to_ncbi)
# edge_df['Gene2_ncbi'] = edge_df['Gene2'].map(index_to_ncbi)

# # Step 2: Map NCBI gene ID → gene symbol
# edge_df['Gene1'] = edge_df['Gene1_ncbi'].apply(get_gene_claim_name)
# edge_df['Gene2'] = edge_df['Gene2_ncbi'].apply(get_gene_claim_name)

# # Step 3: Drop temp NCBI ID columns
# edge_df = edge_df.drop(columns=['Gene1_ncbi', 'Gene2_ncbi'])

# # Optional: Save to CSV
# edge_df.to_csv('gene_gene_edgelist_named.csv', index=False)
