In [6]:
OUTPUT_FOLDER = "./output/DGIDB_BREAST_CANCER/"
# Leave blank for the all drugs
SNOMED_DISEASE_CODES = [254837009]

In [7]:
import numpy as np
import pandas as pd
from scipy.sparse import dok_matrix, save_npz
import os
import json
DGIDB = pd.read_csv("../Data/DGIDB/DrugToGene.tsv", sep="\t")
HUMANNET = pd.read_csv("../Data/HumanNet/HumanNet-GSP.tsv", sep="\t")
DDDB = pd.read_csv("../Data/DDDB/DrugToDisease.tsv", sep="\t")
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

In [8]:
specific_disease_drugs = DDDB.loc[DDDB['SNOMED'].isin(SNOMED_DISEASE_CODES), 'ndfrt_preferred_label'].dropna().unique().tolist()
print(specific_disease_drugs)

['GEMCITABINE', 'PREDNISOLONE', 'MELPHALAN', 'GOSERELIN ACETATE', 'MEDROXYPROGESTERONE', 'METHOTREXATE', 'TAMOXIFEN', 'VINCRISTINE', 'EPIRUBICIN', 'OXALIPLATIN']


In [9]:
# Calculate gene degrees in HumanNet
genes_in_humannet = pd.unique(HUMANNET[['Gene1', 'Gene2']].values.ravel())
gene_to_degree = {gene: 0 for gene in genes_in_humannet}

for _, row in HUMANNET.iterrows():
    gene_to_degree[row["Gene1"]] += 1
    gene_to_degree[row["Gene2"]] += 1

# Add degrees to DGIDB with fallback to 0.01 for missing genes
DGIDB['ncbi_gene_id'] = DGIDB['ncbi_gene_id'].astype(str)
gene_to_degree = {str(gene): degree for gene, degree in gene_to_degree.items()}
DGIDB['degree'] = DGIDB['ncbi_gene_id'].map(gene_to_degree).fillna(0.01)

# Create mappings for vertices and hyperedges
genes = DGIDB['ncbi_gene_id'].unique()
drugs = DGIDB['drug_name'].unique()
gene_to_index = {gene: i for i, gene in enumerate(genes)}
drug_to_index = {drug: i for i, drug in enumerate(drugs)}
# Define file paths
gene_to_index_path = OUTPUT_FOLDER + "gene_to_index.json"
drug_to_index_path = OUTPUT_FOLDER + "drug_to_index.json"

# Save gene_to_index mapping
with open(gene_to_index_path, 'w') as gene_file:
    json.dump(gene_to_index, gene_file, indent=4)

# Save drug_to_index mapping
with open(drug_to_index_path, 'w') as drug_file:
    json.dump(drug_to_index, drug_file, indent=4)

print(f"Mappings saved to {gene_to_index_path} and {drug_to_index_path}.")


# Initialize a sparse incidence matrix
incidence_matrix = dok_matrix((len(genes), len(drugs)), dtype=float)
binary_incidence_matrix = dok_matrix((len(genes), len(drugs)), dtype=int)

# Filter rows only if needed
if specific_disease_drugs:
    relevant_rows = DGIDB[DGIDB['drug_name'].isin(specific_disease_drugs)]
else:
    relevant_rows = DGIDB

# Populate only the relevant entries
for _, row in relevant_rows.iterrows():
    gene_idx = gene_to_index[row['ncbi_gene_id']]
    drug_idx = drug_to_index[row['drug_name']]
    incidence_matrix[gene_idx, drug_idx] = row['degree']
    binary_incidence_matrix[gene_idx, drug_idx] = 1

# Convert the DOK matrix to CSR format
csr_matrix = incidence_matrix.tocsr()
binary_csr_matrix = binary_incidence_matrix.tocsr()

# Save the matrix as .npz file
save_npz(OUTPUT_FOLDER + "hypergraph_incidence_matrix_weighted.npz", csr_matrix)
save_npz(OUTPUT_FOLDER + "hypergraph_incidence_matrix_binary.npz", binary_csr_matrix)

# Print confirmation
print(f"Weighted incidence matrix saved as {OUTPUT_FOLDER}hypergraph_incidence_matrix_weighted.npz'.")
print(f"Binary incidence matrix saved as {OUTPUT_FOLDER}hypergraph_incidence_matrix_binary.npz'.")

Mappings saved to ./output/DGIDB_BREAST_CANCER/gene_to_index.json and ./output/DGIDB_BREAST_CANCER/drug_to_index.json.
Weighted incidence matrix saved as ./output/DGIDB_BREAST_CANCER/hypergraph_incidence_matrix_weighted.npz'.
Binary incidence matrix saved as ./output/DGIDB_BREAST_CANCER/hypergraph_incidence_matrix_binary.npz'.
