In [1]:
import pandas as pd
from scipy.sparse import dok_matrix, save_npz
from tqdm import tqdm
tqdm.pandas()  # Enable tqdm for pandas
import json
DGIDB_PATH = "./Data/DGIDB/interactions.tsv"
DGIDB = pd.read_csv(DGIDB_PATH, sep="\t")
HUMANNET_PATH = "./Data/HumanNet/HumanNet-GSP.tsv"
HUMANNET = pd.read_csv(HUMANNET_PATH, sep="\t")
NCBI_PATH = "./Data/ncbi/gene2refseq.gz"
NCBI_INFO = pd.read_csv(NCBI_PATH, sep='\t', compression='gzip')

  interactivity=interactivity, compiler=compiler, result=result)


In [2]:

# Filter for Homo sapiens genes (tax_id = 9606)
human_gene2refseq = NCBI_INFO[NCBI_INFO['#tax_id'] == 9606]

# Create a mapping from gene claim names (Symbol) to NCBI gene IDs (GeneID)
gene_claim_to_id = pd.Series(human_gene2refseq.GeneID.values, index=human_gene2refseq.Symbol).to_dict()
gene_claim_name = "TP53"
ncbi_gene_id = gene_claim_to_id.get(gene_claim_name, "Gene name not found")
print(f"NCBI Gene ID for {gene_claim_name}: {ncbi_gene_id}")

NCBI Gene ID for TP53: 7157


In [3]:
def get_ncbi_gene_id(gene_claim_name):
    return gene_claim_to_id.get(gene_claim_name, None)
# Add a new column to dgidb with NCBI gene IDs


In [4]:
DGIDB['ncbi_gene_id'] = DGIDB['gene_claim_name'].apply(get_ncbi_gene_id)
print(DGIDB.head())

  gene_claim_name gene_concept_id gene_name interaction_source_db_name  \
0          CYP2D6       hgnc:2625    CYP2D6                        DTC   
1           PPARG       hgnc:9236     PPARG                        DTC   
2           ATAD5      hgnc:25752     ATAD5                        DTC   
3            RGS4      hgnc:10000      RGS4                        DTC   
4           MAPK1       hgnc:6871     MAPK1                        DTC   

  interaction_source_db_version interaction_type  interaction_score  \
0                        9/2/20              NaN           0.017709   
1                        9/2/20              NaN           0.840123   
2                        9/2/20              NaN           0.177992   
3                        9/2/20              NaN           0.034319   
4                        9/2/20              NaN           0.050007   

           drug_claim_name       drug_concept_id                drug_name  \
0               RACLOPRIDE          ncit:C152139   

In [5]:
DGIDB['ncbi_gene_id'] = DGIDB['ncbi_gene_id'].apply(lambda x: str(int(x)) if pd.notnull(x) else None)
print(DGIDB.head())

  gene_claim_name gene_concept_id gene_name interaction_source_db_name  \
0          CYP2D6       hgnc:2625    CYP2D6                        DTC   
1           PPARG       hgnc:9236     PPARG                        DTC   
2           ATAD5      hgnc:25752     ATAD5                        DTC   
3            RGS4      hgnc:10000      RGS4                        DTC   
4           MAPK1       hgnc:6871     MAPK1                        DTC   

  interaction_source_db_version interaction_type  interaction_score  \
0                        9/2/20              NaN           0.017709   
1                        9/2/20              NaN           0.840123   
2                        9/2/20              NaN           0.177992   
3                        9/2/20              NaN           0.034319   
4                        9/2/20              NaN           0.050007   

           drug_claim_name       drug_concept_id                drug_name  \
0               RACLOPRIDE          ncit:C152139   

In [6]:
DGIDB.to_csv("dgidb_ncbi_v1.csv", index=False)

In [7]:
# Count rows where 'ncbi_gene_id' is None
DGIDB = pd.read_csv("dgidb_ncbi_v1.csv")
none_count = DGIDB['ncbi_gene_id'].isna().sum()

# Print the count
print(f"Number of rows with None in 'ncbi_gene_id': {none_count}")

Number of rows with None in 'ncbi_gene_id': 44265


In [8]:
ncbi_count = DGIDB['gene_claim_name'].str.contains('ncbi', case=False, na=False).sum()

print("Number of genes where gene_claim_name has 'ncbi':", ncbi_count)
ncbi_none_count = DGIDB[
    DGIDB['gene_claim_name'].str.contains('NCBIGENE:', case=False, na=False) &
    DGIDB['ncbi_gene_id'].isna()
].shape[0]

print("Number of genes where gene_claim_name has 'ncbi' and ncbi_gene_id is None:", ncbi_none_count)

Number of genes where gene_claim_name has 'ncbi': 16070
Number of genes where gene_claim_name has 'ncbi' and ncbi_gene_id is None: 16070


In [9]:
DGIDB.loc[
    DGIDB['gene_claim_name'].str.contains('NCBIGENE:', case=False, na=False), 
    'ncbi_gene_id'
] = DGIDB['gene_claim_name'].str.extract(r'NCBIGENE:(\d+)', expand=False)
DGIDB['ncbi_gene_id'] = DGIDB['ncbi_gene_id'].apply(lambda x: str(int(x)) if pd.notnull(x) else None)

In [10]:
print(DGIDB.head())

  gene_claim_name gene_concept_id gene_name interaction_source_db_name  \
0          CYP2D6       hgnc:2625    CYP2D6                        DTC   
1           PPARG       hgnc:9236     PPARG                        DTC   
2           ATAD5      hgnc:25752     ATAD5                        DTC   
3            RGS4      hgnc:10000      RGS4                        DTC   
4           MAPK1       hgnc:6871     MAPK1                        DTC   

  interaction_source_db_version interaction_type  interaction_score  \
0                        9/2/20              NaN           0.017709   
1                        9/2/20              NaN           0.840123   
2                        9/2/20              NaN           0.177992   
3                        9/2/20              NaN           0.034319   
4                        9/2/20              NaN           0.050007   

           drug_claim_name       drug_concept_id                drug_name  \
0               RACLOPRIDE          ncit:C152139   

In [11]:
DGIDB.to_csv("dgidb_ncbi_v2.csv", index=False)

In [12]:
# Filter rows where ncbi_gene_id is None
none_rows = DGIDB[DGIDB['ncbi_gene_id'].isna()]
none_rows.to_csv("none_rows.csv")
# Print the first few examples
print("Examples of rows where 'ncbi_gene_id' is None:")
print(none_rows[['gene_claim_name', 'ncbi_gene_id']])


Examples of rows where 'ncbi_gene_id' is None:
                                     gene_claim_name ncbi_gene_id
5                     ANGIOTENSIN II RECEPTOR TYPE-1         None
6      VASCULAR ENDOTHELIAL GROWTH FACTOR RECEPTOR 2         None
7                       POLY [ADP-RIBOSE] POLYMERASE         None
19                                    UNIPROT:P08588         None
20                                    UNIPROT:Q9Y5Y9         None
...                                              ...          ...
97974                   PROSTAGLANDIN G/H SYNTHASE 2         None
97975               EPIDERMAL GROWTH FACTOR RECEPTOR         None
97976                            ADENOSINE DEAMINASE         None
97977                                     MESOTHELIN         None
97979                                     MESOTHELIN         None

[28195 rows x 2 columns]


In [13]:
# Count occurrences of each unique gene_claim_name where ncbi_gene_id is None
gene_claim_name_counts = none_rows['gene_claim_name'].value_counts()

# Print the counts
print("Number of times each gene_claim_name appears where 'ncbi_gene_id' is None:")
print(gene_claim_name_counts)


Number of times each gene_claim_name appears where 'ncbi_gene_id' is None:
BACTERIAL FIMBRIN D-MANNOSE ADHESIN               188
TRANSIENT RECEPTOR POTENTIAL CATION CHANNEL V1    178
CANNABINOID RECEPTOR 1                            176
TYROSINASE                                        143
B-LYMPHOCYTE SURFACE ANTIGEN B4                   135
                                                 ... 
NOTCH-2 RECEPTOR                                    1
GLUCOSE TRANSPORTER TYPE 4                          1
THYROTROPIN-RELEASING HORMONE                       1
STEROL O-ACYLTRANSFERASE                            1
SHORT TRANSIENT RECEPTOR POTENTIAL CHANNEL 4        1
Name: gene_claim_name, Length: 3434, dtype: int64


In [14]:
# Convert ncbi_gene_id in DGIDB to a set for efficient lookup (drop NaN values)
dgidb_ncbi_ids = set(DGIDB['ncbi_gene_id'].dropna().astype(str))

# Combine HumanNet's gene1 and gene2 into a single set
humannet_genes = set(HUMANNET['Gene1'].astype(str)) | set(HUMANNET['Gene2'].astype(str))
humannet_gene1 = set(HUMANNET['Gene1'].astype(str))
# Find ncbi_gene_ids in DGIDB that are not in HumanNet
unique_in_dgidb1 = dgidb_ncbi_ids - humannet_gene1
unique_in_dgidb = dgidb_ncbi_ids - humannet_genes
# Print the count of such ncbi_gene_ids
print("Number of ncbi_gene_id in DGIDB but not in HumanNet[gene1]:", len(unique_in_dgidb1))
print("Number of ncbi_gene_id in DGIDB but not in HumanNet[gene1] or HumanNet[gene2]:", len(unique_in_dgidb))
print("Total number of interactions in DGIDB:", len(DGIDB))
print("Total number of unique genes in DGIDB:",len(dgidb_ncbi_ids))
rows_not_in_humannet = DGIDB[DGIDB['ncbi_gene_id'].notna() & ~DGIDB['ncbi_gene_id'].astype(str).isin(humannet_genes)]
print("Number of rows in DGIDB where ncbi_gene_id is not in HumanNet[gene1] or HumanNet[gene2]:", len(rows_not_in_humannet))


Number of ncbi_gene_id in DGIDB but not in HumanNet[gene1]: 2161
Number of ncbi_gene_id in DGIDB but not in HumanNet[gene1] or HumanNet[gene2]: 2032
Total number of interactions in DGIDB: 98239
Total number of unique genes in DGIDB: 4835
Number of rows in DGIDB where ncbi_gene_id is not in HumanNet[gene1] or HumanNet[gene2]: 17100


In [28]:
v2 = pd.read_csv("./Data/DGIDB/converted/human/dgidb_ncbi_v2.csv")

In [29]:
none_rows = v2[v2['ncbi_gene_id'].isna()]
print(none_rows.head())

                                  gene_claim_name gene_concept_id gene_name  \
5                  ANGIOTENSIN II RECEPTOR TYPE-1        hgnc:336     AGTR1   
6   VASCULAR ENDOTHELIAL GROWTH FACTOR RECEPTOR 2       hgnc:6307       KDR   
7                    POLY [ADP-RIBOSE] POLYMERASE        hgnc:270     PARP1   
19                                 UNIPROT:P08588        hgnc:285     ADRB1   
20                                 UNIPROT:Q9Y5Y9      hgnc:10582    SCN10A   

   interaction_source_db_name interaction_source_db_version interaction_type  \
5                         TTD                    2020.06.01              NaN   
6                         TTD                    2020.06.01              NaN   
7                         TTD                    2020.06.01              NaN   
19                       TEND                      1-Aug-11              NaN   
20                       TEND                      1-Aug-11              NaN   

    interaction_score drug_claim_name   drug

In [30]:
# Remove rows with None values in 'ncbi_gene_id' column
v2 = v2.dropna(subset=['ncbi_gene_id'])

# Verify the number of rows removed
print(f"Number of rows removed: {len(none_rows)}")
print(f"Number of rows remaining: {len(v2)}")
v2['ncbi_gene_id'] = v2['ncbi_gene_id'].apply(lambda x: str(int(x)) if pd.notnull(x) else None)

Number of rows removed: 28195
Number of rows remaining: 70044


In [31]:
v2.to_csv("./Data/DGIDB/converted/human/dgidb_ncbi_v3.csv", index=False)

In [32]:
DGIDB=v2

In [33]:
# Create PPI subnetwork

import numpy as np
# Get the unique list of genes
genes = pd.unique(HUMANNET[['Gene1', 'Gene2']].values.ravel())

# Create a mapping of genes to indices for the adjacency matrix
gene_to_index = {gene: idx for idx, gene in enumerate(genes)}

# Initialize an adjacency matrix of zeros
adj_matrix = np.zeros((len(genes), len(genes)), dtype=int)

# Fill the adjacency matrix based on HumanNet edges
for _, row in HUMANNET.iterrows():
    i, j = gene_to_index[row["Gene1"]], gene_to_index[row["Gene2"]]
    adj_matrix[i, j] = 1
    adj_matrix[j, i] = 1  # Ensure symmetry

# Create a DataFrame for the adjacency matrix for easy visualization if needed
adj_matrix_df = pd.DataFrame(adj_matrix, index=genes, columns=genes)

# Define a function to get the degree of a specific gene
def get_gene_degree(gene):
    if gene not in gene_to_index:
        raise ValueError(f"Gene {gene} not found in the dataset.")
    index = gene_to_index[gene]
    return adj_matrix[index].sum()

# Example usage
gene_example = 54936
degree = get_gene_degree(gene_example)
print(f"The degree of gene {gene_example} is: {degree}")

The degree of gene 54936 is: 48


In [34]:
from scipy.sparse import dok_matrix, save_npz
# Calculate gene degrees in HumanNet
genes_in_humannet = pd.unique(HUMANNET[['Gene1', 'Gene2']].values.ravel())
gene_to_degree = {gene: 0 for gene in genes_in_humannet}

for _, row in HUMANNET.iterrows():
    gene_to_degree[row["Gene1"]] += 1
    gene_to_degree[row["Gene2"]] += 1

# Add degrees to DGIDB with fallback to 0.01 for missing genes
DGIDB['ncbi_gene_id'] = DGIDB['ncbi_gene_id'].astype(str)
gene_to_degree = {str(gene): degree for gene, degree in gene_to_degree.items()}
DGIDB['degree'] = DGIDB['ncbi_gene_id'].map(gene_to_degree).fillna(0.01)

# Create mappings for vertices and hyperedges
genes = DGIDB['ncbi_gene_id'].unique()
drugs = DGIDB['drug_name'].unique()
gene_to_index = {gene: i for i, gene in enumerate(genes)}
drug_to_index = {drug: i for i, drug in enumerate(drugs)}

# Initialize a sparse incidence matrix
incidence_matrix = dok_matrix((len(genes), len(drugs)), dtype=float)

# Populate the incidence matrix with degree as weight
for _, row in DGIDB.iterrows():
    gene_idx = gene_to_index[row['ncbi_gene_id']]
    drug_idx = drug_to_index[row['drug_name']]
    degree = row['degree']
    incidence_matrix[gene_idx, drug_idx] = degree

# Convert the DOK matrix to CSR format
csr_matrix = incidence_matrix.tocsr()

# Save the matrix as .npz file
save_npz("hypergraph_incidence_matrix_weighted.npz", csr_matrix)

# Print confirmation
print("Weighted incidence matrix saved as 'hypergraph_incidence_matrix_weighted.npz'.")

Weighted incidence matrix saved as 'hypergraph_incidence_matrix_weighted.npz'.


In [35]:
import json

# Define file paths
gene_to_index_path = "gene_to_index.json"
drug_to_index_path = "drug_to_index.json"

# Save gene_to_index mapping
with open(gene_to_index_path, 'w') as gene_file:
    json.dump(gene_to_index, gene_file, indent=4)

# Save drug_to_index mapping
with open(drug_to_index_path, 'w') as drug_file:
    json.dump(drug_to_index, drug_file, indent=4)

print(f"Mappings saved to {gene_to_index_path} and {drug_to_index_path}.")


Mappings saved to gene_to_index.json and drug_to_index.json.


In [36]:
from scipy.sparse import dok_matrix, save_npz
import pandas as pd

# Create mappings for vertices and hyperedges
genes = DGIDB['ncbi_gene_id'].unique()
drugs = DGIDB['drug_name'].unique()
gene_to_index = {gene: i for i, gene in enumerate(genes)}
drug_to_index = {drug: i for i, drug in enumerate(drugs)}

# Initialize a sparse binary incidence matrix
binary_incidence_matrix = dok_matrix((len(genes), len(drugs)), dtype=int)

# Populate the binary incidence matrix with 1 where there is a relationship
for _, row in DGIDB.iterrows():
    gene_idx = gene_to_index[row['ncbi_gene_id']]
    drug_idx = drug_to_index[row['drug_name']]
    binary_incidence_matrix[gene_idx, drug_idx] = 1

# Convert the DOK matrix to CSR format
binary_csr_matrix = binary_incidence_matrix.tocsr()

# Save the binary matrix as .npz file
save_npz("hypergraph_incidence_matrix_binary.npz", binary_csr_matrix)

# Print confirmation
print("Binary incidence matrix saved as 'hypergraph_incidence_matrix_binary.npz'.")


Binary incidence matrix saved as 'hypergraph_incidence_matrix_binary.npz'.


In [37]:
print(binary_incidence_matrix)


  (0, 0)	1
  (1, 1)	1
  (2, 2)	1
  (3, 3)	1
  (4, 4)	1
  (5, 5)	1
  (6, 6)	1
  (7, 7)	1
  (8, 8)	1
  (9, 9)	1
  (10, 10)	1
  (11, 11)	1
  (12, 12)	1
  (13, 13)	1
  (14, 14)	1
  (15, 15)	1
  (16, 16)	1
  (17, 17)	1
  (18, 18)	1
  (19, 19)	1
  (20, 20)	1
  (21, 21)	1
  (22, 22)	1
  (23, 23)	1
  (0, 24)	1
  :	:
  (1285, 16398)	1
  (64, 13493)	1
  (116, 5036)	1
  (116, 3687)	1
  (42, 12949)	1
  (42, 14675)	1
  (4523, 264)	1
  (277, 198)	1
  (67, 1804)	1
  (585, 16798)	1
  (73, 8251)	1
  (73, 8927)	1
  (268, 2755)	1
  (772, 883)	1
  (212, 27)	1
  (212, 3447)	1
  (180, 57)	1
  (151, 3167)	1
  (151, 1829)	1
  (1244, 16131)	1
  (315, 6546)	1
  (315, 15780)	1
  (315, 8251)	1
  (315, 1165)	1
  (85, 16799)	1


In [38]:
print(incidence_matrix)

  (0, 0)	52.0
  (1, 1)	398.0
  (2, 2)	16.0
  (3, 3)	0.01
  (4, 4)	202.0
  (5, 5)	0.01
  (6, 6)	0.01
  (7, 7)	54.0
  (8, 8)	0.01
  (9, 9)	311.0
  (10, 10)	41.0
  (11, 11)	67.0
  (12, 12)	0.01
  (13, 13)	158.0
  (14, 14)	59.0
  (15, 15)	0.01
  (16, 16)	324.0
  (17, 17)	62.0
  (18, 18)	39.0
  (19, 19)	35.0
  (20, 20)	0.01
  (21, 21)	197.0
  (22, 22)	85.0
  (23, 23)	25.0
  (0, 24)	52.0
  :	:
  (1285, 16398)	23.0
  (64, 13493)	0.01
  (116, 5036)	138.0
  (116, 3687)	138.0
  (42, 12949)	152.0
  (42, 14675)	152.0
  (4523, 264)	0.01
  (277, 198)	112.0
  (67, 1804)	138.0
  (585, 16798)	7.0
  (73, 8251)	339.0
  (73, 8927)	339.0
  (268, 2755)	0.01
  (772, 883)	4.0
  (212, 27)	57.0
  (212, 3447)	57.0
  (180, 57)	58.0
  (151, 3167)	441.0
  (151, 1829)	441.0
  (1244, 16131)	46.0
  (315, 6546)	505.0
  (315, 15780)	505.0
  (315, 8251)	505.0
  (315, 1165)	505.0
  (85, 16799)	2.0
