In [1]:
import os
import json
import pandas as pd


In [4]:
NODE_FILE = "/playpen/jesse/drug_repurpose/PrimeKG/nodes.csv"
df_node = pd.read_csv(NODE_FILE)

unique_node_types = df_node['node_type'].nunique()
print(f"Number of unique node types: {unique_node_types}")

print("Unique node types:")
print(df_node['node_type'].unique())

print("\nCount of each node type:")
print(df_node['node_type'].value_counts())

Number of unique node types: 10
Unique node types:
['gene/protein' 'drug' 'effect/phenotype' 'disease' 'biological_process'
 'molecular_function' 'cellular_component' 'exposure' 'pathway' 'anatomy']

Count of each node type:
node_type
biological_process    28642
gene/protein          27671
disease               17080
effect/phenotype      15311
anatomy               14035
molecular_function    11169
drug                   7957
cellular_component     4176
pathway                2516
exposure                818
Name: count, dtype: int64


In [6]:
source_counts = df_node.groupby(['node_type', 'node_source']).size().reset_index(name='count')
print(source_counts)

             node_type    node_source  count
0              anatomy         UBERON  14035
1   biological_process             GO  28642
2   cellular_component             GO   4176
3              disease          MONDO  15813
4              disease  MONDO_grouped   1267
5                 drug       DrugBank   7957
6     effect/phenotype            HPO  15311
7             exposure            CTD    818
8         gene/protein           NCBI  27671
9   molecular_function             GO  11169
10             pathway       REACTOME   2516


In [10]:
EDGE_FILE = "/playpen/jesse/drug_repurpose/PrimeKG/edges.csv"  
df_edges = pd.read_csv(EDGE_FILE)

unique_relations = df_edges['relation'].nunique()
print(f"Number of unique relation types: {unique_relations}")

print("\nUnique relation types:")
print(df_edges['relation'].unique())

print("\nRelation counts:")
print(df_edges['relation'].value_counts())

Number of unique relation types: 30

Unique relation types:
['protein_protein' 'drug_protein' 'contraindication' 'indication'
 'off-label use' 'drug_drug' 'phenotype_protein' 'phenotype_phenotype'
 'disease_phenotype_negative' 'disease_phenotype_positive'
 'disease_protein' 'disease_disease' 'drug_effect' 'bioprocess_bioprocess'
 'molfunc_molfunc' 'cellcomp_cellcomp' 'molfunc_protein'
 'cellcomp_protein' 'bioprocess_protein' 'exposure_protein'
 'exposure_disease' 'exposure_exposure' 'exposure_bioprocess'
 'exposure_molfunc' 'exposure_cellcomp' 'pathway_pathway'
 'pathway_protein' 'anatomy_anatomy' 'anatomy_protein_present'
 'anatomy_protein_absent']

Relation counts:
relation
anatomy_protein_present       3036406
drug_drug                     2672628
protein_protein                642150
disease_phenotype_positive     300634
bioprocess_protein             289610
cellcomp_protein               166804
disease_protein                160822
molfunc_protein                139060
drug_effect

In [11]:
# Inference path relations:
needed_relations = [
    "disease_protein",
    "phenotype_protein",
    "drug_protein",
    "disease_phenotype_positive"
]

# Validation relation:
validation_relation = "indication"

In [20]:
path_edges = df_edges[df_edges['relation'].isin(needed_relations)]

disease_gene = path_edges[path_edges['relation'] == 'disease_protein'][['x_index', 'y_index']]
drug_gene = path_edges[path_edges['relation'] == 'drug_protein'][['x_index', 'y_index']]
disease_pheno = path_edges[path_edges['relation'] == 'disease_phenotype_positive'][['x_index', 'y_index']]
drug_pheno = path_edges[path_edges['relation'] == 'drug_phenotype'][['x_index', 'y_index']]

disease_gene.columns = ['disease_index', 'gene_index']
drug_gene.columns = ['drug_index', 'gene_index']
disease_pheno.columns = ['disease_index', 'phenotype_index']
drug_pheno.columns = ['drug_index', 'phenotype_index']
print(disease_gene)

         disease_index  gene_index
3235582           7097       28313
3235583           2174       28313
3235584           8038       28313
3235585           5925       28313
3235586            238       28313
...                ...         ...
6030122          28149        4152
6030123          28181          59
6030124          31190        5826
6030125          33606       10422
6030126          30032       33802

[160822 rows x 2 columns]


In [21]:
# ---- Inference Path 1: Disease → Protein → Drug ----
dg_merge = pd.merge(disease_gene, drug_gene, on='gene_index')
inferred_disease_drug_1 = dg_merge[['disease_index', 'drug_index']]

# ---- Inference Path 2: Disease → Phenotype → Protein → Drug ----
dp_merge = pd.merge(disease_pheno, drug_pheno, on='phenotype_index')
inferred_disease_drug_2 = dp_merge[['disease_index', 'drug_index']]

inferred_pairs = pd.concat([inferred_disease_drug_1, inferred_disease_drug_2]).drop_duplicates()

In [22]:

# Load known indications
indication_edges = df_edges[df_edges['relation'] == validation_relation][['x_index', 'y_index']]
indication_edges['pair'] = indication_edges.apply(
    lambda row: tuple(sorted([row['x_index'], row['y_index']])), axis=1
)
known_pairs_set = set(indication_edges['pair'])

inferred_pairs['pair'] = inferred_pairs.apply(
    lambda row: tuple(sorted([row['disease_index'], row['drug_index']])), axis=1
)
inferred_pairs = inferred_pairs.drop_duplicates(subset='pair')

# Map to names
inferred_pairs['is_validated'] = inferred_pairs['pair'].isin(known_pairs_set)
validated_hits = inferred_pairs[inferred_pairs['is_validated']].copy()

id_to_name = df_node.set_index('node_index')['node_name'].to_dict()

def map_pair_names(row):
    node1 = id_to_name.get(row['pair'][0], 'N/A')
    node2 = id_to_name.get(row['pair'][1], 'N/A')
    return pd.Series({'entity_1': node1, 'entity_2': node2})

validated_named = validated_hits.apply(map_pair_names, axis=1)

print("Total inferred pairs:", len(inferred_pairs))
print("Validated hits (via indication):", len(validated_hits))

print("\nSample validated hits (undirected):")
print(validated_named.head())


Total inferred pairs: 406600
Validated hits (via indication): 1931

Sample validated hits (undirected):
           entity_1       entity_2
121     Ziprasidone  schizophrenia
125      Olanzapine  schizophrenia
130        Loxapine  schizophrenia
131       Promazine  schizophrenia
136  Chlorpromazine  schizophrenia
