In [1]:
import os
import json
import pandas as pd


In [4]:
NODE_FILE = "/playpen/jesse/drug_repurpose/PrimeKG/nodes.csv"
df_node = pd.read_csv(NODE_FILE)

unique_node_types = df_node['node_type'].nunique()
print(f"Number of unique node types: {unique_node_types}")

print("Unique node types:")
print(df_node['node_type'].unique())

print("\nCount of each node type:")
print(df_node['node_type'].value_counts())

Number of unique node types: 10
Unique node types:
['gene/protein' 'drug' 'effect/phenotype' 'disease' 'biological_process'
 'molecular_function' 'cellular_component' 'exposure' 'pathway' 'anatomy']

Count of each node type:
node_type
biological_process    28642
gene/protein          27671
disease               17080
effect/phenotype      15311
anatomy               14035
molecular_function    11169
drug                   7957
cellular_component     4176
pathway                2516
exposure                818
Name: count, dtype: int64


In [6]:
source_counts = df_node.groupby(['node_type', 'node_source']).size().reset_index(name='count')
print(source_counts)

             node_type    node_source  count
0              anatomy         UBERON  14035
1   biological_process             GO  28642
2   cellular_component             GO   4176
3              disease          MONDO  15813
4              disease  MONDO_grouped   1267
5                 drug       DrugBank   7957
6     effect/phenotype            HPO  15311
7             exposure            CTD    818
8         gene/protein           NCBI  27671
9   molecular_function             GO  11169
10             pathway       REACTOME   2516


In [10]:
EDGE_FILE = "/playpen/jesse/drug_repurpose/PrimeKG/edges.csv"  
df_edges = pd.read_csv(EDGE_FILE)

unique_relations = df_edges['relation'].nunique()
print(f"Number of unique relation types: {unique_relations}")

print("\nUnique relation types:")
print(df_edges['relation'].unique())

print("\nRelation counts:")
print(df_edges['relation'].value_counts())

Number of unique relation types: 30

Unique relation types:
['protein_protein' 'drug_protein' 'contraindication' 'indication'
 'off-label use' 'drug_drug' 'phenotype_protein' 'phenotype_phenotype'
 'disease_phenotype_negative' 'disease_phenotype_positive'
 'disease_protein' 'disease_disease' 'drug_effect' 'bioprocess_bioprocess'
 'molfunc_molfunc' 'cellcomp_cellcomp' 'molfunc_protein'
 'cellcomp_protein' 'bioprocess_protein' 'exposure_protein'
 'exposure_disease' 'exposure_exposure' 'exposure_bioprocess'
 'exposure_molfunc' 'exposure_cellcomp' 'pathway_pathway'
 'pathway_protein' 'anatomy_anatomy' 'anatomy_protein_present'
 'anatomy_protein_absent']

Relation counts:
relation
anatomy_protein_present       3036406
drug_drug                     2672628
protein_protein                642150
disease_phenotype_positive     300634
bioprocess_protein             289610
cellcomp_protein               166804
disease_protein                160822
molfunc_protein                139060
drug_effect

In [28]:
# Inference path relations:
needed_relations = [
    "disease_protein",
    "drug_effect",
    "drug_protein",
    "disease_phenotype_positive"
]

# Validation relation:
validation_relation = "indication"

In [37]:
path_edges = df_edges[df_edges['relation'].isin(needed_relations)]

disease_gene = path_edges[path_edges['relation'] == 'disease_protein'][['x_index', 'y_index']]
drug_gene = path_edges[path_edges['relation'] == 'drug_protein'][['x_index', 'y_index']]
disease_pheno = path_edges[path_edges['relation'] == 'disease_phenotype_positive'][['x_index', 'y_index']]
drug_pheno = path_edges[path_edges['relation'] == 'drug_effect'][['x_index', 'y_index']]

disease_gene.columns = ['gene_index', 'disease_index']
drug_gene.columns = ['drug_index', 'gene_index']
disease_pheno.columns = ['disease_index', 'phenotype_index']
drug_pheno.columns = ['drug_index', 'phenotype_index']
print(disease_gene)

         gene_index  disease_index
3235582        7097          28313
3235583        2174          28313
3235584        8038          28313
3235585        5925          28313
3235586         238          28313
...             ...            ...
6030122       28149           4152
6030123       28181             59
6030124       31190           5826
6030125       33606          10422
6030126       30032          33802

[160822 rows x 2 columns]


In [57]:
# ---- Inference Path 1: Disease → Protein → Drug ----
inferred_disease_drug_protein = pd.merge(disease_gene, drug_gene, on='gene_index')

# ---- Inference Path 2: Disease → Phenotype → Drug ----
inferred_disease_drug_phenotype = pd.merge(disease_pheno, drug_pheno, on='phenotype_index')

print(inferred_disease_drug_protein)
print(inferred_disease_drug_phenotype)

        gene_index  disease_index  drug_index
0             7097          28313       14679
1             7097          28313       14012
2             7097          28313       14680
3             2174          28313       15852
4             2174          28313       14280
...            ...            ...         ...
563726        4152          28149       14499
563727        4152          28149       14050
563728        4152          28149       15752
563729        5826          31190       14490
563730        5826          31190       14491

[563731 rows x 3 columns]
         disease_index  phenotype_index  drug_index
0                27158            22784       14881
1                27158            22784       14537
2                27158            22784       14140
3                27158            22272       14544
4                27158            22272       14275
...                ...              ...         ...
6418453          32207            22574       15432
64184

In [41]:
# Load known indications
indication_edges = df_edges[df_edges['relation'] == validation_relation][['x_index', 'y_index']]
indication_edges.columns = ['drug_index', 'disease_index']
print(indication_edges)

         drug_index  disease_index
346730        16687          33577
346731        16687          36035
346764        20297          33577
346765        20297          36035
346768        16693          33577
...             ...            ...
5776153       84333          14471
5776154       27527          16634
5776155       38622          16634
5776156       28673          16634
5776158       39497          17237

[18776 rows x 2 columns]


In [59]:
valid_ddprotein = pd.merge(inferred_disease_drug_protein, indication_edges, on=['drug_index', 'disease_index'])
valid_ddprotein

Unnamed: 0,gene_index,disease_index,drug_index
0,1659,28313,14140
1,1659,28313,14161
2,1659,28313,16011
3,1659,28313,15098
4,1659,28313,14178
...,...,...,...
4784,8192,33605,15417
4785,361,33605,14641
4786,2329,33605,14807
4787,11665,33625,20290


In [60]:
valid_ddphenotype = pd.merge(inferred_disease_drug_phenotype, indication_edges, on=['drug_index', 'disease_index'])
valid_ddphenotype

Unnamed: 0,disease_index,phenotype_index,drug_index
0,27878,24492,14024
1,27921,24285,15074
2,27921,24285,15173
3,27921,24285,16855
4,27921,24285,17315
...,...,...,...
4020,39540,22262,14024
4021,39540,22262,14295
4022,39540,22262,14023
4023,28313,22398,14952


In [65]:
matched_pairs = pd.concat([
    valid_ddphenotype[['drug_index', 'disease_index']],
    valid_ddprotein[['drug_index', 'disease_index']]
])
rmd_paris = matched_pairs.drop_duplicates()
rmd_paris

Unnamed: 0,drug_index,disease_index
0,14024,27878
1,15074,27921
2,15173,27921
3,16855,27921
4,17315,27921
...,...,...
4773,20151,33605
4777,14641,33605
4781,15417,33605
4787,20290,33625


In [69]:
unmatched_pairs = pd.merge(
    indication_edges, rmd_paris,
    on=['drug_index', 'disease_index'],
    how='left', indicator=True
)


not_valid_pairs = unmatched_pairs[unmatched_pairs['_merge'] == 'left_only'].drop(columns=['_merge'])
not_valid_pairs

Unnamed: 0,drug_index,disease_index
1,16687,36035
2,20297,33577
3,20297,36035
4,16693,33577
5,16693,36035
...,...,...
18771,84333,14471
18772,27527,16634
18773,38622,16634
18774,28673,16634
