In [1]:
import os
import json
import pandas as pd


In [2]:
NODE_FILE = "/playpen/jesse/drug_repurpose/PrimeKG/nodes.csv"
df_node = pd.read_csv(NODE_FILE)

unique_node_types = df_node['node_type'].nunique()
print(f"Number of unique node types: {unique_node_types}")

print("Unique node types:")
print(df_node['node_type'].unique())

print("\nCount of each node type:")
print(df_node['node_type'].value_counts())

df_node

Number of unique node types: 10
Unique node types:
['gene/protein' 'drug' 'effect/phenotype' 'disease' 'biological_process'
 'molecular_function' 'cellular_component' 'exposure' 'pathway' 'anatomy']

Count of each node type:
node_type
biological_process    28642
gene/protein          27671
disease               17080
effect/phenotype      15311
anatomy               14035
molecular_function    11169
drug                   7957
cellular_component     4176
pathway                2516
exposure                818
Name: count, dtype: int64


Unnamed: 0,node_index,node_id,node_type,node_name,node_source
0,0,9796,gene/protein,PHYHIP,NCBI
1,1,7918,gene/protein,GPANK1,NCBI
2,2,8233,gene/protein,ZRSR2,NCBI
3,3,4899,gene/protein,NRF1,NCBI
4,4,5297,gene/protein,PI4KA,NCBI
...,...,...,...,...,...
129370,129370,R-HSA-936837,pathway,Ion transport by P-type ATPases,REACTOME
129371,129371,R-HSA-997272,pathway,Inhibition of voltage gated Ca2+ channels via...,REACTOME
129372,129372,1062,anatomy,anatomical entity,UBERON
129373,129373,468,anatomy,multi-cellular organism,UBERON


In [3]:
source_counts = df_node.groupby(['node_type', 'node_source']).size().reset_index(name='count')
print(source_counts)

             node_type    node_source  count
0              anatomy         UBERON  14035
1   biological_process             GO  28642
2   cellular_component             GO   4176
3              disease          MONDO  15813
4              disease  MONDO_grouped   1267
5                 drug       DrugBank   7957
6     effect/phenotype            HPO  15311
7             exposure            CTD    818
8         gene/protein           NCBI  27671
9   molecular_function             GO  11169
10             pathway       REACTOME   2516


In [4]:
EDGE_FILE = "/playpen/jesse/drug_repurpose/PrimeKG/edges.csv"  
df_edges = pd.read_csv(EDGE_FILE)

unique_relations = df_edges['relation'].nunique()
print(f"Number of unique relation types: {unique_relations}")

print("\nUnique relation types:")
print(df_edges['relation'].unique())

print("\nRelation counts:")
print(df_edges['relation'].value_counts())

Number of unique relation types: 30

Unique relation types:
['protein_protein' 'drug_protein' 'contraindication' 'indication'
 'off-label use' 'drug_drug' 'phenotype_protein' 'phenotype_phenotype'
 'disease_phenotype_negative' 'disease_phenotype_positive'
 'disease_protein' 'disease_disease' 'drug_effect' 'bioprocess_bioprocess'
 'molfunc_molfunc' 'cellcomp_cellcomp' 'molfunc_protein'
 'cellcomp_protein' 'bioprocess_protein' 'exposure_protein'
 'exposure_disease' 'exposure_exposure' 'exposure_bioprocess'
 'exposure_molfunc' 'exposure_cellcomp' 'pathway_pathway'
 'pathway_protein' 'anatomy_anatomy' 'anatomy_protein_present'
 'anatomy_protein_absent']

Relation counts:
relation
anatomy_protein_present       3036406
drug_drug                     2672628
protein_protein                642150
disease_phenotype_positive     300634
bioprocess_protein             289610
cellcomp_protein               166804
disease_protein                160822
molfunc_protein                139060
drug_effect

In [5]:
# Inference path relations:
needed_relations = [
    "disease_protein",
    "drug_effect",
    "drug_protein",
    "disease_phenotype_positive"
]

# Validation relation:
validation_relation = "indication"

In [6]:
path_edges = df_edges[df_edges['relation'].isin(needed_relations)]

disease_gene = path_edges[path_edges['relation'] == 'disease_protein'][['x_index', 'y_index']]
drug_gene = path_edges[path_edges['relation'] == 'drug_protein'][['x_index', 'y_index']]
disease_pheno = path_edges[path_edges['relation'] == 'disease_phenotype_positive'][['x_index', 'y_index']]
drug_pheno = path_edges[path_edges['relation'] == 'drug_effect'][['x_index', 'y_index']]

print(drug_gene)

         x_index  y_index
321075     14012     7183
321076     14012     8256
321077     14013     4107
321078     14014     1424
321079     14015     1424
...          ...      ...
5733523     6069    14491
5733524    12188    14490
5733525    12188    14491
5733526    10452    14712
5733527     4776    14712

[51306 rows x 2 columns]


In [7]:
merged = disease_gene.merge(
    df_node[['node_index', 'node_name', 'node_type']],
    left_on='x_index',
    right_on='node_index',
    how='left'
).rename(columns={
    'node_name': 'x_name',
    'node_type': 'x_type'
}).drop(columns=['node_index'])

merged = merged.merge(
    df_node[['node_index', 'node_name', 'node_type']],
    left_on='y_index',
    right_on='node_index',
    how='left'
).rename(columns={
    'node_name': 'y_name',
    'node_type': 'y_type'
}).drop(columns=['node_index'])

def clarify_roles(row):
    if row['x_type'] == 'gene/protein':
        return pd.Series({
            'gene_index': row['x_index'],
            'gene_name': row['x_name'],
            'disease_index': row['y_index'],
            'disease_name': row['y_name']
        })
    else:
        return pd.Series({
            'gene_index': row['y_index'],
            'gene_name': row['y_name'],
            'disease_index': row['x_index'],
            'disease_name': row['x_name']
        })

disease_gene = merged.apply(clarify_roles, axis=1)
disease_gene = disease_gene[['gene_index', 'disease_index']]
disease_gene

Unnamed: 0,gene_index,disease_index
0,7097,28313
1,2174,28313
2,8038,28313
3,5925,28313
4,238,28313
...,...,...
160817,4152,28149
160818,59,28181
160819,5826,31190
160820,10422,33606


In [8]:
merged = drug_gene.merge(
    df_node[['node_index', 'node_name', 'node_type']],
    left_on='x_index',
    right_on='node_index',
    how='left'
).rename(columns={
    'node_name': 'x_name',
    'node_type': 'x_type'
}).drop(columns=['node_index'])

merged = merged.merge(
    df_node[['node_index', 'node_name', 'node_type']],
    left_on='y_index',
    right_on='node_index',
    how='left'
).rename(columns={
    'node_name': 'y_name',
    'node_type': 'y_type'
}).drop(columns=['node_index'])

def clarify_roles(row):
    if row['x_type'] == 'gene/protein':
        return pd.Series({
            'gene_index': row['x_index'],
            'gene_name': row['x_name'],
            'drug_index': row['y_index'],
            'drug_name': row['y_name']
        })
    else:
        return pd.Series({
            'gene_index': row['y_index'],
            'gene_name': row['y_name'],
            'drug_index': row['x_index'],
            'drug_name': row['x_name']
        })

drug_gene = merged.apply(clarify_roles, axis=1)
drug_gene = drug_gene[['gene_index', 'drug_index']]
drug_gene


Unnamed: 0,gene_index,drug_index
0,7183,14012
1,8256,14012
2,4107,14013
3,1424,14014
4,1424,14015
...,...,...
51301,6069,14491
51302,12188,14490
51303,12188,14491
51304,10452,14712


In [9]:
merged = disease_pheno.merge(
    df_node[['node_index', 'node_name', 'node_type']],
    left_on='x_index',
    right_on='node_index',
    how='left'
).rename(columns={
    'node_name': 'x_name',
    'node_type': 'x_type'
}).drop(columns=['node_index'])

merged = merged.merge(
    df_node[['node_index', 'node_name', 'node_type']],
    left_on='y_index',
    right_on='node_index',
    how='left'
).rename(columns={
    'node_name': 'y_name',
    'node_type': 'y_type'
}).drop(columns=['node_index'])

def clarify_roles(row):
    if row['x_type'] == 'effect/phenotype':
        return pd.Series({
            'phenotype_index': row['x_index'],
            'phenotype_name': row['x_name'],
            'disease_index': row['y_index'],
            'disease_name': row['y_name']
        })
    else:
        return pd.Series({
            'phenotype_index': row['y_index'],
            'phenotype_name': row['y_name'],
            'disease_index': row['x_index'],
            'disease_name': row['x_name']
        })

disease_pheno = merged.apply(clarify_roles, axis=1)
disease_pheno = disease_pheno[['phenotype_index', 'disease_index']]
disease_pheno


Unnamed: 0,phenotype_index,disease_index
0,24442,27472
1,22784,27158
2,84344,27158
3,22488,27158
4,22272,27158
...,...,...
300629,25218,33713
300630,22574,32207
300631,26287,32207
300632,22204,33561


In [10]:
merged = drug_pheno.merge(
    df_node[['node_index', 'node_name', 'node_type']],
    left_on='x_index',
    right_on='node_index',
    how='left'
).rename(columns={
    'node_name': 'x_name',
    'node_type': 'x_type'
}).drop(columns=['node_index'])

merged = merged.merge(
    df_node[['node_index', 'node_name', 'node_type']],
    left_on='y_index',
    right_on='node_index',
    how='left'
).rename(columns={
    'node_name': 'y_name',
    'node_type': 'y_type'
}).drop(columns=['node_index'])

def clarify_roles(row):
    if row['x_type'] == 'effect/phenotype':
        return pd.Series({
            'phenotype_index': row['x_index'],
            'phenotype_name': row['x_name'],
            'drug_index': row['y_index'],
            'drug_name': row['y_name']
        })
    else:
        return pd.Series({
            'phenotype_index': row['y_index'],
            'phenotype_name': row['y_name'],
            'drug_index': row['x_index'],
            'drug_name': row['x_name']
        })

drug_pheno = merged.apply(clarify_roles, axis=1)
drug_pheno = drug_pheno[['phenotype_index', 'drug_index']]
drug_pheno



Unnamed: 0,phenotype_index,drug_index
0,23158,16322
1,85849,16322
2,22447,16322
3,22831,16322
4,23469,16322
...,...,...
129563,25235,15288
129564,24289,15288
129565,23834,15288
129566,84853,15288


In [11]:
# ---- Inference Path 1: Disease → Protein → Drug ----
inferred_disease_drug_protein = pd.merge(disease_gene, drug_gene, on='gene_index')

# ---- Inference Path 2: Disease → Phenotype → Drug ----
inferred_disease_drug_phenotype = pd.merge(disease_pheno, drug_pheno, on='phenotype_index')

print(inferred_disease_drug_protein)
print(inferred_disease_drug_phenotype)

         gene_index  disease_index  drug_index
0              7097          28313       14679
1              7097          28313       14012
2              7097          28313       14680
3              7097          28313       14679
4              7097          28313       14012
...             ...            ...         ...
2254919        4152          28149       15752
2254920        5826          31190       14490
2254921        5826          31190       14491
2254922        5826          31190       14490
2254923        5826          31190       14491

[2254924 rows x 3 columns]
          phenotype_index  disease_index  drug_index
0                   22784          27158       14881
1                   22784          27158       14537
2                   22784          27158       14140
3                   22784          27158       14881
4                   22784          27158       14537
...                   ...            ...         ...
25673827            22574          32

In [12]:
# Load known indications
indication_edges = df_edges[df_edges['relation'] == validation_relation][['x_index', 'y_index']]
print(indication_edges)

         x_index  y_index
346730     16687    33577
346731     16687    36035
346764     20297    33577
346765     20297    36035
346768     16693    33577
...          ...      ...
5776153    84333    14471
5776154    27527    16634
5776155    38622    16634
5776156    28673    16634
5776158    39497    17237

[18776 rows x 2 columns]


In [13]:
merged = indication_edges.merge(
    df_node[['node_index', 'node_name', 'node_type']],
    left_on='x_index',
    right_on='node_index',
    how='left'
).rename(columns={
    'node_name': 'x_name',
    'node_type': 'x_type'
}).drop(columns=['node_index'])

merged = merged.merge(
    df_node[['node_index', 'node_name', 'node_type']],
    left_on='y_index',
    right_on='node_index',
    how='left'
).rename(columns={
    'node_name': 'y_name',
    'node_type': 'y_type'
}).drop(columns=['node_index'])
merged

Unnamed: 0,x_index,y_index,x_name,x_type,y_name,y_type
0,16687,33577,Fosinopril,drug,hypertensive disorder,disease
1,16687,36035,Fosinopril,drug,hypertension,disease
2,20297,33577,Imidapril,drug,hypertensive disorder,disease
3,20297,36035,Imidapril,drug,hypertension,disease
4,16693,33577,Cilazapril,drug,hypertensive disorder,disease
...,...,...,...,...,...,...
18771,84333,14471,"tenosynovial giant cell tumor, localized type",disease,Pexidartinib,drug
18772,27527,16634,Glanzmann thrombasthenia,disease,Coagulation factor VIIa Recombinant Human,drug
18773,38622,16634,factor VII deficiency,disease,Coagulation factor VIIa Recombinant Human,drug
18774,28673,16634,congenital factor VII deficiency,disease,Coagulation factor VIIa Recombinant Human,drug


In [14]:

def clarify_roles(row):
    if row['x_type'] == 'drug':
        return pd.Series({
            'drug_index': row['x_index'],
            'drug_name': row['x_name'],
            'disease_index': row['y_index'],
            'disease_name': row['y_name']
        })
    else:
        return pd.Series({
            'drug_index': row['y_index'],
            'drug_name': row['y_name'],
            'disease_index': row['x_index'],
            'disease_name': row['x_name']
        })

indication_edges = merged.apply(clarify_roles, axis=1)
indication_edges = indication_edges[['drug_index', 'disease_index']]
indication_edges

Unnamed: 0,drug_index,disease_index
0,16687,33577
1,16687,36035
2,20297,33577
3,20297,36035
4,16693,33577
...,...,...
18771,14471,84333
18772,16634,27527
18773,16634,38622
18774,16634,28673


In [15]:
valid_ddprotein = pd.merge(inferred_disease_drug_protein, indication_edges, on=['drug_index', 'disease_index'])
valid_ddprotein.drop_duplicates()

Unnamed: 0,gene_index,disease_index,drug_index
0,1659,28313,14140
2,1659,28313,14161
4,1659,28313,16011
6,1659,28313,15098
8,1659,28313,14178
...,...,...,...
19132,8192,33605,15417
19140,361,33605,14641
19144,2329,33605,14807
19148,11665,33625,20290


In [16]:
valid_ddprotein = valid_ddprotein.merge(
    df_node[['node_index', 'node_name', 'node_type']],
    left_on='drug_index',
    right_on='node_index',
    how='left'
).rename(columns={'node_name': 'drug_name', 'node_type': 'drug_type'}).drop(columns=['node_index'])

valid_ddprotein = valid_ddprotein.merge(
    df_node[['node_index', 'node_name', 'node_type']],
    left_on='disease_index',
    right_on='node_index',
    how='left'
).rename(columns={'node_name': 'disease_name', 'node_type': 'disease_type'}).drop(columns=['node_index'])

valid_ddprotein = valid_ddprotein.merge(
    df_node[['node_index', 'node_name', 'node_type']],
    left_on='gene_index',
    right_on='node_index',
    how='left'
).rename(columns={'node_name': 'gene_name', 'node_type': 'gene_type'}).drop(columns=['node_index'])


valid_ddprotein_dd = valid_ddprotein.drop_duplicates()
# valid_ddprotein_dd.to_csv("ddprotein.csv", index=False)

In [17]:
valid_ddphenotype = pd.merge(inferred_disease_drug_phenotype, indication_edges, on=['drug_index', 'disease_index'])
valid_ddphenotype.drop_duplicates()

Unnamed: 0,phenotype_index,disease_index,drug_index
0,24492,27878,14024
4,24285,27921,15074
6,24285,27921,15173
8,24285,27921,16855
10,24285,27921,17315
...,...,...,...
16102,22262,39540,14024
16104,22262,39540,14295
16106,22262,39540,14023
16124,22398,28313,14952


In [18]:
valid_ddphenotype = valid_ddphenotype.merge(
    df_node[['node_index', 'node_name', 'node_type']],
    left_on='drug_index',
    right_on='node_index',
    how='left'
).rename(columns={'node_name': 'drug_name', 'node_type': 'drug_type'}).drop(columns=['node_index'])

valid_ddphenotype = valid_ddphenotype.merge(
    df_node[['node_index', 'node_name', 'node_type']],
    left_on='disease_index',
    right_on='node_index',
    how='left'
).rename(columns={'node_name': 'disease_name', 'node_type': 'disease_type'}).drop(columns=['node_index'])

valid_ddphenotype = valid_ddphenotype.merge(
    df_node[['node_index', 'node_name', 'node_type']],
    left_on='phenotype_index',
    right_on='node_index',
    how='left'
).rename(columns={'node_name': 'phenotype_name', 'node_type': 'phenotype_type'}).drop(columns=['node_index'])

valid_ddphenotype_dd = valid_ddphenotype.drop_duplicates()
print(valid_ddphenotype_dd)
# valid_ddphenotype_dd.to_csv("ddphenotype.csv", index=False)

       phenotype_index  disease_index  drug_index           drug_name  \
0                24492          27878       14024          Prednisone   
4                24285          27921       15074         Amphetamine   
6                24285          27921       15173   Dextroamphetamine   
8                24285          27921       16855     Methylphenidate   
10               24285          27921       17315  Dexmethylphenidate   
...                ...            ...         ...                 ...   
16102            22262          39540       14024          Prednisone   
16104            22262          39540       14295        Promethazine   
16106            22262          39540       14023       Triamcinolone   
16124            22398          28313       14952         Haloperidol   
16126            22398          28313       14351         Thiothixene   

      drug_type                              disease_name disease_type  \
0          drug     X-linked adrenal hypoplasia c

In [19]:
matched_pairs = pd.concat([
    valid_ddphenotype[['drug_index', 'disease_index']],
    valid_ddprotein[['drug_index', 'disease_index']]
])
rmd_paris = matched_pairs.drop_duplicates()
rmd_paris

Unnamed: 0,drug_index,disease_index
0,14024,27878
4,15074,27921
6,15173,27921
8,16855,27921
10,17315,27921
...,...,...
19086,20151,33605
19108,14641,33605
19120,15417,33605
19148,20290,33625


In [20]:
unmatched_pairs = pd.merge(
    indication_edges, rmd_paris,
    on=['drug_index', 'disease_index'],
    how='left', indicator=True
)


not_valid_pairs = unmatched_pairs[unmatched_pairs['_merge'] == 'left_only'].drop(columns=['_merge'])
not_valid_pairs

Unnamed: 0,drug_index,disease_index
1,16687,36035
2,20297,33577
3,20297,36035
4,16693,33577
5,16693,36035
...,...,...
18769,14471,36638
18770,14471,33151
18771,14471,84333
18772,16634,27527


In [21]:
matches = indication_edges.merge(rmd_paris, on=["drug_index", "disease_index"])
print(len(matches))

5326


In [22]:
import pandas as pd

edges_df = pd.read_csv("/playpen/jesse/drug_repurpose/PrimeKG/edges.csv")

phenotype_edges = edges_df[edges_df['relation'] == 'disease_phenotype_positive']
protein_edges = edges_df[edges_df['relation'] == 'disease_protein']

In [23]:
def get_related_items(disease_id, edges, target='phenotype'):
    matches_x = edges[edges['x_index'] == disease_id]['y_index'].tolist()
    matches_y = edges[edges['y_index'] == disease_id]['x_index'].tolist()
    return list(set(matches_x + matches_y))

drug_disease_df = not_valid_pairs
disease_ids = drug_disease_df['disease_index'].unique()

disease_to_phenotypes = {
    d: get_related_items(d, phenotype_edges, target='phenotype') for d in disease_ids
}
disease_to_proteins = {
    d: get_related_items(d, protein_edges, target='protein') for d in disease_ids
}

drug_disease_df['related_phenotypes'] = drug_disease_df['disease_index'].map(disease_to_phenotypes)
drug_disease_df['related_proteins'] = drug_disease_df['disease_index'].map(disease_to_proteins)

In [24]:
filtered_ddf = drug_disease_df[
    drug_disease_df['related_phenotypes'].apply(lambda x: len(x) > 0) &
    drug_disease_df['related_proteins'].apply(lambda x: len(x) > 0)
].reset_index(drop=True)
filtered_ddf

Unnamed: 0,drug_index,disease_index,related_phenotypes,related_proteins
0,20297,33577,"[23513, 94180, 22741, 94390]","[8805, 3912, 11080, 2667, 33776, 4497, 22002, ..."
1,16693,33577,"[23513, 94180, 22741, 94390]","[8805, 3912, 11080, 2667, 33776, 4497, 22002, ..."
2,15762,33577,"[23513, 94180, 22741, 94390]","[8805, 3912, 11080, 2667, 33776, 4497, 22002, ..."
3,14836,33577,"[23513, 94180, 22741, 94390]","[8805, 3912, 11080, 2667, 33776, 4497, 22002, ..."
4,14421,33577,"[23513, 94180, 22741, 94390]","[8805, 3912, 11080, 2667, 33776, 4497, 22002, ..."
...,...,...,...,...
3095,20652,32483,"[25987, 88592, 23059, 23060, 85285, 90791, 229...","[13319, 4494, 4369, 9469, 2331, 3485, 9375, 84..."
3096,14129,28869,"[88486, 22759, 91246, 26159, 90480, 25618, 244...","[5255, 5136, 2834, 12572, 6173, 2593, 3879, 69..."
3097,20653,28197,"[22272, 23937, 23172, 23173, 22788, 85516, 928...","[5255, 7570, 5267, 6167, 35223, 546, 803, 1335..."
3098,20164,28265,"[22272, 85249, 27009, 22534, 84747, 85518, 231...","[11130, 1007]"


In [25]:
test_data_positive = filtered_ddf.sample(n=505, random_state=42).reset_index(drop=True)
print(test_data_positive.head())

   drug_index  disease_index  \
0       15583          31850   
1       20250          33645   
2       20450          29078   
3       15609          33577   
4       14779          27292   

                                  related_phenotypes  \
0                              [25256, 91026, 22757]   
1                              [94360, 26340, 23116]   
2  [23937, 86562, 22952, 84874, 24045, 85550, 885...   
3                       [23513, 94180, 22741, 94390]   
4  [23552, 85504, 23050, 85002, 23053, 26125, 860...   

                                    related_proteins  
0                                        [1641, 150]  
1                                [2384, 8530, 10795]  
2  [34816, 4610, 4098, 8707, 6664, 3595, 1037, 41...  
3  [8805, 3912, 11080, 2667, 33776, 4497, 22002, ...  
4  [12289, 2310, 7302, 2055, 13706, 6031, 11283, ...  


In [26]:
test_data_positive = test_data_positive.merge(
    df_node[['node_index', 'node_name', 'node_type']],
    left_on='drug_index',
    right_on='node_index',
    how='left'
).rename(columns={'node_name': 'drug_name', 'node_type': 'drug_type'}).drop(columns=['node_index'])

test_data_positive = test_data_positive.merge(
    df_node[['node_index', 'node_name', 'node_type']],
    left_on='disease_index',
    right_on='node_index',
    how='left'
).rename(columns={'node_name': 'disease_name', 'node_type': 'disease_type'}).drop(columns=['node_index'])

print(test_data_positive)

     drug_index  disease_index  \
0         15583          31850   
1         20250          33645   
2         20450          29078   
3         15609          33577   
4         14779          27292   
..          ...            ...   
500       14961          29220   
501       20218          28374   
502       15095          33577   
503       15429          31116   
504       20538          30826   

                                    related_phenotypes  \
0                                [25256, 91026, 22757]   
1                                [94360, 26340, 23116]   
2    [23937, 86562, 22952, 84874, 24045, 85550, 885...   
3                         [23513, 94180, 22741, 94390]   
4    [23552, 85504, 23050, 85002, 23053, 26125, 860...   
..                                                 ...   
500  [22788, 88454, 24199, 84361, 24461, 23958, 848...   
501                       [84456, 93465, 25403, 22759]   
502                       [23513, 94180, 22741, 94390]   
503        

In [27]:
conter_relation = "contraindication"
contraindication_edges = df_edges[df_edges['relation'] == conter_relation][['x_index', 'y_index']]
contraindication_edges.columns = ['drug_index', 'disease_index']
print(contraindication_edges)

         drug_index  disease_index
346728        15193          33577
346729        15193          36035
346732        14483          33577
346733        14483          36035
346734        16476          33577
...             ...            ...
5776145       35751          14251
5776146       35846          20456
5776147       35751          20456
5776148       27446          17286
5776157       84334          18277

[61350 rows x 2 columns]


In [28]:
# test_data_negative = contraindication_edges.sample(n=500, random_state=42).reset_index(drop=True)
# print(test_data_negative)
test_data_negative = contraindication_edges

In [29]:
def get_related_items(disease_id, edges, target='phenotype'):
    matches_x = edges[edges['x_index'] == disease_id]['y_index'].tolist()
    matches_y = edges[edges['y_index'] == disease_id]['x_index'].tolist()
    return list(set(matches_x + matches_y))

dd_negative_df = test_data_negative
disease_ids = dd_negative_df['disease_index'].unique()

disease_to_phenotypes = {
    d: get_related_items(d, phenotype_edges, target='phenotype') for d in disease_ids
}
disease_to_proteins = {
    d: get_related_items(d, protein_edges, target='protein') for d in disease_ids
}

dd_negative_df['related_phenotypes'] = dd_negative_df['disease_index'].map(disease_to_phenotypes)
dd_negative_df['related_proteins'] = dd_negative_df['disease_index'].map(disease_to_proteins)

In [30]:
dd_negative_df = dd_negative_df[
    dd_negative_df['related_phenotypes'].apply(lambda x: len(x) > 0) &
    dd_negative_df['related_proteins'].apply(lambda x: len(x) > 0)
].reset_index(drop=True)
dd_negative_df

Unnamed: 0,drug_index,disease_index,related_phenotypes,related_proteins
0,15193,33577,"[23513, 94180, 22741, 94390]","[8805, 3912, 11080, 2667, 33776, 4497, 22002, ..."
1,14483,33577,"[23513, 94180, 22741, 94390]","[8805, 3912, 11080, 2667, 33776, 4497, 22002, ..."
2,16476,33577,"[23513, 94180, 22741, 94390]","[8805, 3912, 11080, 2667, 33776, 4497, 22002, ..."
3,20148,33577,"[23513, 94180, 22741, 94390]","[8805, 3912, 11080, 2667, 33776, 4497, 22002, ..."
4,15087,33577,"[23513, 94180, 22741, 94390]","[8805, 3912, 11080, 2667, 33776, 4497, 22002, ..."
...,...,...,...,...
11214,16741,33579,"[94337, 94338, 22570, 26061, 24371, 94421, 243...","[2002, 2228, 34124]"
11215,16741,27462,"[22569, 22759, 24389, 33725]","[2002, 2228, 34124]"
11216,14919,27761,"[23172, 26375, 85515, 85131, 86929, 89121, 234...","[7665, 1682, 2103]"
11217,16593,28591,"[23168, 25608, 25865, 25596, 24461, 26127, 262...",[1722]


In [31]:
dd_negative_df = dd_negative_df.merge(
    df_node[['node_index', 'node_name', 'node_type']],
    left_on='drug_index',
    right_on='node_index',
    how='left'
).rename(columns={'node_name': 'drug_name', 'node_type': 'drug_type'}).drop(columns=['node_index'])

dd_negative_df = dd_negative_df.merge(
    df_node[['node_index', 'node_name', 'node_type']],
    left_on='disease_index',
    right_on='node_index',
    how='left'
).rename(columns={'node_name': 'disease_name', 'node_type': 'disease_type'}).drop(columns=['node_index'])


dd_negative_df

Unnamed: 0,drug_index,disease_index,related_phenotypes,related_proteins,drug_name,drug_type,disease_name,disease_type
0,15193,33577,"[23513, 94180, 22741, 94390]","[8805, 3912, 11080, 2667, 33776, 4497, 22002, ...",Rotigotine,drug,hypertensive disorder,disease
1,14483,33577,"[23513, 94180, 22741, 94390]","[8805, 3912, 11080, 2667, 33776, 4497, 22002, ...",Estradiol valerate,drug,hypertensive disorder,disease
2,16476,33577,"[23513, 94180, 22741, 94390]","[8805, 3912, 11080, 2667, 33776, 4497, 22002, ...",Phenazopyridine,drug,hypertensive disorder,disease
3,20148,33577,"[23513, 94180, 22741, 94390]","[8805, 3912, 11080, 2667, 33776, 4497, 22002, ...",Synephrine,drug,hypertensive disorder,disease
4,15087,33577,"[23513, 94180, 22741, 94390]","[8805, 3912, 11080, 2667, 33776, 4497, 22002, ...",Venlafaxine,drug,hypertensive disorder,disease
...,...,...,...,...,...,...,...,...
11214,16741,33579,"[94337, 94338, 22570, 26061, 24371, 94421, 243...","[2002, 2228, 34124]",Azelaic acid,drug,hypopigmentation of the skin (disease),disease
11215,16741,27462,"[22569, 22759, 24389, 33725]","[2002, 2228, 34124]",Azelaic acid,drug,"dilution, pigmentary",disease
11216,14919,27761,"[23172, 26375, 85515, 85131, 86929, 89121, 234...","[7665, 1682, 2103]",Nelarabine,drug,inflammatory demyelinating polyradiculoneuropathy,disease
11217,16593,28591,"[23168, 25608, 25865, 25596, 24461, 26127, 262...",[1722],Tranexamic acid,drug,hypoplasminogenemia,disease


In [32]:
test_data_positive

Unnamed: 0,drug_index,disease_index,related_phenotypes,related_proteins,drug_name,drug_type,disease_name,disease_type
0,15583,31850,"[25256, 91026, 22757]","[1641, 150]",Halcinonide,drug,seborrheic keratosis,disease
1,20250,33645,"[94360, 26340, 23116]","[2384, 8530, 10795]",Ceftizoxime,drug,infectious meningitis,disease
2,20450,29078,"[23937, 86562, 22952, 84874, 24045, 85550, 885...","[34816, 4610, 4098, 8707, 6664, 3595, 1037, 41...",Proglumetacin,drug,rheumatoid arthritis,disease
3,15609,33577,"[23513, 94180, 22741, 94390]","[8805, 3912, 11080, 2667, 33776, 4497, 22002, ...",Aliskiren,drug,hypertensive disorder,disease
4,14779,27292,"[23552, 85504, 23050, 85002, 23053, 26125, 860...","[12289, 2310, 7302, 2055, 13706, 6031, 11283, ...",Tiopronin,drug,glycogen storage disease,disease
...,...,...,...,...,...,...,...,...
500,14961,29220,"[22788, 88454, 24199, 84361, 24461, 23958, 848...","[1122, 35366, 5671, 9767, 3879, 5104, 9459, 20...",Simvastatin,drug,hyperlipoproteinemia,disease
501,20218,28374,"[84456, 93465, 25403, 22759]","[2978, 9029, 4425, 12842, 1740, 12766, 4959]",Sodium citrate,drug,gastroesophageal reflux disease,disease
502,15095,33577,"[23513, 94180, 22741, 94390]","[8805, 3912, 11080, 2667, 33776, 4497, 22002, ...",Timolol,drug,hypertensive disorder,disease
503,15429,31116,"[22757, 86677, 94326, 22759]","[2053, 4104, 8201, 2060, 15, 4112, 8212, 22, 8...",Vindesine,drug,lung cancer,disease


In [33]:
test_data_negative = dd_negative_df.sample(n=505, random_state=42).reset_index(drop=True)
test_data_negative

Unnamed: 0,drug_index,disease_index,related_phenotypes,related_proteins,drug_name,drug_type,disease_name,disease_type
0,14309,27468,"[89256, 24461, 23984, 22933, 84885]","[362, 4237, 6002, 1908, 8733, 3231]",Levofloxacin,drug,X-linked severe congenital neutropenia,disease
1,14783,27878,"[22788, 84517, 84363, 24492, 85837, 25515, 873...","[5574, 11114, 8204, 8336, 12240, 1239, 2616, 6...",Caffeine,drug,X-linked adrenal hypoplasia congenita,disease
2,15297,31618,"[23168, 22272, 86923, 23186, 22291, 22292, 253...","[35033, 6567, 6504, 5203, 1012, 341, 13014, 63...",Secobarbital,drug,congenital central hypoventilation syndrome,disease
3,15836,29387,"[22759, 22472, 25486, 86799, 84764]","[6283, 8044, 2384, 4990, 1748, 789, 6233, 1113...",Telmisartan,drug,hyperparathyroidism,disease
4,14711,33594,"[94242, 22979, 94148, 23080, 94522, 94356, 945...","[3233, 2498, 10984, 4941, 6926, 13453, 2965, 4...",Chenodeoxycholic acid,drug,diarrheal disease,disease
...,...,...,...,...,...,...,...,...
500,14530,29113,"[22497, 26180, 22951, 22759, 86572, 22483, 25308]","[5124, 3078, 7688, 7568, 4497, 7059, 7958, 232...",Stanolone,drug,diabetic ketoacidosis,disease
501,14846,33605,"[25744, 94267, 94549, 27095]","[7040, 33913, 8192, 11136, 11143, 1927, 4620, ...",Perindopril,drug,hypotensive disorder,disease
502,20312,37785,[33765],"[772, 4997, 3460, 657, 3474, 1299, 279, 2712, ...",Difenoxin,drug,ulcerative colitis (disease),disease
503,16588,33598,"[26376, 94264, 23039]","[6144, 8069, 9485, 34574, 34575, 5137, 1810, 1...",Metformin,drug,stroke disorder,disease


In [35]:
# test_data_negative = dd_negative_df.sample(n=500, random_state=42).reset_index(drop=True)
test_data = pd.concat([test_data_positive, test_data_negative], ignore_index=True)
test_data['relation'] = ['positive'] * 505 + ['negative'] * 505


test_main_data = test_data.iloc[5:-5].copy()
test_main_data.to_csv("test_data.csv", index=False)
test_data

Unnamed: 0,drug_index,disease_index,related_phenotypes,related_proteins,drug_name,drug_type,disease_name,disease_type,relation
0,15583,31850,"[25256, 91026, 22757]","[1641, 150]",Halcinonide,drug,seborrheic keratosis,disease,positive
1,20250,33645,"[94360, 26340, 23116]","[2384, 8530, 10795]",Ceftizoxime,drug,infectious meningitis,disease,positive
2,20450,29078,"[23937, 86562, 22952, 84874, 24045, 85550, 885...","[34816, 4610, 4098, 8707, 6664, 3595, 1037, 41...",Proglumetacin,drug,rheumatoid arthritis,disease,positive
3,15609,33577,"[23513, 94180, 22741, 94390]","[8805, 3912, 11080, 2667, 33776, 4497, 22002, ...",Aliskiren,drug,hypertensive disorder,disease,positive
4,14779,27292,"[23552, 85504, 23050, 85002, 23053, 26125, 860...","[12289, 2310, 7302, 2055, 13706, 6031, 11283, ...",Tiopronin,drug,glycogen storage disease,disease,positive
...,...,...,...,...,...,...,...,...,...
1005,14530,29113,"[22497, 26180, 22951, 22759, 86572, 22483, 25308]","[5124, 3078, 7688, 7568, 4497, 7059, 7958, 232...",Stanolone,drug,diabetic ketoacidosis,disease,negative
1006,14846,33605,"[25744, 94267, 94549, 27095]","[7040, 33913, 8192, 11136, 11143, 1927, 4620, ...",Perindopril,drug,hypotensive disorder,disease,negative
1007,20312,37785,[33765],"[772, 4997, 3460, 657, 3474, 1299, 279, 2712, ...",Difenoxin,drug,ulcerative colitis (disease),disease,negative
1008,16588,33598,"[26376, 94264, 23039]","[6144, 8069, 9485, 34574, 34575, 5137, 1810, 1...",Metformin,drug,stroke disorder,disease,negative


In [37]:
shot_data = pd.concat([test_data.head(5), test_data.tail(5)])
shot_data.to_csv("shot_data.csv", index=False)
shot_data

Unnamed: 0,drug_index,disease_index,related_phenotypes,related_proteins,drug_name,drug_type,disease_name,disease_type,relation
0,15583,31850,"[25256, 91026, 22757]","[1641, 150]",Halcinonide,drug,seborrheic keratosis,disease,positive
1,20250,33645,"[94360, 26340, 23116]","[2384, 8530, 10795]",Ceftizoxime,drug,infectious meningitis,disease,positive
2,20450,29078,"[23937, 86562, 22952, 84874, 24045, 85550, 885...","[34816, 4610, 4098, 8707, 6664, 3595, 1037, 41...",Proglumetacin,drug,rheumatoid arthritis,disease,positive
3,15609,33577,"[23513, 94180, 22741, 94390]","[8805, 3912, 11080, 2667, 33776, 4497, 22002, ...",Aliskiren,drug,hypertensive disorder,disease,positive
4,14779,27292,"[23552, 85504, 23050, 85002, 23053, 26125, 860...","[12289, 2310, 7302, 2055, 13706, 6031, 11283, ...",Tiopronin,drug,glycogen storage disease,disease,positive
1005,14530,29113,"[22497, 26180, 22951, 22759, 86572, 22483, 25308]","[5124, 3078, 7688, 7568, 4497, 7059, 7958, 232...",Stanolone,drug,diabetic ketoacidosis,disease,negative
1006,14846,33605,"[25744, 94267, 94549, 27095]","[7040, 33913, 8192, 11136, 11143, 1927, 4620, ...",Perindopril,drug,hypotensive disorder,disease,negative
1007,20312,37785,[33765],"[772, 4997, 3460, 657, 3474, 1299, 279, 2712, ...",Difenoxin,drug,ulcerative colitis (disease),disease,negative
1008,16588,33598,"[26376, 94264, 23039]","[6144, 8069, 9485, 34574, 34575, 5137, 1810, 1...",Metformin,drug,stroke disorder,disease,negative
1009,14792,33632,"[94305, 22921, 94349, 22929, 23729, 94200, 94363]","[11136, 2434, 130, 7815, 6664, 6926, 2834, 789...",Paclitaxel,drug,anemia (disease),disease,negative
