# Init Data

In [44]:
import os
import json
import pandas as pd


NODE_FILE = "/playpen/jesse/drug_repurpose/PrimeKG/nodes.csv"
df_node = pd.read_csv(NODE_FILE)

unique_node_types = df_node['node_type'].nunique()
print(f"Number of unique node types: {unique_node_types}")

print("Unique node types:")
print(df_node['node_type'].unique())

print("\nCount of each node type:")
print(df_node['node_type'].value_counts())

df_node

Number of unique node types: 10
Unique node types:
['gene/protein' 'drug' 'effect/phenotype' 'disease' 'biological_process'
 'molecular_function' 'cellular_component' 'exposure' 'pathway' 'anatomy']

Count of each node type:
node_type
biological_process    28642
gene/protein          27671
disease               17080
effect/phenotype      15311
anatomy               14035
molecular_function    11169
drug                   7957
cellular_component     4176
pathway                2516
exposure                818
Name: count, dtype: int64


Unnamed: 0,node_index,node_id,node_type,node_name,node_source
0,0,9796,gene/protein,PHYHIP,NCBI
1,1,7918,gene/protein,GPANK1,NCBI
2,2,8233,gene/protein,ZRSR2,NCBI
3,3,4899,gene/protein,NRF1,NCBI
4,4,5297,gene/protein,PI4KA,NCBI
...,...,...,...,...,...
129370,129370,R-HSA-936837,pathway,Ion transport by P-type ATPases,REACTOME
129371,129371,R-HSA-997272,pathway,Inhibition of voltage gated Ca2+ channels via...,REACTOME
129372,129372,1062,anatomy,anatomical entity,UBERON
129373,129373,468,anatomy,multi-cellular organism,UBERON


In [45]:
EDGE_FILE = "/playpen/jesse/drug_repurpose/PrimeKG/edges.csv"  
df_edges = pd.read_csv(EDGE_FILE)

unique_relations = df_edges['relation'].nunique()
print(f"Number of unique relation types: {unique_relations}")

print("\nUnique relation types:")
print(df_edges['relation'].unique())

print("\nRelation counts:")
print(df_edges['relation'].value_counts())

Number of unique relation types: 30

Unique relation types:
['protein_protein' 'drug_protein' 'contraindication' 'indication'
 'off-label use' 'drug_drug' 'phenotype_protein' 'phenotype_phenotype'
 'disease_phenotype_negative' 'disease_phenotype_positive'
 'disease_protein' 'disease_disease' 'drug_effect' 'bioprocess_bioprocess'
 'molfunc_molfunc' 'cellcomp_cellcomp' 'molfunc_protein'
 'cellcomp_protein' 'bioprocess_protein' 'exposure_protein'
 'exposure_disease' 'exposure_exposure' 'exposure_bioprocess'
 'exposure_molfunc' 'exposure_cellcomp' 'pathway_pathway'
 'pathway_protein' 'anatomy_anatomy' 'anatomy_protein_present'
 'anatomy_protein_absent']

Relation counts:
relation
anatomy_protein_present       3036406
drug_drug                     2672628
protein_protein                642150
disease_phenotype_positive     300634
bioprocess_protein             289610
cellcomp_protein               166804
disease_protein                160822
molfunc_protein                139060
drug_effect

In [46]:
print(df_edges)

                       relation   display_relation  x_index  y_index
0               protein_protein                ppi        0     8889
1               protein_protein                ppi        1     2798
2               protein_protein                ppi        2     5646
3               protein_protein                ppi        3    11592
4               protein_protein                ppi        4     2122
...                         ...                ...      ...      ...
8100493  anatomy_protein_absent  expression absent    66747     5259
8100494  anatomy_protein_absent  expression absent    63824    58254
8100495  anatomy_protein_absent  expression absent    63826    58254
8100496  anatomy_protein_absent  expression absent    64523    58254
8100497  anatomy_protein_absent  expression absent    67302    58254

[8100498 rows x 4 columns]


# Step1: Test data check and combination

In [37]:
test_postive = pd.read_csv("/playpen/jesse/drug_repurpose/split_data/data_analysis/test_positive_full.csv")
print(test_postive)

     drug_index  disease_index drug_type           drug_name disease_type  \
0         14520          33651      drug           Allantoin      disease   
1         15098          33683      drug           Promazine      disease   
2         15493          29637      drug       Bromocriptine      disease   
3         14275          30654      drug         Doxorubicin      disease   
4         20375          33714      drug  Aluminum hydroxide      disease   
..          ...            ...       ...                 ...          ...   
495       15778          28208      drug          Tolazamide      disease   
496       15179          33703      drug        Nicotinamide      disease   
497       14269          32531      drug  Methylprednisolone      disease   
498       15451          33577      drug          Bisoprolol      disease   
499       14196          28851      drug         Propranolol      disease   

                                  disease_name  \
0                        

In [38]:
test_postive_new = pd.read_csv("/playpen/jesse/drug_repurpose/split_data/data_analysis/test_positive_full_new.csv")
print(test_postive_new)

     drug_index  disease_index drug_type           drug_name disease_type  \
0         14520          33651      drug           Allantoin      disease   
1         15098          33683      drug           Promazine      disease   
2         15493          29637      drug       Bromocriptine      disease   
3         14275          30654      drug         Doxorubicin      disease   
4         20375          33714      drug  Aluminum hydroxide      disease   
..          ...            ...       ...                 ...          ...   
496       15179          33703      drug        Nicotinamide      disease   
497       14269          32531      drug  Methylprednisolone      disease   
498       15451          33577      drug          Bisoprolol      disease   
499       14196          28851      drug         Propranolol      disease   
500       15098          37888      drug           Promazine      disease   

                                  disease_name  \
0                        

In [39]:
diff_new = test_postive_new.merge(test_postive, how='outer', indicator=True).query('_merge == "left_only"')
print("Rows only in new DF:")
print(diff_new)

Rows only in new DF:
     drug_index  disease_index drug_type  drug_name disease_type  \
238       15098          37888      drug  Promazine      disease   

          disease_name related_phenotypes          related_proteins     _merge  
238  allergic rhinitis     [25610, 23083]  [8354, 3451, 3818, 4315]  left_only  


In [40]:

test_positive_gd = pd.merge(
    test_postive,
    df_edges,
    left_on=['drug_index', 'disease_index'],
    right_on=['x_index', 'y_index']
)
 

test_positive_gd = test_positive_gd[test_positive_gd['relation'] == 'indication']
# test_positive_gd.to_csv('./data_analysis/test_positive_gd.csv', index=False) 
test_positive_gd

Unnamed: 0,drug_index,disease_index,drug_type,drug_name,disease_type,disease_name,related_phenotypes,related_proteins,relation,display_relation,x_index,y_index
0,14520,33651,drug,Allantoin,disease,seborrheic dermatitis,"[94334, 22550]","[1641, 150]",indication,indication,14520,33651
2,15098,33683,drug,Promazine,disease,urticaria (disease),"[94533, 22540, 25518, 94454, 94520]","[3552, 417, 8834, 5667, 4968, 2889, 7083, 1004...",indication,indication,15098,33683
3,15493,29637,drug,Bromocriptine,disease,prolactin producing pituitary gland tumor,"[25479, 84488, 90495, 25620, 25599, 84764, 225...","[3104, 2401, 2789, 3046, 1884, 22023, 5320, 10...",indication,indication,15493,29637
4,14275,30654,drug,Doxorubicin,disease,acute lymphoblastic/lymphocytic leukemia,"[23970, 86308, 22757, 84890]","[2, 9221, 1549, 4111, 13840, 4114, 1558, 1559,...",indication,indication,14275,30654
6,20375,33714,drug,Aluminum hydroxide,disease,esophagitis (disease),"[93737, 22987, 22988, 94534]","[2978, 9029, 4425, 12842, 1740, 12766, 4959]",indication,indication,20375,33714
...,...,...,...,...,...,...,...,...,...,...,...,...
504,15778,28208,drug,Tolazamide,disease,type 2 diabetes mellitus,"[22759, 24491, 22483, 91382, 85559]","[34295, 13830, 7688, 10248, 34312, 34317, 3431...",indication,indication,15778,28208
505,15179,33703,drug,Nicotinamide,disease,acne (disease),"[22539, 94372, 94243, 94244]","[12305, 1587, 6229, 1485]",indication,indication,15179,33703
506,14269,32531,drug,Methylprednisolone,disease,mantle cell lymphoma,"[22952, 25547, 23214, 24143, 85040, 24337, 256...","[769, 134, 6926, 4369, 7186, 7441, 8212, 1558,...",indication,indication,14269,32531
507,15451,33577,drug,Bisoprolol,disease,hypertensive disorder,"[23513, 94180, 22741, 94390]","[8805, 3912, 11080, 2667, 33776, 4497, 22002, ...",indication,indication,15451,33577


In [41]:

test_positive_gd_new = pd.merge(
    test_postive_new,
    df_edges,
    left_on=['drug_index', 'disease_index'],
    right_on=['x_index', 'y_index']
)
 

test_positive_gd_new = test_positive_gd_new[test_positive_gd_new['relation'] == 'indication']
test_positive_gd_new.to_csv('./data_analysis/test_positive_gd_new.csv', index=False) 
test_positive_gd_new

Unnamed: 0,drug_index,disease_index,drug_type,drug_name,disease_type,disease_name,related_phenotypes,related_proteins,relation,display_relation,x_index,y_index
0,14520,33651,drug,Allantoin,disease,seborrheic dermatitis,"[94334, 22550]","[1641, 150]",indication,indication,14520,33651
2,15098,33683,drug,Promazine,disease,urticaria (disease),"[94533, 22540, 25518, 94454, 94520]","[3552, 417, 8834, 5667, 4968, 2889, 7083, 1004...",indication,indication,15098,33683
3,15493,29637,drug,Bromocriptine,disease,prolactin producing pituitary gland tumor,"[25479, 84488, 90495, 25620, 25599, 84764, 225...","[3104, 2401, 2789, 3046, 1884, 22023, 5320, 10...",indication,indication,15493,29637
4,14275,30654,drug,Doxorubicin,disease,acute lymphoblastic/lymphocytic leukemia,"[23970, 86308, 22757, 84890]","[2, 9221, 1549, 4111, 13840, 4114, 1558, 1559,...",indication,indication,14275,30654
6,20375,33714,drug,Aluminum hydroxide,disease,esophagitis (disease),"[93737, 22987, 22988, 94534]","[2978, 9029, 4425, 12842, 1740, 12766, 4959]",indication,indication,20375,33714
...,...,...,...,...,...,...,...,...,...,...,...,...
505,15179,33703,drug,Nicotinamide,disease,acne (disease),"[22539, 94372, 94243, 94244]","[12305, 1587, 6229, 1485]",indication,indication,15179,33703
506,14269,32531,drug,Methylprednisolone,disease,mantle cell lymphoma,"[22952, 25547, 23214, 24143, 85040, 24337, 256...","[769, 134, 6926, 4369, 7186, 7441, 8212, 1558,...",indication,indication,14269,32531
507,15451,33577,drug,Bisoprolol,disease,hypertensive disorder,"[23513, 94180, 22741, 94390]","[8805, 3912, 11080, 2667, 33776, 4497, 22002, ...",indication,indication,15451,33577
508,14196,28851,drug,Propranolol,disease,Kasabach-Merritt syndrome,"[22791, 87306, 85009, 22933, 22934, 86296, 225...","[2840, 177, 7212, 9965]",indication,indication,14196,28851


In [42]:
test_negative = pd.read_csv("/playpen/jesse/drug_repurpose/split_data/data_analysis/test_negative_full.csv")
print(test_negative)

test_negative_new = pd.read_csv("/playpen/jesse/drug_repurpose/split_data/data_analysis/test_negative_full_new.csv")
print(test_negative_new)

     drug_index  disease_index drug_type            drug_name disease_type  \
0         20391          28811      drug  Silver sulfadiazine      disease   
1         15294          28547      drug              Tacrine      disease   
2         14783          33679      drug             Caffeine      disease   
3         20239          29188      drug        Penicillamine      disease   
4         17243          27326      drug          Alimemazine      disease   
..          ...            ...       ...                  ...          ...   
495       15542          28456      drug          Milnacipran      disease   
496       15584          31790      drug           Lanreotide      disease   
497       14168          31493      drug       Spironolactone      disease   
498       20427          31918      drug       Cholestyramine      disease   
499       20350          30491      drug      Pentaerithrityl      disease   

                                          disease_name  \
0    

In [47]:

test_negative_gd = pd.merge(
    test_negative,
    df_edges,
    left_on=['drug_index', 'disease_index'],
    right_on=['x_index', 'y_index']
)
test_negative_gd = test_negative_gd[test_negative_gd['relation'] == 'contraindication']
# test_negative_gd.to_csv('./data_analysis/test_negative_gd.csv', index=False)  
test_negative_gd


test_negative_gd_new = pd.merge(
    test_negative_new,
    df_edges,
    left_on=['drug_index', 'disease_index'],
    right_on=['x_index', 'y_index']
)
test_negative_gd_new = test_negative_gd_new[test_negative_gd_new['relation'] == 'contraindication']
test_negative_gd_new.to_csv('./data_analysis/test_negative_gd_new.csv', index=False)  
test_negative_gd_new

Unnamed: 0,drug_index,disease_index,drug_type,drug_name,disease_type,disease_name,related_phenotypes,related_proteins,relation,display_relation,x_index,y_index
0,20391,28811,drug,Silver sulfadiazine,disease,cystinuria,"[25699, 26180, 22759, 22472, 85642, 85419, 850...","[7457, 12289, 5829, 7302, 13706, 34890, 10413,...",contraindication,contraindication,20391,28811
1,15294,28547,drug,Tacrine,disease,Parkinson disease,"[25858, 84357, 24327, 25224, 24331, 23564, 226...","[8192, 22017, 519, 11, 8212, 9237, 1567, 6175,...",contraindication,contraindication,15294,28547
2,14783,33679,drug,Caffeine,disease,potassium deficiency disease,"[94408, 23033]","[1760, 8805, 12165, 172, 213, 1494, 509]",contraindication,contraindication,14783,33679
3,20239,29188,drug,Penicillamine,disease,autosomal recessive severe congenital neutrope...,"[22124, 22933, 26180]","[6671, 6002, 1908, 8733, 3231]",contraindication,contraindication,20239,29188
4,17243,27326,drug,Alimemazine,disease,long QT syndrome,"[89225, 94223, 26257, 84369, 94225, 84887, 238...","[7177, 9233, 34962, 7198, 1968, 177, 10417, 35...",contraindication,contraindication,17243,27326
...,...,...,...,...,...,...,...,...,...,...,...,...
500,14168,31493,drug,Spironolactone,disease,pancreatitis,"[24224, 87330, 26180, 22759, 86573, 23950, 861...","[12165, 33925, 33926, 10376, 33929, 2698, 3405...",contraindication,contraindication,14168,31493
501,20427,31918,drug,Cholestyramine,disease,phenylketonuria,"[88450, 85125, 22277, 84616, 22156, 24462, 245...","[9905, 10181]",contraindication,contraindication,20427,31918
502,20350,30491,drug,Pentaerithrityl,disease,obstructive sleep apnea syndrome,"[26449, 84853, 25854, 22759]","[5203, 341, 9207]",contraindication,contraindication,20350,30491
503,15776,27933,drug,Nabilone,disease,anxiety disorder,"[22759, 22447]","[8192, 10257, 22, 6175, 33, 2082, 2096, 6195, ...",contraindication,contraindication,15776,27933


In [None]:
test_data = pd.concat([test_positive_gd, test_negative_gd], ignore_index=True)
# test_data.to_csv('./test_data.csv', index=False)  

test_data_new = pd.concat([test_positive_gd_new, test_negative_gd_new], ignore_index=True)
# test_data_new.to_csv('./data_analysis/test_data_new.csv', index=False)  

Modify the test_data new manually; remove the duplicate row (14559, 33656) due to the conflict in the PrimeKG.

In [None]:
test_data = pd.read_csv("/playpen/jesse/drug_repurpose/split_data/test_data.csv")
print(test_data)

     drug_index  disease_index drug_type           drug_name disease_type  \
0         14520          33651      drug           Allantoin      disease   
1         15098          33683      drug           Promazine      disease   
2         15493          29637      drug       Bromocriptine      disease   
3         14275          30654      drug         Doxorubicin      disease   
4         20375          33714      drug  Aluminum hydroxide      disease   
..          ...            ...       ...                 ...          ...   
995       15584          31790      drug          Lanreotide      disease   
996       14168          31493      drug      Spironolactone      disease   
997       20427          31918      drug      Cholestyramine      disease   
998       20350          30491      drug     Pentaerithrityl      disease   
999       15776          27933      drug            Nabilone      disease   

                                  disease_name  \
0                        

In [19]:

dupes = test_data_new.groupby(
    ['drug_index', 'disease_index']
).size().reset_index(name='count')


print(dupes[dupes['count'] > 1])

Empty DataFrame
Columns: [drug_index, disease_index, count]
Index: []


# Step2: Training data check and combination

In [15]:
train_negative_gene = pd.read_csv("./data_analysis/train_negative_gene.csv")
train_positive_gene = pd.read_csv("./data_analysis/train_positive_protein.csv")
train_negative_phenotype = pd.read_csv("./data_analysis/train_negative_phenotype.csv")
train_positive_phenotype = pd.read_csv("./data_analysis/train_positive_phenotype.csv")

In [16]:
train_negative_gene

Unnamed: 0,drug_index,disease_index,gene_index,drug_name,drug_type,disease_name,disease_type,gene_name,gene_type
0,14198,36187,4497,Clonidine,drug,coronary artery disease,disease,NOS3,gene/protein
1,15495,37703,1876,Estazolam,drug,neurotic disorder,disease,NTRK2,gene/protein
2,15078,36035,10901,Tramadol,drug,hypertension,disease,CYP3A5,gene/protein
3,14806,83760,10047,Entacapone,drug,dysthymic disorder,disease,OPRK1,gene/protein
4,14286,27933,2551,Rifampicin,drug,anxiety disorder,disease,PAWR,gene/protein
...,...,...,...,...,...,...,...,...,...
495,18277,83760,3466,Levetiracetam,drug,dysthymic disorder,disease,HTR3A,gene/protein
496,15783,35641,1863,Cyclizine,drug,epilepsy,disease,GFAP,gene/protein
497,15776,37703,2591,Nabilone,drug,neurotic disorder,disease,SLC6A4,gene/protein
498,14476,35966,1497,Testosterone cypionate,drug,prostate cancer,disease,GSTP1,gene/protein


In [17]:
train_negative_gene_gd = pd.merge(
    train_negative_gene,
    df_edges,
    left_on=['drug_index', 'disease_index'],
    right_on=['x_index', 'y_index']
)
train_negative_gene_gd = train_negative_gene_gd[train_negative_gene_gd['relation'] == 'contraindication']
# train_negative_gene_gd.to_csv('./data_analysis/train_negative_gene_gd.csv', index=False)  
train_negative_gene_gd

Unnamed: 0,drug_index,disease_index,gene_index,drug_name,drug_type,disease_name,disease_type,gene_name,gene_type,relation,display_relation,x_index,y_index
0,14198,36187,4497,Clonidine,drug,coronary artery disease,disease,NOS3,gene/protein,contraindication,contraindication,14198,36187
1,15495,37703,1876,Estazolam,drug,neurotic disorder,disease,NTRK2,gene/protein,contraindication,contraindication,15495,37703
2,15078,36035,10901,Tramadol,drug,hypertension,disease,CYP3A5,gene/protein,contraindication,contraindication,15078,36035
3,14806,83760,10047,Entacapone,drug,dysthymic disorder,disease,OPRK1,gene/protein,contraindication,contraindication,14806,83760
4,14286,27933,2551,Rifampicin,drug,anxiety disorder,disease,PAWR,gene/protein,contraindication,contraindication,14286,27933
...,...,...,...,...,...,...,...,...,...,...,...,...,...
497,18277,83760,3466,Levetiracetam,drug,dysthymic disorder,disease,HTR3A,gene/protein,contraindication,contraindication,18277,83760
498,15783,35641,1863,Cyclizine,drug,epilepsy,disease,GFAP,gene/protein,contraindication,contraindication,15783,35641
499,15776,37703,2591,Nabilone,drug,neurotic disorder,disease,SLC6A4,gene/protein,contraindication,contraindication,15776,37703
500,14476,35966,1497,Testosterone cypionate,drug,prostate cancer,disease,GSTP1,gene/protein,contraindication,contraindication,14476,35966


In [18]:
train_positive_gene_gd = pd.merge(
    train_positive_gene,
    df_edges,
    left_on=['drug_index', 'disease_index'],
    right_on=['x_index', 'y_index']
)
train_positive_gene_gd = train_positive_gene_gd[train_positive_gene_gd['relation'] == 'indication']
train_positive_gene_gd

Unnamed: 0,gene_index,disease_index,drug_index,drug_name,drug_type,disease_name,disease_type,gene_name,gene_type,relation,display_relation,x_index,y_index
0,7880,27933,15119,Tranylcypromine,drug,anxiety disorder,disease,CYP2C9,gene/protein,indication,indication,15119,27933
1,22002,28313,14913,Trifluoperazine,drug,schizophrenia,disease,CALY,gene/protein,indication,indication,14913,28313
2,3955,36634,14945,Sorafenib,drug,liver cancer,disease,CYP1A2,gene/protein,indication,indication,14945,36634
3,2401,28801,16842,Pramipexole,drug,juvenile onset Parkinson disease 19A,disease,DRD2,gene/protein,indication,indication,16842,28801
4,4315,83845,17376,Cefotaxime,drug,viral meningitis,disease,ALB,gene/protein,indication,indication,17376,83845
...,...,...,...,...,...,...,...,...,...,...,...,...,...
518,4315,37703,14310,Doxepin,drug,neurotic disorder,disease,ALB,gene/protein,indication,indication,14310,37703
519,864,27933,15197,Mianserin,drug,anxiety disorder,disease,HTR2C,gene/protein,indication,indication,15197,27933
520,125,29375,17605,Cetuximab,drug,squamous cell carcinoma,disease,EGFR,gene/protein,indication,indication,17605,29375
521,337,83868,14411,Ibrutinib,drug,Richter syndrome,disease,BTK,gene/protein,indication,indication,14411,83868


In [19]:
train_positive_phenotype_gd = pd.merge(
    train_positive_phenotype,
    df_edges,
    left_on=['drug_index', 'disease_index'],
    right_on=['x_index', 'y_index']
)
train_positive_phenotype_gd = train_positive_phenotype_gd[train_positive_phenotype_gd['relation'] == 'indication']
train_positive_phenotype_gd

Unnamed: 0,drug_index,disease_index,phenotype_index,drug_name,drug_type,disease_name,disease_type,phenotype_name,phenotype_type,relation,display_relation,x_index,y_index
0,15772,28547,93516,Tolcapone,drug,Parkinson disease,disease,Impulsivity,effect/phenotype,indication,indication,15772,28547
1,20597,33643,24226,Tivozanib,drug,renal cell carcinoma (disease),disease,Renal neoplasm,effect/phenotype,indication,indication,20597,33643
2,14042,29078,25620,Hydrocortisone acetate,drug,rheumatoid arthritis,disease,Fatigue,effect/phenotype,indication,indication,14042,29078
3,15860,32325,93521,Sodium aurothiomalate,drug,juvenile idiopathic arthritis,disease,Mediastinal lymphadenopathy,effect/phenotype,indication,indication,15860,32325
4,14390,33609,85707,Azapropazone,drug,osteoarthritis,disease,Osteoarthritis of the elbow,effect/phenotype,indication,indication,14390,33609
...,...,...,...,...,...,...,...,...,...,...,...,...,...
500,15333,28547,22679,Rasagiline,drug,Parkinson disease,disease,Sleep disturbance,effect/phenotype,indication,indication,15333,28547
501,15077,32531,24143,Bortezomib,drug,mantle cell lymphoma,disease,Splenomegaly,effect/phenotype,indication,indication,15077,32531
502,20565,27304,22788,Elosulfase alfa,drug,mucopolysaccharidosis,disease,Failure to thrive,effect/phenotype,indication,indication,20565,27304
503,17042,33688,94467,Ramucirumab,drug,non-small cell lung carcinoma (disease),disease,Squamous cell lung carcinoma,effect/phenotype,indication,indication,17042,33688


In [20]:
train_negative_phenotype_gd = pd.merge(
    train_negative_phenotype,
    df_edges,
    left_on=['drug_index', 'disease_index'],
    right_on=['x_index', 'y_index']
)
train_negative_phenotype_gd = train_negative_phenotype_gd[train_negative_phenotype_gd['relation'] == 'contraindication']
train_negative_phenotype_gd

Unnamed: 0,drug_index,disease_index,phenotype_index,drug_name,drug_type,disease_name,disease_type,phenotype_name,phenotype_type,relation,display_relation,x_index,y_index
0,17064,27626,84771,Meprobamate,drug,thrombocytopenia,disease,Petechiae,effect/phenotype,contraindication,contraindication,17064,27626
1,14249,32719,25974,Pseudoephedrine,drug,congenital hypothyroidism,disease,Macroglossia,effect/phenotype,contraindication,contraindication,14249,32719
2,14976,31650,84675,Triazolam,drug,myasthenia gravis,disease,Diplopia,effect/phenotype,contraindication,contraindication,14976,31650
3,15116,33091,22437,Paroxetine,drug,serotonin syndrome,disease,Restlessness,effect/phenotype,contraindication,contraindication,15116,33091
4,14561,31502,24199,Cholecalciferol,drug,Wilson disease,disease,Hepatomegaly,effect/phenotype,contraindication,contraindication,14561,31502
...,...,...,...,...,...,...,...,...,...,...,...,...,...
497,16071,28158,23157,Isopropamide,drug,inflammatory bowel disease,disease,Vomiting,effect/phenotype,contraindication,contraindication,16071,28158
498,20303,31122,23368,Trolnitrate,drug,phaeochromocytoma,disease,Proteinuria,effect/phenotype,contraindication,contraindication,20303,31122
499,14026,32899,89372,Flumethasone,drug,pituitary deficiency due to Rathke's pouch cysts,disease,Enlarged pituitary gland,effect/phenotype,contraindication,contraindication,14026,32899
500,14385,32909,24227,Liraglutide,drug,medullary thyroid gland carcinoma,disease,Pheochromocytoma,effect/phenotype,contraindication,contraindication,14385,32909


In [23]:
train_data = pd.concat([train_negative_gene_gd, train_positive_gene_gd, train_negative_phenotype_gd, train_positive_phenotype_gd], ignore_index=True)

train_data.to_csv('./data_analysis/train_data_full.csv', index=False) 

In [22]:
import jsonlines
import pandas as pd

train_path = "./train.jsonl"
with jsonlines.open(train_path, "a") as f_write:
    for index, row in train_data.iterrows():
        line_dict = {}
        line_dict['drug_name'] = row['drug_name']
        line_dict['disease_name'] = row['disease_name']
        question = f"Is {row['disease_name']} an indication for {row['drug_name']}?"
        if pd.notna(row['gene_name']):
            line_dict['gene_name'] = row['gene_name']
            if row['relation'] == 'indication':
                reason = f"The drug {row['drug_name']} acts as a carrier for the gene {row['gene_name']}. The disease {row['disease_name']} is associated with the gene {row['gene_name']}, so the drug {row['drug_name']} can be used to treat {row['disease_name']}."
                prefix = f"Question: {question}\nReasoning: {reason}\nAnswer: YES."
            if row['relation'] == 'contraindication':
                reason = f"The drug {row['drug_name']} cannot act as a carrier for the gene {row['gene_name']}. But the disease {row['disease_name']} is associated with the gene {row['gene_name']}, so the drug {row['drug_name']} cannot be used to treat {row['disease_name']}."
                prefix = f"Question: {question}\nReasoning: {reason}\nAnswer: NO."
            
            line_dict['prompt'] = prefix

        if pd.notna(row['phenotype_name']):
            line_dict['phenotype_name'] = row['phenotype_name']
            if row['relation'] == 'indication':
                reason = f"The disease {row['disease_name']} is characterized by the phenotype {row['phenotype_name']}, indicating that the drug {row['drug_name']} may be effective in treating {row['disease_name']}."
                prefix = f"Question: {question}\nReasoning: {reason}\nAnswer: YES."
            if row['relation'] == 'contraindication':
                reason = f"The disease {row['disease_name']} is characterized by the phenotype {row['phenotype_name']}, indicating that the drug {row['drug_name']} may not be effective in treating {row['disease_name']}."
                prefix = f"Question: {question}\nReasoning: {reason}\nAnswer: NO." 
                
            line_dict['prompt'] = prefix
        
        line_dict['relation'] = row['relation']
        # f_write.write(line_dict)

# Step3: Get the Remaining data from the train and test for few shot

In [34]:
test_data = pd.read_csv("/playpen/jesse/drug_repurpose/split_data/data_analysis/test_data_new.csv")
print(test_data)

     drug_index  disease_index drug_type           drug_name disease_type  \
0         14520          33651      drug           Allantoin      disease   
1         15098          33683      drug           Promazine      disease   
2         15493          29637      drug       Bromocriptine      disease   
3         14275          30654      drug         Doxorubicin      disease   
4         20375          33714      drug  Aluminum hydroxide      disease   
..          ...            ...       ...                 ...          ...   
995       14168          31493      drug      Spironolactone      disease   
996       20427          31918      drug      Cholestyramine      disease   
997       20350          30491      drug     Pentaerithrityl      disease   
998       15776          27933      drug            Nabilone      disease   
999       15075          31790      drug          Cevimeline      disease   

                                  disease_name  \
0                        

In [35]:
test_data_sub = test_data[['disease_index', 'drug_index']].drop_duplicates()
train_data_sub = train_data[['disease_index', 'drug_index']].drop_duplicates()

In [36]:
overlap = test_data_sub.merge(train_data_sub, on=['disease_index', 'drug_index'], how='inner')
if not overlap.empty:
    print("There is overlap:")
    print(overlap)
else:
    print("No overlap found.")

No overlap found.


In [None]:
indication_relation = "indication"

indication_edges = df_edges[df_edges['relation'] == indication_relation][['x_index', 'y_index']]
print(indication_edges)

contraindication_relation = "contraindication"

contraindication_edges = df_edges[df_edges['relation'] == contraindication_relation][['x_index', 'y_index']]
print(contraindication_edges)

In [28]:
combined_sub = pd.concat([test_data_sub, train_data_sub]).drop_duplicates()

combined_sub = combined_sub.rename(columns={'disease_index': 'a', 'drug_index': 'b'})


pairs_to_remove = set([tuple(x) for x in combined_sub[['a', 'b']].values])
pairs_to_remove |= set([tuple(x[::-1]) for x in combined_sub[['a', 'b']].values])  

indication_edges_filtered = indication_edges[
    ~indication_edges[['x_index', 'y_index']].apply(tuple, axis=1).isin(pairs_to_remove)
]

print(indication_edges_filtered)

         x_index  y_index
346730     16687    33577
346731     16687    36035
346764     20297    33577
346765     20297    36035
346768     16693    33577
...          ...      ...
5776153    84333    14471
5776154    27527    16634
5776155    38622    16634
5776156    28673    16634
5776158    39497    17237

[16222 rows x 2 columns]


drug: Fosinopril disease: hypertensive disorder


drug: Rotigotine disease: hypertensive disorder

In [29]:
contraindication_edges_filtered = contraindication_edges[
    ~contraindication_edges[['x_index', 'y_index']].apply(tuple, axis=1).isin(pairs_to_remove)
]

print(contraindication_edges_filtered)

         x_index  y_index
346728     15193    33577
346729     15193    36035
346732     14483    33577
346733     14483    36035
346734     16476    33577
...          ...      ...
5776145    35751    14251
5776146    35846    20456
5776147    35751    20456
5776148    27446    17286
5776157    84334    18277

[58450 rows x 2 columns]
