In [106]:
import pandas as pd
import os
from tqdm import tqdm
from dotenv import load_dotenv
from neo4j import GraphDatabase, basic_auth


In [2]:
load_dotenv(os.path.join(os.path.expanduser('~'), '.neo4j_config.env'))
USER = os.environ.get('SPOKE_USER')
PSW = os.environ.get('SPOKE_PSW')
URI = os.environ.get('SPOKE_URI')


In [3]:
%%time

DISEASE_QUERY = """
    MATCH(d:Disease)-[r:ASSOCIATES_DaG]->(g:Gene)
    WHERE r.diseases_sources <> ["textmining"]
    RETURN DISTINCT d.identifier AS d_id, d.name AS d_name
"""

auth = basic_auth(USER, PSW)
sdb = GraphDatabase.driver(URI, auth=auth)

disease_list = []
with sdb.session() as session:
    with session.begin_transaction() as tx:
        result = tx.run(DISEASE_QUERY)
        for row in result:
            disease_list.append((row["d_id"], row["d_name"]))

disease_df = pd.DataFrame(disease_list, columns=["disease_id", "disease_name"])
disease_id_list = list(disease_df.disease_id.values)


CPU times: user 140 ms, sys: 28.4 ms, total: 169 ms
Wall time: 8.67 s


In [36]:
%%time

QUERY = """
    MATCH(c:Compound)-[r:TREATS_CtD]->(d1:Disease)-[:RESEMBLES_DrD]-(d2:Disease) 
    WHERE r.phase>=3 AND NOT EXISTS((c)-[:TREATS_CtD]->(d2)) AND d1.identifier IN {}
    RETURN c.name as c_name, d1.name AS d1_name, d2.name AS d2_name LIMIT 20000
""".format(disease_id_list)

auth = basic_auth(USER, PSW)
sdb = GraphDatabase.driver(URI, auth=auth)
edge_list = []
with sdb.session() as session:
    with session.begin_transaction() as tx:
        result = tx.run(QUERY)
        for row in result:
            edge_list.append((row["c_name"], row["d1_name"], row["d2_name"]))

drug_repurp_df = pd.DataFrame(edge_list, columns=["compound", "disease_1", "disease_2"])



CPU times: user 466 ms, sys: 56.6 ms, total: 523 ms
Wall time: 1.35 s


In [53]:

result_df = drug_repurp_df.groupby("disease_1").agg({
    'compound': list,
    'disease_2': list
}).reset_index()

result_df.rename(columns={'compound': 'Compounds', 'disease_2': 'Diseases'}, inplace=True)
result_df['Compounds'] = result_df['Compounds'].apply(lambda x: list(set(x)))
result_df['Diseases'] = result_df['Diseases'].apply(lambda x: list(set(x)))
result_df.loc[:, "text"] = "What compounds treat '" + result_df.disease_1 + "' and what diseases resemble '" + result_df.disease_1 + "'?"
result_df.shape



(143, 4)

In [60]:
result_df.columns

Index(['disease_1', 'Compounds', 'Diseases', 'text'], dtype='object')

In [59]:
result_df.to_csv("../../../../data/benchmark_datasets/one_hop_graph_traversal_questions.csv", index=False, header=True)


### Changing the formatting of questions from 75th index onwards (Because questions until 75 are used for hyperparameter analysis)



In [43]:
result_df = pd.read_csv("../../../../data/benchmark_datasets/one_hop_graph_traversal_questions.csv")
result_df = result_df.iloc[75:]
result_df.drop("text", axis=1, inplace=True)
result_df.loc[:, "text"] = "What are the drugs that treat '" + result_df.disease_1 + "'? And what diseases can these drugs be repurposed to? To answer the second question, find all diseases that resemble to '" + result_df.disease_1 + "'."

result_df.to_csv("../../../../data/benchmark_datasets/drug_repurposing_questions.csv", index=False, header=True)


In [54]:
result_df.iloc[3].text

    

"What are the drugs that treat 'hereditary hemorrhagic telangiectasia'? And what diseases can these drugs be repurposed to? To answer the second question, find all diseases that resemble to 'hereditary hemorrhagic telangiectasia'."

In [47]:
result_df.iloc[3].Compounds


"['BEVACIZUMAB', 'Propranolol', 'Mupirocin', 'Sodium Chloride', 'Tranexamic acid']"

In [53]:
result_df.iloc[3].Diseases

'[\'hepatopulmonary syndrome\', \'Coats disease\', \'polycythemia vera\', \'nose disease\', "von Willebrand\'s disease", \'endocarditis\', \'hepatic encephalopathy\', \'Raynaud disease\', \'vascular skin disease\', \'hepatic coma\', \'angiodysplasia\', \'telangiectasis\', \'intracranial embolism\', \'arteriovenous malformations of the brain\']'

In [30]:
ind = 5
print(result_df.text.values[ind])
print(result_df.Compounds.values[ind])
print(result_df.Diseases.values[ind])


What compounds treat 'GM2 gangliosidosis' and what diseases resemble 'GM2 gangliosidosis'?
['Miglustat']
['lateral sclerosis', 'gangliosidosis', 'motor neuron disease', 'GM1 gangliosidosis', 'anterior horn cell disease', 'Tay-Sachs disease']


In [17]:
import ast
result_df.loc[:,"disease_count"] = result_df.Diseases.apply(lambda x:len(ast.literal_eval(x)))

result_df[result_df.disease_count == 3]


Unnamed: 0,disease_1,Compounds,Diseases,text,disease_count
19,Sjogren-Larsson syndrome,['2-(3-Amino-6-chloroquinolin-2-yl)propan-2-ol'],"['autosomal recessive congenital ichthyosis', ...",What compounds treat 'Sjogren-Larsson syndrome...,3
61,fatty liver disease,"['Saroglitazar', 'Rimonabant', 'Metformin', 'D...","['drug-induced hepatitis', 'hemolytic anemia',...",What compounds treat 'fatty liver disease' and...,3
115,pituitary adenoma,"['Lanreotide acetate', 'Cabergoline', 'Iopamid...","['adamantinous craniopharyngioma', 'craniophar...",What compounds treat 'pituitary adenoma' and w...,3


In [83]:
df = pd.read_csv("../../../../data/benchmark_datasets/drug_repurposing_questions.csv")
print(df.text.values[3])


What are the drugs that treat 'hereditary hemorrhagic telangiectasia'? And what diseases can these drugs be repurposed to? To answer the second question, find all diseases that resemble to 'hereditary hemorrhagic telangiectasia'.


In [82]:
df.Diseases.values[3]

'[\'hepatopulmonary syndrome\', \'Coats disease\', \'polycythemia vera\', \'nose disease\', "von Willebrand\'s disease", \'endocarditis\', \'hepatic encephalopathy\', \'Raynaud disease\', \'vascular skin disease\', \'hepatic coma\', \'angiodysplasia\', \'telangiectasis\', \'intracranial embolism\', \'arteriovenous malformations of the brain\']'

In [None]:
If Coats disease resembles hereditary hemorrhagic telangiectasia, what compounds that treat hereditary hemorrhagic telangiectasia can be repurposed to treat Coats disease?


In [101]:
import random
random.seed(1)
drug_rp_data = []
for index, row in df.iterrows():
    disease_that_need_repurposing = random.sample(ast.literal_eval(row["Diseases"]), len(ast.literal_eval(row["Diseases"])))[0]
    ref_disease = row["disease_1"]
    cmpGroundTruth = ast.literal_eval(row["Compounds"])
    text = "If {} resembles {}, what compounds that treat {} can be repurposed to treat {}?".format(disease_that_need_repurposing, ref_disease, ref_disease, disease_that_need_repurposing)
    drug_rp_data.append((disease_that_need_repurposing, ref_disease, cmpGroundTruth, text))
drug_rp_data_df = pd.DataFrame(drug_rp_data, columns=["disease_in_question", "refDisease", "compoundGroundTruth", "text"])
    

In [104]:
drug_rp_data_df.to_csv("../../../../data/benchmark_datasets/drug_repurposing_questions_v2.csv")


In [105]:
drug_rp_data_df.text.values[0]

'If atypical hemolytic-uremic syndrome resembles hemolytic-uremic syndrome, what compounds that treat hemolytic-uremic syndrome can be repurposed to treat atypical hemolytic-uremic syndrome?'

In [107]:
df = pd.read_csv("../../../../data/benchmark_datasets/one_hop_graph_traversal_questions.csv")


In [118]:
ind = 0
df.text.values[ind]

"What compounds treat 'Alagille syndrome' and what diseases resemble 'Alagille syndrome'?"

In [121]:
df.Compounds.values[ind]


"['Odevixibat', 'Maralixibat']"

In [122]:
df.Diseases.values[ind]

"['optic disk drusen', 'intrahepatic cholestasis', 'Moyamoya disease', 'Williams-Beuren syndrome', 'pulmonary valve stenosis', 'xanthomatosis']"