In [135]:
import requests
import pandas as pd
import random
import os
from tqdm import tqdm
from dotenv import load_dotenv
from neo4j import GraphDatabase, basic_auth
import re
from utility import *


In [136]:
def create_mcq_with_shuffle(df, source_column, target_column, node_type, predicate):
    disease_pairs = df[source_column].unique()
    disease_pairs = [(disease1, disease2) for disease1 in disease_pairs for disease2 in disease_pairs if disease1 != disease2]

    new_data = []

    #For each source pair, find a common target and 4 negative samples
    for disease1, disease2 in disease_pairs:
        common_gene = set(df[df[source_column] == disease1][target_column]).intersection(set(df[df[source_column] == disease2][target_column]))
        common_gene = list(common_gene)[0] if common_gene else None
        # Get 4 random negative samples
        negative_samples = df[(df[source_column] != disease1) & (df[source_column] != disease2)][target_column].sample(4).tolist()
        new_data.append(((disease1, disease2), common_gene, negative_samples))

    new_df = pd.DataFrame(new_data, columns=["disease_pair", "correct_node", "negative_samples"])
    new_df.dropna(subset = ["correct_node"], inplace=True)
    new_df.loc[:, "disease_1"] = new_df["disease_pair"].apply(lambda x: x[0])
    new_df.loc[:, "disease_2"] = new_df["disease_pair"].apply(lambda x: x[1])
    new_df.negative_samples = new_df.negative_samples.apply(lambda x:", ".join(x[0:4]))
    new_df.loc[:, "options_combined"] = new_df.negative_samples.apply(lambda x:x.split(",")) + new_df.correct_node.apply(lambda x:x.split(","))
    new_df.loc[:, "options_combined"] = new_df.options_combined.apply(shuffle_list)
    new_df.loc[:, "options_combined"] = new_df.options_combined.apply(lambda x:", ".join(x))
    new_df.loc[:, "text"] = "Out of the given list, which " + node_type + " " + predicate + " " + new_df.disease_1 + " and " + new_df.disease_2 + ". Given list is: " + new_df.options_combined
    return new_df

# Only include entities that is represented in SPOKE 
#### Note: 
#### If the data has '*A-ASSOCIATES-B*', we are making sure A and B are present in SPOKE, and NOT checking if '*A-ASSOCIATES-B*' is present in SPOKE. 
#### Otherwise, the comaprison, with and without SPOKE, will not be fair (because SPOKE doesn't represent those entities, if it doesn't have them)


In [137]:
%%time


load_dotenv(os.path.join(os.path.expanduser('~'), '.neo4j_config.env'))
USER = os.environ.get('SPOKE_USER')
PSW = os.environ.get('SPOKE_PSW')
URI = os.environ.get('SPOKE_URI')

GENE_QUERY = """
    MATCH(d:Disease)-[r:ASSOCIATES_DaG]->(g:Gene) 
    RETURN DISTINCT g.name AS g_name     
"""

VARIANT_QUERY = """
    MATCH(d:Disease)<-[r:ASSOCIATES_VaP]-(v:Variant) 
    RETURN DISTINCT v.identifier AS v_id     
"""

ORGANISM_QUERY = """
    MATCH(d:Disease)<-[r:CAUSES_OcD]-(o:Organism) 
    RETURN DISTINCT o.identifier AS o_id     
"""


auth = basic_auth(USER, PSW)
sdb = GraphDatabase.driver(URI, auth=auth)

gene_list = []
with sdb.session() as session:
    with session.begin_transaction() as tx:
        result = tx.run(GENE_QUERY)
        for row in result:
            gene_list.append(row["g_name"])
            
variant_list = []
with sdb.session() as session:
    with session.begin_transaction() as tx:
        result = tx.run(VARIANT_QUERY)
        for row in result:
            variant_list.append(row["v_id"])
            

organism_list = []
with sdb.session() as session:
    with session.begin_transaction() as tx:
        result = tx.run(ORGANISM_QUERY)
        for row in result:
            organism_list.append(row["o_id"])
            
            
sdb.close()


CPU times: user 6.31 s, sys: 939 ms, total: 7.25 s
Wall time: 55.6 s


# MONARCH

In [138]:
def get_api_resp(URI, params=None):
    if params:
        return requests.get(URI, params=params)
    else:
        return requests.get(URI)
    
def get_association(URI, disease_id, params, object_attribute="label"):
    URI_ = URI.format(disease_id)
    resp = get_api_resp(URI_, params=params)
    if resp.status_code == 200:
        response = resp.json()
        associations = response["associations"]
        object_list = []
        for item in associations:
            object_list.append(item["object"][object_attribute])
        df = pd.DataFrame(object_list, columns=["object"])
        df["subject"] = disease_id
        return df
    else:
        return None
    

In [139]:
DISEASE_GENE_URI = "https://api.monarchinitiative.org/api/bioentity/disease/{}/genes"
DISEASE_VARIANT_URI = "https://api.monarchinitiative.org/api/bioentity/disease/{}/variants"
disease_path = "../../../data/benchmark_datasets/monarch/gwas_diseases.csv"

# For extracting Monarch data, we are considering a subset of diseases from SPOKE that has connections with genes from GWAS. These disease are obtained by running CYPHER in neo4j browser and saved as csv file which is loaded below
disease_df = pd.read_csv(disease_path)
disease_df.columns = ["disease_id", "disease_name"]
disease_df.disease_id = disease_df.disease_id.apply(lambda x:x.split('"')[1])


params = {}
params["rows"] = 2
params["direct"] = "true"
params["direct_taxon"] = "true"


## Disease-Gene 

In [140]:
edge_df_list = []

for index, row in tqdm(disease_df.iterrows()):
    edge_df_list.append(get_association(DISEASE_GENE_URI, row["disease_id"], params))

edge_df = pd.concat(edge_df_list, ignore_index=True)
edge_df = pd.merge(edge_df, disease_df, left_on="subject", right_on="disease_id").drop(["subject", "disease_id"], axis=1)
edge_df.disease_name = edge_df.disease_name.apply(lambda x:x.split('"')[1])
edge_df = edge_df[edge_df.object.isin(gene_list)]
monarch_disease_gene_mcq = create_mcq_with_shuffle(edge_df, "disease_name", "object", "Gene", "is associated with")



254it [01:46,  2.39it/s]


## Disease-Variant 

In [141]:
edge_df_list_2 = []

for index, row in tqdm(disease_df.iterrows()):
    edge_df_list_2.append(get_association(DISEASE_VARIANT_URI, row["disease_id"], params, object_attribute="id"))

edge_df_2 = pd.concat(edge_df_list_2, ignore_index=True)
edge_df_2 = pd.merge(edge_df_2, disease_df, left_on="subject", right_on="disease_id").drop(["subject", "disease_id"], axis=1)
edge_df_2.disease_name = edge_df_2.disease_name.apply(lambda x:x.split('"')[1])
edge_df_2.object = edge_df_2.object.apply(lambda x:x.split("dbSNP:")[-1])
edge_df_2 = edge_df_2[edge_df_2.object.isin(variant_list)]

monarch_disease_variant_mcq = create_mcq_with_shuffle(edge_df_2, "disease_name", "object", "Variant", "is associated with")


254it [01:44,  2.43it/s]


# ROBOKOP

In [142]:
ROBOKOP_PATH = "../../../data/benchmark_datasets/robokop"


## Disease-Variant

In [143]:
FILES_LIST_2 = ["disease_variant_1.csv", "disease_variant_2.csv"]

data_robokop = []
for item in FILES_LIST_2:
    data_robokop.append(pd.read_csv(os.path.join(ROBOKOP_PATH, item)))
    
data_robokop = pd.concat(data_robokop, ignore_index=True)
data_robokop.columns = ["source", "target"]
data_robokop = data_robokop[data_robokop.target.isin(variant_list)]
robokop_disease_variant_mcq = create_mcq_with_shuffle(data_robokop, "source", "target", "Variant", "is associated with")


## Disease-Organism 

In [144]:
FILES_LIST_3 = ["disease_organism_1.csv"]

data_robokop = []
for item in FILES_LIST_3:
    data_robokop.append(pd.read_csv(os.path.join(ROBOKOP_PATH, item)))
    
data_robokop = pd.concat(data_robokop, ignore_index=True)

def extract_doid(entry):
    matches = re.findall(r'DOID:\d+', entry)
    return matches

data_robokop['extracted_DOIDs'] = data_robokop['d.equivalent_identifiers'].apply(extract_doid)
data_robokop = data_robokop.explode("extracted_DOIDs").dropna(subset=["extracted_DOIDs"]).drop("d.equivalent_identifiers", axis=1)
data_robokop.columns = ["source", "target", "target_id", "source_id"]
data_robokop.loc[:, "target_id"] = data_robokop.target_id.apply(lambda x:x.split("NCBITaxon:")[-1])
data_robokop.target_id = data_robokop.target_id.astype(int)

# Include only those entities that are present in SPOKE (Note: only entities, not the association)
data_robokop = data_robokop[data_robokop.target_id.isin(organism_list)]

robokop_disease_organism_mcq = create_mcq_with_shuffle(data_robokop, "source", "target", "Organism", "causes")



In [145]:
data_combined = pd.concat([monarch_disease_gene_mcq, monarch_disease_variant_mcq, robokop_disease_variant_mcq, robokop_disease_organism_mcq], ignore_index=True)



In [149]:
data_combined.to_csv("../../../data/benchmark_datasets/test_questions_two_hop_mcq_from_monarch_and_robokop.csv", index=False, header=True)



In [159]:
print(data_combined.text.values[3])
print(data_combined.correct_node.values[3])

Out of the given list, which Gene is associated with psoriasis and allergic rhinitis. Given list is:  ATP2B1, HLA-B,  STAT4,  TERT, FADS1
HLA-B


In [8]:
import pandas as pd
data_combined = pd.read_csv("../../../data/benchmark_datasets/test_questions_two_hop_mcq_from_monarch_and_robokop.csv")
data_combined = data_combined[data_combined.correct_node.str.contains("rs")]
data_combined.text.values[23]
data_combined.iloc[23]


disease_pair                 ('atrophic gastritis', 'duodenal ulcer')
correct_node                                                rs2294008
negative_samples         rs12524487, rs9275260, rs12203592, rs4459895
disease_1                                          atrophic gastritis
disease_2                                              duodenal ulcer
options_combined     rs4459895,  rs12203592,  rs9275260, rs1252448...
text                Out of the given list, which Variant is associ...
Name: 279, dtype: object