In [1]:
import requests
import pandas as pd
from tenacity import retry, stop_after_attempt, wait_random_exponential


In [2]:
BASE_URI = "https://spoke.rbvi.ucsf.edu"

In [3]:
def get_api_resp(END_POINT, params=None):
    URI = BASE_URI + END_POINT
    if params:
        return requests.get(URI, params=params)
    else:
        return requests.get(URI)

In [12]:
@retry(wait=wait_random_exponential(min=10, max=30), stop=stop_after_attempt(5))
def get_context_using_api(node_value):
    type_end_point = "/api/v1/types"
    result = get_api_resp(type_end_point)
    data_spoke_types = result.json()
    node_types = list(data_spoke_types["nodes"].keys())
    edge_types = list(data_spoke_types["edges"].keys())
    node_types_to_remove = ["DatabaseTimestamp", "Version"]
    filtered_node_types = [node_type for node_type in node_types if node_type not in node_types_to_remove]
    api_params = {
        'node_filters' : filtered_node_types,
        'edge_filters': edge_types,
        'cutoff_Compound_max_phase': 3,
        'cutoff_Protein_source': ['SwissProt'],
        'cutoff_DaG_diseases_sources': ['knowledge', 'experiments'],
        'cutoff_DaG_textmining': 3,
        'cutoff_CtD_phase': 3,
        'cutoff_PiP_confidence': 0.7,
        'cutoff_ACTeG_level': ['Low', 'Medium', 'High'],
        'depth' : 2
    }
    node_type = "Disease"
    attribute = "name"
    nbr_end_point = "/api/v1/neighborhood/{}/{}/{}".format(node_type, attribute, node_value)
    result = get_api_resp(nbr_end_point, params=api_params)
    node_context = result.json()
    nbr_nodes = []
    nbr_edges = []
    for item in node_context:
        if "_" not in item["data"]["neo4j_type"]:
            try:
                if item["data"]["neo4j_type"] == "Protein":
                    nbr_nodes.append((item["data"]["neo4j_type"], item["data"]["id"], item["data"]["properties"]["description"]))
                else:
                    nbr_nodes.append((item["data"]["neo4j_type"], item["data"]["id"], item["data"]["properties"]["name"]))
            except:
                nbr_nodes.append((item["data"]["neo4j_type"], item["data"]["id"], item["data"]["properties"]["identifier"]))
        elif "_" in item["data"]["neo4j_type"]:
            try:
                provenance = ", ".join(item["data"]["properties"]["sources"])
            except:
                try:
                    provenance = item["data"]["properties"]["source"]
                    if isinstance(provenance, list):
                        provenance = ", ".join(provenance)                    
                except:
                    provenance = "SPOKE-KG"                                    
            nbr_edges.append((item["data"]["source"], item["data"]["neo4j_type"], item["data"]["target"], provenance))
    nbr_nodes_df = pd.DataFrame(nbr_nodes, columns=["node_type", "node_id", "node_name"])
    nbr_edges_df = pd.DataFrame(nbr_edges, columns=["source", "edge_type", "target", "provenance"])
    merge_1 = pd.merge(nbr_edges_df, nbr_nodes_df, left_on="source", right_on="node_id").drop("node_id", axis=1)
    merge_1.loc[:,"node_name"] = merge_1.node_type + " " + merge_1.node_name
    merge_1.drop(["source", "node_type"], axis=1, inplace=True)
    merge_1 = merge_1.rename(columns={"node_name":"source"})
    merge_2 = pd.merge(merge_1, nbr_nodes_df, left_on="target", right_on="node_id").drop("node_id", axis=1)
    merge_2.loc[:,"node_name"] = merge_2.node_type + " " + merge_2.node_name
    merge_2.drop(["target", "node_type"], axis=1, inplace=True)
    merge_2 = merge_2.rename(columns={"node_name":"target"})
    merge_2 = merge_2[["source", "edge_type", "target", "provenance"]]
    merge_2.loc[:, "predicate"] = merge_2.edge_type.apply(lambda x:x.split("_")[0])
    merge_2.loc[:, "context"] =  merge_2.source + " " + merge_2.predicate.str.lower() + " " + merge_2.target + " and Provenance of this association is from " + merge_2.provenance + "."
    context = merge_2['context'].str.cat(sep=' ')
    return context



In [36]:
@retry(wait=wait_random_exponential(min=10, max=30), stop=stop_after_attempt(5))
def get_context_using_api_v2(node_value, nhop=1):
    type_end_point = "/api/v1/types"
    result = get_api_resp(type_end_point)
    data_spoke_types = result.json()
    node_types = list(data_spoke_types["nodes"].keys())
    edge_types = list(data_spoke_types["edges"].keys())
    node_types_to_remove = ["DatabaseTimestamp", "Version"]
    filtered_node_types = [node_type for node_type in node_types if node_type not in node_types_to_remove]
    api_params = {
        'node_filters' : filtered_node_types,
        'edge_filters': edge_types,
        'cutoff_Compound_max_phase': 3,
        'cutoff_Protein_source': ['SwissProt'],
        'cutoff_DaG_diseases_sources': ['knowledge', 'experiments'],
        'cutoff_DaG_textmining': 3,
        'cutoff_CtD_phase': 3,
        'cutoff_PiP_confidence': 0.7,
        'cutoff_ACTeG_level': ['Low', 'Medium', 'High'],
        'depth' : nhop
    }
    node_type = "Disease"
    attribute = "name"
    nbr_end_point = "/api/v1/neighborhood/{}/{}/{}".format(node_type, attribute, node_value)
    result = get_api_resp(nbr_end_point, params=api_params)
    node_context = result.json()
    nbr_nodes = []
    nbr_edges = []
    for item in node_context:
        if "_" not in item["data"]["neo4j_type"]:
            try:
                if item["data"]["neo4j_type"] == "Protein":
                    nbr_nodes.append((item["data"]["neo4j_type"], item["data"]["id"], item["data"]["properties"]["description"]))
                else:
                    nbr_nodes.append((item["data"]["neo4j_type"], item["data"]["id"], item["data"]["properties"]["name"]))
            except:
                nbr_nodes.append((item["data"]["neo4j_type"], item["data"]["id"], item["data"]["properties"]["identifier"]))
        elif "_" in item["data"]["neo4j_type"]:
            try:
                provenance = ", ".join(item["data"]["properties"]["sources"])
            except:
                try:
                    provenance = item["data"]["properties"]["source"]
                    if isinstance(provenance, list):
                        provenance = ", ".join(provenance)                    
                except:
                    provenance = "SPOKE-KG"                                    
            nbr_edges.append((item["data"]["source"], item["data"]["neo4j_type"], item["data"]["target"], provenance))
    nbr_nodes_df = pd.DataFrame(nbr_nodes, columns=["node_type", "node_id", "node_name"])
    nbr_edges_df = pd.DataFrame(nbr_edges, columns=["source", "edge_type", "target", "provenance"])
    merge_1 = pd.merge(nbr_edges_df, nbr_nodes_df, left_on="source", right_on="node_id").drop("node_id", axis=1)
    merge_1.loc[:,"node_name"] = merge_1.node_type + " " + merge_1.node_name
    merge_1.drop(["source", "node_type"], axis=1, inplace=True)
    merge_1 = merge_1.rename(columns={"node_name":"source"})
    merge_2 = pd.merge(merge_1, nbr_nodes_df, left_on="target", right_on="node_id").drop("node_id", axis=1)
    merge_2.loc[:,"node_name"] = merge_2.node_type + " " + merge_2.node_name
    merge_2.drop(["target", "node_type"], axis=1, inplace=True)
    merge_2 = merge_2.rename(columns={"node_name":"target"})
    merge_2 = merge_2[["source", "edge_type", "target", "provenance"]]
    merge_2.loc[:, "predicate"] = merge_2.edge_type.apply(lambda x:x.split("_")[0])
    merge_2.loc[:, "context"] =  merge_2.source + " " + merge_2.predicate.str.lower() + " " + merge_2.target + " and Provenance of this association is from " + merge_2.provenance + "."
#     context = merge_2['context'].str.cat(sep=' ')
    return merge_2



In [9]:
%%time
context = get_context_using_api("multiple sclerosis")

CPU times: user 1.57 s, sys: 435 ms, total: 2.01 s
Wall time: 19.7 s


In [37]:
%%time
context = get_context_using_api_v2("multiple sclerosis", nhop=2)

CPU times: user 1.43 s, sys: 466 ms, total: 1.89 s
Wall time: 31 s


In [15]:
context

Unnamed: 0,source,edge_type,target,provenance,predicate,context
0,Disease secondary progressive multiple sclerosis,ISA_DiD,Disease multiple sclerosis,Disease Ontology,ISA,Disease secondary progressive multiple scleros...
1,Disease primary progressive multiple sclerosis,ISA_DiD,Disease multiple sclerosis,Disease Ontology,ISA,Disease primary progressive multiple sclerosis...
2,Disease relapsing-remitting multiple sclerosis,ISA_DiD,Disease multiple sclerosis,Disease Ontology,ISA,Disease relapsing-remitting multiple sclerosis...
3,Disease Balo concentric sclerosis,ISA_DiD,Disease multiple sclerosis,Disease Ontology,ISA,Disease Balo concentric sclerosis isa Disease ...
4,Disease progressive relapsing multiple sclerosis,ISA_DiD,Disease multiple sclerosis,Disease Ontology,ISA,Disease progressive relapsing multiple scleros...
...,...,...,...,...,...,...
70721,Gene HLA-DRB1,PARTICIPATES_GpPW,Pathway Cancer immunotherapy by CTLA4 blockade,WikiPathways,PARTICIPATES,Gene HLA-DRB1 participates Pathway Cancer immu...
70722,Gene HLA-DRB1,PARTICIPATES_GpPW,Pathway Network map of SARS-CoV-2 signaling pa...,WikiPathways,PARTICIPATES,Gene HLA-DRB1 participates Pathway Network map...
70723,Gene HLA-DRB1,PARTICIPATES_GpPW,Pathway Cancer immunotherapy by PD-1 blockade,WikiPathways,PARTICIPATES,Gene HLA-DRB1 participates Pathway Cancer immu...
70724,Gene HLA-DRB1,PARTICIPATES_GpPW,Pathway T-cell activation SARS-CoV-2,WikiPathways,PARTICIPATES,Gene HLA-DRB1 participates Pathway T-cell acti...


In [16]:
import networkx as nx

In [18]:
%%time
G = nx.from_pandas_edgelist(context, 'source', 'target', edge_attr='predicate')



CPU times: user 138 ms, sys: 2.63 ms, total: 141 ms
Wall time: 139 ms


In [31]:
%%time

n1 = "Disease multiple sclerosis"
n2 = "Disease progressive relapsing multiple sclerosis"
path = nx.shortest_path(G, n1, n2)

path_with_predicates = []

for i in range(len(path) - 1):
    predicate = G.get_edge_data(path[i], path[i + 1])['predicate']    
    path_with_predicates.extend([path[i], predicate])

path_with_predicates.append(path[-1])

CPU times: user 150 µs, sys: 2 µs, total: 152 µs
Wall time: 161 µs


In [32]:
path_with_predicates

['Disease multiple sclerosis',
 'ISA',
 'Disease progressive relapsing multiple sclerosis']

In [43]:
def find_paths_of_length(G, source, target, length):
    if length == 0:
        return [[source]] if source == target else []
    
    paths = []
    for neighbor in G.neighbors(source):
        for path in find_paths_of_length(G, neighbor, target, length - 1):
            if source not in path:
                paths.append([source, G.get_edge_data(source, neighbor)['predicate']] + path)
    return paths


In [47]:
find_paths_of_length(G, n1, n2, 3)

[['Disease multiple sclerosis',
  'ISA',
  'Disease secondary progressive multiple sclerosis',
  'RESEMBLES',
  'Disease relapsing-remitting multiple sclerosis',
  'RESEMBLES',
  'Disease progressive relapsing multiple sclerosis'],
 ['Disease multiple sclerosis',
  'ISA',
  'Disease secondary progressive multiple sclerosis',
  'RESEMBLES',
  'Disease venous insufficiency',
  'RESEMBLES',
  'Disease progressive relapsing multiple sclerosis'],
 ['Disease multiple sclerosis',
  'ISA',
  'Disease secondary progressive multiple sclerosis',
  'RESEMBLES',
  'Disease primary progressive multiple sclerosis',
  'RESEMBLES',
  'Disease progressive relapsing multiple sclerosis'],
 ['Disease multiple sclerosis',
  'ISA',
  'Disease secondary progressive multiple sclerosis',
  'RESEMBLES',
  'Disease optic neuritis',
  'RESEMBLES',
  'Disease progressive relapsing multiple sclerosis'],
 ['Disease multiple sclerosis',
  'ISA',
  'Disease secondary progressive multiple sclerosis',
  'RESEMBLES',
  'D

In [6]:
type_end_point = "/api/v1/types"
result = get_api_resp(type_end_point)
data_spoke_types = result.json()
node_types = list(data_spoke_types["nodes"].keys())
edge_types = list(data_spoke_types["edges"].keys())
node_types_to_remove = ["DatabaseTimestamp", "Version"]
filtered_node_types = [node_type for node_type in node_types if node_type not in node_types_to_remove]
api_params = {
    'node_filters' : filtered_node_types,
    'edge_filters': edge_types,
    'cutoff_Compound_max_phase': 3,
    'cutoff_Protein_source': ['SwissProt'],
    'cutoff_DaG_diseases_sources': ['knowledge', 'experiments'],
    'cutoff_DaG_textmining': 3,
    'cutoff_CtD_phase': 3,
    'cutoff_PiP_confidence': 0.7,
    'cutoff_ACTeG_level': ['Low', 'Medium', 'High']
}
node_type = "Disease"
attribute = "name"
nbr_end_point = "/api/v1/neighborhood/{}/{}/{}".format(node_type, attribute, "COVID-19")
result = get_api_resp(nbr_end_point, params=api_params)
node_context = result.json()

In [15]:
node_context[0]

'Disease Ontology'

In [10]:
import ast
nbr_nodes = []
nbr_edges = []
for item in node_context:
    if "_" not in item["data"]["neo4j_type"]:
        try:
            if item["data"]["neo4j_type"] == "Protein":
                nbr_nodes.append((item["data"]["neo4j_type"], item["data"]["id"], item["data"]["properties"]["description"]))
            else:
                nbr_nodes.append((item["data"]["neo4j_type"], item["data"]["id"], item["data"]["properties"]["name"]))
        except:
            nbr_nodes.append((item["data"]["neo4j_type"], item["data"]["id"], item["data"]["properties"]["identifier"]))
    elif "_" in item["data"]["neo4j_type"]:
        try:
            provenance = ", ".join(item["data"]["properties"]["sources"])
        except:
            try:
                provenance = item["data"]["properties"]["source"]
                if isinstance(provenance, list):
                    provenance = ", ".join(provenance)                    
            except:
                try:                    
                    preprint_list = ast.literal_eval(item["data"]["properties"]["preprint_list"])
                    if len(preprint_list) > 0:                                                    
                        provenance = ", ".join(preprint_list)
                    else:
                        pmid_list = ast.literal_eval(item["data"]["properties"]["pmid_list"])
                        pmid_list = map(lambda x:"pubmedId:"+x, pmid_list)
                        if len(pmid_list) > 0:
                            provenance = ", ".join(pmid_list)
                        else:
                            provenance = "Based on data from Institute For Systems Biology (ISB)"
                except:                                
                    provenance = "SPOKE-KG"                                    
        nbr_edges.append((item["data"]["source"], item["data"]["neo4j_type"], item["data"]["target"], provenance))
nbr_nodes_df = pd.DataFrame(nbr_nodes, columns=["node_type", "node_id", "node_name"])
nbr_edges_df = pd.DataFrame(nbr_edges, columns=["source", "edge_type", "target", "provenance"])
merge_1 = pd.merge(nbr_edges_df, nbr_nodes_df, left_on="source", right_on="node_id").drop("node_id", axis=1)
merge_1.loc[:,"node_name"] = merge_1.node_type + " " + merge_1.node_name
merge_1.drop(["source", "node_type"], axis=1, inplace=True)
merge_1 = merge_1.rename(columns={"node_name":"source"})
merge_2 = pd.merge(merge_1, nbr_nodes_df, left_on="target", right_on="node_id").drop("node_id", axis=1)
merge_2.loc[:,"node_name"] = merge_2.node_type + " " + merge_2.node_name
merge_2.drop(["target", "node_type"], axis=1, inplace=True)
merge_2 = merge_2.rename(columns={"node_name":"target"})
merge_2 = merge_2[["source", "edge_type", "target", "provenance"]]
merge_2.loc[:, "predicate"] = merge_2.edge_type.apply(lambda x:x.split("_")[0])
merge_2.loc[:, "context"] =  merge_2.source + " " + merge_2.predicate.str.lower() + " " + merge_2.target + " and Provenance of this association is from " + merge_2.provenance + "."
context = merge_2['context'].str.cat(sep=' ')



In [118]:
preprint_list

['https://doi.org/10.1101/2020.03.02.20029975']

In [125]:
context

"Disease critical COVID-19 isa Disease COVID-19 and Provenance of this association is from Disease Ontology. Disease severe COVID-19 isa Disease COVID-19 and Provenance of this association is from Disease Ontology. Disease non-severe COVID-19 isa Disease COVID-19 and Provenance of this association is from Disease Ontology. Organism Severe acute respiratory syndrome coronavirus 2 causes Disease COVID-19 and Provenance of this association is from SPOKE-KG. Protein Interleukin-23 subunit alpha (IL-23 subunit alpha) (IL-23-A) (Interleukin-23 subunit p19) (IL-23p19) increasedin Disease COVID-19 and Provenance of this association is from SPOKE-KG. Protein C-C motif chemokine 7 (Monocyte chemoattractant protein 3) (Monocyte chemotactic protein 3) (MCP-3) (NC28) (Small-inducible cytokine A7) increasedin Disease COVID-19 and Provenance of this association is from https://doi.org/10.1101/2020.03.02.20029975. Protein Fibroblast growth factor 2 (FGF-2) (Basic fibroblast growth factor) (bFGF) (Hepa