# Example for querying compounds connected to a known set of genes from NCATS Translator

The following functions are used send queries to Translator and display the results

In [32]:

import json
import csv
import glob

from kgx.transformer import Transformer
from kgx.validator import Validator
from bmt import Toolkit

from translatorpy.trapigraph import TrapiGraph
from translatorpy.translatorquery import TranslatorQuery
from translatorpy import utilities as translator_util


## Utility functions

In [20]:

def node2kgx(nid,node,biolink_classes):
    """
    Convert translator node to KGX format
    """
    kgx_node = {
        'id':nid,
        'name':node.get('name'),
        'attributes':node.get('attributes')
    }
    
    node_cats = node.get('categories')
    
    named_things = [i for i in node_cats if i in biolink_classes]
    
    if len(named_things) > 0:
        kgx_node['category'] = named_things
    else:
        kgx_node['category'] = ['biolink:NamedThing']
    
    return kgx_node
    
def edge2kgx(eid,edge):
    """
    Convert translator edge to KGX format
    """
    kgx_edge = {
        'id':eid,
        'subject': edge.get('subject'),
        'predicate': edge.get('predicate'),
        'object': edge.get('object')
    }
    
    attr = edge.get('attributes')

    rel_attr = [i['value'] for i in attr if i['attribute_type_id']=='biolink:relation']
    if len(rel_attr)>0:
        kgx_edge['relation'] = rel_attr[0]
    else:
        kgx_edge['relation'] = None
    
    kgx_edge['attributes'] = attr

    return kgx_edge


def trapi2kgx(kg,biolink_classes):
    """
    Convert translator result to KGX format
    """
    kgx = {}
    kgx['nodes'] = [node2kgx(nid,node,biolink_classes) for nid,node in kg['nodes'].items()]
    kgx['edges'] = [edge2kgx(eid,edge) for eid,edge in kg['edges'].items()]
    
    return kgx

def write_results(query,prefix,target,biolink_classes):
    json_files = []
    for result in query.results:
        message =  query.results[result]['message'].get('results')

        if message is not None:
            kg = translator_util.getpath(query.results[result],["message","knowledge_graph"])

            kgx = trapi2kgx(kg,biolink_classes)
            
            json_fname = "data/kgx_files/{0}_{1}_{2}.json".format(target,prefix,result)
            json_files.append(json_fname)
            with open(json_fname,encoding='utf-8',mode='w') as kgx_file:
                json.dump(kgx,kgx_file,ensure_ascii=False, indent=2)
    
    return json_files



def myquery(gene_symbol,genename2ncbicurie,biolink_classes):
    try:
        gene = genename2ncbicurie[gene_symbol]
        direct_edge_list = [[(0,gene),(1,'biolink:ChemicalEntity'),'biolink:related_to']]
        
        node_categories = {gene:['biolink:Gene']}
        
        candidate_direct_trapi = TrapiGraph(direct_edge_list,format='SOP',node_data=node_categories)
        
        query = TranslatorQuery()
        query.query(candidate_direct_trapi,delay=30)
        json_files = write_results(query,'direct',gene_symbol,biolink_classes)

        if len(json_files) > 0:
            input_args = {'filename': json_files, 'format': 'json'}
            output_args = {'filename': "data/kgx_files/{0}".format(gene_symbol), 'format': 'tsv'}
            t = Transformer()
            t.transform(input_args=input_args, output_args=output_args)

        return 0
    except Exception as e:
        print(e)
        return 1

## Running queries

In [7]:
#Resolve names
gene_list = ['EGFR','ERBB2']
translator_gene_names = translator_util.translate_node_name(gene_list,'NCBIGene')
genename2ncbicurie = {i[0]:i[1] for i in translator_gene_names}

In [6]:
genename2ncbicurie

{'EGFR': 'NCBIGene:1956', 'ERBB2': 'NCBIGene:2064'}

In [14]:
#An example to show the TRAPI query graph
gene = genename2ncbicurie['EGFR']
direct_edge_list = [[(0,gene),(1,'biolink:ChemicalEntity'),'biolink:related_to']]

node_categories = {gene:['biolink:Gene']}

candidate_direct_trapi = TrapiGraph(direct_edge_list,format='SOP',node_data=node_categories)
candidate_direct_trapi.query

{'message': {'query_graph': {'edges': {'e00': {'subject': 'n00',
     'object': 'n01',
     'predicates': ['biolink:related_to']}},
   'nodes': {'n00': {'ids': ['NCBIGene:1956'], 'categories': ['biolink:Gene']},
    'n01': {'categories': ['biolink:ChemicalEntity']}}}}}

In [None]:
#Biolink model toolkit is used to enforce the data model
tk = Toolkit()
biolink_classes = ["biolink:" + i.title().replace(" ","") for i in tk.get_descendants('named thing')]

In [24]:
#Can be parallized with joblib:
#
#query_succcess = Parallel(n_jobs=-1)(delayed(myquery)(gene,genename2ncbicurie,biolink_classes) for gene in gene_list)
for gene in gene_list:
    r = myquery(gene,genename2ncbicurie,biolink_classes)
    if r == 0:
        print("{0} successfully queried".format(gene))
    

b29b5097-2aab-4e7a-8423-fc4ab5c5fffa
Still Running
Still Running
Still Running
Still Running
Still Running
Still Running
Still Running
Still Running
Still Running
Done
Done
ARS Error ara-aragorn 0
Done
Done ara-arax 500
Done
Done ara-bte 2873
Error
Error ara-unsecret 0
Done
Done kp-genetics 0
Done
Done kp-molecular 1913
Done
Done ara-explanatory 2902
Done
Done ara-improving 1000
Done
Done kp-cam 77
Done
Done kp-textmining 778
Done
Done kp-openpredict 0
Done
Done kp-icees 0
Done
Done kp-chp 20
Done
ARS Error kp-cohd 0
Done
Done kp-icees-dili 0
Replacement index 1 out of range for positional args tuple
d551f608-3e24-4ab3-939b-4e2ac7615336
Still Running
Still Running
Still Running
Still Running
Still Running
Still Running
Still Running
Still Running
Still Running
Done
Done
ARS Error ara-aragorn 0
Done
ARS Error ara-bte 0
Done
Done ara-arax 500
Error
Error ara-unsecret 0
Done
Done kp-genetics 0
Done
Done kp-molecular 555
Done
Done ara-explanatory 1947
Done
Done ara-improving 1000
Done
Done

## Post-processing

Compose to one file for nodes and one file for edges. Node/edges attributes can be added as columns to these files. The script kgx-neo4j.sh shows how to upload these results to a local neo4j database for analysis and vizualization. 



In [34]:
csv.field_size_limit(sys.maxsize)

def clean_up_files(ofname,single_files,fields):

    with open(ofname,'w') as clean_file:
        clean_writer = csv.DictWriter(clean_file,fieldnames=fields,delimiter='\t')
        clean_writer.writeheader()
        
        for ifname in single_files:
            with open(ifname,'r') as single_file:
                single_reader = csv.DictReader(single_file,delimiter='\t')
                for row in single_reader:
                    clean_writer.writerow({i:row[i] for i in fields} )

core_node_fields=["id","category","name","provided_by","knowledge_source"]
core_edge_fields=["id","subject","predicate","object","knowledge_source",'attributes']

single_node_files = glob.glob("data/kgx_files/*_nodes.tsv")
single_edges_files = glob.glob("data/kgx_files/*_edges.tsv")

node_fname = "data/gene_compound_nodes.tsv"
edge_fname = "data/gene_compound_edges.tsv"

clean_up_files(node_fname,single_node_files,core_node_fields)
clean_up_files(edge_fname,single_edges_files,core_edge_fields)