Skip to content

Commit

Permalink
Merge pull request #207 from dianakolusheva/xdd
Browse files Browse the repository at this point in the history
xDD integration
  • Loading branch information
bgyori committed Mar 12, 2021
2 parents d789c7c + 9d4f651 commit 047a035
Show file tree
Hide file tree
Showing 10 changed files with 378 additions and 30 deletions.
1 change: 1 addition & 0 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -83,3 +83,4 @@ jobs:
INDRALAB_USERS_DB: ${{ secrets.INDRALAB_USERS_DB }}
EMMAADBTEST: ${{ secrets.EMMAADBTEST }}
EMAIL_SIGN_SECRET: ${{ secrets.EMAIL_SIGN_SECRET }}
XDD_API_KEY: ${{ secrets.XDD_API_KEY }}
1 change: 1 addition & 0 deletions doc/modules/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,5 @@ EMMAA modules reference
readers
database
aws_lambda_functions
xdd
util
10 changes: 10 additions & 0 deletions doc/modules/xdd.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
xDD client
==========

.. automodule:: emmaa.xdd
:members:
:show-inheritance:

.. automodule:: emmaa.xdd.xdd_client
:members:
:show-inheritance:
25 changes: 25 additions & 0 deletions emmaa/analyze_tests_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -1173,6 +1173,31 @@ def _get_pmcid_title(pmcid):
return title


def _get_trid_title(trid):
db = get_db('primary')
tc = db.select_one(db.TextContent,
db.TextContent.text_ref_id == trid,
db.TextContent.text_type == 'title')
if tc:
title = unpack(tc.content)
return title
tr = db.select_one(db.TextRef, db.TextRef.id == trid)
ref_dict = tr.get_ref_dict()
if 'PMID' in ref_dict:
pmid = ref_dict['PMID']
pmids_to_titles = _get_pmid_titles([pmid])
if pmid in pmids_to_titles:
return pmids_to_titles[pmid]
if 'PMCID' in ref_dict:
title = _get_pmcid_title(ref_dict['PMCID'])
if title:
return title
if 'DOI' in ref_dict:
title = _get_doi_title(ref_dict['DOI'])
if title:
return title


def _get_publication_link(text_refs):
if text_refs.get('PMCID'):
name = 'PMC'
Expand Down
50 changes: 50 additions & 0 deletions emmaa/tests/test_xdd.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
from nose.plugins.attrib import attr
from emmaa.xdd import get_document_figures, get_figures_from_query


@attr('nonpublic')
def test_document_figures_doi():
doi = '10.1101/2020.08.23.20180281'
fig_list = get_document_figures(doi, 'DOI')
assert fig_list
# Should be a list of tuples with title and image bytes
assert len(fig_list[0]) == 2


# This would call database
@attr('notravis', 'nonpublic')
def test_document_figures_other_types():
# Should get results from different paper ID types
trid = 31859624
fig_list = get_document_figures(trid, 'TRID')
assert fig_list
assert len(fig_list[0]) == 2
pmid = '32838361'
fig_list = get_document_figures(pmid, 'PMID')
assert fig_list
assert len(fig_list[0]) == 2
pmcid = 'PMC7362813'
fig_list = get_document_figures(pmcid, 'PMCID')
assert fig_list
assert len(fig_list[0]) == 2


@attr('nonpublic')
def test_figures_from_query():
query = 'ATG12,ATG5'
# Get full result
fig_list = get_figures_from_query(query)
assert fig_list
assert len(fig_list[0]) == 3
total = len(fig_list)
assert total > 15, total
# Set smaller limit
fig_list = get_figures_from_query(query, limit=10)
assert fig_list
assert len(fig_list[0]) == 3
assert len(fig_list) == 10
# If limit is larger than total, get total
fig_list = get_figures_from_query(query, limit=(total+10))
assert fig_list
assert len(fig_list[0]) == 3
assert len(fig_list) == total
3 changes: 3 additions & 0 deletions emmaa/xdd/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
"""This modules provides an interface to query xDD content for figures and
tables."""
from .xdd_client import get_document_figures, get_figures_from_query
190 changes: 190 additions & 0 deletions emmaa/xdd/xdd_client.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,190 @@
import os
import requests
import logging
from indra_db import get_db


logger = logging.getLogger(__name__)
api_key = os.environ.get('XDD_API_KEY')
doc_url = 'https://xdddev.chtc.io/sets/xdd-covid-19/cosmos/api/document'
obj_url = 'https://xdddev.chtc.io/sets/xdd-covid-19/cosmos/api/object/'
query_url = 'https://xdd.wisc.edu/sets/xdd-covid-19/cosmos/api/search'


def get_document_objects(doi):
"""Get a list of figure/table object dictionaries for a given DOI."""
logger.info(f'Got a request to get figures for DOI {doi}')
# Get first batch of results and find the total number of results
rj = send_document_search_request(doi, page=0)
if not rj:
return []
total = rj.get('total', 0)
logger.info(f'Got a total of {total} objects')
objects = rj['objects']
page = 0
while len(objects) < total:
page += 1
rj = send_document_search_request(doi, page=page)
if not rj:
logger.warning(f'Did not get results for {doi} page {page}')
break
objects += rj['objects']
filtered_objects = [
obj for obj in objects if obj['cls'] in ['Figure', 'Table']]
return filtered_objects


def get_figure_from_document_object(obj_dict):
"""Get a figure title and bytes content from figure object dictionary."""
txt = obj_dict['header_content']
url = f"{obj_url}{obj_dict['id']}"
res = requests.get(url, {'api_key': api_key})
rj = res.json()
if 'objects' not in rj:
return txt, None
b = rj['objects'][0]['children'][0]['bytes']
return txt, b


def get_document_figures(paper_id, paper_id_type):
"""Get figures and tables from a given paper.
Parameters
----------
paper_id : str or int
ID of a paper.
paper_id_type : str
A name of a paper ID type (PMID, PMCID, DOI, TRID).
Returns
-------
figures : list[tuple]
A list of tuples where each tuple is a figure title and bytes content.
"""
paper_id_type = paper_id_type.upper()
if paper_id_type == 'DOI':
doi = paper_id
else:
db = get_db('primary')
if paper_id_type == 'TRID':
tr = db.select_one(db.TextRef, db.TextRef.id == paper_id)
elif paper_id_type == 'PMID':
tr = db.select_one(db.TextRef, db.TextRef.pmid == paper_id)
elif paper_id_type == 'PMCID':
tr = db.select_one(db.TextRef, db.TextRef.pmcid == paper_id)
ref_dict = tr.get_ref_dict()
doi = ref_dict.get('DOI')
if not doi:
logger.warning(f'Could not get DOI from {paper_id_type} {paper_id}, '
'returning 0 figures and tables')
return []
objects = get_document_objects(doi)
if not objects:
return []
figures = []
for obj in objects:
figures.append(get_figure_from_document_object(obj))
logger.info(f'Returning {len(figures)} figures and tables.')
return figures


def get_figures_from_query(query, limit=None):
"""Get figures and tables from a query.
Parameters
----------
query : str
An entity name or comma-separated entity names to query for.
limit : int or None
A number of figures and tables to return.
Returns
-------
figures : list[tuple]
A list of tuples where each tuple is a link to the paper, a figure
title and bytes content.
"""
logger.info(f'Got a request for query {query} with limit {limit}')
# Get first batch of results and find the total number of results
rj = send_query_search_request(query, page=0)
if not rj:
return []
total = rj.get('total', 0)
logger.info(f'Got a total of {total} objects')
objects = rj['objects']
page = 0
# If there's a limit of number of figures so we can stop when we reach it
# or when we run out of objects
if limit:
figures = get_figures_from_query_objects(objects)
while len(figures) < limit and len(objects) < total:
page += 1
rj = send_query_search_request(query, page)
if not rj:
logger.warning(f'Did not get results for {query}, page {page}')
break
new_figures = get_figures_from_query_objects(rj['objects'])
figures += new_figures
objects += rj['objects']
figures = figures[: limit]
logger.info(f'Returning {len(figures)} figures and tables.')
return figures
# There's no limit so we want to get all objects before getting figures
while len(objects) < total:
page += 1
rj = send_query_search_request(query, page)
if not rj:
logger.warning(f'Did not get results for {query} page {page}')
break
objects += rj['objects']
figures = get_figures_from_query_objects(objects)
logger.info(f'Returning {len(figures)} figures and tables.')
return figures


def send_request(url, params):
"""Send a request and handle potential errors."""
res = requests.get(url, params=params)
try:
rj = res.json()
if 'objects' not in rj:
params.pop('api_key')
logger.warning(f'Could not get objects for {params}')
if 'error' in rj:
logger.warning(rj['error'])
return
except Exception as e:
logger.info(e)
return
return rj


def send_query_search_request(query, page):
"""Send a request to get one page of results for a query."""
logger.info(f'Sending a request for query {query}, page {page}')
return send_request(
query_url,
{'query': query, 'inclusive': True, 'page': page, 'api_key': api_key})


def send_document_search_request(doi, page):
"""Send a request to get one page of results for a DOI."""
logger.info(f'Sending a request for DOI {doi}, page {page}')
return send_request(doc_url,
{'doi': doi, 'api_key': api_key, 'page': page})


def get_figures_from_query_objects(objects):
"""Get a list of paper links, figure titles and their content bytes from
a list of object dictionaries (returned from query api)."""
figures = []
for obj in objects:
for child in obj['children']:
if child['cls'] in ['Figure', 'Table']:
txt = child['header_content']
b = child['bytes']
urls = set()
for link in obj['bibjson']['link']:
urls.add(link['url'])
figures.append((urls, txt, b))
return figures

0 comments on commit 047a035

Please sign in to comment.