-
Notifications
You must be signed in to change notification settings - Fork 10
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #207 from dianakolusheva/xdd
xDD integration
- Loading branch information
Showing
10 changed files
with
378 additions
and
30 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,4 +16,5 @@ EMMAA modules reference | |
readers | ||
database | ||
aws_lambda_functions | ||
xdd | ||
util |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
xDD client | ||
========== | ||
|
||
.. automodule:: emmaa.xdd | ||
:members: | ||
:show-inheritance: | ||
|
||
.. automodule:: emmaa.xdd.xdd_client | ||
:members: | ||
:show-inheritance: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
from nose.plugins.attrib import attr | ||
from emmaa.xdd import get_document_figures, get_figures_from_query | ||
|
||
|
||
@attr('nonpublic') | ||
def test_document_figures_doi(): | ||
doi = '10.1101/2020.08.23.20180281' | ||
fig_list = get_document_figures(doi, 'DOI') | ||
assert fig_list | ||
# Should be a list of tuples with title and image bytes | ||
assert len(fig_list[0]) == 2 | ||
|
||
|
||
# This would call database | ||
@attr('notravis', 'nonpublic') | ||
def test_document_figures_other_types(): | ||
# Should get results from different paper ID types | ||
trid = 31859624 | ||
fig_list = get_document_figures(trid, 'TRID') | ||
assert fig_list | ||
assert len(fig_list[0]) == 2 | ||
pmid = '32838361' | ||
fig_list = get_document_figures(pmid, 'PMID') | ||
assert fig_list | ||
assert len(fig_list[0]) == 2 | ||
pmcid = 'PMC7362813' | ||
fig_list = get_document_figures(pmcid, 'PMCID') | ||
assert fig_list | ||
assert len(fig_list[0]) == 2 | ||
|
||
|
||
@attr('nonpublic') | ||
def test_figures_from_query(): | ||
query = 'ATG12,ATG5' | ||
# Get full result | ||
fig_list = get_figures_from_query(query) | ||
assert fig_list | ||
assert len(fig_list[0]) == 3 | ||
total = len(fig_list) | ||
assert total > 15, total | ||
# Set smaller limit | ||
fig_list = get_figures_from_query(query, limit=10) | ||
assert fig_list | ||
assert len(fig_list[0]) == 3 | ||
assert len(fig_list) == 10 | ||
# If limit is larger than total, get total | ||
fig_list = get_figures_from_query(query, limit=(total+10)) | ||
assert fig_list | ||
assert len(fig_list[0]) == 3 | ||
assert len(fig_list) == total |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
"""This modules provides an interface to query xDD content for figures and | ||
tables.""" | ||
from .xdd_client import get_document_figures, get_figures_from_query |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,190 @@ | ||
import os | ||
import requests | ||
import logging | ||
from indra_db import get_db | ||
|
||
|
||
logger = logging.getLogger(__name__) | ||
api_key = os.environ.get('XDD_API_KEY') | ||
doc_url = 'https://xdddev.chtc.io/sets/xdd-covid-19/cosmos/api/document' | ||
obj_url = 'https://xdddev.chtc.io/sets/xdd-covid-19/cosmos/api/object/' | ||
query_url = 'https://xdd.wisc.edu/sets/xdd-covid-19/cosmos/api/search' | ||
|
||
|
||
def get_document_objects(doi): | ||
"""Get a list of figure/table object dictionaries for a given DOI.""" | ||
logger.info(f'Got a request to get figures for DOI {doi}') | ||
# Get first batch of results and find the total number of results | ||
rj = send_document_search_request(doi, page=0) | ||
if not rj: | ||
return [] | ||
total = rj.get('total', 0) | ||
logger.info(f'Got a total of {total} objects') | ||
objects = rj['objects'] | ||
page = 0 | ||
while len(objects) < total: | ||
page += 1 | ||
rj = send_document_search_request(doi, page=page) | ||
if not rj: | ||
logger.warning(f'Did not get results for {doi} page {page}') | ||
break | ||
objects += rj['objects'] | ||
filtered_objects = [ | ||
obj for obj in objects if obj['cls'] in ['Figure', 'Table']] | ||
return filtered_objects | ||
|
||
|
||
def get_figure_from_document_object(obj_dict): | ||
"""Get a figure title and bytes content from figure object dictionary.""" | ||
txt = obj_dict['header_content'] | ||
url = f"{obj_url}{obj_dict['id']}" | ||
res = requests.get(url, {'api_key': api_key}) | ||
rj = res.json() | ||
if 'objects' not in rj: | ||
return txt, None | ||
b = rj['objects'][0]['children'][0]['bytes'] | ||
return txt, b | ||
|
||
|
||
def get_document_figures(paper_id, paper_id_type): | ||
"""Get figures and tables from a given paper. | ||
Parameters | ||
---------- | ||
paper_id : str or int | ||
ID of a paper. | ||
paper_id_type : str | ||
A name of a paper ID type (PMID, PMCID, DOI, TRID). | ||
Returns | ||
------- | ||
figures : list[tuple] | ||
A list of tuples where each tuple is a figure title and bytes content. | ||
""" | ||
paper_id_type = paper_id_type.upper() | ||
if paper_id_type == 'DOI': | ||
doi = paper_id | ||
else: | ||
db = get_db('primary') | ||
if paper_id_type == 'TRID': | ||
tr = db.select_one(db.TextRef, db.TextRef.id == paper_id) | ||
elif paper_id_type == 'PMID': | ||
tr = db.select_one(db.TextRef, db.TextRef.pmid == paper_id) | ||
elif paper_id_type == 'PMCID': | ||
tr = db.select_one(db.TextRef, db.TextRef.pmcid == paper_id) | ||
ref_dict = tr.get_ref_dict() | ||
doi = ref_dict.get('DOI') | ||
if not doi: | ||
logger.warning(f'Could not get DOI from {paper_id_type} {paper_id}, ' | ||
'returning 0 figures and tables') | ||
return [] | ||
objects = get_document_objects(doi) | ||
if not objects: | ||
return [] | ||
figures = [] | ||
for obj in objects: | ||
figures.append(get_figure_from_document_object(obj)) | ||
logger.info(f'Returning {len(figures)} figures and tables.') | ||
return figures | ||
|
||
|
||
def get_figures_from_query(query, limit=None): | ||
"""Get figures and tables from a query. | ||
Parameters | ||
---------- | ||
query : str | ||
An entity name or comma-separated entity names to query for. | ||
limit : int or None | ||
A number of figures and tables to return. | ||
Returns | ||
------- | ||
figures : list[tuple] | ||
A list of tuples where each tuple is a link to the paper, a figure | ||
title and bytes content. | ||
""" | ||
logger.info(f'Got a request for query {query} with limit {limit}') | ||
# Get first batch of results and find the total number of results | ||
rj = send_query_search_request(query, page=0) | ||
if not rj: | ||
return [] | ||
total = rj.get('total', 0) | ||
logger.info(f'Got a total of {total} objects') | ||
objects = rj['objects'] | ||
page = 0 | ||
# If there's a limit of number of figures so we can stop when we reach it | ||
# or when we run out of objects | ||
if limit: | ||
figures = get_figures_from_query_objects(objects) | ||
while len(figures) < limit and len(objects) < total: | ||
page += 1 | ||
rj = send_query_search_request(query, page) | ||
if not rj: | ||
logger.warning(f'Did not get results for {query}, page {page}') | ||
break | ||
new_figures = get_figures_from_query_objects(rj['objects']) | ||
figures += new_figures | ||
objects += rj['objects'] | ||
figures = figures[: limit] | ||
logger.info(f'Returning {len(figures)} figures and tables.') | ||
return figures | ||
# There's no limit so we want to get all objects before getting figures | ||
while len(objects) < total: | ||
page += 1 | ||
rj = send_query_search_request(query, page) | ||
if not rj: | ||
logger.warning(f'Did not get results for {query} page {page}') | ||
break | ||
objects += rj['objects'] | ||
figures = get_figures_from_query_objects(objects) | ||
logger.info(f'Returning {len(figures)} figures and tables.') | ||
return figures | ||
|
||
|
||
def send_request(url, params): | ||
"""Send a request and handle potential errors.""" | ||
res = requests.get(url, params=params) | ||
try: | ||
rj = res.json() | ||
if 'objects' not in rj: | ||
params.pop('api_key') | ||
logger.warning(f'Could not get objects for {params}') | ||
if 'error' in rj: | ||
logger.warning(rj['error']) | ||
return | ||
except Exception as e: | ||
logger.info(e) | ||
return | ||
return rj | ||
|
||
|
||
def send_query_search_request(query, page): | ||
"""Send a request to get one page of results for a query.""" | ||
logger.info(f'Sending a request for query {query}, page {page}') | ||
return send_request( | ||
query_url, | ||
{'query': query, 'inclusive': True, 'page': page, 'api_key': api_key}) | ||
|
||
|
||
def send_document_search_request(doi, page): | ||
"""Send a request to get one page of results for a DOI.""" | ||
logger.info(f'Sending a request for DOI {doi}, page {page}') | ||
return send_request(doc_url, | ||
{'doi': doi, 'api_key': api_key, 'page': page}) | ||
|
||
|
||
def get_figures_from_query_objects(objects): | ||
"""Get a list of paper links, figure titles and their content bytes from | ||
a list of object dictionaries (returned from query api).""" | ||
figures = [] | ||
for obj in objects: | ||
for child in obj['children']: | ||
if child['cls'] in ['Figure', 'Table']: | ||
txt = child['header_content'] | ||
b = child['bytes'] | ||
urls = set() | ||
for link in obj['bibjson']['link']: | ||
urls.add(link['url']) | ||
figures.append((urls, txt, b)) | ||
return figures |
Oops, something went wrong.