Skip to content

Commit

Permalink
Merge pull request #113 from pagreene/xdd-updat
Browse files Browse the repository at this point in the history
Update xDD manager to handle extra information included in tag.
  • Loading branch information
pagreene committed Jun 10, 2020
2 parents a8d70f3 + a418c40 commit e120cc6
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 9 deletions.
37 changes: 29 additions & 8 deletions indra_db/managers/xdd_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def load_statements(self, db):
for group in self.groups:
logger.info(f"Processing {group.key}")
file_pair_dict = _get_file_pairs_from_group(s3, group)
for run_id, (bibs, stmts) in file_pair_dict.items():
for (run_id, id_src), (bibs, stmts) in file_pair_dict.items():
logger.info(f"Loading {run_id}")
doi_lookup = {bib['_xddid']: bib['identifier'][0]['id'].upper()
for bib in bibs if 'identifier' in bib}
Expand All @@ -64,8 +64,12 @@ def load_statements(self, db):

self.statements[trid][ev['text_refs']['READER']].append(sj)
if trid not in self.text_content:
if id_src:
src = f'xdd-{id_src}'
else:
src = 'xdd'
self.text_content[trid] = \
(trid, 'xdd', 'xdd', 'fulltext',
(trid, src, 'xdd', 'fulltext',
pub_lookup[xddid] == 'bioRxiv')
return

Expand All @@ -79,7 +83,7 @@ def dump_statements(self, db):
tcids = db.select_all(
[db.TextContent.text_ref_id, db.TextContent.id],
db.TextContent.text_ref_id.in_(self.statements.keys()),
db.TextContent.source == 'xdd'
db.TextContent.format == 'xdd'
)
tcid_lookup = {trid: tcid for trid, tcid in tcids}

Expand Down Expand Up @@ -131,29 +135,46 @@ def run(self, db):
self.dump_statements(db)


class XDDFileError(Exception):
pass


def _get_file_pairs_from_group(s3, group: S3Path):
files = group.list_objects(s3)
file_pairs = defaultdict(dict)
for file_path in files:
run_id, file_suffix = file_path.key.split('_')
# Get information from the filename, including the cases with and
# without the id_src label.
parts = file_path.key.split('_')
if len(parts) == 2:
run_id, file_suffix = parts
id_src = None
elif len(parts) == 3:
run_id, id_src, file_suffix = parts
else:
raise XDDFileError(f"XDD file does not match known standards: "
f"{file_path.key}")
file_type = file_suffix.split('.')[0]

# Try getting the file
try:
file_obj = s3.get_object(**file_path.kw())
file_json = json.loads(file_obj['Body'].read())
file_pairs[run_id][file_type] = file_json
file_pairs[(run_id, id_src)][file_type] = file_json
except Exception as e:
logger.error(f"Failed to load {file_path}")
logger.exception(e)
if run_id in file_pairs:
del file_pairs[run_id]

# Create a dict of tuples from the pairs of files.
ret = {}
for run_id, files in file_pairs.items():
for batch_id, files in file_pairs.items():
if len(files) != 2 or 'bib' not in files or 'stmts' not in files:
logger.warning(f"Run {run_id} does not have both 'bib' and "
logger.warning(f"Run {batch_id} does not have both 'bib' and "
f"'stmts' in files: {files.keys()}. Skipping.")
continue
ret[run_id] = (files['bib'], files['stmts'])
ret[batch_id] = (files['bib'], files['stmts'])
return ret


Expand Down
2 changes: 1 addition & 1 deletion indra_db/reading/read_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,7 +300,7 @@ def iter_over_content(self):
tc_query = self._db.filter_query(
self._db.TextContent,
self._db.TextContent.id.in_(self.tcids),
self._db.TextContent.source != 'xdd'
self._db.TextContent.format != 'xdd'
)

if self.reading_mode != 'all':
Expand Down

0 comments on commit e120cc6

Please sign in to comment.