Merge pull request #171 from pagreene/add-ref-data

Refactor Content Updates
gyorilab · May 25, 2021 · 053df45 · 053df45
2 parents dccb4f0 + dd2a286
commit 053df45
Show file tree

Hide file tree

Showing 7 changed files with 948 additions and 633 deletions.
diff --git a/indra_db/cli/__init__.py b/indra_db/cli/__init__.py
@@ -101,15 +101,12 @@ def pipeline_stats(task):
 @click.argument("sources", nargs=-1,
                 type=click.Choice(["pubmed", "pmc_oa", "manuscripts"]),
                 required=False)
-@click.option("-n", "--num_procs", type=int, default=1,
-              help=('Select the number of processors to use during this '
-                    'operation. Default is 1.'))
 @click.option('-c', '--continuing', is_flag=True,
               help=('Continue uploading or updating, picking up where you left '
                     'off.'))
 @click.option('-d', '--debug', is_flag=True,
               help='Run with debugging level output.')
-def content_run(task, sources, num_procs, continuing, debug):
+def content_run(task, sources, continuing, debug):
     """Upload/update text refs and content on the database.
 
     \b
@@ -144,26 +141,42 @@ def content_run(task, sources, num_procs, continuing, debug):
     for ContentManager in selected_managers:
         if task == 'upload':
             print(f"Uploading {ContentManager.my_source}.")
-            ContentManager().populate(db, num_procs, continuing)
+            ContentManager().populate(db, continuing)
         elif task == 'update':
             print(f"Updating {ContentManager.my_source}")
-            ContentManager().update(db, num_procs)
+            ContentManager().update(db)
 
 
 @main.command()
-def content_list():
+@click.option('-l', '--long', is_flag=True,
+              help="Include a list of the most recently added content for all "
+                   "source types.")
+def content_list(long):
     """List the current knowledge sources and their status."""
     # Import what is needed.
     import tabulate
+    from sqlalchemy import func
     from .content import Pubmed, PmcOA, Manuscripts
+
     content_managers = [Pubmed, PmcOA, Manuscripts]
     db = get_db('primary')
 
     # Generate the rows.
-    rows = [(cm.my_source, format_date(cm.get_latest_update(db)))
-            for cm in content_managers]
-    print(tabulate.tabulate(rows, ('Source', 'Last Updated'),
-                            tablefmt='simple'))
+    source_updates = {cm.my_source: format_date(cm.get_latest_update(db))
+                      for cm in content_managers}
+    if long:
+        print("This may take a while...", end='', flush=True)
+        q = (db.session.query(db.TextContent.source,
+                              func.max(db.TextContent.insert_date))
+                       .group_by(db.TextContent.source))
+        rows = [(src, source_updates.get(src, '-'), format_date(last_insert))
+                for src, last_insert in q.all()]
+        headers = ('Source', 'Last Updated', 'Latest Insert')
+        print("\r", end='')
+    else:
+        rows = [(k, v) for k, v in source_updates.items()]
+        headers = ('Source', 'Last Updated')
+    print(tabulate.tabulate(rows, headers, tablefmt='simple'))
 
 
 @main.command()