Non-MARC import from archive.org (#1952)

* add subjects to multivalues parsed from ia * improve non-marc import * add missing braces * correct data format to catalog.add_book.load() * add publication_date from ia item * add other identifiers * change history comment to avoid specifying MARC if from an IA record
internetarchive · Mar 12, 2019 · 8e964b5 · 8e964b5
1 parent 1c8e8f7
commit 8e964b5
Show file tree

Hide file tree

Showing 3 changed files with 31 additions and 18 deletions.
diff --git a/openlibrary/core/ia.py b/openlibrary/core/ia.py
@@ -1,6 +1,5 @@
 """Library for interacting wih archive.org.
 """
-from __future__ import print_function
 import os
 import urllib2
 import datetime
@@ -15,16 +14,15 @@
 
 import six
 
+logger = logging.getLogger('openlibrary.ia')
 
-logger = logging.getLogger("openlibrary.ia")
-
-VALID_READY_REPUB_STATES = ["4", "19", "20", "22"]
+VALID_READY_REPUB_STATES = ['4', '19', '20', '22']
 
 def get_item_json(itemid):
     itemid = web.safestr(itemid.strip())
     url = 'http://archive.org/metadata/%s' % itemid
     try:
-        stats.begin("archive.org", url=url)
+        stats.begin('archive.org', url=url)
         metadata_json = urllib2.urlopen(url).read()
         stats.end()
         return simplejson.loads(metadata_json)
@@ -33,12 +31,12 @@ def get_item_json(itemid):
         return {}
 
 def extract_item_metadata(item_json):
-    metadata = process_metadata_dict(item_json.get("metadata", {}))
+    metadata = process_metadata_dict(item_json.get('metadata', {}))
     if metadata:
         # if any of the files is access restricted, consider it as
         # an access-restricted item.
         files = item_json.get('files', [])
-        metadata['access-restricted'] = any(f.get("private") == "true" for f in files)
+        metadata['access-restricted'] = any(f.get('private') == 'true' for f in files)
 
         # remember the filenames to construct download links
         metadata['_filenames'] = [f['name'] for f in files]
@@ -48,7 +46,7 @@ def get_metadata(itemid):
     item_json = get_item_json(itemid)
     return extract_item_metadata(item_json)
 
-get_metadata = cache.memcache_memoize(get_metadata, key_prefix="ia.get_metadata", timeout=5*60)
+get_metadata = cache.memcache_memoize(get_metadata, key_prefix='ia.get_metadata', timeout=5*60)
 
 def process_metadata_dict(metadata):
     """Process metadata dict to make sure multi-valued fields like
@@ -59,7 +57,7 @@ def process_metadata_dict(metadata):
     non-list cases. This function makes sure the known multi-valued fields are
     always lists.
     """
-    mutlivalued = set(["collection", "external-identifier"])
+    mutlivalued = set(['collection', 'external-identifier', 'isbn', 'subject', 'oclc-id'])
     def process_item(k, v):
         if k in mutlivalued and not isinstance(v, list):
             v = [v]
@@ -74,7 +72,7 @@ def _old_get_meta_xml(itemid):
     itemid = web.safestr(itemid.strip())
     url = 'http://www.archive.org/download/%s/%s_meta.xml' % (itemid, itemid)
     try:
-        stats.begin("archive.org", url=url)
+        stats.begin('archive.org', url=url)
         metaxml = urllib2.urlopen(url).read()
         stats.end()
     except IOError:
@@ -130,7 +128,6 @@ def xml2dict(xml, **defaults):
 def _get_metadata(itemid):
     """Returns metadata by querying the archive.org metadata API.
     """
-    print("_get_metadata", itemid, file=web.debug)
     url = "http://www.archive.org/metadata/%s" % itemid
     try:
         stats.begin("archive.org", url=url)

diff --git a/openlibrary/plugins/importapi/code.py b/openlibrary/plugins/importapi/code.py
@@ -284,16 +284,31 @@ def get_ia_record(self, metadata):
         :rtype: dict
         :return: Edition record
         """
-        #TODO: include identifiers: isbn, oclc, lccn
         authors = [{'name': name} for name in metadata.get('creator', '').split(';')]
+        description = metadata.get('description')
+        isbn = metadata.get('isbn')
+        language = metadata.get('language')
+        lccn = metadata.get('lccn')
         subject = metadata.get('subject')
+        oclc = metadata.get('oclc-id')
         d = {
-            "title": metadata.get('title', ''),
-            "authors": authors,
-            "language": metadata.get('language', ''),
+            'title': metadata.get('title', ''),
+            'authors': authors,
+            'publish_date': metadata.get('date'),
+            'publisher': metadata.get('publisher'),
         }
+        if description:
+            d['description'] = description
+        if isbn:
+            d['isbn'] = isbn
+        if language and len(language) == 3:
+            d['languages'] = [language]
+        if lccn:
+            d['lccn'] = [lccn]
         if subject:
-            d["subjects"] = isinstance(subject, list) and subject or [subject]
+            d['subjects'] = subject
+        if oclc:
+            d['oclc'] = oclc
         return d
 
     def load_book(self, edition_data):

diff --git a/openlibrary/templates/history/comment.html b/openlibrary/templates/history/comment.html
@@ -14,7 +14,8 @@
         $if record.source_name == "amazon.com":
              Inital record created, from an $:link(record.source_url, "amazon.com") <a href="$record.url">record</a>.
         $else:
-            Initial record created, from $:link(record.source_url, record.source_name) <a href="$record.url">MARC record</a>.
+            $ record_type = 'item' if record.source_name == 'Internet Archive' else 'MARC'
+            Initial record created, from $:link(record.source_url, record.source_name) <a href="$record.url">$record_type record</a>.
     $else:
         Found a <a href="$record.url">matching record</a> from $:link(record.source_url, record.source_name).
 $elif "history_v2" in ctx.features:
@@ -23,4 +24,4 @@
 $elif v.comment:
     $v.comment
 $else:
-    <em>$_("Edited without comment.")</em>
+    <em>$_("Edited without comment.")</em>