Skip to content

Commit

Permalink
Non-MARC import from archive.org (#1952)
Browse files Browse the repository at this point in the history
* add subjects to multivalues parsed from ia

* improve non-marc import

* add missing braces

* correct data format to catalog.add_book.load()

* add publication_date from ia item

* add other identifiers

* change history comment to avoid specifying MARC if from an IA record
  • Loading branch information
hornc authored and mekarpeles committed Mar 12, 2019
1 parent 1c8e8f7 commit 8e964b5
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 18 deletions.
19 changes: 8 additions & 11 deletions openlibrary/core/ia.py
@@ -1,6 +1,5 @@
"""Library for interacting wih archive.org.
"""
from __future__ import print_function
import os
import urllib2
import datetime
Expand All @@ -15,16 +14,15 @@

import six

logger = logging.getLogger('openlibrary.ia')

logger = logging.getLogger("openlibrary.ia")

VALID_READY_REPUB_STATES = ["4", "19", "20", "22"]
VALID_READY_REPUB_STATES = ['4', '19', '20', '22']

def get_item_json(itemid):
itemid = web.safestr(itemid.strip())
url = 'http://archive.org/metadata/%s' % itemid
try:
stats.begin("archive.org", url=url)
stats.begin('archive.org', url=url)
metadata_json = urllib2.urlopen(url).read()
stats.end()
return simplejson.loads(metadata_json)
Expand All @@ -33,12 +31,12 @@ def get_item_json(itemid):
return {}

def extract_item_metadata(item_json):
metadata = process_metadata_dict(item_json.get("metadata", {}))
metadata = process_metadata_dict(item_json.get('metadata', {}))
if metadata:
# if any of the files is access restricted, consider it as
# an access-restricted item.
files = item_json.get('files', [])
metadata['access-restricted'] = any(f.get("private") == "true" for f in files)
metadata['access-restricted'] = any(f.get('private') == 'true' for f in files)

# remember the filenames to construct download links
metadata['_filenames'] = [f['name'] for f in files]
Expand All @@ -48,7 +46,7 @@ def get_metadata(itemid):
item_json = get_item_json(itemid)
return extract_item_metadata(item_json)

get_metadata = cache.memcache_memoize(get_metadata, key_prefix="ia.get_metadata", timeout=5*60)
get_metadata = cache.memcache_memoize(get_metadata, key_prefix='ia.get_metadata', timeout=5*60)

def process_metadata_dict(metadata):
"""Process metadata dict to make sure multi-valued fields like
Expand All @@ -59,7 +57,7 @@ def process_metadata_dict(metadata):
non-list cases. This function makes sure the known multi-valued fields are
always lists.
"""
mutlivalued = set(["collection", "external-identifier"])
mutlivalued = set(['collection', 'external-identifier', 'isbn', 'subject', 'oclc-id'])
def process_item(k, v):
if k in mutlivalued and not isinstance(v, list):
v = [v]
Expand All @@ -74,7 +72,7 @@ def _old_get_meta_xml(itemid):
itemid = web.safestr(itemid.strip())
url = 'http://www.archive.org/download/%s/%s_meta.xml' % (itemid, itemid)
try:
stats.begin("archive.org", url=url)
stats.begin('archive.org', url=url)
metaxml = urllib2.urlopen(url).read()
stats.end()
except IOError:
Expand Down Expand Up @@ -130,7 +128,6 @@ def xml2dict(xml, **defaults):
def _get_metadata(itemid):
"""Returns metadata by querying the archive.org metadata API.
"""
print("_get_metadata", itemid, file=web.debug)
url = "http://www.archive.org/metadata/%s" % itemid
try:
stats.begin("archive.org", url=url)
Expand Down
25 changes: 20 additions & 5 deletions openlibrary/plugins/importapi/code.py
Expand Up @@ -284,16 +284,31 @@ def get_ia_record(self, metadata):
:rtype: dict
:return: Edition record
"""
#TODO: include identifiers: isbn, oclc, lccn
authors = [{'name': name} for name in metadata.get('creator', '').split(';')]
description = metadata.get('description')
isbn = metadata.get('isbn')
language = metadata.get('language')
lccn = metadata.get('lccn')
subject = metadata.get('subject')
oclc = metadata.get('oclc-id')
d = {
"title": metadata.get('title', ''),
"authors": authors,
"language": metadata.get('language', ''),
'title': metadata.get('title', ''),
'authors': authors,
'publish_date': metadata.get('date'),
'publisher': metadata.get('publisher'),
}
if description:
d['description'] = description
if isbn:
d['isbn'] = isbn
if language and len(language) == 3:
d['languages'] = [language]
if lccn:
d['lccn'] = [lccn]
if subject:
d["subjects"] = isinstance(subject, list) and subject or [subject]
d['subjects'] = subject
if oclc:
d['oclc'] = oclc
return d

def load_book(self, edition_data):
Expand Down
5 changes: 3 additions & 2 deletions openlibrary/templates/history/comment.html
Expand Up @@ -14,7 +14,8 @@
$if record.source_name == "amazon.com":
Inital record created, from an $:link(record.source_url, "amazon.com") <a href="$record.url">record</a>.
$else:
Initial record created, from $:link(record.source_url, record.source_name) <a href="$record.url">MARC record</a>.
$ record_type = 'item' if record.source_name == 'Internet Archive' else 'MARC'
Initial record created, from $:link(record.source_url, record.source_name) <a href="$record.url">$record_type record</a>.
$else:
Found a <a href="$record.url">matching record</a> from $:link(record.source_url, record.source_name).
$elif "history_v2" in ctx.features:
Expand All @@ -23,4 +24,4 @@
$elif v.comment:
$v.comment
$else:
<em>$_("Edited without comment.")</em>
<em>$_("Edited without comment.")</em>

0 comments on commit 8e964b5

Please sign in to comment.