Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Import archive.org bulk marc items #1058

Merged
merged 36 commits into from
Sep 15, 2018
Merged
Show file tree
Hide file tree
Changes from 35 commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
4050d5c
fixes #1029 import subjects from ia metadata
hornc Aug 24, 2018
0236ecd
raise error in identifier not provided
hornc Aug 24, 2018
5cfb8f0
add docstrings
hornc Aug 27, 2018
17f45e5
raise 400 Bad Requests on error instead of 200s
hornc Aug 27, 2018
68e36fb
document add_book methods used by the import API
hornc Aug 27, 2018
f28a84a
add import from archive.org bulk_marc items
hornc Aug 27, 2018
3e417e1
comment and remove deprecated imports
hornc Aug 29, 2018
729a321
found and commented a bug in deprecated functions
hornc Aug 29, 2018
5872309
remove utils.ia find_item lookup
hornc Aug 29, 2018
0f34ee1
make it clear which fns belong to fast_parse
hornc Aug 29, 2018
c8bb135
don't keep looping if we got a response
hornc Aug 29, 2018
add6383
handle missing coverstore servers in dev
hornc Aug 29, 2018
b6f5174
return length and offset of next record when reading bulk MARC
hornc Aug 29, 2018
5e83d32
link to all initial source records in item history
hornc Aug 29, 2018
cb25df8
refactor and remove unused deprecated code
hornc Aug 29, 2018
0dd5770
remove non-functioning tests
hornc Aug 29, 2018
6b85403
fix True assert tests in add_book
hornc Aug 30, 2018
3bb8884
MarcBinary to raise exceptions itself rather than have get_ia return …
hornc Aug 30, 2018
23b8ec5
parametrize get_marc_record_from_ia tests for better reporting
hornc Aug 30, 2018
3e4e271
rename ils only import tests
hornc Sep 5, 2018
f968ba5
address docstring issues
hornc Sep 6, 2018
972f793
move MARC tests to own dir
hornc Sep 6, 2018
647cb54
move fast_parse tests to test dir
hornc Sep 6, 2018
cb88426
correct rtype
hornc Sep 6, 2018
3035437
re-add missing test data
hornc Sep 6, 2018
0c75981
raise BadMarc exception if unexpected/empty data
hornc Sep 6, 2018
0383d86
remove conditionals from tests
hornc Sep 6, 2018
82d7e5f
confirm behaviour of ia MARC import add_book
hornc Sep 6, 2018
26326ca
docstrings for load_book used in import api path
hornc Sep 6, 2018
1bf18cf
docstrings and tests for merge_marc code used by import
hornc Sep 7, 2018
0292910
DRY isbn gathering code
hornc Sep 7, 2018
1792e9b
refactor add_book matching
hornc Sep 7, 2018
51baf2e
address review comments
hornc Sep 7, 2018
c81e186
make ia urls constants
hornc Sep 12, 2018
54e2cc6
load ia base url from config
hornc Sep 12, 2018
09b6817
address review comments, refactor
hornc Sep 13, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions conf/openlibrary.yml
Original file line number Diff line number Diff line change
Expand Up @@ -205,3 +205,4 @@ internal_tests_api_key: '8oPd1tx747YH374ohs48ZO5s2Nt1r9yD'
ia_availability_api_url: 'https://archive.org/services/loans/beta/loan/index.php' # to be deprecated in favor of _v1 below
ia_availability_api_v1_url: 'https://archive.org/services/loans/beta/loan/index.php'
ia_availability_api_v2_url: 'https://archive.org/services/availability/'
ia_base_url: 'https://archive.org'
177 changes: 116 additions & 61 deletions openlibrary/catalog/add_book/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@


re_normalize = re.compile('[^[:alphanum:] ]', re.U)
re_lang = re.compile('^/languages/([a-z]{3})$')

def strip_accents(s):
"""http://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string
Expand Down Expand Up @@ -146,6 +147,24 @@ def new_work(q, rec, cover_id):
return w

def load_data(rec):
"""
Adds a new Edition to Open Library. Creates a new Work if required,
otherwise associates the new Edition with an existing Work.

:param dict rec: Edition record to add (no further checks at this point)
:rtype: dict
:return:
{
"success": False,
"error": <error msg>
}
OR
{
"success": True,
"work": {"key": <key>, "status": "created" | "modified" | "matched"},
"edition": {"key": <key>, "status": "created"}
}
"""
cover_url = None
if 'cover' in rec:
cover_url = rec['cover']
Expand All @@ -163,7 +182,6 @@ def load_data(rec):
author_in = [import_author(a, eastern=east_in_by_statement(rec, a)) for a in q.get('authors', [])]
(authors, author_reply) = build_author_reply(author_in, edits)

#q['source_records'] = [loc]
if authors:
q['authors'] = authors
reply['authors'] = author_reply
Expand Down Expand Up @@ -249,42 +267,47 @@ def find_match(e1, edition_pool):
if try_merge(e1, edition_key, thing):
return edition_key

def isbns_from_record(rec):
"""
Returns a list of all isbns from the various possible isbn fields.

:param dict rec: Edition import record
:rtype: list
"""
isbns = rec.get('isbn', []) + rec.get('isbn_10', []) + rec.get('isbn_13', [])
isbns = [isbn.replace('-', '').strip() for isbn in isbns]
return isbns

def build_pool(rec):
"""
Searches for existing edition matches on title and bibliographic keys.

:param dict rec: Edition record
hornc marked this conversation as resolved.
Show resolved Hide resolved
:rtype: dict
:return: {<identifier: title | isbn | lccn etc>: [list of /books/OL..M keys that match rec on <identifier>]}
"""
pool = defaultdict(set)
match_fields = ('title', 'oclc_numbers', 'lccn', 'ocaid')

## Find records with matching title
assert isinstance(rec.get('title'), basestring)
q = {
'type': '/type/edition',
'normalized_title_': normalize(rec['title'])
}
pool['title'] = set(web.ctx.site.things(q))
# Find records with matching fields
for field in match_fields:
pool[field] = set(editions_matched(rec, field))

q['title'] = rec['title']
del q['normalized_title_']
pool['title'].update(web.ctx.site.things(q))
# update title pool with normalized title matches
pool['title'].update(set(editions_matched(rec, 'normalized_title_', normalize(rec['title']))))

## Find records with matching ISBNs
isbns = rec.get('isbn', []) + rec.get('isbn_10', []) + rec.get('isbn_13', [])
isbns = [isbn.replace("-", "").strip() for isbn in isbns] # strip hyphens
# Find records with matching ISBNs
isbns = isbns_from_record(rec)
if isbns:
# Make a single request to find records matching the given ISBNs
keys = web.ctx.site.things({"isbn_": isbns, 'type': '/type/edition'})
if keys:
pool['isbn'] = set(keys)

## Find records with matching oclc_numbers and lccn
for field in 'oclc_numbers', 'lccn':
values = rec.get(field, [])
if values:
for v in values:
q = {field: v, 'type': '/type/edition'}
found = web.ctx.site.things(q)
if found:
pool[field] = set(found)
pool['isbn'] = set(editions_matched(rec, 'isbn_', isbns))

return dict((k, list(v)) for k, v in pool.iteritems() if v)

def add_db_name(rec):
"""
db_name = Author name followed by dates.
adds 'db_name' in place for each author.
"""
if 'authors' not in rec:
return

Expand All @@ -297,49 +320,68 @@ def add_db_name(rec):
date = a.get('birth_date', '') + '-' + a.get('death_date', '')
a['db_name'] = ' '.join([a['name'], date]) if date else a['name']

re_lang = re.compile('^/languages/([a-z]{3})$')
def editions_matched(rec, key, value=None):
"""
Search OL for editions matching record's 'key' value.

:param dict rec: Edition import record
:param str key: Key to search on
:param list|str value: Value or Values to use, overriding record values
:rtpye: list
:return: List of edition keys ["/books/OL..M",]
"""
if value is None and key not in rec:
return []

if value is None:
value = rec[key]
q = {
'type':'/type/edition',
key: value
}
ekeys = list(web.ctx.site.things(q))
return ekeys

def early_exit(rec):
f = 'ocaid'
# Anand - August 2014
# If openlibrary ID is already specified in the record, then use it.
# This will be the case when the item metadata already has openlibrary field.
if 'openlibrary' in rec:
return rec['openlibrary']
"""
Attempts to quickly find an existing item match using bibliographic keys.

if 'ocaid' in rec:
q = {
'type':'/type/edition',
f: rec[f],
}
ekeys = list(web.ctx.site.things(q))
if ekeys:
return ekeys[0]
:param dict rec: Edition record
:rtype: str|bool
:return: First key matched of format "/books/OL..M" or False if no match found.
"""

if 'isbn_10' or 'isbn_13' in rec:
isbns = rec.get("isbn_10", []) + rec.get("isbn_13", [])
isbns = [isbn.strip().replace("-", "") for isbn in isbns]
if 'openlibrary' in rec:
return '/books/' + rec['openlibrary']

q = {
'type':'/type/edition',
'isbn_': isbns
}
ekeys = list(web.ctx.site.things(q))
ekeys = editions_matched(rec, 'ocaid')
if ekeys:
return ekeys[0]

isbns = isbns_from_record(rec)
if isbns:
ekeys = editions_matched(rec, 'isbn_', isbns)
if ekeys:
return ekeys[0]

# only searches for the first value from these lists
for f in 'source_records', 'oclc_numbers', 'lccn':
if rec.get(f):
q = {
'type':'/type/edition',
f: rec[f][0],
}
ekeys = list(web.ctx.site.things(q))
ekeys = editions_matched(rec, f, rec[f][0])
if ekeys:
return ekeys[0]
return False

def find_exact_match(rec, edition_pool):
"""
Returns an edition key match for rec from edition_pool
Only returns a key if all values match?

:param dict rec: Edition import record
:param dict edition_pool:
:rtype: str|bool
:return: edition key
"""
seen = set()
for field, editions in edition_pool.iteritems():
for ekey in editions:
Expand Down Expand Up @@ -378,6 +420,14 @@ def find_exact_match(rec, edition_pool):
return False

def add_cover(cover_url, ekey):
"""
Adds a cover to coverstore and returns the cover id.
hornc marked this conversation as resolved.
Show resolved Hide resolved

:param str cover_url: URL of cover image
:param str ekey: Edition key /book/OL..M
:rtype: int
:return: Cover id
"""
olid = ekey.split("/")[-1]
coverstore_url = config.get('coverstore_url').rstrip('/')
upload_url = coverstore_url + '/b/upload2'
Expand All @@ -391,6 +441,7 @@ def add_cover(cover_url, ekey):
'olid': olid,
'ip': web.ctx.ip,
}
reply = None
for attempt in range(10):
try:
res = urllib.urlopen(upload_url, urllib.urlencode(params))
Expand All @@ -399,10 +450,9 @@ def add_cover(cover_url, ekey):
sleep(2)
continue
body = res.read()
if body != '':
if body not in ['', 'None']:
reply = json.loads(body)
hornc marked this conversation as resolved.
Show resolved Hide resolved
if res.getcode() == 200 and body != '':
if 'id' in reply:
if res.getcode() == 200 and 'id' in reply:
break
print 'retry, attempt', attempt
sleep(2)
Expand Down Expand Up @@ -479,7 +529,8 @@ def load(rec):

edition_pool = build_pool(rec)
if not edition_pool:
return load_data(rec) # 'no books in pool, loading'
# No match candidates found, add edition
return load_data(rec)

#matches = set(item for sublist in edition_pool.values() for item in sublist)
#if len(matches) == 1:
Expand All @@ -498,9 +549,11 @@ def load(rec):

match = find_match(e1, edition_pool)

if not match: # 'match found:', match, rec['ia']
if not match:
# No match found, add edition
return load_data(rec)

# We have an edition match at this point
need_work_save = False
need_edition_save = False
w = None
Expand All @@ -509,6 +562,7 @@ def load(rec):
w = e.works[0].dict()
work_created = False
else:
# Found an edition without a work
work_created = True
need_work_save = True
need_edition_save = True
Expand All @@ -517,6 +571,7 @@ def load(rec):
'title': get_title(rec),
'key': web.ctx.site.new_key('/type/work'),
}
#TODO: add edition covers and author to new work
e.works = [{'key': w['key']}]

reply = {
Expand Down