Skip to content
Permalink
Browse files

Merge pull request #2186 from hornc/1571/orphaned-redirect-authors

Resolve redirects on orphaned editions when importing
  • Loading branch information...
mekarpeles committed Jun 13, 2019
2 parents 34b58c8 + 0f69279 commit e10c3f40683905949daf40e9d7b1d5bbe3993de1
@@ -22,8 +22,6 @@
response = load(record)
"""
from __future__ import print_function

import re
import json
from time import sleep
@@ -263,11 +261,22 @@ def load_data(rec, account=None):
return reply

def is_redirect(thing):
"""
:param Thing thing:
:rtype: bool
"""
if not thing:
return False
return thing.type.key == '/type/redirect'

def find_match(e1, edition_pool):
"""
Find the best match for e1 in edition_pool and return its key.
:param dict e1: the new edition we are trying to match
:param list edition_pool: list of possible edition matches
:rtype: str|None
:return: None or the edition key '/books/OL...M' of the best edition match for e1 in edition_pool
"""
seen = set()
for k, v in edition_pool.iteritems():
for edition_key in v:
@@ -282,7 +291,6 @@ def find_match(e1, edition_pool):
found = False
break
if is_redirect(thing):
print('following redirect %s => %s' % (edition_key, thing['location']))
edition_key = thing['location']
if not found:
continue
@@ -465,7 +473,6 @@ def add_cover(cover_url, ekey, account=None):
try:
res = urllib.urlopen(upload_url, urllib.urlencode(params))
except IOError:
print('retry, attempt', attempt)
sleep(2)
continue
body = res.read()
@@ -475,7 +482,6 @@ def add_cover(cover_url, ekey, account=None):
reply = json.loads(body)
if res.getcode() == 200 and 'id' in reply:
break
print('retry, attempt', attempt)
sleep(2)
if not reply or reply.get('message') == 'Invalid URL':
return
@@ -511,7 +517,7 @@ def create_ol_subjects_for_ocaid(ocaid, subjects):
return ("success for %s" % item.identifier)

def update_ia_metadata_for_ol_edition(edition_id):
"""An ol_edition is of the form OL...M"""
"""edition_id is of the form OL...M"""

data = {'error': 'No qualifying edition'}
if edition_id:
@@ -572,6 +578,15 @@ def load(rec, account=None):
need_work_save = need_edition_save = False
w = None
e = web.ctx.site.get(match)
# check for, and resolve, author redirects
for a in e.authors:
while is_redirect(a):
if a in e.authors:
e.authors.remove(a)
a = web.ctx.site.get(a.location)
if not is_redirect(a):
e.authors.append(a)

if e.get('works'):
w = e.works[0].dict()
work_created = False
@@ -623,7 +638,7 @@ def load(rec, account=None):

edition_fields = [
'local_id', 'ia_box_id', 'ia_loaded_id', 'source_records']
# XXX Todos:
# TODO:
# only consider `source_records` for newly created work
# or if field originally missing:
#if work_created and not e.get('source_records'):
@@ -41,6 +41,26 @@ def do_flip(author):
author['name'] = name
author['personal_name'] = name

def pick_from_matches(author, match):
"""
Finds the best match for author from a list of OL authors records, match.
:param dict author: Author import representation
:param list match: List of matching OL author records
:rtype: dict
:return: A single OL author record from match
"""
maybe = []
if 'birth_date' in author and 'death_date' in author:
maybe = [m for m in match if 'birth_date' in m and 'death_date' in m]
elif 'date' in author:
maybe = [m for m in match if 'date' in m]
if not maybe:
maybe = match
if len(maybe) == 1:
return maybe[0]
return min(maybe, key=key_int)

def find_author(name):
"""
Searches OL for an author by name.
@@ -65,29 +85,9 @@ def walk_redirects(obj, seen):
authors = [walk_redirects(a, seen) for a in authors if a['key'] not in seen]
return authors

def pick_from_matches(author, match):
"""
Finds the best match for author from a list of OL authors records, match.
:param dict author: Author import representation
:param list match: List of matching OL author records
:rtype: dict
:return: A single OL author record from match
"""
maybe = []
if 'birth_date' in author and 'death_date' in author:
maybe = [m for m in match if 'birth_date' in m and 'death_date' in m]
elif 'date' in author:
maybe = [m for m in match if 'date' in m]
if not maybe:
maybe = match
if len(maybe) == 1:
return maybe[0]
return min(maybe, key=key_int)

def find_entity(author):
"""
Looks for an existing Author record in OL
Looks for an existing Author record in OL by name
and returns it if found.
:param dict author: Author import dict
@@ -129,7 +129,7 @@ def find_entity(author):

def import_author(author, eastern=False):
"""
Converts an import stlye Author dictionary into an
Converts an import style new-author dictionary into an
Open Library author representation.
:param dict author: Author import record
@@ -8,16 +8,24 @@ def db_name(a):
if a.birth_date or a.death_date:
date = a.get('birth_date', '') + '-' + a.get('death_date', '')
elif a.date:
#assert not a.birth_date and not a.death_date
date = a.date
return ' '.join([a['name'], date]) if date else a['name']

def try_merge(e1, edition_key, existing):
"""
Converts the existing edition into a comparable dict and performs a
thresholded comparison to decide whether they are the same.
:param dict e1:
:param str edition_key:
:param Thing existing: Edition object that most likely matches e1, the object of edition_key
:rtype: bool
:return: Whether e1 is sufficiently the same as the 'existing' edition
"""
thing_type = existing.type.key
if thing_type == '/type/delete':
return False
assert thing_type == '/type/edition'

rec2 = {}
rec2['full_title'] = existing.title
if existing.subtitle:
@@ -28,12 +36,9 @@ def try_merge(e1, edition_key, existing):
if existing.authors:
rec2['authors'] = []
for a in existing.authors:
author_type = a.type.key
while author_type == '/type/redirect':
while a.type.key == '/type/redirect':
a = web.ctx.site.get(a.location)
author_type = a.type.key
continue
if author_type == '/type/author':
if a.type.key == '/type/author':
assert a['name']
rec2['authors'].append({'name': a['name'], 'db_name': db_name(a)})
e2 = build_marc(rec2)
@@ -149,6 +149,46 @@ def test_load_with_new_author(mock_site, ia_writeback):
assert len(w.authors) == 1
assert len(e.authors) == 1

def test_load_with_redirected_author(mock_site, add_languages):
"""Test importing existing editions without works
which have author redirects. A work should be created with
the final author.
"""
redirect_author = {
'type': {'key': '/type/redirect'},
'name': 'John Smith',
'key': '/authors/OL55A',
'location': '/authors/OL10A'}
final_author = {
'type': {'key': '/type/author'},
'name': 'John Smith',
'key': '/authors/OL10A'}
orphaned_edition = {
'title': 'Test item HATS',
'key': '/books/OL10M',
'publishers': ['TestPub'],
'publish_date': '1994',
'authors': [{'key': '/authors/OL55A'}],
'type': {'key': '/type/edition'}}
mock_site.save(orphaned_edition)
mock_site.save(redirect_author)
mock_site.save(final_author)

rec = {
'title': 'Test item HATS',
'authors': [{'name': 'John Smith'}],
'publishers': ['TestPub'],
'publish_date': '1994',
'source_records': 'ia:test_redir_author'}
reply = load(rec)
assert reply['edition']['status'] == 'modified'
assert reply['edition']['key'] == '/books/OL10M'
assert reply['work']['status'] == 'created'
e = mock_site.get(reply['edition']['key'])
assert e.authors[0].key == '/authors/OL10A'
w = mock_site.get(reply['work']['key'])
assert w.authors[0].author.key == '/authors/OL10A'

def test_duplicate_ia_book(mock_site, add_languages, ia_writeback):
rec = {
'ocaid': 'test_item',
@@ -1,4 +1,3 @@
from __future__ import print_function
from openlibrary.catalog.marc.marc_xml import MarcXml, BadSubtag, BlankTag
from openlibrary.catalog.marc.marc_binary import MarcBinary
from openlibrary.catalog.marc.marc_subject import read_subjects, tidy_subject, four_types
@@ -120,29 +119,3 @@ def test_subjects_bin(self, item, expected):
data = data.decode('utf-8').encode('raw_unicode_escape')
rec = MarcBinary(data)
assert read_subjects(rec) == expected

subjects = []
for item, expect in xml_samples:
filename = os.path.dirname(__file__) + '/test_data/xml_input/' + item + '_marc.xml'
element = etree.parse(filename).getroot()
if element.tag != record_tag and element[0].tag == record_tag:
element = element[0]
rec = MarcXml(element)
subjects.append(read_subjects(rec))

for item, expect in bin_samples:
filename = os.path.dirname(__file__) + '/test_data/bin_input/' + item

data = open(filename).read()
if len(data) != int(data[:5]):
data = data.decode('utf-8').encode('raw_unicode_escape')
rec = MarcBinary(data)
subjects.append(read_subjects(rec))

all_subjects = defaultdict(lambda: defaultdict(int))
for a in subjects:
for b, c in a.items():
for d, e in c.items():
all_subjects[b][d] += e

print(four_types(dict((k, dict(v)) for k, v in all_subjects.items())))
@@ -96,12 +96,16 @@ def compare_isbn10(e1, e2):
for j in e2['isbn']:
if i == j:
return ('ISBN', 'match', isbn_match)

return ('ISBN', 'mismatch', -225)

# 450 + 200 + 85 + 200

def level1_merge(e1, e2):
"""
:param dict e1, e2: editions to match
:rtype: list
:return: a list of tuples (field/category, result str, score int)
"""
score = []
if e1['short_title'] == e2['short_title']:
score.append(('short-title', 'match', 450))
@@ -113,6 +117,24 @@ def level1_merge(e1, e2):
score.append(compare_isbn10(e1, e2))
return score

def level2_merge(e1, e2):
"""
:rtype: list
:return: a list of tuples (field/category, result str, score int)
"""
score = []
score.append(compare_date(e1, e2))
score.append(compare_country(e1, e2))
score.append(compare_isbn10(e1, e2))
score.append(compare_title(e1, e2))
score.append(compare_lccn(e1, e2))
page_score = compare_number_of_pages(e1, e2)
if page_score:
score.append(page_score)
score.append(compare_publisher(e1, e2))
score.append(compare_authors(e1, e2))
return score

def compare_author_fields(e1_authors, e2_authors):
for i in e1_authors:
for j in e2_authors:
@@ -158,7 +180,6 @@ def compare_authors(e1, e2):
return ('authors', 'no authors', 75)
return ('authors', 'field missing from one record', -25)


def title_replace_amp(amazon):
return normalize(amazon['full-title'].replace(" & ", " and ")).lower()

@@ -256,29 +277,14 @@ def compare_publisher(e1, e2):
if 'publishers' not in e1 or 'publishers' not in e2:
return ('publisher', 'either missing', 0)

def level2_merge(e1, e2):
score = []
score.append(compare_date(e1, e2))
score.append(compare_country(e1, e2))
score.append(compare_isbn10(e1, e2))
score.append(compare_title(e1, e2))
score.append(compare_lccn(e1, e2))
page_score = compare_number_of_pages(e1, e2)
if page_score:
score.append(page_score)

score.append(compare_publisher(e1, e2))
score.append(compare_authors(e1, e2))

return score

def build_marc(edition):
"""
Called from openlibrary.catalog.add_book.load()
Returns an expanded representation of an edition dict
:param dict edition: Import edition representation
:rtype: dict
:return: An expanded version of an Import edition
:return: An expanded version of an edition dict
more titles, normalized + short
all isbns in "isbn": []
"""
@@ -295,14 +301,25 @@ def build_marc(edition):
return marc

def attempt_merge(e1, e2, threshold, debug=False):
l1 = level1_merge(e1, e2)
total = sum(i[2] for i in l1)
"""
Determines (according to a threshold) whether two edition representations are
sufficiently the same. Used when importing new books.
:param dict e1: dict representing an edition
:param dict e2: dict representing an edition
:param int threshold: each field match or difference adds or subtracts a score. Example: 875 for standard edition matching
:rtype: bool
:return: Whether two editions have sufficient fields in common to be considered the same
"""
level1 = level1_merge(e1, e2)
total = sum(i[2] for i in level1)
if debug:
print(total, l1)
print("E1: %s\nE2: %s" % (e1, e2))
print(total, level1)
if total >= threshold:
return True
l2 = level2_merge(e1, e2)
total = sum(i[2] for i in l2)
level2 = level2_merge(e1, e2)
total = sum(i[2] for i in level2)
if debug:
print(total, l2)
print(total, level2)
return total >= threshold

0 comments on commit e10c3f4

Please sign in to comment.
You can’t perform that action at this time.