internetarchive · mekarpeles · Sep 15, 2018 · Aug 24, 2018 · Aug 24, 2018 · Aug 27, 2018
diff --git a/openlibrary/catalog/add_book/__init__.py b/openlibrary/catalog/add_book/__init__.py
@@ -146,6 +146,15 @@ def new_work(q, rec, cover_id):
     return w
 
 def load_data(rec):
+    """
+    Creates a new Edition,
+    searches for an existing Work,
+      creates a new one if required,
+      otherwise adds to existing Work,
+        possibly with modification.
+    :param dict rec: Edition record to load (now we have established it should be added)
+    :rtype: dict {"success": bool, "error": string | "work": {"key": <key>, "status": "created" | "modified" | "matched"} , "edition": {"key": <key>, "status": "created"}}
+    """
     cover_url = None
     if 'cover' in rec:
         cover_url = rec['cover']
@@ -250,6 +259,11 @@ def find_match(e1, edition_pool):
                 return edition_key
 
 def build_pool(rec):
+    """
+    Searches for existing edition matches on title and bibliographic keys.
+    :param dict rec: Edition record
+    :rtype: dict {<identifier: title | isbn | lccn etc>: [ list of /books/OL..M keys that match rec on <identifier>]}
+    """
     pool = defaultdict(set)
 
     ## Find records with matching title
@@ -300,6 +314,10 @@ def add_db_name(rec):
 re_lang = re.compile('^/languages/([a-z]{3})$')
 
 def early_exit(rec):
+    """Attempts to quickly find an existing item match using bibliographic keys.
+    :param dict rec: Edition record
+    :rtype: (str|None) First key matched of format "/books/OL..M" or False if no match found.
+    """
     f = 'ocaid'
     # Anand - August 2014
     # If openlibrary ID is already specified in the record, then use it.
@@ -316,6 +334,7 @@ def early_exit(rec):
         if ekeys:
             return ekeys[0]
 
+    #TODO: Check whether 'isbn' should also be used here.
     if 'isbn_10' or 'isbn_13' in rec:
         isbns = rec.get("isbn_10", []) + rec.get("isbn_13", [])
         isbns = [isbn.strip().replace("-", "") for isbn in isbns]
@@ -378,6 +397,12 @@ def find_exact_match(rec, edition_pool):
     return False
 
 def add_cover(cover_url, ekey):
+    """
+    Adds a cover to coverstore and returns the cover id.
+    :param str cover_url: URL of cover image
+    :param str ekey: Edition key /book/OL..M
+    :rtype: int Cover id
+    """
     olid = ekey.split("/")[-1]
     coverstore_url = config.get('coverstore_url').rstrip('/')
     upload_url = coverstore_url + '/b/upload2'
@@ -391,6 +416,7 @@ def add_cover(cover_url, ekey):
         'olid': olid,
         'ip': web.ctx.ip,
     }
+    reply = None
     for attempt in range(10):
         try:
             res = urllib.urlopen(upload_url, urllib.urlencode(params))
@@ -399,7 +425,7 @@ def add_cover(cover_url, ekey):
             sleep(2)
             continue
         body = res.read()
-        if body != '':
+        if body not in ['', 'None']:
             reply = json.loads(body)
         if res.getcode() == 200 and body != '':
             if 'id' in reply:

diff --git a/openlibrary/catalog/get_ia.py b/openlibrary/catalog/get_ia.py
@@ -1,13 +1,11 @@
-from openlibrary.catalog.marc import fast_parse, read_xml
-from openlibrary.catalog.utils import error_mail
 from openlibrary.catalog.marc.marc_binary import MarcBinary
 from openlibrary.catalog.marc.marc_xml import MarcXml
+from openlibrary.catalog.marc import parse
 from lxml import etree
 import xml.parsers.expat
 import urllib2, os.path, socket
 from time import sleep
 import traceback
-from openlibrary.utils.ia import find_item
 from openlibrary.core import ia
 
 base = "https://archive.org/download/"
@@ -19,20 +17,13 @@ def urlopen_keep_trying(url):
     for i in range(3):
         try:
             f = urllib2.urlopen(url)
+            return f
         except urllib2.HTTPError, error:
-            if error.code in (403, 404):
-                #print "404 for '%s'" % url
+            if error.code in (403, 404, 416):
                 raise
-            else:
-                print 'error:', error.code, error.msg
-            pass
         except urllib2.URLError:
             pass
-        else:
-            return f
-        print url, "failed"
         sleep(2)
-        print "trying again"
 
 def bad_ia_xml(ia):
     if ia == 'revistadoinstit01paulgoog':
@@ -43,33 +34,24 @@ def bad_ia_xml(ia):
     return '<!--' in urlopen_keep_trying(base + loc).read()
 
 def get_marc_ia_data(ia, host=None, path=None):
-    ia = ia.strip() # 'cyclopdiaofedu00kidd '
+    """
+    DEPRECATED
+    """
     ending = 'meta.mrc'
     if host and path:
         url = 'http://%s%s/%s_%s' % (host, path, ia, ending)
     else:
-        url = 'http://www.archive.org/download/' + ia + '/' + ia + '_' + ending
+        url = base + ia + '/' + ia + '_' + ending
     f = urlopen_keep_trying(url)
     return f.read() if f else None
 
-def get_marc_ia(ia):
-    ia = ia.strip() # 'cyclopdiaofedu00kidd '
-    url = base + ia + "/" + ia + "_meta.mrc"
-    data = urlopen_keep_trying(url).read()
-    length = int(data[0:5])
-    if len(data) != length:
-        data = data.decode('utf-8').encode('raw_unicode_escape')
-    assert len(data) == length
-
-    assert 'Internet Archive: Error' not in data
-    print 'leader:', data[:24]
-    return data
-    return fast_parse.read_edition(data, accept_electronic = True)
-
 def get_marc_record_from_ia(identifier):
-    """Takes IA identifiers and returns MARC record instance.
+    """
+    Takes IA identifiers and returns MARC record instance.
     11/2017: currently called by openlibrary/plugins/importapi/code.py
     when the /api/import/ia endpoint is POSTed to.
+    :param str identifier: ocaid
+    :rtype: (MarcXML | MarcBinary)
     """
     metadata = ia.get_metadata(identifier)
     filenames = metadata['_filenames']
@@ -97,62 +79,18 @@ def get_marc_record_from_ia(identifier):
             # BinaryMARCs with incorrectly converted unicode characters do not match.
             return MarcBinary(data)
 
-def get_ia(ia):
-    ia = ia.strip() # 'cyclopdiaofedu00kidd '
-    # read MARC record of scanned book from archive.org
-    # try the XML first because it has better character encoding
-    # if there is a problem with the XML switch to the binary MARC
-    xml_file = ia + "_marc.xml"
-    loc = ia + "/" + xml_file
-    try:
-        print base + loc
-        f = urlopen_keep_trying(base + loc)
-    except urllib2.HTTPError, error:
-        if error.code == 404:
-            raise NoMARCXML
-        else:
-            print 'error:', error.code, error.msg
-            raise
-    assert f
-    if f:
-        try:
-            return read_xml.read_edition(f)
-        except read_xml.BadXML:
-            print "read_xml BADXML"
-            pass
-        except xml.parsers.expat.ExpatError:
-            #print 'IA:', repr(ia)
-            #print 'XML parse error:', base + loc
-            print "read_xml ExpatError"
-            pass
-    print base + loc
-    if '<title>Internet Archive: Page Not Found</title>' in urllib2.urlopen(base + loc).read(200):
-        raise NoMARCXML
-    url = base + ia + "/" + ia + "_meta.mrc"
-    print url
-    try:
-        f = urlopen_keep_trying(url)
-    except urllib2.URLError:
-        pass
-    if not f:
-        return None
-    data = f.read()
-    length = data[0:5]
-    loc = ia + "/" + ia + "_meta.mrc:0:" + length
-    if len(data) == 0:
-        print 'zero length MARC for', url
-        return None
-    if 'Internet Archive: Error' in data:
-        print 'internet archive error for', url
-        return None
-    if data.startswith('<html>\n<head>'):
-        print 'internet archive error for', url
-        return None
-    try:
-        return fast_parse.read_edition(data, accept_electronic = True)
-    except (ValueError, AssertionError, fast_parse.BadDictionary):
-        print(repr(data))
-        raise
+def get_ia(identifier):
+    """
+    DEPRECATED: Use get_marc_record_from_ia() above + parse.read_edition()
+      Triggers UnboundLocalError: local variable 'v' referenced before assignment
+    Read MARC record of scanned book from archive.org
+    try the XML first because it has better character encoding
+    if there is a problem with the XML switch to the binary MARC
+    :param str identifier: ocaid
+    :rtype: (None | dict)
+    """
+    marc = get_marc_record_from_ia(identifier)
+    return parse.read_edition(marc)
 
 def files(archive_id):
     url = base + archive_id + "/" + archive_id + "_files.xml"
@@ -196,49 +134,57 @@ def get_data(loc):
     return buf
 
 def get_from_archive(locator):
+    """
+    Gets a single binary MARC record from within an Archive.org
+    bulk MARC item - data only.
+
+    :param str locator: Locator ocaid/filename:offset:length
+    :rtype: (str|None) Binary MARC data
+    """
+    data, offset, length = get_from_archive_bulk(locator)
+    return data
+
+def get_from_archive_bulk(locator):
+    """
+    Gets a single binary MARC record from within an Archive.org
+    bulk MARC item, and return the offset and length of the next
+    item.
+    If offset or length are `None`, then there is no next record.
+
+    :param str locator: Locator ocaid/filename:offset:length
+    :rtype: (str, int|None, int|None) (Binary MARC data, Next record offset, Next record length)
+    """
     if locator.startswith('marc:'):
         locator = locator[5:]
     filename, offset, length = locator.split (":")
-    offset = int (offset)
-    length = int (length)
-
-    ia, rest = filename.split('/', 1)
-
-    for attempt in range(5):
-        try:
-            host, path = find_item(ia)
-            break
-        except socket.timeout:
-            if attempt == 4:
-                raise
-            print 'retry, attempt', attempt
+    offset = int(offset)
+    length = int(length)
 
     r0, r1 = offset, offset+length-1
-    url = 'http://' + host + path + '/' + rest
+    # get the next record's length in this request
+    r1 += 5
+    url = base + filename
 
     assert 0 < length < 100000
 
-    ureq = urllib2.Request(url, None, {'Range':'bytes=%d-%d'% (r0, r1)},)
-
-    f = None
-    for i in range(3):
-        try:
-            f = urllib2.urlopen(ureq)
-        except urllib2.HTTPError, error:
-            if error.code == 416:
-                raise
-            elif error.code == 404:
-                print "404 for '%s'" % url
-                raise
-            else:
-                print url
-                print 'error:', error.code, error.msg
-        except urllib2.URLError:
-            pass
+    ureq = urllib2.Request(url, None, {'Range': 'bytes=%d-%d' % (r0, r1)})
+    f = urlopen_keep_trying(ureq)
+    data = None
     if f:
-        return f.read(100000)
-    else:
-        print locator, url, 'failed'
+        data = f.read(100000)
+        len_in_rec = int(data[:5])
+        if len_in_rec != length:
+            data, next_offset, next_length = get_from_archive_bulk('%s:%d:%d' % (filename, offset, len_in_rec))
+        else:
+            next_length = data[length:]
+            data = data[:length]
+            if len(next_length) == 5:
+                # We have data for the next record
+                next_offset = offset + len_in_rec
+                next_length = int(next_length)
+            else:
+                next_offset = next_length = None
+    return data, next_offset, next_length
 
 def get_from_local(locator):
     try:
@@ -253,6 +199,12 @@ def get_from_local(locator):
     return buf
 
 def read_marc_file(part, f, pos=0):
+    """
+    :param str part:
+    :param str f: Full binary MARC data containing many records
+    :param int pos: Start position within the data
+    :rtype: (int, str, str) (Next position, Current source_record name, Current single MARC record)
+    """
     try:
         for data, int_length in fast_parse.read_file(f):
             loc = "marc:%s:%d:%d" % (part, pos, int_length)
@@ -272,18 +224,15 @@ def marc_formats(ia, host=None, path=None):
     if host and path:
         url = 'http://%s%s/%s_%s' % (host, path, ia, ending)
     else:
-        url = 'http://www.archive.org/download/' + ia + '/' + ia + '_' + ending
+        url = base + ia + '/' + ia + '_' + ending
     for attempt in range(10):
         f = urlopen_keep_trying(url)
         if f is not None:
             break
         sleep(10)
     if f is None:
-        msg_from = 'load_scribe@archive.org'
-        msg_to = ['edward@archive.org']
-        subject = "error reading %s_files.xml" % ia
-        msg = url
-        error_mail(msg_from, msg_to, subject, msg)
+        #TODO: log this, if anything uses this code
+        msg = "error reading %s_files.xml" % ia
         return has
     data = f.read()
     try:
@@ -298,19 +247,3 @@ def marc_formats(ia, host=None, path=None):
         if all(has.values()):
             break
     return has
-
-def test_get_ia():
-    ia = "poeticalworksoft00grayiala"
-    expect = {
-        'publisher': ['Printed by C. Whittingham for T. N. Longman and O. Rees [etc]'],
-        'number_of_pages': 223,
-        'full_title': 'The poetical works of Thomas Gray with some account of his life and writings ; the whole carefully revised and illustrated by notes ; to which are annexed, Poems addressed to, and in memory of Mr. Gray ; several of which were never before collected.',
-        'publish_date': '1800',
-        'publish_country': 'enk',
-        'authors': [
-            {'db_name': 'Gray, Thomas 1716-1771.', 'name': 'Gray, Thomas'}
-        ],
-        'oclc': ['5047966']
-    }
-    assert get_ia(ia) == expect
-
diff --git a/openlibrary/catalog/importer/load_scribe.py b/openlibrary/catalog/importer/load_scribe.py
@@ -10,7 +10,7 @@
 from openlibrary.catalog.utils.query import query, withKey
 from openlibrary.catalog.importer.merge import try_merge
 from openlibrary.catalog.importer.update import add_source_records
-from openlibrary.catalog.get_ia import get_ia, urlopen_keep_trying, NoMARCXML, bad_ia_xml, marc_formats, get_marc_ia, get_marc_ia_data
+from openlibrary.catalog.get_ia import get_ia, urlopen_keep_trying, NoMARCXML, bad_ia_xml, marc_formats, get_marc_ia_data
 from openlibrary.catalog.title_page_img.load import add_cover_image
 from openlibrary.solr.update_work import update_work, solr_update
 #from openlibrary.catalog.works.find_works import find_title_redirects, find_works, get_books, books_query, update_works