Merge pull request #1664 from cclauss/fix-unicode-for-python-3

unicode() was removed in Python 3
internetarchive · Dec 2, 2018 · d3f63c1 · d3f63c1
2 parents 8fb2c8a + 3990a85
commit d3f63c1
Show file tree

Hide file tree

Showing 39 changed files with 126 additions and 85 deletions.
diff --git a/openlibrary/api.py b/openlibrary/api.py
@@ -25,6 +25,8 @@
 import web
 import logging
 
+import six
+
 logger = logging.getLogger("openlibrary.api")
 
 class OLError(Exception):
@@ -212,9 +214,9 @@ def marshal(data):
     elif isinstance(data, datetime.datetime):
         return {"type": "/type/datetime", "value": data.isoformat()}
     elif isinstance(data, Text):
-        return {"type": "/type/text", "value": unicode(data)}
+        return {"type": "/type/text", "value": six.text_type(data)}
     elif isinstance(data, Reference):
-        return {"key": unicode(data)}
+        return {"key": six.text_type(data)}
     else:
         return data
 
@@ -258,11 +260,11 @@ def parse_datetime(value):
         return datetime.datetime(*map(int, tokens))
 
 
-class Text(unicode):
+class Text(six.text_type):
     def __repr__(self):
-        return "<text: %s>" % unicode.__repr__(self)
+        return u"<text: %s>" % six.text_type.__repr__(self)
 
 
-class Reference(unicode):
+class Reference(six.text_type):
     def __repr__(self):
-        return "<ref: %s>" % unicode.__repr__(self)
+        return u"<ref: %s>" % six.text_type.__repr__(self)
diff --git a/openlibrary/catalog/add_book/__init__.py b/openlibrary/catalog/add_book/__init__.py
@@ -35,6 +35,8 @@
 import web
 from infogami import config
 
+import six
+
 from openlibrary.catalog.merge.merge_marc import build_marc
 from openlibrary.catalog.utils import mk_norm
 from openlibrary.core import lending
@@ -55,7 +57,7 @@ def strip_accents(s):
     """
     if isinstance(s, str):
         return s
-    assert isinstance(s, unicode)
+    assert isinstance(s, six.text_type)
     return ''.join((c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn'))
 
 def normalize(s): # strip non-alphanums and truncate at 25 chars

diff --git a/openlibrary/catalog/author/merge.py b/openlibrary/catalog/author/merge.py
@@ -5,6 +5,9 @@
 from openlibrary.catalog.utils import key_int, match_with_bad_chars, pick_best_author, remove_trailing_number_dot
 from unicodedata import normalize
 import web, re, sys, codecs, urllib
+
+import six
+
 sys.path.append('/home/edward/src/olapi')
 from olapi import OpenLibrary, unmarshal, Reference
 from openlibrary.catalog.utils.edit import fix_edition
@@ -113,7 +116,7 @@ def do_normalize(author_key, best_key, authors):
                 if m:
                     need_update = True
                     v = v[:-len(m.group(1))]
-            if not isinstance(v, unicode):
+            if not isinstance(v, six.text_type):
                 continue
             norm_v = norm(v)
             if v == norm_v:
@@ -126,7 +129,7 @@ def do_normalize(author_key, best_key, authors):
         for k in author_keys:
             if k not in best:
                 v = a[k]
-                if not isinstance(v, unicode):
+                if not isinstance(v, six.text_type):
                     continue
                 norm_v = norm(v)
                 if v == norm_v:
@@ -137,7 +140,7 @@ def do_normalize(author_key, best_key, authors):
             v = best[k]
             if 'date' in k:
                 v = remove_trailing_number_dot(v)
-            if isinstance(v, unicode):
+            if isinstance(v, six.text_type):
                 v = norm(v)
             if k not in a or v != a[k]:
                 a[k] = v

diff --git a/openlibrary/catalog/importer/add_source_records.py b/openlibrary/catalog/importer/add_source_records.py
@@ -15,6 +15,8 @@
 
 from catalog.read_rc import read_rc
 
+import six
+
 rc = read_rc()
 
 marc_index = web.database(dbn='postgres', db='marc_index')
@@ -82,7 +84,7 @@ def fix_toc(e):
         return
     if isinstance(toc[0], dict) and toc[0]['type'] == '/type/toc_item':
         return
-    return [{'title': unicode(i), 'type': '/type/toc_item'} for i in toc if i != u'']
+    return [{'title': six.text_type(i), 'type': '/type/toc_item'} for i in toc if i != u'']
 
 re_skip = re.compile('\b([A-Z]|Co|Dr|Jr|Capt|Mr|Mrs|Ms|Prof|Rev|Revd|Hon)\.$')
 

diff --git a/openlibrary/catalog/importer/import_marc.py b/openlibrary/catalog/importer/import_marc.py
@@ -113,7 +113,7 @@ def fix_toc(e):
         return
     if isinstance(toc[0], dict) and toc[0]['type'] == '/type/toc_item':
         return
-    return [{'title': unicode(i), 'type': '/type/toc_item'} for i in toc if i != u'']
+    return [{'title': six.text_type(i), 'type': '/type/toc_item'} for i in toc if i != u'']
 
 re_skip = re.compile('\b([A-Z]|Co|Dr|Jr|Capt|Mr|Mrs|Ms|Prof|Rev|Revd|Hon)\.$')
 

diff --git a/openlibrary/catalog/importer/update.py b/openlibrary/catalog/importer/update.py
@@ -9,6 +9,8 @@
 from openlibrary.api import OpenLibrary, unmarshal, marshal
 from pprint import pprint
 
+import six
+
 rc = read_rc()
 ol = OpenLibrary("http://openlibrary.org")
 ol.login('ImportBot', rc['ImportBot'])
@@ -28,7 +30,7 @@ def fix_toc(e):
     # http://openlibrary.org/books/OL789133M - /type/toc_item missing from table_of_contents
     if isinstance(toc[0], dict) and ('pagenum' in toc[0] or toc[0]['type'] == '/type/toc_item'):
         return
-    return [{'title': unicode(i), 'type': '/type/toc_item'} for i in toc if i != u'']
+    return [{'title': six.text_type(i), 'type': '/type/toc_item'} for i in toc if i != u'']
 
 re_skip = re.compile('\b([A-Z]|Co|Dr|Jr|Capt|Mr|Mrs|Ms|Prof|Rev|Revd|Hon)\.$')
 

diff --git a/openlibrary/catalog/marc/marc_binary.py b/openlibrary/catalog/marc/marc_binary.py
@@ -14,7 +14,7 @@ class BadLength(MarcException):
     pass
 
 def norm(s):
-    return normalize('NFC', unicode(s))
+    return normalize('NFC', six.text_type(s))
 
 class BinaryDataField():
     def __init__(self, rec, line):

diff --git a/openlibrary/catalog/marc/marc_xml.py b/openlibrary/catalog/marc/marc_xml.py
@@ -2,6 +2,8 @@
 from marc_base import MarcBase, MarcException
 from unicodedata import normalize
 
+import six
+
 data_tag = '{http://www.loc.gov/MARC21/slim}datafield'
 control_tag = '{http://www.loc.gov/MARC21/slim}controlfield'
 subfield_tag = '{http://www.loc.gov/MARC21/slim}subfield'
@@ -21,7 +23,7 @@ def read_marc_file(f):
         elem.clear()
 
 def norm(s):
-    return normalize('NFC', unicode(s.replace(u'\xa0', ' ')))
+    return normalize('NFC', six.text_type(s.replace(u'\xa0', ' ')))
 
 def get_text(e):
     return norm(e.text) if e.text else u''

diff --git a/openlibrary/catalog/marc/parse_xml.py b/openlibrary/catalog/marc/parse_xml.py
@@ -4,6 +4,8 @@
 from parse import read_edition
 from unicodedata import normalize
 
+import six
+
 slim = '{http://www.loc.gov/MARC21/slim}'
 leader_tag = slim + 'leader'
 data_tag = slim + 'datafield'
@@ -13,7 +15,7 @@
 record_tag = slim + 'record'
 
 def norm(s):
-    return normalize('NFC', unicode(s))
+    return normalize('NFC', six.text_type(s))
 
 class BadSubtag:
     pass

diff --git a/openlibrary/catalog/marc/read_xml.py b/openlibrary/catalog/marc/read_xml.py
@@ -5,6 +5,8 @@
 from time import sleep
 from unicodedata import normalize
 
+import six
+
 re_question = re.compile('^\?+$')
 re_lccn = re.compile('(...\d+).*')
 re_letters = re.compile('[A-Za-z]')
@@ -155,10 +157,10 @@ def read_edition(f):
         #        return None
         #    continue
         if tag == '008':
-            publish_date = unicode(line)[7:11]
+            publish_date = six.text_type(line)[7:11]
             if publish_date.isdigit():
                 edition["publish_date"] = publish_date
-            publish_country = unicode(line)[15:18]
+            publish_country = six.text_type(line)[15:18]
             if publish_country not in ('|||', '   '):
                 edition["publish_country"] = publish_country
             continue

diff --git a/openlibrary/catalog/marc/simple_html.py b/openlibrary/catalog/marc/simple_html.py
@@ -5,6 +5,8 @@
 from build_record import build_record
 import sys, re
 
+import six
+
 trans = {'&':'&amp;','<':'&lt;','>':'&gt;','\n':'<br>'}
 re_html_replace = re.compile('([&<>\n])')
 
@@ -84,7 +86,7 @@ def output_record_as_html(rec):
         elif rec[k] is None:
             v = '<em>empty</em>'
         else:
-            v = esc(unicode(rec[k]))
+            v = esc(six.text_type(rec[k]))
         rows.append('<tr><th>%s</th><td>%s</td></tr>\n' % (label, v))
 
     return '<table>' + ''.join(rows) + '</table>'

diff --git a/openlibrary/catalog/merge/amazon.py b/openlibrary/catalog/merge/amazon.py
@@ -3,6 +3,8 @@
 from names import match_name
 from normalize import normalize
 
+import six
+
 re_year = re.compile('(\d{4})$')
 re_amazon_title_paren = re.compile('^(.*) \([^)]+?\)$')
 re_and_of_space = re.compile(' and | of | ')
@@ -291,7 +293,7 @@ def test_merge_titles():
         'title': 'Spytime',
     }
 
-    amazon = build_titles(unicode(full_title(amazon)))
+    amazon = build_titles(six.text_type(full_title(amazon)))
     marc = build_titles(marc['title_with_subtitles'])
     assert amazon['short_title'] == marc['short_title']
     assert compare_title(amazon, marc) == ('full-title', 'containted within other title', 350)
@@ -303,7 +305,7 @@ def test_merge_titles2():
         'title': u'seabirds of Britain and Ireland',
         'full_title': u'The seabirds of Britain and Ireland',
     }
-    amazon = build_titles(unicode(full_title(amazon)))
+    amazon = build_titles(six.text_type(full_title(amazon)))
     marc = build_titles(marc['title_with_subtitles'])
     assert compare_title(amazon, marc) == ('full-title', 'exact match', 600)
 

diff --git a/openlibrary/catalog/merge/merge.py b/openlibrary/catalog/merge/merge.py
@@ -3,6 +3,8 @@
 from names import match_name
 from normalize import normalize
 
+import six
+
 re_year = re.compile('(\d{4})$')
 re_amazon_title_paren = re.compile('^(.*) \([^)]+?\)$')
 re_and_of_space = re.compile(' and | of | ')
@@ -285,7 +287,7 @@ def test_merge_titles():
         'title': 'Spytime',
     }
 
-    amazon = build_titles(unicode(full_title(amazon)))
+    amazon = build_titles(six.text_type(full_title(amazon)))
     marc = build_titles(marc['title_with_subtitles'])
     assert amazon['short_title'] == marc['short_title']
     assert compare_title(amazon, marc) == ('full-title', 'containted within other title', 350)
@@ -297,7 +299,7 @@ def test_merge_titles2():
         'title': u'seabirds of Britain and Ireland',
         'full_title': u'The seabirds of Britain and Ireland',
     }
-    amazon = build_titles(unicode(full_title(amazon)))
+    amazon = build_titles(six.text_type(full_title(amazon)))
     marc = build_titles(marc['title_with_subtitles'])
     assert compare_title(amazon, marc) == ('full-title', 'exact match', 600)
 

diff --git a/openlibrary/catalog/merge/normalize.py b/openlibrary/catalog/merge/normalize.py
@@ -1,11 +1,13 @@
 import re, unicodedata
 
+import six
+
 #re_brace = re.compile('{[^{}]+?}')
 re_normalize = re.compile('[^[:alpha:] ]', re.I)
 re_whitespace = re.compile('[-\s,;.]+')
 
 def normalize(s):
-    if isinstance(s, unicode):
+    if isinstance(s, six.text_type):
         s = unicodedata.normalize('NFC', s.replace(u'\u0142', u'l'))
     s = s.replace(' & ', ' and ')
     # remove {mlrhring} and friends
@@ -23,4 +25,3 @@ def normalize(s):
 #    a = "Tha{mllhring}{macr}alib{macr}i, {mllhring}Abd al-Malik ibn Mu{dotb}hammad 961 or 2-1037 or 8."
 #    b = u"Tha\xb0\xe5alib\xe5i, \xb0Abd al-Malik ibn Mu\xf2hammad 961 or 2-1037 or 8."
 #    assert normalize(a) == normalize(b)
-
diff --git a/openlibrary/catalog/onix/parse.py b/openlibrary/catalog/onix/parse.py
@@ -13,6 +13,8 @@
 from thread_utils import AsyncChannel, threaded_generator
 from onix import OnixProduct, OnixHandler, onix_codelists
 
+import six
+
 def parser (input):
 	# returns a generator that produces dicts representing Open Library items
 
@@ -262,16 +264,13 @@ def person_name (x):
 	return name
 
 def elt_get (e, tag, reference_name):
-       ee = e.get (tag) or e.get (reference_name.lower ())
-       if ee:
-               return unicode (ee)
-       else:
-               return None
+     ee = e.get (tag) or e.get (reference_name.lower ())
+     return six.text_type(ee) if ee else None
+
 
 re_by = re.compile ("^\s*by\s+", re.IGNORECASE)
 re_iname = re.compile ("^(.*),\s*(.*)$")
 
 def add_val (o, key, val):
 	if val is not None:
 		o.setdefault (key, []).append (val)
-
diff --git a/openlibrary/catalog/onix/xmltramp.py b/openlibrary/catalog/onix/xmltramp.py
@@ -5,6 +5,8 @@
 __credits__ = "Many thanks to pjz, bitsko, and DanC."
 __copyright__ = "(C) 2003-2006 Aaron Swartz. GNU GPL 2."
 
+import six
+
 def isstr(f): return isinstance(f, type('')) or isinstance(f, type(u''))
 def islst(f): return isinstance(f, type(())) or isinstance(f, type([]))
 
@@ -16,6 +18,7 @@ def quote(x, elt=True):
     if not elt: x = x.replace('"', '&quot;')
     return x
 
+@six.python_2_unicode_compatible
 class Element:
     def __init__(self, name, attrs=None, children=None, prefixes=None, line=None):
         if islst(name) and name[0] == None: name = name[1]
@@ -97,15 +100,12 @@ def arep(a, inprefixes, addns=1):
 
         return out
 
-    def __unicode__(self):
+    def __str__(self):
         text = ''
         for x in self._dir:
-            text += unicode(x)
+            text += six.text_type(x)
         return ' '.join(text.split())
 
-    def __str__(self):
-        return self.__unicode__().encode('utf-8')
-
     def __getattr__(self, n):
         if n[0] == '_': raise AttributeError("Use foo['"+n+"'] to access the child element.")
         if self._dNS: n = (self._dNS, n)

diff --git a/openlibrary/catalog/utils/__init__.py b/openlibrary/catalog/utils/__init__.py
@@ -4,7 +4,6 @@
 import openlibrary.catalog.merge.normalize as merge
 
 import six
-from six.moves import range
 
 try:
     cmp = cmp       # Python 2
@@ -129,15 +128,15 @@ def pick_first_date(dates):
     return { 'date': fix_l_in_date(' '.join([remove_trailing_number_dot(d) for d in dates])) }
 
 def strip_accents(s):
-    return normalize('NFKD', unicode(s)).encode('ASCII', 'ignore')
+    return normalize('NFKD', six.text_type(s)).encode('ASCII', 'ignore')
 
 re_drop = re.compile('[?,]')
 
 def match_with_bad_chars(a, b):
-    if unicode(a) == unicode(b):
+    if six.text_type(a) == six.text_type(b):
         return True
-    a = normalize('NFKD', unicode(a)).lower()
-    b = normalize('NFKD', unicode(b)).lower()
+    a = normalize('NFKD', six.text_type(a)).lower()
+    b = normalize('NFKD', six.text_type(b)).lower()
     if a == b:
         return True
     a = a.encode('ASCII', 'ignore')
@@ -152,7 +151,7 @@ def accent_count(s):
     return len([c for c in norm(s) if ord(c) > 127])
 
 def norm(s):
-    return normalize('NFC', s) if isinstance(s, unicode) else s
+    return normalize('NFC', s) if isinstance(s, six.text_type) else s
 
 def pick_best_name(names):
     names = [norm(n) for n in names]