Permalink
Browse files

Merge pull request #1664 from cclauss/fix-unicode-for-python-3

unicode() was removed in Python 3
  • Loading branch information...
mekarpeles committed Dec 2, 2018
2 parents 8fb2c8a + 3990a85 commit d3f63c1d6f8864d1a57f7e439bc2e02ad0a8411f
Showing with 126 additions and 85 deletions.
  1. +8 −6 openlibrary/api.py
  2. +3 −1 openlibrary/catalog/add_book/__init__.py
  3. +6 −3 openlibrary/catalog/author/merge.py
  4. +3 −1 openlibrary/catalog/importer/add_source_records.py
  5. +1 −1 openlibrary/catalog/importer/import_marc.py
  6. +3 −1 openlibrary/catalog/importer/update.py
  7. +1 −1 openlibrary/catalog/marc/marc_binary.py
  8. +3 −1 openlibrary/catalog/marc/marc_xml.py
  9. +3 −1 openlibrary/catalog/marc/parse_xml.py
  10. +4 −2 openlibrary/catalog/marc/read_xml.py
  11. +3 −1 openlibrary/catalog/marc/simple_html.py
  12. +4 −2 openlibrary/catalog/merge/amazon.py
  13. +4 −2 openlibrary/catalog/merge/merge.py
  14. +3 −2 openlibrary/catalog/merge/normalize.py
  15. +5 −6 openlibrary/catalog/onix/parse.py
  16. +5 −5 openlibrary/catalog/onix/xmltramp.py
  17. +5 −6 openlibrary/catalog/utils/__init__.py
  18. +3 −1 openlibrary/catalog/utils/edit.py
  19. +1 −1 openlibrary/catalog/works/by_author.py
  20. +2 −2 openlibrary/catalog/works/find_works.py
  21. +1 −1 openlibrary/catalog/works/live.py
  22. +3 −4 openlibrary/conftest.py
  23. +3 −1 openlibrary/core/helpers.py
  24. +1 −1 openlibrary/mocks/mock_infobase.py
  25. +1 −1 openlibrary/plugins/ol_infobase.py
  26. +3 −1 openlibrary/plugins/openlibrary/lists.py
  27. +4 −3 openlibrary/plugins/openlibrary/opds.py
  28. +7 −5 openlibrary/plugins/openlibrary/tests/test_home.py
  29. +3 −1 openlibrary/plugins/search/code.py
  30. +4 −3 openlibrary/plugins/search/facet_hash.py
  31. +4 −4 openlibrary/plugins/search/solr_client.py
  32. +1 −1 openlibrary/plugins/upstream/models.py
  33. +6 −4 openlibrary/plugins/upstream/utils.py
  34. +4 −3 openlibrary/solr/facet_hash.py
  35. +3 −1 openlibrary/solr/inside/index_all.py
  36. +3 −1 openlibrary/solr/inside/index_gevent.py
  37. +1 −1 openlibrary/solr/update_work.py
  38. +1 −1 openlibrary/tests/catalog/test_get_ia.py
  39. +3 −2 scripts/copydocs.py
@@ -25,6 +25,8 @@
import web
import logging
import six
logger = logging.getLogger("openlibrary.api")
class OLError(Exception):
@@ -212,9 +214,9 @@ def marshal(data):
elif isinstance(data, datetime.datetime):
return {"type": "/type/datetime", "value": data.isoformat()}
elif isinstance(data, Text):
return {"type": "/type/text", "value": unicode(data)}
return {"type": "/type/text", "value": six.text_type(data)}
elif isinstance(data, Reference):
return {"key": unicode(data)}
return {"key": six.text_type(data)}
else:
return data
@@ -258,11 +260,11 @@ def parse_datetime(value):
return datetime.datetime(*map(int, tokens))
class Text(unicode):
class Text(six.text_type):
def __repr__(self):
return "<text: %s>" % unicode.__repr__(self)
return u"<text: %s>" % six.text_type.__repr__(self)
class Reference(unicode):
class Reference(six.text_type):
def __repr__(self):
return "<ref: %s>" % unicode.__repr__(self)
return u"<ref: %s>" % six.text_type.__repr__(self)
@@ -35,6 +35,8 @@
import web
from infogami import config
import six
from openlibrary.catalog.merge.merge_marc import build_marc
from openlibrary.catalog.utils import mk_norm
from openlibrary.core import lending
@@ -55,7 +57,7 @@ def strip_accents(s):
"""
if isinstance(s, str):
return s
assert isinstance(s, unicode)
assert isinstance(s, six.text_type)
return ''.join((c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn'))
def normalize(s): # strip non-alphanums and truncate at 25 chars
@@ -5,6 +5,9 @@
from openlibrary.catalog.utils import key_int, match_with_bad_chars, pick_best_author, remove_trailing_number_dot
from unicodedata import normalize
import web, re, sys, codecs, urllib
import six
sys.path.append('/home/edward/src/olapi')
from olapi import OpenLibrary, unmarshal, Reference
from openlibrary.catalog.utils.edit import fix_edition
@@ -113,7 +116,7 @@ def do_normalize(author_key, best_key, authors):
if m:
need_update = True
v = v[:-len(m.group(1))]
if not isinstance(v, unicode):
if not isinstance(v, six.text_type):
continue
norm_v = norm(v)
if v == norm_v:
@@ -126,7 +129,7 @@ def do_normalize(author_key, best_key, authors):
for k in author_keys:
if k not in best:
v = a[k]
if not isinstance(v, unicode):
if not isinstance(v, six.text_type):
continue
norm_v = norm(v)
if v == norm_v:
@@ -137,7 +140,7 @@ def do_normalize(author_key, best_key, authors):
v = best[k]
if 'date' in k:
v = remove_trailing_number_dot(v)
if isinstance(v, unicode):
if isinstance(v, six.text_type):
v = norm(v)
if k not in a or v != a[k]:
a[k] = v
@@ -15,6 +15,8 @@
from catalog.read_rc import read_rc
import six
rc = read_rc()
marc_index = web.database(dbn='postgres', db='marc_index')
@@ -82,7 +84,7 @@ def fix_toc(e):
return
if isinstance(toc[0], dict) and toc[0]['type'] == '/type/toc_item':
return
return [{'title': unicode(i), 'type': '/type/toc_item'} for i in toc if i != u'']
return [{'title': six.text_type(i), 'type': '/type/toc_item'} for i in toc if i != u'']
re_skip = re.compile('\b([A-Z]|Co|Dr|Jr|Capt|Mr|Mrs|Ms|Prof|Rev|Revd|Hon)\.$')
@@ -113,7 +113,7 @@ def fix_toc(e):
return
if isinstance(toc[0], dict) and toc[0]['type'] == '/type/toc_item':
return
return [{'title': unicode(i), 'type': '/type/toc_item'} for i in toc if i != u'']
return [{'title': six.text_type(i), 'type': '/type/toc_item'} for i in toc if i != u'']
re_skip = re.compile('\b([A-Z]|Co|Dr|Jr|Capt|Mr|Mrs|Ms|Prof|Rev|Revd|Hon)\.$')
@@ -9,6 +9,8 @@
from openlibrary.api import OpenLibrary, unmarshal, marshal
from pprint import pprint
import six
rc = read_rc()
ol = OpenLibrary("http://openlibrary.org")
ol.login('ImportBot', rc['ImportBot'])
@@ -28,7 +30,7 @@ def fix_toc(e):
# http://openlibrary.org/books/OL789133M - /type/toc_item missing from table_of_contents
if isinstance(toc[0], dict) and ('pagenum' in toc[0] or toc[0]['type'] == '/type/toc_item'):
return
return [{'title': unicode(i), 'type': '/type/toc_item'} for i in toc if i != u'']
return [{'title': six.text_type(i), 'type': '/type/toc_item'} for i in toc if i != u'']
re_skip = re.compile('\b([A-Z]|Co|Dr|Jr|Capt|Mr|Mrs|Ms|Prof|Rev|Revd|Hon)\.$')
@@ -14,7 +14,7 @@ class BadLength(MarcException):
pass
def norm(s):
return normalize('NFC', unicode(s))
return normalize('NFC', six.text_type(s))
class BinaryDataField():
def __init__(self, rec, line):
@@ -2,6 +2,8 @@
from marc_base import MarcBase, MarcException
from unicodedata import normalize
import six
data_tag = '{http://www.loc.gov/MARC21/slim}datafield'
control_tag = '{http://www.loc.gov/MARC21/slim}controlfield'
subfield_tag = '{http://www.loc.gov/MARC21/slim}subfield'
@@ -21,7 +23,7 @@ def read_marc_file(f):
elem.clear()
def norm(s):
return normalize('NFC', unicode(s.replace(u'\xa0', ' ')))
return normalize('NFC', six.text_type(s.replace(u'\xa0', ' ')))
def get_text(e):
return norm(e.text) if e.text else u''
@@ -4,6 +4,8 @@
from parse import read_edition
from unicodedata import normalize
import six
slim = '{http://www.loc.gov/MARC21/slim}'
leader_tag = slim + 'leader'
data_tag = slim + 'datafield'
@@ -13,7 +15,7 @@
record_tag = slim + 'record'
def norm(s):
return normalize('NFC', unicode(s))
return normalize('NFC', six.text_type(s))
class BadSubtag:
pass
@@ -5,6 +5,8 @@
from time import sleep
from unicodedata import normalize
import six
re_question = re.compile('^\?+$')
re_lccn = re.compile('(...\d+).*')
re_letters = re.compile('[A-Za-z]')
@@ -155,10 +157,10 @@ def read_edition(f):
# return None
# continue
if tag == '008':
publish_date = unicode(line)[7:11]
publish_date = six.text_type(line)[7:11]
if publish_date.isdigit():
edition["publish_date"] = publish_date
publish_country = unicode(line)[15:18]
publish_country = six.text_type(line)[15:18]
if publish_country not in ('|||', ' '):
edition["publish_country"] = publish_country
continue
@@ -5,6 +5,8 @@
from build_record import build_record
import sys, re
import six
trans = {'&':'&amp;','<':'&lt;','>':'&gt;','\n':'<br>'}
re_html_replace = re.compile('([&<>\n])')
@@ -84,7 +86,7 @@ def output_record_as_html(rec):
elif rec[k] is None:
v = '<em>empty</em>'
else:
v = esc(unicode(rec[k]))
v = esc(six.text_type(rec[k]))
rows.append('<tr><th>%s</th><td>%s</td></tr>\n' % (label, v))
return '<table>' + ''.join(rows) + '</table>'
@@ -3,6 +3,8 @@
from names import match_name
from normalize import normalize
import six
re_year = re.compile('(\d{4})$')
re_amazon_title_paren = re.compile('^(.*) \([^)]+?\)$')
re_and_of_space = re.compile(' and | of | ')
@@ -291,7 +293,7 @@ def test_merge_titles():
'title': 'Spytime',
}
amazon = build_titles(unicode(full_title(amazon)))
amazon = build_titles(six.text_type(full_title(amazon)))
marc = build_titles(marc['title_with_subtitles'])
assert amazon['short_title'] == marc['short_title']
assert compare_title(amazon, marc) == ('full-title', 'containted within other title', 350)
@@ -303,7 +305,7 @@ def test_merge_titles2():
'title': u'seabirds of Britain and Ireland',
'full_title': u'The seabirds of Britain and Ireland',
}
amazon = build_titles(unicode(full_title(amazon)))
amazon = build_titles(six.text_type(full_title(amazon)))
marc = build_titles(marc['title_with_subtitles'])
assert compare_title(amazon, marc) == ('full-title', 'exact match', 600)
@@ -3,6 +3,8 @@
from names import match_name
from normalize import normalize
import six
re_year = re.compile('(\d{4})$')
re_amazon_title_paren = re.compile('^(.*) \([^)]+?\)$')
re_and_of_space = re.compile(' and | of | ')
@@ -285,7 +287,7 @@ def test_merge_titles():
'title': 'Spytime',
}
amazon = build_titles(unicode(full_title(amazon)))
amazon = build_titles(six.text_type(full_title(amazon)))
marc = build_titles(marc['title_with_subtitles'])
assert amazon['short_title'] == marc['short_title']
assert compare_title(amazon, marc) == ('full-title', 'containted within other title', 350)
@@ -297,7 +299,7 @@ def test_merge_titles2():
'title': u'seabirds of Britain and Ireland',
'full_title': u'The seabirds of Britain and Ireland',
}
amazon = build_titles(unicode(full_title(amazon)))
amazon = build_titles(six.text_type(full_title(amazon)))
marc = build_titles(marc['title_with_subtitles'])
assert compare_title(amazon, marc) == ('full-title', 'exact match', 600)
@@ -1,11 +1,13 @@
import re, unicodedata
import six
#re_brace = re.compile('{[^{}]+?}')
re_normalize = re.compile('[^[:alpha:] ]', re.I)
re_whitespace = re.compile('[-\s,;.]+')
def normalize(s):
if isinstance(s, unicode):
if isinstance(s, six.text_type):
s = unicodedata.normalize('NFC', s.replace(u'\u0142', u'l'))
s = s.replace(' & ', ' and ')
# remove {mlrhring} and friends
@@ -23,4 +25,3 @@ def normalize(s):
# a = "Tha{mllhring}{macr}alib{macr}i, {mllhring}Abd al-Malik ibn Mu{dotb}hammad 961 or 2-1037 or 8."
# b = u"Tha\xb0\xe5alib\xe5i, \xb0Abd al-Malik ibn Mu\xf2hammad 961 or 2-1037 or 8."
# assert normalize(a) == normalize(b)
@@ -13,6 +13,8 @@
from thread_utils import AsyncChannel, threaded_generator
from onix import OnixProduct, OnixHandler, onix_codelists
import six
def parser (input):
# returns a generator that produces dicts representing Open Library items
@@ -262,16 +264,13 @@ def person_name (x):
return name
def elt_get (e, tag, reference_name):
ee = e.get (tag) or e.get (reference_name.lower ())
if ee:
return unicode (ee)
else:
return None
ee = e.get (tag) or e.get (reference_name.lower ())
return six.text_type(ee) if ee else None
re_by = re.compile ("^\s*by\s+", re.IGNORECASE)
re_iname = re.compile ("^(.*),\s*(.*)$")
def add_val (o, key, val):
if val is not None:
o.setdefault (key, []).append (val)
@@ -5,6 +5,8 @@
__credits__ = "Many thanks to pjz, bitsko, and DanC."
__copyright__ = "(C) 2003-2006 Aaron Swartz. GNU GPL 2."
import six
def isstr(f): return isinstance(f, type('')) or isinstance(f, type(u''))
def islst(f): return isinstance(f, type(())) or isinstance(f, type([]))
@@ -16,6 +18,7 @@ def quote(x, elt=True):
if not elt: x = x.replace('"', '&quot;')
return x
@six.python_2_unicode_compatible
class Element:
def __init__(self, name, attrs=None, children=None, prefixes=None, line=None):
if islst(name) and name[0] == None: name = name[1]
@@ -97,15 +100,12 @@ def arep(a, inprefixes, addns=1):
return out
def __unicode__(self):
def __str__(self):
text = ''
for x in self._dir:
text += unicode(x)
text += six.text_type(x)
return ' '.join(text.split())
def __str__(self):
return self.__unicode__().encode('utf-8')
def __getattr__(self, n):
if n[0] == '_': raise AttributeError("Use foo['"+n+"'] to access the child element.")
if self._dNS: n = (self._dNS, n)
@@ -4,7 +4,6 @@
import openlibrary.catalog.merge.normalize as merge
import six
from six.moves import range
try:
cmp = cmp # Python 2
@@ -129,15 +128,15 @@ def pick_first_date(dates):
return { 'date': fix_l_in_date(' '.join([remove_trailing_number_dot(d) for d in dates])) }
def strip_accents(s):
return normalize('NFKD', unicode(s)).encode('ASCII', 'ignore')
return normalize('NFKD', six.text_type(s)).encode('ASCII', 'ignore')
re_drop = re.compile('[?,]')
def match_with_bad_chars(a, b):
if unicode(a) == unicode(b):
if six.text_type(a) == six.text_type(b):
return True
a = normalize('NFKD', unicode(a)).lower()
b = normalize('NFKD', unicode(b)).lower()
a = normalize('NFKD', six.text_type(a)).lower()
b = normalize('NFKD', six.text_type(b)).lower()
if a == b:
return True
a = a.encode('ASCII', 'ignore')
@@ -152,7 +151,7 @@ def accent_count(s):
return len([c for c in norm(s) if ord(c) > 127])
def norm(s):
return normalize('NFC', s) if isinstance(s, unicode) else s
return normalize('NFC', s) if isinstance(s, six.text_type) else s
def pick_best_name(names):
names = [norm(n) for n in names]
Oops, something went wrong.

0 comments on commit d3f63c1

Please sign in to comment.