Skip to content

Commit

Permalink
Merge pull request #1664 from cclauss/fix-unicode-for-python-3
Browse files Browse the repository at this point in the history
unicode() was removed in Python 3
  • Loading branch information
mekarpeles committed Dec 2, 2018
2 parents 8fb2c8a + 3990a85 commit d3f63c1
Show file tree
Hide file tree
Showing 39 changed files with 126 additions and 85 deletions.
14 changes: 8 additions & 6 deletions openlibrary/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
import web
import logging

import six

logger = logging.getLogger("openlibrary.api")

class OLError(Exception):
Expand Down Expand Up @@ -212,9 +214,9 @@ def marshal(data):
elif isinstance(data, datetime.datetime):
return {"type": "/type/datetime", "value": data.isoformat()}
elif isinstance(data, Text):
return {"type": "/type/text", "value": unicode(data)}
return {"type": "/type/text", "value": six.text_type(data)}
elif isinstance(data, Reference):
return {"key": unicode(data)}
return {"key": six.text_type(data)}
else:
return data

Expand Down Expand Up @@ -258,11 +260,11 @@ def parse_datetime(value):
return datetime.datetime(*map(int, tokens))


class Text(unicode):
class Text(six.text_type):
def __repr__(self):
return "<text: %s>" % unicode.__repr__(self)
return u"<text: %s>" % six.text_type.__repr__(self)


class Reference(unicode):
class Reference(six.text_type):
def __repr__(self):
return "<ref: %s>" % unicode.__repr__(self)
return u"<ref: %s>" % six.text_type.__repr__(self)
4 changes: 3 additions & 1 deletion openlibrary/catalog/add_book/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@
import web
from infogami import config

import six

from openlibrary.catalog.merge.merge_marc import build_marc
from openlibrary.catalog.utils import mk_norm
from openlibrary.core import lending
Expand All @@ -55,7 +57,7 @@ def strip_accents(s):
"""
if isinstance(s, str):
return s
assert isinstance(s, unicode)
assert isinstance(s, six.text_type)
return ''.join((c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn'))

def normalize(s): # strip non-alphanums and truncate at 25 chars
Expand Down
9 changes: 6 additions & 3 deletions openlibrary/catalog/author/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
from openlibrary.catalog.utils import key_int, match_with_bad_chars, pick_best_author, remove_trailing_number_dot
from unicodedata import normalize
import web, re, sys, codecs, urllib

import six

sys.path.append('/home/edward/src/olapi')
from olapi import OpenLibrary, unmarshal, Reference
from openlibrary.catalog.utils.edit import fix_edition
Expand Down Expand Up @@ -113,7 +116,7 @@ def do_normalize(author_key, best_key, authors):
if m:
need_update = True
v = v[:-len(m.group(1))]
if not isinstance(v, unicode):
if not isinstance(v, six.text_type):
continue
norm_v = norm(v)
if v == norm_v:
Expand All @@ -126,7 +129,7 @@ def do_normalize(author_key, best_key, authors):
for k in author_keys:
if k not in best:
v = a[k]
if not isinstance(v, unicode):
if not isinstance(v, six.text_type):
continue
norm_v = norm(v)
if v == norm_v:
Expand All @@ -137,7 +140,7 @@ def do_normalize(author_key, best_key, authors):
v = best[k]
if 'date' in k:
v = remove_trailing_number_dot(v)
if isinstance(v, unicode):
if isinstance(v, six.text_type):
v = norm(v)
if k not in a or v != a[k]:
a[k] = v
Expand Down
4 changes: 3 additions & 1 deletion openlibrary/catalog/importer/add_source_records.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@

from catalog.read_rc import read_rc

import six

rc = read_rc()

marc_index = web.database(dbn='postgres', db='marc_index')
Expand Down Expand Up @@ -82,7 +84,7 @@ def fix_toc(e):
return
if isinstance(toc[0], dict) and toc[0]['type'] == '/type/toc_item':
return
return [{'title': unicode(i), 'type': '/type/toc_item'} for i in toc if i != u'']
return [{'title': six.text_type(i), 'type': '/type/toc_item'} for i in toc if i != u'']

re_skip = re.compile('\b([A-Z]|Co|Dr|Jr|Capt|Mr|Mrs|Ms|Prof|Rev|Revd|Hon)\.$')

Expand Down
2 changes: 1 addition & 1 deletion openlibrary/catalog/importer/import_marc.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ def fix_toc(e):
return
if isinstance(toc[0], dict) and toc[0]['type'] == '/type/toc_item':
return
return [{'title': unicode(i), 'type': '/type/toc_item'} for i in toc if i != u'']
return [{'title': six.text_type(i), 'type': '/type/toc_item'} for i in toc if i != u'']

re_skip = re.compile('\b([A-Z]|Co|Dr|Jr|Capt|Mr|Mrs|Ms|Prof|Rev|Revd|Hon)\.$')

Expand Down
4 changes: 3 additions & 1 deletion openlibrary/catalog/importer/update.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
from openlibrary.api import OpenLibrary, unmarshal, marshal
from pprint import pprint

import six

rc = read_rc()
ol = OpenLibrary("http://openlibrary.org")
ol.login('ImportBot', rc['ImportBot'])
Expand All @@ -28,7 +30,7 @@ def fix_toc(e):
# http://openlibrary.org/books/OL789133M - /type/toc_item missing from table_of_contents
if isinstance(toc[0], dict) and ('pagenum' in toc[0] or toc[0]['type'] == '/type/toc_item'):
return
return [{'title': unicode(i), 'type': '/type/toc_item'} for i in toc if i != u'']
return [{'title': six.text_type(i), 'type': '/type/toc_item'} for i in toc if i != u'']

re_skip = re.compile('\b([A-Z]|Co|Dr|Jr|Capt|Mr|Mrs|Ms|Prof|Rev|Revd|Hon)\.$')

Expand Down
2 changes: 1 addition & 1 deletion openlibrary/catalog/marc/marc_binary.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ class BadLength(MarcException):
pass

def norm(s):
return normalize('NFC', unicode(s))
return normalize('NFC', six.text_type(s))

class BinaryDataField():
def __init__(self, rec, line):
Expand Down
4 changes: 3 additions & 1 deletion openlibrary/catalog/marc/marc_xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
from marc_base import MarcBase, MarcException
from unicodedata import normalize

import six

data_tag = '{http://www.loc.gov/MARC21/slim}datafield'
control_tag = '{http://www.loc.gov/MARC21/slim}controlfield'
subfield_tag = '{http://www.loc.gov/MARC21/slim}subfield'
Expand All @@ -21,7 +23,7 @@ def read_marc_file(f):
elem.clear()

def norm(s):
return normalize('NFC', unicode(s.replace(u'\xa0', ' ')))
return normalize('NFC', six.text_type(s.replace(u'\xa0', ' ')))

def get_text(e):
return norm(e.text) if e.text else u''
Expand Down
4 changes: 3 additions & 1 deletion openlibrary/catalog/marc/parse_xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
from parse import read_edition
from unicodedata import normalize

import six

slim = '{http://www.loc.gov/MARC21/slim}'
leader_tag = slim + 'leader'
data_tag = slim + 'datafield'
Expand All @@ -13,7 +15,7 @@
record_tag = slim + 'record'

def norm(s):
return normalize('NFC', unicode(s))
return normalize('NFC', six.text_type(s))

class BadSubtag:
pass
Expand Down
6 changes: 4 additions & 2 deletions openlibrary/catalog/marc/read_xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
from time import sleep
from unicodedata import normalize

import six

re_question = re.compile('^\?+$')
re_lccn = re.compile('(...\d+).*')
re_letters = re.compile('[A-Za-z]')
Expand Down Expand Up @@ -155,10 +157,10 @@ def read_edition(f):
# return None
# continue
if tag == '008':
publish_date = unicode(line)[7:11]
publish_date = six.text_type(line)[7:11]
if publish_date.isdigit():
edition["publish_date"] = publish_date
publish_country = unicode(line)[15:18]
publish_country = six.text_type(line)[15:18]
if publish_country not in ('|||', ' '):
edition["publish_country"] = publish_country
continue
Expand Down
4 changes: 3 additions & 1 deletion openlibrary/catalog/marc/simple_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
from build_record import build_record
import sys, re

import six

trans = {'&':'&amp;','<':'&lt;','>':'&gt;','\n':'<br>'}
re_html_replace = re.compile('([&<>\n])')

Expand Down Expand Up @@ -84,7 +86,7 @@ def output_record_as_html(rec):
elif rec[k] is None:
v = '<em>empty</em>'
else:
v = esc(unicode(rec[k]))
v = esc(six.text_type(rec[k]))
rows.append('<tr><th>%s</th><td>%s</td></tr>\n' % (label, v))

return '<table>' + ''.join(rows) + '</table>'
Expand Down
6 changes: 4 additions & 2 deletions openlibrary/catalog/merge/amazon.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
from names import match_name
from normalize import normalize

import six

re_year = re.compile('(\d{4})$')
re_amazon_title_paren = re.compile('^(.*) \([^)]+?\)$')
re_and_of_space = re.compile(' and | of | ')
Expand Down Expand Up @@ -291,7 +293,7 @@ def test_merge_titles():
'title': 'Spytime',
}

amazon = build_titles(unicode(full_title(amazon)))
amazon = build_titles(six.text_type(full_title(amazon)))
marc = build_titles(marc['title_with_subtitles'])
assert amazon['short_title'] == marc['short_title']
assert compare_title(amazon, marc) == ('full-title', 'containted within other title', 350)
Expand All @@ -303,7 +305,7 @@ def test_merge_titles2():
'title': u'seabirds of Britain and Ireland',
'full_title': u'The seabirds of Britain and Ireland',
}
amazon = build_titles(unicode(full_title(amazon)))
amazon = build_titles(six.text_type(full_title(amazon)))
marc = build_titles(marc['title_with_subtitles'])
assert compare_title(amazon, marc) == ('full-title', 'exact match', 600)

Expand Down
6 changes: 4 additions & 2 deletions openlibrary/catalog/merge/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
from names import match_name
from normalize import normalize

import six

re_year = re.compile('(\d{4})$')
re_amazon_title_paren = re.compile('^(.*) \([^)]+?\)$')
re_and_of_space = re.compile(' and | of | ')
Expand Down Expand Up @@ -285,7 +287,7 @@ def test_merge_titles():
'title': 'Spytime',
}

amazon = build_titles(unicode(full_title(amazon)))
amazon = build_titles(six.text_type(full_title(amazon)))
marc = build_titles(marc['title_with_subtitles'])
assert amazon['short_title'] == marc['short_title']
assert compare_title(amazon, marc) == ('full-title', 'containted within other title', 350)
Expand All @@ -297,7 +299,7 @@ def test_merge_titles2():
'title': u'seabirds of Britain and Ireland',
'full_title': u'The seabirds of Britain and Ireland',
}
amazon = build_titles(unicode(full_title(amazon)))
amazon = build_titles(six.text_type(full_title(amazon)))
marc = build_titles(marc['title_with_subtitles'])
assert compare_title(amazon, marc) == ('full-title', 'exact match', 600)

Expand Down
5 changes: 3 additions & 2 deletions openlibrary/catalog/merge/normalize.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
import re, unicodedata

import six

#re_brace = re.compile('{[^{}]+?}')
re_normalize = re.compile('[^[:alpha:] ]', re.I)
re_whitespace = re.compile('[-\s,;.]+')

def normalize(s):
if isinstance(s, unicode):
if isinstance(s, six.text_type):
s = unicodedata.normalize('NFC', s.replace(u'\u0142', u'l'))
s = s.replace(' & ', ' and ')
# remove {mlrhring} and friends
Expand All @@ -23,4 +25,3 @@ def normalize(s):
# a = "Tha{mllhring}{macr}alib{macr}i, {mllhring}Abd al-Malik ibn Mu{dotb}hammad 961 or 2-1037 or 8."
# b = u"Tha\xb0\xe5alib\xe5i, \xb0Abd al-Malik ibn Mu\xf2hammad 961 or 2-1037 or 8."
# assert normalize(a) == normalize(b)

11 changes: 5 additions & 6 deletions openlibrary/catalog/onix/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
from thread_utils import AsyncChannel, threaded_generator
from onix import OnixProduct, OnixHandler, onix_codelists

import six

def parser (input):
# returns a generator that produces dicts representing Open Library items

Expand Down Expand Up @@ -262,16 +264,13 @@ def person_name (x):
return name

def elt_get (e, tag, reference_name):
ee = e.get (tag) or e.get (reference_name.lower ())
if ee:
return unicode (ee)
else:
return None
ee = e.get (tag) or e.get (reference_name.lower ())
return six.text_type(ee) if ee else None


re_by = re.compile ("^\s*by\s+", re.IGNORECASE)
re_iname = re.compile ("^(.*),\s*(.*)$")

def add_val (o, key, val):
if val is not None:
o.setdefault (key, []).append (val)

10 changes: 5 additions & 5 deletions openlibrary/catalog/onix/xmltramp.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
__credits__ = "Many thanks to pjz, bitsko, and DanC."
__copyright__ = "(C) 2003-2006 Aaron Swartz. GNU GPL 2."

import six

def isstr(f): return isinstance(f, type('')) or isinstance(f, type(u''))
def islst(f): return isinstance(f, type(())) or isinstance(f, type([]))

Expand All @@ -16,6 +18,7 @@ def quote(x, elt=True):
if not elt: x = x.replace('"', '&quot;')
return x

@six.python_2_unicode_compatible
class Element:
def __init__(self, name, attrs=None, children=None, prefixes=None, line=None):
if islst(name) and name[0] == None: name = name[1]
Expand Down Expand Up @@ -97,15 +100,12 @@ def arep(a, inprefixes, addns=1):

return out

def __unicode__(self):
def __str__(self):
text = ''
for x in self._dir:
text += unicode(x)
text += six.text_type(x)
return ' '.join(text.split())

def __str__(self):
return self.__unicode__().encode('utf-8')

def __getattr__(self, n):
if n[0] == '_': raise AttributeError("Use foo['"+n+"'] to access the child element.")
if self._dNS: n = (self._dNS, n)
Expand Down
11 changes: 5 additions & 6 deletions openlibrary/catalog/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import openlibrary.catalog.merge.normalize as merge

import six
from six.moves import range

try:
cmp = cmp # Python 2
Expand Down Expand Up @@ -129,15 +128,15 @@ def pick_first_date(dates):
return { 'date': fix_l_in_date(' '.join([remove_trailing_number_dot(d) for d in dates])) }

def strip_accents(s):
return normalize('NFKD', unicode(s)).encode('ASCII', 'ignore')
return normalize('NFKD', six.text_type(s)).encode('ASCII', 'ignore')

re_drop = re.compile('[?,]')

def match_with_bad_chars(a, b):
if unicode(a) == unicode(b):
if six.text_type(a) == six.text_type(b):
return True
a = normalize('NFKD', unicode(a)).lower()
b = normalize('NFKD', unicode(b)).lower()
a = normalize('NFKD', six.text_type(a)).lower()
b = normalize('NFKD', six.text_type(b)).lower()
if a == b:
return True
a = a.encode('ASCII', 'ignore')
Expand All @@ -152,7 +151,7 @@ def accent_count(s):
return len([c for c in norm(s) if ord(c) > 127])

def norm(s):
return normalize('NFC', s) if isinstance(s, unicode) else s
return normalize('NFC', s) if isinstance(s, six.text_type) else s

def pick_best_name(names):
names = [norm(n) for n in names]
Expand Down
Loading

0 comments on commit d3f63c1

Please sign in to comment.