Skip to content

Commit

Permalink
Allow for Python implementations that don't support lone surrogates (…
Browse files Browse the repository at this point in the history
…read: Jython).

This is based on earlier work by Jim Baker (thanks!).

The two major parts of this are:

 * Avoiding having lone surrogates in any string literals, and
 * Avoiding tests that contain lone surrogates.

As part of this, the decoder for double-escaped tokenizer tests is rewritten
to avoid unicode_escape as that has bogus behaviour with non-ASCII characters.
  • Loading branch information
gsnedders committed Apr 28, 2015
1 parent b293489 commit b51828b
Show file tree
Hide file tree
Showing 5 changed files with 87 additions and 14 deletions.
1 change: 1 addition & 0 deletions AUTHORS.rst
Expand Up @@ -32,3 +32,4 @@ Patches and suggestions
- Juan Carlos Garcia Segovia
- Mike West
- Marc DM
- Jim Baker
5 changes: 3 additions & 2 deletions CHANGES.rst
Expand Up @@ -4,9 +4,10 @@ Change Log
0.9999
~~~~~~

Released on XXX, 2014
Released on XXX, 2015

* XXX
* Add support for Python implementations that don't support lone surrogates
(read: Jython).


0.999
Expand Down
35 changes: 26 additions & 9 deletions html5lib/inputstream.py
Expand Up @@ -28,7 +28,18 @@ class BufferedIOBase(object):
asciiUppercaseBytes = frozenset([item.encode("ascii") for item in asciiUppercase])
spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"])

invalid_unicode_re = re.compile("[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uD800-\uDFFF\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]")

invalid_unicode_no_surrogate = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]"

if utils.supports_lone_surrogates:
# Use one extra step of indirection and create surrogates with
# unichr. Not using this indirection would introduce an illegal
# unicode literal on platforms not supporting such lone
# surrogates.
invalid_unicode_re = re.compile(invalid_unicode_no_surrogate +
eval('"\\uD800-\\uDFFF"'))
else:
invalid_unicode_re = re.compile(invalid_unicode_no_surrogate)

non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
Expand Down Expand Up @@ -164,13 +175,18 @@ def __init__(self, source):
"""

# Craziness
if len("\U0010FFFF") == 1:
if not utils.supports_lone_surrogates:
# Such platforms will have already checked for such
# surrogate errors, so no need to do this checking.
self.reportCharacterErrors = None
self.replaceCharactersRegexp = None
elif len("\U0010FFFF") == 1:
self.reportCharacterErrors = self.characterErrorsUCS4
self.replaceCharactersRegexp = re.compile("[\uD800-\uDFFF]")
self.replaceCharactersRegexp = re.compile(eval('"[\\uD800-\\uDFFF]"'))
else:
self.reportCharacterErrors = self.characterErrorsUCS2
self.replaceCharactersRegexp = re.compile("([\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?<![\uD800-\uDBFF])[\uDC00-\uDFFF])")
self.replaceCharactersRegexp = re.compile(
eval('"([\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?<![\\uD800-\\uDBFF])[\\uDC00-\\uDFFF])"'))

# List of where new lines occur
self.newLines = [0]
Expand Down Expand Up @@ -265,11 +281,12 @@ def readChunk(self, chunkSize=None):
self._bufferedCharacter = data[-1]
data = data[:-1]

self.reportCharacterErrors(data)
if self.reportCharacterErrors:
self.reportCharacterErrors(data)

# Replace invalid characters
# Note U+0000 is dealt with in the tokenizer
data = self.replaceCharactersRegexp.sub("\ufffd", data)
# Replace invalid characters
# Note U+0000 is dealt with in the tokenizer
data = self.replaceCharactersRegexp.sub("\ufffd", data)

data = data.replace("\r\n", "\n")
data = data.replace("\r", "\n")
Expand Down
37 changes: 35 additions & 2 deletions html5lib/tests/test_tokenizer.py
Expand Up @@ -4,10 +4,12 @@
import warnings
import re

from six import unichr

from .support import get_data_files

from html5lib.tokenizer import HTMLTokenizer
from html5lib import constants
from html5lib import constants, utils


class TokenizerTestParser(object):
Expand Down Expand Up @@ -122,9 +124,38 @@ def tokensMatch(expectedTokens, receivedTokens, ignoreErrorOrder,
return tokens["expected"] == tokens["received"]


_surrogateRe = re.compile(r"\\u([0-9A-Fa-f]{4})(?:\\u([0-9A-Fa-f]{4}))?")


def unescape(test):
def decode(inp):
return inp.encode("utf-8").decode("unicode-escape")
"""Decode \\uXXXX escapes
This decodes \\uXXXX escapes, possibly into non-BMP characters when
two surrogate character escapes are adjacent to each other.
"""
# This cannot be implemented using the unicode_escape codec
# because that requires its input be ISO-8859-1, and we need
# arbitrary unicode as input.
def repl(m):
if m.group(2) is not None:
high = int(m.group(1), 16)
low = int(m.group(2), 16)
if 0xD800 <= high <= 0xDBFF and 0xDC00 <= low <= 0xDFFF:
cp = ((high - 0xD800) << 10) + (low - 0xDC00) + 0x10000
return unichr(cp)
else:
return unichr(high) + unichr(low)
else:
return unichr(int(m.group(1), 16))
try:
return _surrogateRe.sub(repl, inp)
except ValueError:
# This occurs when unichr throws ValueError, which should
# only be for a lone-surrogate.
if utils.supports_lone_surrogates:
raise
return None

test["input"] = decode(test["input"])
for token in test["output"]:
Expand Down Expand Up @@ -183,6 +214,8 @@ def testTokenizer():
test["initialStates"] = ["Data state"]
if 'doubleEscaped' in test:
test = unescape(test)
if test["input"] is None:
continue # Not valid input for this platform
for initialState in test["initialStates"]:
test["initialState"] = capitalize(initialState)
yield runTokenizerTest, test
23 changes: 22 additions & 1 deletion html5lib/utils.py
Expand Up @@ -2,14 +2,35 @@

from types import ModuleType

from six import text_type

try:
import xml.etree.cElementTree as default_etree
except ImportError:
import xml.etree.ElementTree as default_etree


__all__ = ["default_etree", "MethodDispatcher", "isSurrogatePair",
"surrogatePairToCodepoint", "moduleFactoryFactory"]
"surrogatePairToCodepoint", "moduleFactoryFactory",
"supports_lone_surrogates"]


# Platforms not supporting lone surrogates (\uD800-\uDFFF) should be
# caught by the below test. In general this would be any platform
# using UTF-16 as its encoding of unicode strings, such as
# Jython. This is because UTF-16 itself is based on the use of such
# surrogates, and there is no mechanism to further escape such
# escapes.
try:
_x = eval('"\\uD800"')
if not isinstance(_x, text_type):
# We need this with u"" because of http://bugs.jython.org/issue2039
_x = eval('u"\\uD800"')
assert isinstance(_x, text_type)
except:
supports_lone_surrogates = False
else:
supports_lone_surrogates = True


class MethodDispatcher(dict):
Expand Down

0 comments on commit b51828b

Please sign in to comment.