Skip to content

Commit

Permalink
Fix against vendor package html5lib to not directly use surrogates in…
Browse files Browse the repository at this point in the history
… unicode literals
  • Loading branch information
jimbaker committed Apr 1, 2014
1 parent 1e0711e commit 53b5e25
Showing 1 changed file with 32 additions and 8 deletions.
40 changes: 32 additions & 8 deletions pip/_vendor/html5lib/inputstream.py
Expand Up @@ -2,6 +2,7 @@
from pip._vendor.six import text_type

import codecs
import platform
import re

from .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
Expand All @@ -27,7 +28,18 @@ class BufferedIOBase(object):
asciiUppercaseBytes = frozenset([item.encode("ascii") for item in asciiUppercase])
spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"])

invalid_unicode_re = re.compile("[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uD800-\uDFFF\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]")
invalid_unicode_template = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF%s]"

if platform.python_implementation() == "Jython":
# Jython does not allow the use of solitary surrogate escapes
# (\uD800-\uDFFF) in literals or other usage. This is because it
# uses UTF-16, which is based on the use of such surrogates.
invalid_unicode_re = re.compile(invalid_unicode_template % "")
else:
# Instead use one extra step of indirection and create surrogates with
# unichr
invalid_unicode_re = re.compile(invalid_unicode_template % (
"%s-%s" % (unichr(0xD800), unichr(0xDFFF)),))

non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
Expand Down Expand Up @@ -160,12 +172,21 @@ def __init__(self, source):
"""

# Craziness
if len("\U0010FFFF") == 1:
if platform.python_implementation() == "Jython":
# By its nature Jython's UTF-16 support does not allow
# surrogate errors, so no need to do this check.
self.reportCharacterErrors = None
self.replaceCharactersRegexp = None
elif len("\U0010FFFF") == 1:
self.reportCharacterErrors = self.characterErrorsUCS4
self.replaceCharactersRegexp = re.compile("[\uD800-\uDFFF]")
self.replaceCharactersRegexp = re.compile("{}".format(
"[{}-{}]".format(unichr(0xD800), unichr(0xDFFF))))
else:
self.reportCharacterErrors = self.characterErrorsUCS2
self.replaceCharactersRegexp = re.compile("([\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?<![\uD800-\uDBFF])[\uDC00-\uDFFF])")
self.replaceCharactersRegexp = re.compile("{}".format(
"([{}-{}](?![{}-{})|(?<![{}-{}])[{}-{}])".format(
unichr(0xD800), unichr(0xDBFF), unichr(0xDC00), unichr(0xDFFF),
unichr(0xD800), unichr(0xDBFF), unichr(0xDC00), unichr(0xDFFF))))

# List of where new lines occur
self.newLines = [0]
Expand Down Expand Up @@ -260,11 +281,14 @@ def readChunk(self, chunkSize=None):
self._bufferedCharacter = data[-1]
data = data[:-1]

self.reportCharacterErrors(data)
if platform.python_implementation() != "Jython":
# data is already Unicode, so Jython already has dealt
# with any surrogate character errors, no need to go here
self.reportCharacterErrors(data)

# Replace invalid characters
# Note U+0000 is dealt with in the tokenizer
data = self.replaceCharactersRegexp.sub("\ufffd", data)
# Replace invalid characters
# Note U+0000 is dealt with in the tokenizer
data = self.replaceCharactersRegexp.sub("\ufffd", data)

data = data.replace("\r\n", "\n")
data = data.replace("\r", "\n")
Expand Down

1 comment on commit 53b5e25

@tseaver
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Have you submitted this as a PR to the pypa folks?

Please sign in to comment.