From 53b5e2525edf9f27e40bbc11fd7a1377e83f94cc Mon Sep 17 00:00:00 2001 From: Jim Baker Date: Mon, 31 Mar 2014 21:16:22 -0600 Subject: [PATCH] Fix against vendor package html5lib to not directly use surrogates in unicode literals --- pip/_vendor/html5lib/inputstream.py | 40 +++++++++++++++++++++++------ 1 file changed, 32 insertions(+), 8 deletions(-) diff --git a/pip/_vendor/html5lib/inputstream.py b/pip/_vendor/html5lib/inputstream.py index 0ac70bb3a45..dc39ad09332 100644 --- a/pip/_vendor/html5lib/inputstream.py +++ b/pip/_vendor/html5lib/inputstream.py @@ -2,6 +2,7 @@ from pip._vendor.six import text_type import codecs +import platform import re from .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase @@ -27,7 +28,18 @@ class BufferedIOBase(object): asciiUppercaseBytes = frozenset([item.encode("ascii") for item in asciiUppercase]) spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"]) -invalid_unicode_re = re.compile("[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uD800-\uDFFF\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]") +invalid_unicode_template = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF%s]" + +if platform.python_implementation() == "Jython": + # Jython does not allow the use of solitary surrogate escapes + # (\uD800-\uDFFF) in literals or other usage. This is because it + # uses UTF-16, which is based on the use of such surrogates. + invalid_unicode_re = re.compile(invalid_unicode_template % "") +else: + # Instead use one extra step of indirection and create surrogates with + # unichr + invalid_unicode_re = re.compile(invalid_unicode_template % ( + "%s-%s" % (unichr(0xD800), unichr(0xDFFF)),)) non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, @@ -160,12 +172,21 @@ def __init__(self, source): """ # Craziness - if len("\U0010FFFF") == 1: + if platform.python_implementation() == "Jython": + # By its nature Jython's UTF-16 support does not allow + # surrogate errors, so no need to do this check. + self.reportCharacterErrors = None + self.replaceCharactersRegexp = None + elif len("\U0010FFFF") == 1: self.reportCharacterErrors = self.characterErrorsUCS4 - self.replaceCharactersRegexp = re.compile("[\uD800-\uDFFF]") + self.replaceCharactersRegexp = re.compile("{}".format( + "[{}-{}]".format(unichr(0xD800), unichr(0xDFFF)))) else: self.reportCharacterErrors = self.characterErrorsUCS2 - self.replaceCharactersRegexp = re.compile("([\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?