Skip to content
Permalink
Browse files

Fix #11, #12: quote attributes that need escaping in legacy browsers

These are mostly out of the market now, so this isn't massively
needed any more; nevertheless, avoiding XSS as much as possible is
inevitably desirable.

This alters the API so that quote_attr_values is now a ternary
setting, choosing between legacy-safe behaviour, spec behaviour, and
always quoting.
  • Loading branch information...
gsnedders committed Jul 19, 2013
1 parent 4768c64 commit 9b8d8eb5afbc066b7fac9390f5ec75e5e8a7cab7
@@ -33,6 +33,13 @@ Released on XXX
* **Use scripting disabled by default (as we don't implement
scripting).**

* **Fix #11, avoiding the XSS bug potentially caused by serializer
allowing attribute values to be escaped out of in old browser versions,
changing the quote_attr_values option on serializer to take one of
three values, "always" (the old True value), "legacy" (the new option,
and the new default), and "spec" (the old False value, and the old
default).**


0.9999999/1.0b8
~~~~~~~~~~~~~~~
@@ -10,7 +10,16 @@

spaceCharacters = "".join(spaceCharacters)

quoteAttributeSpec = re.compile("[" + spaceCharacters + "\"'=<>`]")
quoteAttributeSpecChars = spaceCharacters + "\"'=<>`"
quoteAttributeSpec = re.compile("[" + quoteAttributeSpecChars + "]")
quoteAttributeLegacy = re.compile("[" + quoteAttributeSpecChars +
"\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n"
"\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15"
"\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
"\x20\x2f\x60\xa0\u1680\u180e\u180f\u2000"
"\u2001\u2002\u2003\u2004\u2005\u2006\u2007"
"\u2008\u2009\u200a\u2028\u2029\u202f\u205f"
"\u3000]")

try:
from codecs import register_error, xmlcharrefreplace_errors
@@ -72,7 +81,7 @@ def htmlentityreplace_errors(exc):
class HTMLSerializer(object):

# attribute quoting options
quote_attr_values = False
quote_attr_values = "legacy" # be secure by default
quote_char = '"'
use_best_quote_char = True

@@ -108,9 +117,9 @@ def __init__(self, **kwargs):
inject_meta_charset=True|False
Whether it insert a meta element to define the character set of the
document.
quote_attr_values=True|False
quote_attr_values="legacy"|"spec"|"always"
Whether to quote attribute values that don't require quoting
per HTML5 parsing rules.
per legacy browser behaviour, when required by the standard, or always.
quote_char=u'"'|u"'"
Use given quote character for attribute quoting. Default is to
use double quote unless attribute value contains a double quote,
@@ -239,10 +248,15 @@ def serialize(self, treewalker, encoding=None):
(k not in booleanAttributes.get(name, tuple()) and
k not in booleanAttributes.get("", tuple())):
yield self.encodeStrict("=")
if self.quote_attr_values:
if self.quote_attr_values == "always" or len(v) == 0:
quote_attr = True
elif self.quote_attr_values == "spec":
quote_attr = quoteAttributeSpec.search(v) is not None
elif self.quote_attr_values == "legacy":
quote_attr = quoteAttributeLegacy.search(v) is not None
else:
quote_attr = len(v) == 0 or quoteAttributeSpec.search(v)
raise ValueError("quote_attr_values must be one of: "
"'always', 'spec', or 'legacy'")
v = v.replace("&", "&amp;")
if self.escape_lt_in_attrs:
v = v.replace("<", "&lt;")
@@ -242,7 +242,7 @@
},
{
"expected": [
"<span title=foo\u000bbar>"
"<span title=\"foo\u000bbar\">"
],
"input": [
[
@@ -41,9 +41,9 @@
]
]
],
"description": "quote_attr_values=true",
"description": "quote_attr_values='always'",
"options": {
"quote_attr_values": true
"quote_attr_values": "always"
}
},
{
@@ -64,9 +64,78 @@
]
]
],
"description": "quote_attr_values=true with irrelevant",
"description": "quote_attr_values='always' with irrelevant",
"options": {
"quote_attr_values": true
"quote_attr_values": "always"
}
},
{
"expected": [
"<div class=\"foo\">"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"div",
[
{
"namespace": null,
"name": "class",
"value": "foo"
}
]
]
],
"description": "non-minimized quote_attr_values='always'",
"options": {
"quote_attr_values": "always"
}
},
{
"expected": [
"<div class=foo>"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"div",
[
{
"namespace": null,
"name": "class",
"value": "foo"
}
]
]
],
"description": "non-minimized quote_attr_values='legacy'",
"options": {
"quote_attr_values": "legacy"
}
},
{
"expected": [
"<div class=foo>"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"div",
[
{
"namespace": null,
"name": "class",
"value": "foo"
}
]
]
],
"description": "non-minimized quote_attr_values='spec'",
"options": {
"quote_attr_values": "spec"
}
},
{
@@ -146,6 +146,37 @@ def testComment():
throwsWithLatin1([["Comment", "\u0101"]])


@pytest.mark.parametrize("c", list("\t\n\u000C\x20\r\"'=<>`"))
def testSpecQuoteAttribute(c):
input_ = [["StartTag", "http://www.w3.org/1999/xhtml", "span",
[{"namespace": None, "name": "foo", "value": c}]]]
if c == '"':
output_ = ["<span foo='%s'>" % c]
else:
output_ = ['<span foo="%s">' % c]
options_ = {"quote_attr_values": "spec"}
runSerializerTest(input_, output_, options_)


@pytest.mark.parametrize("c", list("\t\n\u000C\x20\r\"'=<>`"
"\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n"
"\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15"
"\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
"\x20\x2f\x60\xa0\u1680\u180e\u180f\u2000"
"\u2001\u2002\u2003\u2004\u2005\u2006\u2007"
"\u2008\u2009\u200a\u2028\u2029\u202f\u205f"
"\u3000"))
def testLegacyQuoteAttribute(c):
input_ = [["StartTag", "http://www.w3.org/1999/xhtml", "span",
[{"namespace": None, "name": "foo", "value": c}]]]
if c == '"':
output_ = ["<span foo='%s'>" % c]
else:
output_ = ['<span foo="%s">' % c]
options_ = {"quote_attr_values": "legacy"}
runSerializerTest(input_, output_, options_)


@pytest.fixture
def lxml_parser():
return etree.XMLParser(resolve_entities=False)

0 comments on commit 9b8d8eb

Please sign in to comment.
You can’t perform that action at this time.