Skip to content

Commit

Permalink
Fix #11, #12: quote attributes that need escaping in legacy browsers
Browse files Browse the repository at this point in the history
These are mostly out of the market now, so this isn't massively
needed any more; nevertheless, avoiding XSS as much as possible is
inevitably desirable.

This alters the API so that quote_attr_values is now a ternary
setting, choosing between legacy-safe behaviour, spec behaviour, and
always quoting.
  • Loading branch information
gsnedders committed May 11, 2016
1 parent 4768c64 commit 9b8d8eb
Show file tree
Hide file tree
Showing 5 changed files with 132 additions and 11 deletions.
7 changes: 7 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,13 @@ Released on XXX
* **Use scripting disabled by default (as we don't implement
scripting).**

* **Fix #11, avoiding the XSS bug potentially caused by serializer
allowing attribute values to be escaped out of in old browser versions,
changing the quote_attr_values option on serializer to take one of
three values, "always" (the old True value), "legacy" (the new option,
and the new default), and "spec" (the old False value, and the old
default).**


0.9999999/1.0b8
~~~~~~~~~~~~~~~
Expand Down
26 changes: 20 additions & 6 deletions html5lib/serializer/htmlserializer.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,16 @@

spaceCharacters = "".join(spaceCharacters)

quoteAttributeSpec = re.compile("[" + spaceCharacters + "\"'=<>`]")
quoteAttributeSpecChars = spaceCharacters + "\"'=<>`"
quoteAttributeSpec = re.compile("[" + quoteAttributeSpecChars + "]")
quoteAttributeLegacy = re.compile("[" + quoteAttributeSpecChars +
"\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n"
"\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15"
"\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
"\x20\x2f\x60\xa0\u1680\u180e\u180f\u2000"
"\u2001\u2002\u2003\u2004\u2005\u2006\u2007"
"\u2008\u2009\u200a\u2028\u2029\u202f\u205f"
"\u3000]")

try:
from codecs import register_error, xmlcharrefreplace_errors
Expand Down Expand Up @@ -72,7 +81,7 @@ def htmlentityreplace_errors(exc):
class HTMLSerializer(object):

# attribute quoting options
quote_attr_values = False
quote_attr_values = "legacy" # be secure by default
quote_char = '"'
use_best_quote_char = True

Expand Down Expand Up @@ -108,9 +117,9 @@ def __init__(self, **kwargs):
inject_meta_charset=True|False
Whether it insert a meta element to define the character set of the
document.
quote_attr_values=True|False
quote_attr_values="legacy"|"spec"|"always"
Whether to quote attribute values that don't require quoting
per HTML5 parsing rules.
per legacy browser behaviour, when required by the standard, or always.
quote_char=u'"'|u"'"
Use given quote character for attribute quoting. Default is to
use double quote unless attribute value contains a double quote,
Expand Down Expand Up @@ -239,10 +248,15 @@ def serialize(self, treewalker, encoding=None):
(k not in booleanAttributes.get(name, tuple()) and
k not in booleanAttributes.get("", tuple())):
yield self.encodeStrict("=")
if self.quote_attr_values:
if self.quote_attr_values == "always" or len(v) == 0:
quote_attr = True
elif self.quote_attr_values == "spec":
quote_attr = quoteAttributeSpec.search(v) is not None
elif self.quote_attr_values == "legacy":
quote_attr = quoteAttributeLegacy.search(v) is not None
else:
quote_attr = len(v) == 0 or quoteAttributeSpec.search(v)
raise ValueError("quote_attr_values must be one of: "
"'always', 'spec', or 'legacy'")
v = v.replace("&", "&amp;")
if self.escape_lt_in_attrs:
v = v.replace("<", "&lt;")
Expand Down
2 changes: 1 addition & 1 deletion html5lib/tests/serializer-testdata/core.test
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,7 @@
},
{
"expected": [
"<span title=foo\u000bbar>"
"<span title=\"foo\u000bbar\">"
],
"input": [
[
Expand Down
77 changes: 73 additions & 4 deletions html5lib/tests/serializer-testdata/options.test
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,9 @@
]
]
],
"description": "quote_attr_values=true",
"description": "quote_attr_values='always'",
"options": {
"quote_attr_values": true
"quote_attr_values": "always"
}
},
{
Expand All @@ -64,9 +64,78 @@
]
]
],
"description": "quote_attr_values=true with irrelevant",
"description": "quote_attr_values='always' with irrelevant",
"options": {
"quote_attr_values": true
"quote_attr_values": "always"
}
},
{
"expected": [
"<div class=\"foo\">"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"div",
[
{
"namespace": null,
"name": "class",
"value": "foo"
}
]
]
],
"description": "non-minimized quote_attr_values='always'",
"options": {
"quote_attr_values": "always"
}
},
{
"expected": [
"<div class=foo>"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"div",
[
{
"namespace": null,
"name": "class",
"value": "foo"
}
]
]
],
"description": "non-minimized quote_attr_values='legacy'",
"options": {
"quote_attr_values": "legacy"
}
},
{
"expected": [
"<div class=foo>"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"div",
[
{
"namespace": null,
"name": "class",
"value": "foo"
}
]
]
],
"description": "non-minimized quote_attr_values='spec'",
"options": {
"quote_attr_values": "spec"
}
},
{
Expand Down
31 changes: 31 additions & 0 deletions html5lib/tests/test_serializer.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,37 @@ def testComment():
throwsWithLatin1([["Comment", "\u0101"]])


@pytest.mark.parametrize("c", list("\t\n\u000C\x20\r\"'=<>`"))
def testSpecQuoteAttribute(c):
input_ = [["StartTag", "http://www.w3.org/1999/xhtml", "span",
[{"namespace": None, "name": "foo", "value": c}]]]
if c == '"':
output_ = ["<span foo='%s'>" % c]
else:
output_ = ['<span foo="%s">' % c]
options_ = {"quote_attr_values": "spec"}
runSerializerTest(input_, output_, options_)


@pytest.mark.parametrize("c", list("\t\n\u000C\x20\r\"'=<>`"
"\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n"
"\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15"
"\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
"\x20\x2f\x60\xa0\u1680\u180e\u180f\u2000"
"\u2001\u2002\u2003\u2004\u2005\u2006\u2007"
"\u2008\u2009\u200a\u2028\u2029\u202f\u205f"
"\u3000"))
def testLegacyQuoteAttribute(c):
input_ = [["StartTag", "http://www.w3.org/1999/xhtml", "span",
[{"namespace": None, "name": "foo", "value": c}]]]
if c == '"':
output_ = ["<span foo='%s'>" % c]
else:
output_ = ['<span foo="%s">' % c]
options_ = {"quote_attr_values": "legacy"}
runSerializerTest(input_, output_, options_)


@pytest.fixture
def lxml_parser():
return etree.XMLParser(resolve_entities=False)
Expand Down

0 comments on commit 9b8d8eb

Please sign in to comment.