Fix #11, #12: quote attributes that need escaping in legacy browsers

These are mostly out of the market now, so this isn't massively needed any more; nevertheless, avoiding XSS as much as possible is inevitably desirable. This alters the API so that quote_attr_values is now a ternary setting, choosing between legacy-safe behaviour, spec behaviour, and always quoting.
html5lib · May 11, 2016 · 9b8d8eb · 9b8d8eb
1 parent 4768c64
commit 9b8d8eb
Show file tree

Hide file tree

Showing 5 changed files with 132 additions and 11 deletions.
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -33,6 +33,13 @@ Released on XXX
 * **Use scripting disabled by default (as we don't implement
   scripting).**
 
+* **Fix #11, avoiding the XSS bug potentially caused by serializer
+  allowing attribute values to be escaped out of in old browser versions,
+  changing the quote_attr_values option on serializer to take one of
+  three values, "always" (the old True value), "legacy" (the new option,
+  and the new default), and "spec" (the old False value, and the old
+  default).**
+
 
 0.9999999/1.0b8
 ~~~~~~~~~~~~~~~

diff --git a/html5lib/serializer/htmlserializer.py b/html5lib/serializer/htmlserializer.py
@@ -10,7 +10,16 @@
 
 spaceCharacters = "".join(spaceCharacters)
 
-quoteAttributeSpec = re.compile("[" + spaceCharacters + "\"'=<>`]")
+quoteAttributeSpecChars = spaceCharacters + "\"'=<>`"
+quoteAttributeSpec = re.compile("[" + quoteAttributeSpecChars + "]")
+quoteAttributeLegacy = re.compile("[" + quoteAttributeSpecChars +
+                                  "\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n"
+                                  "\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15"
+                                  "\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
+                                  "\x20\x2f\x60\xa0\u1680\u180e\u180f\u2000"
+                                  "\u2001\u2002\u2003\u2004\u2005\u2006\u2007"
+                                  "\u2008\u2009\u200a\u2028\u2029\u202f\u205f"
+                                  "\u3000]")
 
 try:
     from codecs import register_error, xmlcharrefreplace_errors
@@ -72,7 +81,7 @@ def htmlentityreplace_errors(exc):
 class HTMLSerializer(object):
 
     # attribute quoting options
-    quote_attr_values = False
+    quote_attr_values = "legacy"  # be secure by default
     quote_char = '"'
     use_best_quote_char = True
 
@@ -108,9 +117,9 @@ def __init__(self, **kwargs):
         inject_meta_charset=True|False
           Whether it insert a meta element to define the character set of the
           document.
-        quote_attr_values=True|False
+        quote_attr_values="legacy"|"spec"|"always"
           Whether to quote attribute values that don't require quoting
-          per HTML5 parsing rules.
+          per legacy browser behaviour, when required by the standard, or always.
         quote_char=u'"'|u"'"
           Use given quote character for attribute quoting. Default is to
           use double quote unless attribute value contains a double quote,
@@ -239,10 +248,15 @@ def serialize(self, treewalker, encoding=None):
                         (k not in booleanAttributes.get(name, tuple()) and
                          k not in booleanAttributes.get("", tuple())):
                         yield self.encodeStrict("=")
-                        if self.quote_attr_values:
+                        if self.quote_attr_values == "always" or len(v) == 0:
                             quote_attr = True
+                        elif self.quote_attr_values == "spec":
+                            quote_attr = quoteAttributeSpec.search(v) is not None
+                        elif self.quote_attr_values == "legacy":
+                            quote_attr = quoteAttributeLegacy.search(v) is not None
                         else:
-                            quote_attr = len(v) == 0 or quoteAttributeSpec.search(v)
+                            raise ValueError("quote_attr_values must be one of: "
+                                             "'always', 'spec', or 'legacy'")
                         v = v.replace("&", "&amp;")
                         if self.escape_lt_in_attrs:
                             v = v.replace("<", "&lt;")

diff --git a/html5lib/tests/serializer-testdata/core.test b/html5lib/tests/serializer-testdata/core.test
@@ -242,7 +242,7 @@
         },
         {
             "expected": [
-                "<span title=foo\u000bbar>"
+                "<span title=\"foo\u000bbar\">"
             ],
             "input": [
                 [

diff --git a/html5lib/tests/serializer-testdata/options.test b/html5lib/tests/serializer-testdata/options.test
@@ -41,9 +41,9 @@
                     ]
                 ]
             ],
-            "description": "quote_attr_values=true",
+            "description": "quote_attr_values='always'",
             "options": {
-                "quote_attr_values": true
+                "quote_attr_values": "always"
             }
         },
         {
@@ -64,9 +64,78 @@
                     ]
                 ]
             ],
-            "description": "quote_attr_values=true with irrelevant",
+            "description": "quote_attr_values='always' with irrelevant",
             "options": {
-                "quote_attr_values": true
+                "quote_attr_values": "always"
+            }
+        },
+        {
+            "expected": [
+                "<div class=\"foo\">"
+            ],
+            "input": [
+                [
+                    "StartTag",
+                    "http://www.w3.org/1999/xhtml",
+                    "div",
+                    [
+                        {
+                            "namespace": null,
+                            "name": "class",
+                            "value": "foo"
+                        }
+                    ]
+                ]
+            ],
+            "description": "non-minimized quote_attr_values='always'",
+            "options": {
+                "quote_attr_values": "always"
+            }
+        },
+        {
+            "expected": [
+                "<div class=foo>"
+            ],
+            "input": [
+                [
+                    "StartTag",
+                    "http://www.w3.org/1999/xhtml",
+                    "div",
+                    [
+                        {
+                            "namespace": null,
+                            "name": "class",
+                            "value": "foo"
+                        }
+                    ]
+                ]
+            ],
+            "description": "non-minimized quote_attr_values='legacy'",
+            "options": {
+                "quote_attr_values": "legacy"
+            }
+        },
+        {
+            "expected": [
+                "<div class=foo>"
+            ],
+            "input": [
+                [
+                    "StartTag",
+                    "http://www.w3.org/1999/xhtml",
+                    "div",
+                    [
+                        {
+                            "namespace": null,
+                            "name": "class",
+                            "value": "foo"
+                        }
+                    ]
+                ]
+            ],
+            "description": "non-minimized quote_attr_values='spec'",
+            "options": {
+                "quote_attr_values": "spec"
             }
         },
         {

diff --git a/html5lib/tests/test_serializer.py b/html5lib/tests/test_serializer.py
@@ -146,6 +146,37 @@ def testComment():
     throwsWithLatin1([["Comment", "\u0101"]])
 
 
+@pytest.mark.parametrize("c", list("\t\n\u000C\x20\r\"'=<>`"))
+def testSpecQuoteAttribute(c):
+    input_ = [["StartTag", "http://www.w3.org/1999/xhtml", "span",
+               [{"namespace": None, "name": "foo", "value": c}]]]
+    if c == '"':
+        output_ = ["<span foo='%s'>" % c]
+    else:
+        output_ = ['<span foo="%s">' % c]
+    options_ = {"quote_attr_values": "spec"}
+    runSerializerTest(input_, output_, options_)
+
+
+@pytest.mark.parametrize("c", list("\t\n\u000C\x20\r\"'=<>`"
+                                   "\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n"
+                                   "\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15"
+                                   "\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
+                                   "\x20\x2f\x60\xa0\u1680\u180e\u180f\u2000"
+                                   "\u2001\u2002\u2003\u2004\u2005\u2006\u2007"
+                                   "\u2008\u2009\u200a\u2028\u2029\u202f\u205f"
+                                   "\u3000"))
+def testLegacyQuoteAttribute(c):
+    input_ = [["StartTag", "http://www.w3.org/1999/xhtml", "span",
+               [{"namespace": None, "name": "foo", "value": c}]]]
+    if c == '"':
+        output_ = ["<span foo='%s'>" % c]
+    else:
+        output_ = ['<span foo="%s">' % c]
+    options_ = {"quote_attr_values": "legacy"}
+    runSerializerTest(input_, output_, options_)
+
+
 @pytest.fixture
 def lxml_parser():
     return etree.XMLParser(resolve_entities=False)