Skip to content

Commit

Permalink
Make parsing of text be non-quadratic.
Browse files Browse the repository at this point in the history
In Python, appending strings is not guaranteed to be constant-time,
since they are documented to be immutable.  In some corner cases,
CPython is able to make these operations constant-time, but reaching
into ETree objects is not such a case.

This leads to parse times being quadratic in the size of the text in
the input in pathological cases where parsing outputs a large number
of adjacent text nodes which must be combined (e.g. HTML-escaped
values).  Specifically, we expect doubling the size of the input to
result in approximately doubling the time to parse; instead, we
observe quadratic behavior:

```
In [1]: import html5lib

In [2]: %timeit -n1 -r5 html5lib.parse("<" * 200000)
2.99 s ± 269 ms per loop (mean ± std. dev. of 5 runs, 1 loop each)

In [3]: %timeit -n1 -r5 html5lib.parse("<" * 400000)
6.7 s ± 242 ms per loop (mean ± std. dev. of 5 runs, 1 loop each)

In [4]: %timeit -n1 -r5 html5lib.parse("<" * 800000)
19.5 s ± 1.48 s per loop (mean ± std. dev. of 5 runs, 1 loop each)
```

Switch from appending to the internal `str`, to appending text to an
array of text chunks, as appends can be done in constant time.  Using
`bytearray` is a similar solution, but benchmarks slightly worse
because the strings must be encoded before being appended.

This improves parsing of text documents noticeably:

```
In [1]: import html5lib

In [2]: %timeit -n1 -r5 html5lib.parse("<" * 200000)
2.3 s ± 373 ms per loop (mean ± std. dev. of 5 runs, 1 loop each)

In [3]: %timeit -n1 -r5 html5lib.parse("<" * 400000)
3.85 s ± 29.7 ms per loop (mean ± std. dev. of 5 runs, 1 loop each)

In [4]: %timeit -n1 -r5 html5lib.parse("<" * 800000)
8.04 s ± 317 ms per loop (mean ± std. dev. of 5 runs, 1 loop each)
```
  • Loading branch information
alexmv committed Feb 27, 2024
1 parent fd4f032 commit 075cb7c
Show file tree
Hide file tree
Showing 2 changed files with 60 additions and 33 deletions.
81 changes: 52 additions & 29 deletions html5lib/treebuilders/etree.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,23 @@
tag_regexp = re.compile("{([^}]*)}(.*)")


class TextBuffer:
def __init__(self, initial=""):
self.chunks = [initial]

def __str__(self):
return "".join(self.chunks)

def getvalue(self):
return "".join(self.chunks)

def append(self, other):
self.chunks.append(other)

def __eq__(self, other):
return self.getvalue() == other


def getETreeBuilder(ElementTreeImplementation, fullTree=False):
ElementTree = ElementTreeImplementation
ElementTreeCommentType = ElementTree.Comment("asd").tag
Expand Down Expand Up @@ -110,25 +127,25 @@ def removeChild(self, node):
def insertText(self, data, insertBefore=None):
if not len(self._element):
if not self._element.text:
self._element.text = ""
self._element.text += data
self._element.text = TextBuffer("")
self._element.text.append(data)
elif insertBefore is None:
# Insert the text as the tail of the last child element
if not self._element[-1].tail:
self._element[-1].tail = ""
self._element[-1].tail += data
self._element[-1].tail = TextBuffer("")
self._element[-1].tail.append(data)
else:
# Insert the text before the specified node
children = list(self._element)
index = children.index(insertBefore._element)
if index > 0:
if not self._element[index - 1].tail:
self._element[index - 1].tail = ""
self._element[index - 1].tail += data
self._element[index - 1].tail = TextBuffer("")
self._element[index - 1].tail.append(data)
else:
if not self._element.text:
self._element.text = ""
self._element.text += data
self._element.text = TextBuffer("")
self._element.text.append(data)

def cloneNode(self):
element = type(self)(self.name, self.namespace)
Expand All @@ -138,36 +155,39 @@ def cloneNode(self):

def reparentChildren(self, newParent):
if newParent.childNodes:
newParent.childNodes[-1]._element.tail += self._element.text
newParent.childNodes[-1]._element.tail.append(
self._element.text.getvalue()
)
else:
if not newParent._element.text:
newParent._element.text = ""
newParent._element.text = TextBuffer("")
if self._element.text is not None:
newParent._element.text += self._element.text
self._element.text = ""
newParent._element.text.append(self._element.text.getvalue())
self._element.text = TextBuffer("")
base.Node.reparentChildren(self, newParent)

class Comment(Element):
def __init__(self, data):
# Use the superclass constructor to set all properties on the
# wrapper element
self._element = ElementTree.Comment(data)
self._element.text = TextBuffer(data)
self.parent = None
self._childNodes = []
self._flags = []

def _getData(self):
return self._element.text
return self._element.text.getvalue()

def _setData(self, value):
self._element.text = value
self._element.text = TextBuffer(value)

data = property(_getData, _setData)

class DocumentType(Element):
def __init__(self, name, publicId, systemId):
Element.__init__(self, "<!DOCTYPE>")
self._element.text = name
self._element.text = TextBuffer(name)
self.publicId = publicId
self.systemId = systemId

Expand Down Expand Up @@ -208,19 +228,19 @@ def serializeElement(element, indent=0):
publicId = element.get("publicId") or ""
systemId = element.get("systemId") or ""
rv.append("""<!DOCTYPE %s "%s" "%s">""" %
(element.text, publicId, systemId))
(element.text.getvalue(), publicId, systemId))
else:
rv.append("<!DOCTYPE %s>" % (element.text,))
rv.append("<!DOCTYPE %s>" % (element.text.getvalue(),))
elif element.tag == "DOCUMENT_ROOT":
rv.append("#document")
if element.text is not None:
rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text))
rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text.getvalue()))
if element.tail is not None:
raise TypeError("Document node cannot have tail")
if hasattr(element, "attrib") and len(element.attrib):
raise TypeError("Document node cannot have attributes")
elif element.tag == ElementTreeCommentType:
rv.append("|%s<!-- %s -->" % (' ' * indent, element.text))
rv.append("|%s<!-- %s -->" % (' ' * indent, element.text.getvalue()))
else:
assert isinstance(element.tag, text_type), \
"Expected unicode, got %s, %s" % (type(element.tag), element.tag)
Expand Down Expand Up @@ -248,13 +268,14 @@ def serializeElement(element, indent=0):

for name, value in sorted(attributes):
rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
if element.text:
rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text))
if element.text and element.text.getvalue():
rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text.getvalue()))
indent += 2
for child in element:
serializeElement(child, indent)
if element.tail:
rv.append("|%s\"%s\"" % (' ' * (indent - 2), element.tail))
rv.append("|%s\"%s\"" % (' ' * (indent - 2), element.tail.getvalue()))

serializeElement(element, 0)

return "\n".join(rv)
Expand All @@ -272,13 +293,15 @@ def serializeElement(element):
if element.get("publicId") or element.get("systemId"):
publicId = element.get("publicId") or ""
systemId = element.get("systemId") or ""
rv.append("""<!DOCTYPE %s PUBLIC "%s" "%s">""" %
(element.text, publicId, systemId))
rv.append(
"""<!DOCTYPE %s PUBLIC "%s" "%s">"""
% (element.text.getvalue(), publicId, systemId)
)
else:
rv.append("<!DOCTYPE %s>" % (element.text,))
rv.append("<!DOCTYPE %s>" % (element.text.getvalue(),))
elif element.tag == "DOCUMENT_ROOT":
if element.text is not None:
rv.append(element.text)
rv.append(element.text.getvalue())
if element.tail is not None:
raise TypeError("Document node cannot have tail")
if hasattr(element, "attrib") and len(element.attrib):
Expand All @@ -288,7 +311,7 @@ def serializeElement(element):
serializeElement(child)

elif element.tag == ElementTreeCommentType:
rv.append("<!--%s-->" % (element.text,))
rv.append("<!--%s-->" % (element.text.getvalue(),))
else:
# This is assumed to be an ordinary element
if not element.attrib:
Expand All @@ -299,15 +322,15 @@ def serializeElement(element):
for name, value in element.attrib.items()])
rv.append("<%s %s>" % (element.tag, attr))
if element.text:
rv.append(element.text)
rv.append(element.text.getvalue())

for child in element:
serializeElement(child)

rv.append("</%s>" % (element.tag,))

if element.tail:
rv.append(element.tail)
rv.append(element.tail.getvalue())

serializeElement(element)

Expand Down
12 changes: 8 additions & 4 deletions html5lib/treewalkers/etree.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def getNodeDetails(self, node):
if isinstance(node, tuple): # It might be the root Element
elt, _, _, flag = node
if flag in ("text", "tail"):
return base.TEXT, getattr(elt, flag)
return base.TEXT, getattr(elt, flag).getvalue()
else:
node = elt

Expand All @@ -44,11 +44,15 @@ def getNodeDetails(self, node):
return (base.DOCUMENT,)

elif node.tag == "<!DOCTYPE>":
return (base.DOCTYPE, node.text,
node.get("publicId"), node.get("systemId"))
return (
base.DOCTYPE,
node.text.getvalue(),
node.get("publicId"),
node.get("systemId"),
)

elif node.tag == ElementTreeCommentType:
return base.COMMENT, node.text
return base.COMMENT, node.text.getvalue()

else:
assert isinstance(node.tag, string_types), type(node.tag)
Expand Down

0 comments on commit 075cb7c

Please sign in to comment.