From 9cdf9b8d77d5c2c2a27d15fb68dd3f83cafb45a1 Mon Sep 17 00:00:00 2001
From: Snild Dolkow <snild@sony.com>
Date: Thu, 17 Aug 2023 16:25:26 +0200
Subject: [PATCH] Skip parsing after repeated partials on the same token
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When the parse buffer contains the starting bytes of a token but not
all of them, we cannot parse the token to completion. We call this a
partial token.  When this happens, the parse position is reset to the
start of the token, and the parse() call returns. The client is then
expected to provide more data and call parse() again.

In extreme cases, this means that the bytes of a token may be parsed
many times: once for every buffer refill required before the full token
is present in the buffer.

Math:
  Assume there's a token of T bytes
  Assume the client fills the buffer in chunks of X bytes
  We'll try to parse X, 2X, 3X, 4X ... until mX == T (technically >=)
  That's (m²+m)X/2 = (T²/X+T)/2 bytes parsed (arithmetic progression)
  While it is alleviated by larger refills, this amounts to O(T²)

Expat grows its internal buffer by doubling it when necessary, but has
no way to inform the client about how much space is available. Instead,
we add a heuristic that skips parsing when we've repeatedly stopped on
an incomplete token. Specifically:

 * Only try to parse if we have a certain amount of data buffered
 * Every time we stop on an incomplete token, double the threshold
 * As soon as any token completes, the threshold is reset

This means that when we get stuck on an incomplete token, the threshold
grows exponentially, effectively making the client perform larger buffer
fills, limiting how many times we can end up re-parsing the same bytes.

Math:
  Assume there's a token of T bytes
  Assume the client fills the buffer in chunks of X bytes
  We'll try to parse X, 2X, 4X, 8X ... until (2^k)X == T (or larger)
  That's (2^(k+1)-1)X bytes parsed -- e.g. 15X if T = 8X
  This is equal to 2T-X, which amounts to O(T)

We could've chosen a faster growth rate, e.g. 4 or 8. Those seem to
increase performance further, at the cost of further increasing the
risk of growing the buffer more than necessary. This can easily be
adjusted in the future, if desired.

This is all completely transparent to the client, except for:
1. possible delay of some callbacks (when our heuristic overshoots)
2. apps that never do isFinal=XML_TRUE could miss data at the end

For the affected testdata, this change shows a 100-400x speedup.
The recset.xml benchmark shows no clear change either way.

Before:
benchmark -n ../testdata/largefiles/recset.xml 65535 3
  3 loops, with buffer size 65535. Average time per loop: 0.270223
benchmark -n ../testdata/largefiles/aaaaaa_attr.xml 4096 3
  3 loops, with buffer size 4096. Average time per loop: 15.033048
benchmark -n ../testdata/largefiles/aaaaaa_cdata.xml 4096 3
  3 loops, with buffer size 4096. Average time per loop: 0.018027
benchmark -n ../testdata/largefiles/aaaaaa_comment.xml 4096 3
  3 loops, with buffer size 4096. Average time per loop: 11.775362
benchmark -n ../testdata/largefiles/aaaaaa_tag.xml 4096 3
  3 loops, with buffer size 4096. Average time per loop: 11.711414
benchmark -n ../testdata/largefiles/aaaaaa_text.xml 4096 3
  3 loops, with buffer size 4096. Average time per loop: 0.019362

After:
./run.sh benchmark -n ../testdata/largefiles/recset.xml 65535 3
  3 loops, with buffer size 65535. Average time per loop: 0.269030
./run.sh benchmark -n ../testdata/largefiles/aaaaaa_attr.xml 4096 3
  3 loops, with buffer size 4096. Average time per loop: 0.044794
./run.sh benchmark -n ../testdata/largefiles/aaaaaa_cdata.xml 4096 3
  3 loops, with buffer size 4096. Average time per loop: 0.016377
./run.sh benchmark -n ../testdata/largefiles/aaaaaa_comment.xml 4096 3
  3 loops, with buffer size 4096. Average time per loop: 0.027022
./run.sh benchmark -n ../testdata/largefiles/aaaaaa_tag.xml 4096 3
  3 loops, with buffer size 4096. Average time per loop: 0.099360
./run.sh benchmark -n ../testdata/largefiles/aaaaaa_text.xml 4096 3
  3 loops, with buffer size 4096. Average time per loop: 0.017956
---
 expat/lib/xmlparse.c      | 58 ++++++++++++++++++---------
 expat/tests/basic_tests.c | 83 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 122 insertions(+), 19 deletions(-)

diff --git a/expat/lib/xmlparse.c b/expat/lib/xmlparse.c
index 0a8703760..407a3bf66 100644
--- a/expat/lib/xmlparse.c
+++ b/expat/lib/xmlparse.c
@@ -89,6 +89,7 @@
 #  endif
 #endif
 
+#include <stdbool.h>
 #include <stddef.h>
 #include <string.h> /* memset(), memcpy() */
 #include <assert.h>
@@ -647,6 +648,7 @@ struct XML_ParserStruct {
 
   XML_Index m_parseEndByteIndex;
   const char *m_parseEndPtr;
+  size_t m_partialTokenBytesBefore; /* used in heuristic to avoid O(n^2) */
   XML_Char *m_dataBuf;
   XML_Char *m_dataBufEnd;
   XML_StartElementHandler m_startElementHandler;
@@ -978,6 +980,32 @@ get_hash_secret_salt(XML_Parser parser) {
   return parser->m_hash_secret_salt;
 }
 
+static enum XML_Error
+callProcessor(XML_Parser parser, const char *start, const char *end,
+              const char **endPtr) {
+  const size_t have_now = EXPAT_SAFE_PTR_DIFF(end, start);
+
+  if (! parser->m_parsingStatus.finalBuffer) {
+    // Heuristic: don't try to parse a partial token again until the amount of
+    // available data has increased significantly.
+    const size_t had_before = parser->m_partialTokenBytesBefore;
+    const bool enough = (have_now >= 2 * had_before);
+
+    if (! enough) {
+      *endPtr = start; // callers may expect this to be set
+      return XML_ERROR_NONE;
+    }
+  }
+  const enum XML_Error ret = parser->m_processor(parser, start, end, endPtr);
+  // if we consumed nothing, remember what we had on this parse attempt.
+  if (*endPtr == start) {
+    parser->m_partialTokenBytesBefore = have_now;
+  } else {
+    parser->m_partialTokenBytesBefore = 0;
+  }
+  return ret;
+}
+
 static XML_Bool /* only valid for root parser */
 startParsing(XML_Parser parser) {
   /* hash functions must be initialized before setContext() is called */
@@ -1159,6 +1187,7 @@ parserInit(XML_Parser parser, const XML_Char *encodingName) {
   parser->m_bufferEnd = parser->m_buffer;
   parser->m_parseEndByteIndex = 0;
   parser->m_parseEndPtr = NULL;
+  parser->m_partialTokenBytesBefore = 0;
   parser->m_declElementType = NULL;
   parser->m_declAttributeId = NULL;
   parser->m_declEntity = NULL;
@@ -1890,29 +1919,20 @@ XML_Parse(XML_Parser parser, const char *s, int len, int isFinal) {
        to detect errors based on that fact.
     */
     parser->m_errorCode
-        = parser->m_processor(parser, parser->m_bufferPtr,
-                              parser->m_parseEndPtr, &parser->m_bufferPtr);
+        = callProcessor(parser, parser->m_bufferPtr, parser->m_parseEndPtr,
+                        &parser->m_bufferPtr);
 
     if (parser->m_errorCode == XML_ERROR_NONE) {
       switch (parser->m_parsingStatus.parsing) {
       case XML_SUSPENDED:
-        /* It is hard to be certain, but it seems that this case
-         * cannot occur.  This code is cleaning up a previous parse
-         * with no new data (since len == 0).  Changing the parsing
-         * state requires getting to execute a handler function, and
-         * there doesn't seem to be an opportunity for that while in
-         * this circumstance.
-         *
-         * Given the uncertainty, we retain the code but exclude it
-         * from coverage tests.
-         *
-         * LCOV_EXCL_START
-         */
+        /* While we added no new data, the finalBuffer flag may have caused
+         * us to parse previously-unparsed data in the internal buffer.
+         * If that triggered a callback to the application, it would have
+         * had an opportunity to suspend parsing. */
         XmlUpdatePosition(parser->m_encoding, parser->m_positionPtr,
                           parser->m_bufferPtr, &parser->m_position);
         parser->m_positionPtr = parser->m_bufferPtr;
         return XML_STATUS_SUSPENDED;
-        /* LCOV_EXCL_STOP */
       case XML_INITIALIZED:
       case XML_PARSING:
         parser->m_parsingStatus.parsing = XML_FINISHED;
@@ -1942,7 +1962,7 @@ XML_Parse(XML_Parser parser, const char *s, int len, int isFinal) {
     parser->m_parsingStatus.finalBuffer = (XML_Bool)isFinal;
 
     parser->m_errorCode
-        = parser->m_processor(parser, s, parser->m_parseEndPtr = s + len, &end);
+        = callProcessor(parser, s, parser->m_parseEndPtr = s + len, &end);
 
     if (parser->m_errorCode != XML_ERROR_NONE) {
       parser->m_eventEndPtr = parser->m_eventPtr;
@@ -2044,8 +2064,8 @@ XML_ParseBuffer(XML_Parser parser, int len, int isFinal) {
   parser->m_parseEndByteIndex += len;
   parser->m_parsingStatus.finalBuffer = (XML_Bool)isFinal;
 
-  parser->m_errorCode = parser->m_processor(
-      parser, start, parser->m_parseEndPtr, &parser->m_bufferPtr);
+  parser->m_errorCode = callProcessor(parser, start, parser->m_parseEndPtr,
+                                      &parser->m_bufferPtr);
 
   if (parser->m_errorCode != XML_ERROR_NONE) {
     parser->m_eventEndPtr = parser->m_eventPtr;
@@ -2237,7 +2257,7 @@ XML_ResumeParser(XML_Parser parser) {
   }
   parser->m_parsingStatus.parsing = XML_PARSING;
 
-  parser->m_errorCode = parser->m_processor(
+  parser->m_errorCode = callProcessor(
       parser, parser->m_bufferPtr, parser->m_parseEndPtr, &parser->m_bufferPtr);
 
   if (parser->m_errorCode != XML_ERROR_NONE) {
diff --git a/expat/tests/basic_tests.c b/expat/tests/basic_tests.c
index 84a514216..acfdc8061 100644
--- a/expat/tests/basic_tests.c
+++ b/expat/tests/basic_tests.c
@@ -49,6 +49,7 @@
 
 #include <stdio.h>
 #include <string.h>
+#include <time.h>
 
 #if ! defined(__cplusplus)
 #  include <stdbool.h>
@@ -5180,6 +5181,87 @@ START_TEST(test_nested_entity_suspend) {
 }
 END_TEST
 
+/* Regression test for quadratic parsing on large tokens */
+START_TEST(test_big_tokens_take_linear_time) {
+  const struct {
+    const char *pre;
+    const char *post;
+  } text[] = {
+      {"<a>", "</a>"},                      // assumed good, used as baseline
+      {"<b><![CDATA[ value: ", " ]]></b>"}, // CDATA, performed OK before patch
+      {"<c attr='", "'></c>"},              // big attribute, used to be O(N²)
+      {"<d><!-- ", " --></d>"},             // long comment, used to be O(N²)
+      {"<e><", "/></e>"},                   // big elem name, used to be O(N²)
+  };
+  const int num_cases = sizeof(text) / sizeof(text[0]);
+  // For the test we need a <max_slowdown> value that is:
+  // (1) big enough that the test passes reliably (avoiding flaky tests), and
+  // (2) small enough that the test actually catches regressions.
+  const int max_slowdown = 15;
+  char aaaaaa[4096];
+  const int fillsize = (int)sizeof(aaaaaa);
+  const int fillcount = 100;
+
+  memset(aaaaaa, 'a', fillsize);
+
+#if defined(_WIN32)
+  if (CLOCKS_PER_SEC < 100000) {
+    // Skip this test if clock() doesn't have reasonably good resolution.
+    // This workaround is only applied to Windows targets, since XSI requires
+    // the value to be 1 000 000 (10x the condition here), and we want to be
+    // very sure that at least one platform in CI can catch regressions.
+    return;
+  }
+#endif
+
+  clock_t baseline = 0;
+  for (int i = 0; i < num_cases; ++i) {
+    XML_Parser parser = XML_ParserCreate(NULL);
+    assert_true(parser != NULL);
+    enum XML_Status status;
+    set_subtest("%saaaaaa%s", text[i].pre, text[i].post);
+    const clock_t start = clock();
+
+    // parse the start text
+    status = _XML_Parse_SINGLE_BYTES(parser, text[i].pre,
+                                     (int)strlen(text[i].pre), XML_FALSE);
+    if (status != XML_STATUS_OK) {
+      xml_failure(parser);
+    }
+    // parse lots of 'a', failing the test early if it takes too long
+    for (int f = 0; f < fillcount; ++f) {
+      status = _XML_Parse_SINGLE_BYTES(parser, aaaaaa, fillsize, XML_FALSE);
+      if (status != XML_STATUS_OK) {
+        xml_failure(parser);
+      }
+      // i == 0 means we're still calculating the baseline value
+      if (i > 0) {
+        const clock_t now = clock();
+        const clock_t clocks_so_far = now - start;
+        assert_true(clocks_so_far / baseline < max_slowdown);
+      }
+    }
+    // parse the end text
+    status = _XML_Parse_SINGLE_BYTES(parser, text[i].post,
+                                     (int)strlen(text[i].post), XML_TRUE);
+    if (status != XML_STATUS_OK) {
+      xml_failure(parser);
+    }
+
+    // how long did it take in total?
+    const clock_t end = clock();
+    const clock_t taken = end - start;
+    if (i == 0) {
+      assert_true(taken > 0); // just to make sure we don't div-by-0 later
+      baseline = taken;
+    }
+    assert_true(taken / baseline < max_slowdown);
+
+    XML_ParserFree(parser);
+  }
+}
+END_TEST
+
 void
 make_basic_test_case(Suite *s) {
   TCase *tc_basic = tcase_create("basic tests");
@@ -5419,4 +5501,5 @@ make_basic_test_case(Suite *s) {
   tcase_add_test__ifdef_xml_dtd(tc_basic,
                                 test_pool_integrity_with_unfinished_attr);
   tcase_add_test__if_xml_ge(tc_basic, test_nested_entity_suspend);
+  tcase_add_test(tc_basic, test_big_tokens_take_linear_time);
 }