diff --git a/expat/lib/xmlparse.c b/expat/lib/xmlparse.c index 0a8703760..407a3bf66 100644 --- a/expat/lib/xmlparse.c +++ b/expat/lib/xmlparse.c @@ -89,6 +89,7 @@ # endif #endif +#include #include #include /* memset(), memcpy() */ #include @@ -647,6 +648,7 @@ struct XML_ParserStruct { XML_Index m_parseEndByteIndex; const char *m_parseEndPtr; + size_t m_partialTokenBytesBefore; /* used in heuristic to avoid O(n^2) */ XML_Char *m_dataBuf; XML_Char *m_dataBufEnd; XML_StartElementHandler m_startElementHandler; @@ -978,6 +980,32 @@ get_hash_secret_salt(XML_Parser parser) { return parser->m_hash_secret_salt; } +static enum XML_Error +callProcessor(XML_Parser parser, const char *start, const char *end, + const char **endPtr) { + const size_t have_now = EXPAT_SAFE_PTR_DIFF(end, start); + + if (! parser->m_parsingStatus.finalBuffer) { + // Heuristic: don't try to parse a partial token again until the amount of + // available data has increased significantly. + const size_t had_before = parser->m_partialTokenBytesBefore; + const bool enough = (have_now >= 2 * had_before); + + if (! enough) { + *endPtr = start; // callers may expect this to be set + return XML_ERROR_NONE; + } + } + const enum XML_Error ret = parser->m_processor(parser, start, end, endPtr); + // if we consumed nothing, remember what we had on this parse attempt. + if (*endPtr == start) { + parser->m_partialTokenBytesBefore = have_now; + } else { + parser->m_partialTokenBytesBefore = 0; + } + return ret; +} + static XML_Bool /* only valid for root parser */ startParsing(XML_Parser parser) { /* hash functions must be initialized before setContext() is called */ @@ -1159,6 +1187,7 @@ parserInit(XML_Parser parser, const XML_Char *encodingName) { parser->m_bufferEnd = parser->m_buffer; parser->m_parseEndByteIndex = 0; parser->m_parseEndPtr = NULL; + parser->m_partialTokenBytesBefore = 0; parser->m_declElementType = NULL; parser->m_declAttributeId = NULL; parser->m_declEntity = NULL; @@ -1890,29 +1919,20 @@ XML_Parse(XML_Parser parser, const char *s, int len, int isFinal) { to detect errors based on that fact. */ parser->m_errorCode - = parser->m_processor(parser, parser->m_bufferPtr, - parser->m_parseEndPtr, &parser->m_bufferPtr); + = callProcessor(parser, parser->m_bufferPtr, parser->m_parseEndPtr, + &parser->m_bufferPtr); if (parser->m_errorCode == XML_ERROR_NONE) { switch (parser->m_parsingStatus.parsing) { case XML_SUSPENDED: - /* It is hard to be certain, but it seems that this case - * cannot occur. This code is cleaning up a previous parse - * with no new data (since len == 0). Changing the parsing - * state requires getting to execute a handler function, and - * there doesn't seem to be an opportunity for that while in - * this circumstance. - * - * Given the uncertainty, we retain the code but exclude it - * from coverage tests. - * - * LCOV_EXCL_START - */ + /* While we added no new data, the finalBuffer flag may have caused + * us to parse previously-unparsed data in the internal buffer. + * If that triggered a callback to the application, it would have + * had an opportunity to suspend parsing. */ XmlUpdatePosition(parser->m_encoding, parser->m_positionPtr, parser->m_bufferPtr, &parser->m_position); parser->m_positionPtr = parser->m_bufferPtr; return XML_STATUS_SUSPENDED; - /* LCOV_EXCL_STOP */ case XML_INITIALIZED: case XML_PARSING: parser->m_parsingStatus.parsing = XML_FINISHED; @@ -1942,7 +1962,7 @@ XML_Parse(XML_Parser parser, const char *s, int len, int isFinal) { parser->m_parsingStatus.finalBuffer = (XML_Bool)isFinal; parser->m_errorCode - = parser->m_processor(parser, s, parser->m_parseEndPtr = s + len, &end); + = callProcessor(parser, s, parser->m_parseEndPtr = s + len, &end); if (parser->m_errorCode != XML_ERROR_NONE) { parser->m_eventEndPtr = parser->m_eventPtr; @@ -2044,8 +2064,8 @@ XML_ParseBuffer(XML_Parser parser, int len, int isFinal) { parser->m_parseEndByteIndex += len; parser->m_parsingStatus.finalBuffer = (XML_Bool)isFinal; - parser->m_errorCode = parser->m_processor( - parser, start, parser->m_parseEndPtr, &parser->m_bufferPtr); + parser->m_errorCode = callProcessor(parser, start, parser->m_parseEndPtr, + &parser->m_bufferPtr); if (parser->m_errorCode != XML_ERROR_NONE) { parser->m_eventEndPtr = parser->m_eventPtr; @@ -2237,7 +2257,7 @@ XML_ResumeParser(XML_Parser parser) { } parser->m_parsingStatus.parsing = XML_PARSING; - parser->m_errorCode = parser->m_processor( + parser->m_errorCode = callProcessor( parser, parser->m_bufferPtr, parser->m_parseEndPtr, &parser->m_bufferPtr); if (parser->m_errorCode != XML_ERROR_NONE) { diff --git a/expat/tests/basic_tests.c b/expat/tests/basic_tests.c index 84a514216..acfdc8061 100644 --- a/expat/tests/basic_tests.c +++ b/expat/tests/basic_tests.c @@ -49,6 +49,7 @@ #include #include +#include #if ! defined(__cplusplus) # include @@ -5180,6 +5181,87 @@ START_TEST(test_nested_entity_suspend) { } END_TEST +/* Regression test for quadratic parsing on large tokens */ +START_TEST(test_big_tokens_take_linear_time) { + const struct { + const char *pre; + const char *post; + } text[] = { + {"", ""}, // assumed good, used as baseline + {""}, // CDATA, performed OK before patch + {""}, // big attribute, used to be O(N²) + {""}, // long comment, used to be O(N²) + {"<", "/>"}, // big elem name, used to be O(N²) + }; + const int num_cases = sizeof(text) / sizeof(text[0]); + // For the test we need a value that is: + // (1) big enough that the test passes reliably (avoiding flaky tests), and + // (2) small enough that the test actually catches regressions. + const int max_slowdown = 15; + char aaaaaa[4096]; + const int fillsize = (int)sizeof(aaaaaa); + const int fillcount = 100; + + memset(aaaaaa, 'a', fillsize); + +#if defined(_WIN32) + if (CLOCKS_PER_SEC < 100000) { + // Skip this test if clock() doesn't have reasonably good resolution. + // This workaround is only applied to Windows targets, since XSI requires + // the value to be 1 000 000 (10x the condition here), and we want to be + // very sure that at least one platform in CI can catch regressions. + return; + } +#endif + + clock_t baseline = 0; + for (int i = 0; i < num_cases; ++i) { + XML_Parser parser = XML_ParserCreate(NULL); + assert_true(parser != NULL); + enum XML_Status status; + set_subtest("%saaaaaa%s", text[i].pre, text[i].post); + const clock_t start = clock(); + + // parse the start text + status = _XML_Parse_SINGLE_BYTES(parser, text[i].pre, + (int)strlen(text[i].pre), XML_FALSE); + if (status != XML_STATUS_OK) { + xml_failure(parser); + } + // parse lots of 'a', failing the test early if it takes too long + for (int f = 0; f < fillcount; ++f) { + status = _XML_Parse_SINGLE_BYTES(parser, aaaaaa, fillsize, XML_FALSE); + if (status != XML_STATUS_OK) { + xml_failure(parser); + } + // i == 0 means we're still calculating the baseline value + if (i > 0) { + const clock_t now = clock(); + const clock_t clocks_so_far = now - start; + assert_true(clocks_so_far / baseline < max_slowdown); + } + } + // parse the end text + status = _XML_Parse_SINGLE_BYTES(parser, text[i].post, + (int)strlen(text[i].post), XML_TRUE); + if (status != XML_STATUS_OK) { + xml_failure(parser); + } + + // how long did it take in total? + const clock_t end = clock(); + const clock_t taken = end - start; + if (i == 0) { + assert_true(taken > 0); // just to make sure we don't div-by-0 later + baseline = taken; + } + assert_true(taken / baseline < max_slowdown); + + XML_ParserFree(parser); + } +} +END_TEST + void make_basic_test_case(Suite *s) { TCase *tc_basic = tcase_create("basic tests"); @@ -5419,4 +5501,5 @@ make_basic_test_case(Suite *s) { tcase_add_test__ifdef_xml_dtd(tc_basic, test_pool_integrity_with_unfinished_attr); tcase_add_test__if_xml_ge(tc_basic, test_nested_entity_suspend); + tcase_add_test(tc_basic, test_big_tokens_take_linear_time); }