From de97030ff54ee0bd306cbc58bd8093645cc8a5dc Mon Sep 17 00:00:00 2001 From: Csaba Varga Date: Tue, 21 Jan 2020 19:32:03 +0100 Subject: [PATCH] Fix edge cases with delimiters being near a buffer boundary. --- .../org/jsoup/parser/CharacterReader.java | 10 +++- src/main/java/org/jsoup/parser/Tokeniser.java | 5 ++ .../java/org/jsoup/parser/TokeniserState.java | 14 ++++-- .../org/jsoup/parser/CharacterReaderTest.java | 10 +++- .../java/org/jsoup/parser/TokeniserTest.java | 47 +++++++++++++++++-- 5 files changed, 74 insertions(+), 12 deletions(-) diff --git a/src/main/java/org/jsoup/parser/CharacterReader.java b/src/main/java/org/jsoup/parser/CharacterReader.java index 515dcc729f..16f742ee07 100644 --- a/src/main/java/org/jsoup/parser/CharacterReader.java +++ b/src/main/java/org/jsoup/parser/CharacterReader.java @@ -217,8 +217,16 @@ String consumeTo(String seq) { String consumed = cacheString(charBuf, stringCache, bufPos, offset); bufPos += offset; return consumed; - } else { + } else if (bufLength - bufPos < seq.length()) { + // nextIndexOf() did a bufferUp(), so if the buffer is shorter than the search string, we must be at EOF return consumeToEnd(); + } else { + // the string we're looking for may be straddling a buffer boundary, so keep (length - 1) characters + // unread in case they contain the beginning of the search string + int endPos = bufLength - seq.length() + 1; + String consumed = cacheString(charBuf, stringCache, bufPos, endPos - bufPos); + bufPos = endPos; + return consumed; } } diff --git a/src/main/java/org/jsoup/parser/Tokeniser.java b/src/main/java/org/jsoup/parser/Tokeniser.java index b20dd6b191..c5e4bc203f 100644 --- a/src/main/java/org/jsoup/parser/Tokeniser.java +++ b/src/main/java/org/jsoup/parser/Tokeniser.java @@ -234,6 +234,11 @@ void emitCommentPending() { emit(commentPending); } + void createBogusCommentPending() { + commentPending.reset(); + commentPending.bogus = true; + } + void createDoctypePending() { doctypePending.reset(); } diff --git a/src/main/java/org/jsoup/parser/TokeniserState.java b/src/main/java/org/jsoup/parser/TokeniserState.java index bb21d0f6e5..544c0b5892 100644 --- a/src/main/java/org/jsoup/parser/TokeniserState.java +++ b/src/main/java/org/jsoup/parser/TokeniserState.java @@ -105,6 +105,7 @@ void read(Tokeniser t, CharacterReader r) { t.advanceTransition(EndTagOpen); break; case '?': + t.createBogusCommentPending(); t.advanceTransition(BogusComment); break; default: @@ -134,6 +135,7 @@ void read(Tokeniser t, CharacterReader r) { t.advanceTransition(Data); } else { t.error(this); + t.createBogusCommentPending(); t.advanceTransition(BogusComment); } } @@ -910,12 +912,13 @@ void read(Tokeniser t, CharacterReader r) { // todo: handle bogus comment starting from eof. when does that trigger? // rewind to capture character that lead us here r.unconsume(); - Token.Comment comment = new Token.Comment(); - comment.bogus = true; - comment.data.append(r.consumeTo('>')); + t.commentPending.data.append(r.consumeTo('>')); // todo: replace nullChar with replaceChar - t.emit(comment); - t.advanceTransition(Data); + char next = r.consume(); + if (next == '>' || next == eof) { + t.emitCommentPending(); + t.transition(Data); + } } }, MarkupDeclarationOpen { @@ -933,6 +936,7 @@ void read(Tokeniser t, CharacterReader r) { t.transition(CdataSection); } else { t.error(this); + t.createBogusCommentPending(); t.advanceTransition(BogusComment); // advance so this character gets in bogus comment data's rewind } } diff --git a/src/test/java/org/jsoup/parser/CharacterReaderTest.java b/src/test/java/org/jsoup/parser/CharacterReaderTest.java index 9323f54023..1ec8c59167 100644 --- a/src/test/java/org/jsoup/parser/CharacterReaderTest.java +++ b/src/test/java/org/jsoup/parser/CharacterReaderTest.java @@ -133,7 +133,15 @@ public class CharacterReaderTest { assertEquals('T', r.consume()); assertEquals("wo ", r.consumeTo("Two")); assertEquals('T', r.consume()); - assertEquals("wo Four", r.consumeTo("Qux")); + // To handle strings straddling across buffers, consumeTo() may return the + // data in multiple pieces near EOF. + StringBuilder builder = new StringBuilder(); + String part; + do { + part = r.consumeTo("Qux"); + builder.append(part); + } while (!part.isEmpty()); + assertEquals("wo Four", builder.toString()); } @Test public void advance() { diff --git a/src/test/java/org/jsoup/parser/TokeniserTest.java b/src/test/java/org/jsoup/parser/TokeniserTest.java index 2a2aa57401..ca80e30823 100644 --- a/src/test/java/org/jsoup/parser/TokeniserTest.java +++ b/src/test/java/org/jsoup/parser/TokeniserTest.java @@ -1,20 +1,24 @@ package org.jsoup.parser; +import static org.jsoup.parser.CharacterReader.maxBufferLen; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; + import java.io.UnsupportedEncodingException; +import java.util.Arrays; + import org.jsoup.Jsoup; import org.jsoup.nodes.Attribute; +import org.jsoup.nodes.CDataNode; import org.jsoup.nodes.Comment; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; +import org.jsoup.nodes.Node; import org.jsoup.nodes.TextNode; import org.jsoup.select.Elements; import org.junit.Test; -import static org.jsoup.parser.CharacterReader.maxBufferLen; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertNotNull; -import static org.junit.Assert.assertTrue; - public class TokeniserTest { @Test public void bufferUpInAttributeVal() { @@ -180,4 +184,37 @@ public void bufferUpInAttributeVal() { assertEquals("At: " + i, s.charAt(0), Tokeniser.win1252Extensions[i]); } } + + @Test public void canParseVeryLongBogusComment() { + StringBuilder commentData = new StringBuilder(maxBufferLen); + do { + commentData.append("blah blah blah blah "); + } while (commentData.length() < maxBufferLen); + String expectedCommentData = commentData.toString(); + String testMarkup = ""; + Parser parser = new Parser(new HtmlTreeBuilder()); + + Document doc = parser.parseInput(testMarkup, ""); + + Node commentNode = doc.body().childNode(0); + assertTrue("Expected comment node", commentNode instanceof Comment); + assertEquals(expectedCommentData, ((Comment)commentNode).getData()); + } + + @Test public void canParseCdataEndingAtEdgeOfBuffer() { + String cdataStart = ""; + int bufLen = maxBufferLen - cdataStart.length() - 1; // also breaks with -2, but not with -3 or 0 + char[] cdataContentsArray = new char[bufLen]; + Arrays.fill(cdataContentsArray, 'x'); + String cdataContents = new String(cdataContentsArray); + String testMarkup = cdataStart + cdataContents + cdataEnd; + Parser parser = new Parser(new HtmlTreeBuilder()); + + Document doc = parser.parseInput(testMarkup, ""); + + Node cdataNode = doc.body().childNode(0); + assertTrue("Expected CDATA node", cdataNode instanceof CDataNode); + assertEquals(cdataContents, ((CDataNode)cdataNode).text()); + } }