Skip to content

Commit

Permalink
Fix edge cases with delimiters being near a buffer boundary.
Browse files Browse the repository at this point in the history
  • Loading branch information
csaboka authored and jhy committed Jan 27, 2020
1 parent 86d69ea commit de97030
Show file tree
Hide file tree
Showing 5 changed files with 74 additions and 12 deletions.
10 changes: 9 additions & 1 deletion src/main/java/org/jsoup/parser/CharacterReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -217,8 +217,16 @@ String consumeTo(String seq) {
String consumed = cacheString(charBuf, stringCache, bufPos, offset);
bufPos += offset;
return consumed;
} else {
} else if (bufLength - bufPos < seq.length()) {
// nextIndexOf() did a bufferUp(), so if the buffer is shorter than the search string, we must be at EOF
return consumeToEnd();
} else {
// the string we're looking for may be straddling a buffer boundary, so keep (length - 1) characters
// unread in case they contain the beginning of the search string
int endPos = bufLength - seq.length() + 1;
String consumed = cacheString(charBuf, stringCache, bufPos, endPos - bufPos);
bufPos = endPos;
return consumed;
}
}

Expand Down
5 changes: 5 additions & 0 deletions src/main/java/org/jsoup/parser/Tokeniser.java
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,11 @@ void emitCommentPending() {
emit(commentPending);
}

void createBogusCommentPending() {
commentPending.reset();
commentPending.bogus = true;
}

void createDoctypePending() {
doctypePending.reset();
}
Expand Down
14 changes: 9 additions & 5 deletions src/main/java/org/jsoup/parser/TokeniserState.java
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ void read(Tokeniser t, CharacterReader r) {
t.advanceTransition(EndTagOpen);
break;
case '?':
t.createBogusCommentPending();
t.advanceTransition(BogusComment);
break;
default:
Expand Down Expand Up @@ -134,6 +135,7 @@ void read(Tokeniser t, CharacterReader r) {
t.advanceTransition(Data);
} else {
t.error(this);
t.createBogusCommentPending();
t.advanceTransition(BogusComment);
}
}
Expand Down Expand Up @@ -910,12 +912,13 @@ void read(Tokeniser t, CharacterReader r) {
// todo: handle bogus comment starting from eof. when does that trigger?
// rewind to capture character that lead us here
r.unconsume();
Token.Comment comment = new Token.Comment();
comment.bogus = true;
comment.data.append(r.consumeTo('>'));
t.commentPending.data.append(r.consumeTo('>'));
// todo: replace nullChar with replaceChar
t.emit(comment);
t.advanceTransition(Data);
char next = r.consume();
if (next == '>' || next == eof) {
t.emitCommentPending();
t.transition(Data);
}
}
},
MarkupDeclarationOpen {
Expand All @@ -933,6 +936,7 @@ void read(Tokeniser t, CharacterReader r) {
t.transition(CdataSection);
} else {
t.error(this);
t.createBogusCommentPending();
t.advanceTransition(BogusComment); // advance so this character gets in bogus comment data's rewind
}
}
Expand Down
10 changes: 9 additions & 1 deletion src/test/java/org/jsoup/parser/CharacterReaderTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,15 @@ public class CharacterReaderTest {
assertEquals('T', r.consume());
assertEquals("wo ", r.consumeTo("Two"));
assertEquals('T', r.consume());
assertEquals("wo Four", r.consumeTo("Qux"));
// To handle strings straddling across buffers, consumeTo() may return the
// data in multiple pieces near EOF.
StringBuilder builder = new StringBuilder();
String part;
do {
part = r.consumeTo("Qux");
builder.append(part);
} while (!part.isEmpty());
assertEquals("wo Four", builder.toString());
}

@Test public void advance() {
Expand Down
47 changes: 42 additions & 5 deletions src/test/java/org/jsoup/parser/TokeniserTest.java
Original file line number Diff line number Diff line change
@@ -1,20 +1,24 @@
package org.jsoup.parser;

import static org.jsoup.parser.CharacterReader.maxBufferLen;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;

import java.io.UnsupportedEncodingException;
import java.util.Arrays;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Attribute;
import org.jsoup.nodes.CDataNode;
import org.jsoup.nodes.Comment;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.select.Elements;
import org.junit.Test;

import static org.jsoup.parser.CharacterReader.maxBufferLen;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;

public class TokeniserTest {
@Test
public void bufferUpInAttributeVal() {
Expand Down Expand Up @@ -180,4 +184,37 @@ public void bufferUpInAttributeVal() {
assertEquals("At: " + i, s.charAt(0), Tokeniser.win1252Extensions[i]);
}
}

@Test public void canParseVeryLongBogusComment() {
StringBuilder commentData = new StringBuilder(maxBufferLen);
do {
commentData.append("blah blah blah blah ");
} while (commentData.length() < maxBufferLen);
String expectedCommentData = commentData.toString();
String testMarkup = "<html><body><!" + expectedCommentData + "></body></html>";
Parser parser = new Parser(new HtmlTreeBuilder());

Document doc = parser.parseInput(testMarkup, "");

Node commentNode = doc.body().childNode(0);
assertTrue("Expected comment node", commentNode instanceof Comment);
assertEquals(expectedCommentData, ((Comment)commentNode).getData());
}

@Test public void canParseCdataEndingAtEdgeOfBuffer() {
String cdataStart = "<![CDATA[";
String cdataEnd = "]]>";
int bufLen = maxBufferLen - cdataStart.length() - 1; // also breaks with -2, but not with -3 or 0
char[] cdataContentsArray = new char[bufLen];
Arrays.fill(cdataContentsArray, 'x');
String cdataContents = new String(cdataContentsArray);
String testMarkup = cdataStart + cdataContents + cdataEnd;
Parser parser = new Parser(new HtmlTreeBuilder());

Document doc = parser.parseInput(testMarkup, "");

Node cdataNode = doc.body().childNode(0);
assertTrue("Expected CDATA node", cdataNode instanceof CDataNode);
assertEquals(cdataContents, ((CDataNode)cdataNode).text());
}
}

0 comments on commit de97030

Please sign in to comment.