Skip to content

Commit

Permalink
In bogusComment, make sure unconsume not called after a potential buf…
Browse files Browse the repository at this point in the history
…fer up

Wasn't able to repro with the supplied test case, but could previously happen.

Audited other uses of unconsume and all are immediately after consume, so safe to call (as there's no path to a bufferup)

Fixes #1612
  • Loading branch information
jhy committed Aug 14, 2021
1 parent eba3e39 commit 42da864
Show file tree
Hide file tree
Showing 4 changed files with 22 additions and 7 deletions.
3 changes: 2 additions & 1 deletion src/main/java/org/jsoup/parser/Tokeniser.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import org.jsoup.internal.StringUtil;
import org.jsoup.nodes.Entities;

import javax.annotation.Nullable;
import java.util.Arrays;

/**
Expand Down Expand Up @@ -152,7 +153,7 @@ void advanceTransition(TokeniserState state) {

final private int[] codepointHolder = new int[1]; // holder to not have to keep creating arrays
final private int[] multipointHolder = new int[2];
int[] consumeCharacterReference(Character additionalAllowedCharacter, boolean inAttribute) {
@Nullable int[] consumeCharacterReference(Character additionalAllowedCharacter, boolean inAttribute) {
if (reader.isEmpty())
return null;
if (additionalAllowedCharacter != null && additionalAllowedCharacter == reader.current())
Expand Down
11 changes: 5 additions & 6 deletions src/main/java/org/jsoup/parser/TokeniserState.java
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ void read(Tokeniser t, CharacterReader r) {
break;
case '?':
t.createBogusCommentPending();
t.advanceTransition(BogusComment);
t.transition(BogusComment);
break;
default:
if (r.matchesAsciiAlpha()) {
Expand Down Expand Up @@ -136,7 +136,8 @@ void read(Tokeniser t, CharacterReader r) {
} else {
t.error(this);
t.createBogusCommentPending();
t.transition(BogusComment); // reconsume char
t.commentPending.append('/'); // push the / back on that got us here
t.transition(BogusComment);
}
}
},
Expand Down Expand Up @@ -906,11 +907,9 @@ void read(Tokeniser t, CharacterReader r) {
BogusComment {
void read(Tokeniser t, CharacterReader r) {
// todo: handle bogus comment starting from eof. when does that trigger?
// rewind to capture character that lead us here
r.unconsume();
t.commentPending.append(r.consumeTo('>'));
// todo: replace nullChar with replaceChar
char next = r.consume();
char next = r.current();
if (next == '>' || next == eof) {
t.emitCommentPending();
t.transition(Data);
Expand All @@ -933,7 +932,7 @@ void read(Tokeniser t, CharacterReader r) {
} else {
t.error(this);
t.createBogusCommentPending();
t.advanceTransition(BogusComment); // advance so this character gets in bogus comment data's rewind
t.transition(BogusComment);
}
}
},
Expand Down
15 changes: 15 additions & 0 deletions src/test/java/org/jsoup/integration/FuzzFixesTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -205,4 +205,19 @@ public void oob() throws IOException {
Document docXml = Jsoup.parse(new FileInputStream(in), "UTF-8", "https://example.com", Parser.xmlParser());
assertNotNull(docXml);
}

@Test
public void unconsume() throws IOException {
// https://github.com/jhy/jsoup/issues/1612
// I wasn't able to repro this with different ways of loading strings - think somehow the fuzzers input
// buffer is different and the bufferUp() happened at a different point. Regardless, did find an unsafe use
// of unconsume() after a buffer up in bogus comment, so cleaned that up.
File in = ParseTest.getFile("/fuzztests/1612.html.gz");

Document doc = Jsoup.parse(in, "UTF-8");
assertNotNull(doc);

Document docXml = Jsoup.parse(new FileInputStream(in), "UTF-8", "https://example.com", Parser.xmlParser());
assertNotNull(docXml);
}
}
Binary file added src/test/resources/fuzztests/1612.html.gz
Binary file not shown.

0 comments on commit 42da864

Please sign in to comment.