In bogusComment, make sure unconsume not called after a potential buf…

…fer up Wasn't able to repro with the supplied test case, but could previously happen. Audited other uses of unconsume and all are immediately after consume, so safe to call (as there's no path to a bufferup) Fixes #1612
jhy · Aug 14, 2021 · 42da864 · 42da864
1 parent eba3e39
commit 42da864
Show file tree

Hide file tree

Showing 4 changed files with 22 additions and 7 deletions.
diff --git a/src/main/java/org/jsoup/parser/Tokeniser.java b/src/main/java/org/jsoup/parser/Tokeniser.java
@@ -4,6 +4,7 @@
 import org.jsoup.internal.StringUtil;
 import org.jsoup.nodes.Entities;
 
+import javax.annotation.Nullable;
 import java.util.Arrays;
 
 /**
@@ -152,7 +153,7 @@ void advanceTransition(TokeniserState state) {
 
     final private int[] codepointHolder = new int[1]; // holder to not have to keep creating arrays
     final private int[] multipointHolder = new int[2];
-    int[] consumeCharacterReference(Character additionalAllowedCharacter, boolean inAttribute) {
+    @Nullable int[] consumeCharacterReference(Character additionalAllowedCharacter, boolean inAttribute) {
         if (reader.isEmpty())
             return null;
         if (additionalAllowedCharacter != null && additionalAllowedCharacter == reader.current())

diff --git a/src/main/java/org/jsoup/parser/TokeniserState.java b/src/main/java/org/jsoup/parser/TokeniserState.java
@@ -106,7 +106,7 @@ void read(Tokeniser t, CharacterReader r) {
                     break;
                 case '?':
                     t.createBogusCommentPending();
-                    t.advanceTransition(BogusComment);
+                    t.transition(BogusComment);
                     break;
                 default:
                     if (r.matchesAsciiAlpha()) {
@@ -136,7 +136,8 @@ void read(Tokeniser t, CharacterReader r) {
             } else {
                 t.error(this);
                 t.createBogusCommentPending();
-                t.transition(BogusComment); // reconsume char
+                t.commentPending.append('/'); // push the / back on that got us here
+                t.transition(BogusComment);
             }
         }
     },
@@ -906,11 +907,9 @@ void read(Tokeniser t, CharacterReader r) {
     BogusComment {
         void read(Tokeniser t, CharacterReader r) {
             // todo: handle bogus comment starting from eof. when does that trigger?
-            // rewind to capture character that lead us here
-            r.unconsume();
             t.commentPending.append(r.consumeTo('>'));
             // todo: replace nullChar with replaceChar
-            char next = r.consume();
+            char next = r.current();
             if (next == '>' || next == eof) {
                 t.emitCommentPending();
                 t.transition(Data);
@@ -933,7 +932,7 @@ void read(Tokeniser t, CharacterReader r) {
             } else {
                 t.error(this);
                 t.createBogusCommentPending();
-                t.advanceTransition(BogusComment); // advance so this character gets in bogus comment data's rewind
+                t.transition(BogusComment);
             }
         }
     },

diff --git a/src/test/java/org/jsoup/integration/FuzzFixesTest.java b/src/test/java/org/jsoup/integration/FuzzFixesTest.java
@@ -205,4 +205,19 @@ public void oob() throws IOException {
         Document docXml = Jsoup.parse(new FileInputStream(in), "UTF-8", "https://example.com", Parser.xmlParser());
         assertNotNull(docXml);
     }
+
+    @Test
+    public void unconsume() throws IOException {
+        // https://github.com/jhy/jsoup/issues/1612
+        // I wasn't able to repro this with different ways of loading strings - think somehow the fuzzers input
+        // buffer is different and the bufferUp() happened at a different point. Regardless, did find an unsafe use
+        // of unconsume() after a buffer up in bogus comment, so cleaned that up.
+        File in = ParseTest.getFile("/fuzztests/1612.html.gz");
+
+        Document doc = Jsoup.parse(in, "UTF-8");
+        assertNotNull(doc);
+
+        Document docXml = Jsoup.parse(new FileInputStream(in), "UTF-8", "https://example.com", Parser.xmlParser());
+        assertNotNull(docXml);
+    }
 }
diff --git a/src/test/resources/fuzztests/1612.html.gz b/src/test/resources/fuzztests/1612.html.gz