Check the buffer is fully read if looking for absent content

Fixes #1929 In this case we are testing for a missing `</textarea>` - but if the buffer hasn't been fully read, we could never find it. For the normal case that this code is looking for - a missing `</title>` in brief HTML, a best-effort check (assuming the buffer is complete) is sufficient.
jhy · Apr 25, 2023 · f0ae81b · f0ae81b
1 parent 78aeac1
commit f0ae81b
Show file tree

Hide file tree

Showing 4 changed files with 26 additions and 1 deletion.
diff --git a/CHANGES b/CHANGES
@@ -41,6 +41,10 @@ Release 1.16.1 [PENDING]
   * Bugfix: <br> tags should be wrap-indented when in block tags (and not when in inline tags).
     <https://github.com/jhy/jsoup/issues/1911>
 
+  * Bugfix: the contents of a sufficiently large <textarea> with un-escaped HTML closing tags may be incorrectly parsed
+    to an empty node.
+    <https://github.com/jhy/jsoup/issues/1929>
+
 Release 1.15.4 [18-Feb-2023]
   * Improvement: added the ability to escape CSS selectors (tags, IDs, classes) to match elements that don't follow
     regular CSS syntax. For example, to match by classname <p class="one.two">, use document.select("p.one\\.two");

diff --git a/src/main/java/org/jsoup/parser/CharacterReader.java b/src/main/java/org/jsoup/parser/CharacterReader.java
@@ -116,6 +116,11 @@ public int pos() {
         return readerPos + bufPos;
     }
 
+    /** Tests if the buffer has been fully read. */
+    boolean readFully() {
+        return readFully;
+    }
+
     /**
      Enables or disables line number tracking. By default, will be <b>off</b>.Tracking line numbers improves the
      legibility of parser error messages, for example. Tracking should be enabled before any content is read to be of

diff --git a/src/main/java/org/jsoup/parser/TokeniserState.java b/src/main/java/org/jsoup/parser/TokeniserState.java
@@ -186,7 +186,7 @@ void read(Tokeniser t, CharacterReader r) {
             if (r.matches('/')) {
                 t.createTempBuffer();
                 t.advanceTransition(RCDATAEndTagOpen);
-            } else if (r.matchesAsciiAlpha() && t.appropriateEndTagName() != null && !r.containsIgnoreCase(t.appropriateEndTagSeq())) {
+            } else if (r.readFully() && r.matchesAsciiAlpha() && t.appropriateEndTagName() != null &&  !r.containsIgnoreCase(t.appropriateEndTagSeq())) {
                 // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than
                 // consuming to EOF; break out here
                 t.tagPending = t.createTagPending(false).name(t.appropriateEndTagName());

diff --git a/src/test/java/org/jsoup/parser/HtmlParserTest.java b/src/test/java/org/jsoup/parser/HtmlParserTest.java
@@ -1732,4 +1732,20 @@ private boolean didAddElements(String input) {
         //assertEquals("OneTwo", doc.expectFirst("body > div").text());
         System.out.println(doc.html());
     }
+
+    @Test void largeTextareaContents() {
+        // https://github.com/jhy/jsoup/issues/1929
+        StringBuilder sb = new StringBuilder();
+        int num = 2000;
+        for (int i = 0; i <= num; i++) {
+            sb.append("\n<text>foo</text>\n");
+        }
+        String textContent = sb.toString();
+        String sourceHtml = "<textarea>" + textContent + "</textarea>";
+
+        Document doc = Jsoup.parse(sourceHtml);
+        Element textArea = doc.expectFirst("textarea");
+
+        assertEquals(textContent, textArea.wholeText());
+    }
 }