From dea49696e976b389864cd2926f93c6ff07bdcfa6 Mon Sep 17 00:00:00 2001 From: Jonathan Hedley Date: Mon, 27 Mar 2023 15:54:02 +1100 Subject: [PATCH] Updated AfterBody and AfterAfterBody to current spec Don't pop stack to close on or , but leave them on the stack. Had to deviate from the spec slightly to allow whitespace to be added to the html or doc elements. (The goal of that is so that when pretty-printing is off, the output more closely resembles the input, by tracking newlines after etc) Fixes #1851 --- CHANGES | 5 +++ .../org/jsoup/parser/HtmlTreeBuilder.java | 22 +++++++++++- .../jsoup/parser/HtmlTreeBuilderState.java | 34 +++++++++++++------ src/main/java/org/jsoup/parser/Tag.java | 3 +- .../java/org/jsoup/parser/HtmlParserTest.java | 29 +++++++++++++++- .../parser/HtmlTreeBuilderStateTest.java | 2 +- 6 files changed, 81 insertions(+), 14 deletions(-) diff --git a/CHANGES b/CHANGES index f09e52bf7d..b4cc272349 100644 --- a/CHANGES +++ b/CHANGES @@ -8,6 +8,11 @@ Release 1.16.1 [PENDING] * Improvement: Calling Node.remove() on a node with no parent is now a no-op, vs a validation error. + * Bugfix: aligned the HTML Tree Builder processing steps for AfterBody and AfterAfterBody to the updated WHATWG + standard, to not pop the stack to close or elements. This prevents an errant closing preceding + structure. Also added appropriate error message outputs in this case. + + * Bugfix: Corrected support for ruby elements (, , , and ) to current spec. diff --git a/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java b/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java index 785643e5af..9de525b69e 100644 --- a/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java +++ b/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java @@ -301,9 +301,14 @@ void insert(Token.Comment commentToken) { insertNode(comment, commentToken); } + /** Inserts the provided character token into the current element. */ void insert(Token.Character characterToken) { + final Element el = currentElement(); // will be doc if no current element; allows for whitespace to be inserted into the doc root object (not on the stack) + insert(characterToken, el); + } + + void insert(Token.Character characterToken, Element el) { final Node node; - Element el = currentElement(); // will be doc if no current element; allows for whitespace to be inserted into the doc root object (not on the stack) final String tagName = el.normalName(); final String data = characterToken.getData(); @@ -317,6 +322,7 @@ else if (isContentForTagData(tagName)) onNodeInserted(node, characterToken); } + /** Inserts the provided character token into the provided element. Use when not going onto stack element */ private void insertNode(Node node, @Nullable Token token) { // if the stack hasn't been set up yet, elements (doctype, comments) go into the doc if (stack.isEmpty()) @@ -632,6 +638,20 @@ boolean inSelectScope(String targetName) { return false; } + /** Tests if there is some element on the stack that is not in the provided set. */ + boolean onStackNot(String[] allowedTags) { + final int bottom = stack.size() -1; + final int top = bottom > MaxScopeSearchDepth ? bottom - MaxScopeSearchDepth : 0; + // don't walk too far up the tree + + for (int pos = bottom; pos >= top; pos--) { + final String elName = stack.get(pos).normalName(); + if (!inSorted(elName, allowedTags)) + return true; + } + return false; + } + void setHeadElement(Element headElement) { this.headElement = headElement; } diff --git a/src/main/java/org/jsoup/parser/HtmlTreeBuilderState.java b/src/main/java/org/jsoup/parser/HtmlTreeBuilderState.java index baa16c4686..99edf8ce87 100644 --- a/src/main/java/org/jsoup/parser/HtmlTreeBuilderState.java +++ b/src/main/java/org/jsoup/parser/HtmlTreeBuilderState.java @@ -310,7 +310,8 @@ boolean process(Token t, HtmlTreeBuilder tb) { case EOF: if (tb.templateModeSize() > 0) return tb.process(t, InTemplate); - // todo: error if stack contains something not dd, dt, li, p, tbody, td, tfoot, th, thead, tr, body, html + if (tb.onStackNot(InBodyEndOtherErrors)) + tb.error(this); // stop parsing break; } @@ -726,16 +727,22 @@ private boolean inBodyEndTag(Token t, HtmlTreeBuilder tb) { tb.error(this); return false; } else { - // todo: error if stack contains something not dd, dt, li, optgroup, option, p, rp, rt, tbody, td, tfoot, th, thead, tr, body, html - anyOtherEndTag(t, tb); + if (tb.onStackNot(InBodyEndOtherErrors)) + tb.error(this); tb.transition(AfterBody); } break; case "html": - boolean notIgnored = tb.processEndTag("body"); - if (notIgnored) - return tb.process(endTag); - break; + if (!tb.onStack("body")) { + tb.error(this); + return false; // ignore + } else { + if (tb.onStackNot(InBodyEndOtherErrors)) + tb.error(this); + tb.transition(AfterBody); + return tb.process(t); // re-process + } + case "form": if (!tb.onStack("template")) { Element currentForm = tb.getFormElement(); @@ -1594,7 +1601,12 @@ else if (name.equals("col")) { AfterBody { boolean process(Token t, HtmlTreeBuilder tb) { if (isWhitespace(t)) { - tb.insert(t.asCharacter()); // out of spec - include whitespace. spec would move into body + // spec deviation - currently body is still on stack, but we want this to go to the html node + Element html = tb.getFromStack("html"); + if (html != null) + tb.insert(t.asCharacter(), html); + else + tb.process(t, InBody); // will get into body } else if (t.isComment()) { tb.insert(t.asComment()); // into html node } else if (t.isDoctype()) { @@ -1607,7 +1619,6 @@ boolean process(Token t, HtmlTreeBuilder tb) { tb.error(this); return false; } else { - if (tb.onStack("html")) tb.popStackToClose("html"); tb.transition(AfterAfterBody); } } else if (t.isEOF()) { @@ -1699,7 +1710,9 @@ boolean process(Token t, HtmlTreeBuilder tb) { } else if (t.isDoctype() || (t.isStartTag() && t.asStartTag().normalName().equals("html"))) { return tb.process(t, InBody); } else if (isWhitespace(t)) { - tb.insert(t.asCharacter()); + // spec deviation - body and html still on stack, but want this space to go after + Element doc = tb.getDocument(); + tb.insert(t.asCharacter(), doc); }else if (t.isEOF()) { // nice work chuck } else { @@ -1786,6 +1799,7 @@ static final class Constants { static final String[] InBodyEndClosers = new String[]{"address", "article", "aside", "blockquote", "button", "center", "details", "dir", "div", "dl", "fieldset", "figcaption", "figure", "footer", "header", "hgroup", "listing", "menu", "nav", "ol", "pre", "section", "summary", "ul"}; + static final String[] InBodyEndOtherErrors = new String[] {"body", "dd", "dt", "html", "li", "optgroup", "option", "p", "rb", "rp", "rt", "rtc", "tbody", "td", "tfoot", "th", "thead", "tr"}; static final String[] InBodyEndAdoptionFormatters = new String[]{"a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small", "strike", "strong", "tt", "u"}; static final String[] InBodyEndTableFosters = new String[]{"table", "tbody", "tfoot", "thead", "tr"}; static final String[] InTableToBody = new String[]{"tbody", "tfoot", "thead"}; diff --git a/src/main/java/org/jsoup/parser/Tag.java b/src/main/java/org/jsoup/parser/Tag.java index 366bc63e3f..97ed500402 100644 --- a/src/main/java/org/jsoup/parser/Tag.java +++ b/src/main/java/org/jsoup/parser/Tag.java @@ -246,7 +246,8 @@ protected Tag clone() { "sub", "sup", "bdo", "iframe", "embed", "span", "input", "select", "textarea", "label", "button", "optgroup", "option", "legend", "datalist", "keygen", "output", "progress", "meter", "area", "param", "source", "track", "summary", "command", "device", "area", "basefont", "bgsound", "menuitem", "param", "source", "track", - "data", "bdi", "s", "strike", "nobr" + "data", "bdi", "s", "strike", "nobr", + "rb" // deprecated but still known / special handling }; private static final String[] emptyTags = { "meta", "link", "base", "frame", "img", "br", "wbr", "embed", "hr", "input", "keygen", "col", "command", diff --git a/src/test/java/org/jsoup/parser/HtmlParserTest.java b/src/test/java/org/jsoup/parser/HtmlParserTest.java index 26b546d51e..43475ce847 100644 --- a/src/test/java/org/jsoup/parser/HtmlParserTest.java +++ b/src/test/java/org/jsoup/parser/HtmlParserTest.java @@ -1698,11 +1698,38 @@ private boolean didAddElements(String input) { parser.setTrackErrors(10); Document doc = Jsoup.parse(html, parser); ParseErrorList errors = parser.getErrors(); - assertEquals(1, errors.size()); + assertEquals(2, errors.size()); Element ruby = doc.expectFirst("ruby"); assertEquals( "
Hello
", TextUtil.stripNewlines(ruby.outerHtml())); assertEquals("<1:16>: Unexpected StartTag token [] when in state [InBody]", errors.get(0).toString()); } + + @Test void errorOnEofIfOpen() { + String html = "
"; + Parser parser = Parser.htmlParser(); + parser.setTrackErrors(10); + Document doc = Jsoup.parse(html, parser); + ParseErrorList errors = parser.getErrors(); + assertEquals(1, errors.size()); + assertEquals("Unexpected EOF token [] when in state [InBody]", errors.get(0).getErrorMessage()); + } + + @Test void NoErrorOnEofIfBodyOpen() { + String html = ""; + Parser parser = Parser.htmlParser(); + parser.setTrackErrors(10); + Document doc = Jsoup.parse(html, parser); + ParseErrorList errors = parser.getErrors(); + assertEquals(0, errors.size()); + } + + @Test void htmlClose() { + // https://github.com/jhy/jsoup/issues/1851 + String html = "
OneTwo
"; + Document doc = Jsoup.parse(html); + //assertEquals("OneTwo", doc.expectFirst("body > div").text()); + System.out.println(doc.html()); + } } diff --git a/src/test/java/org/jsoup/parser/HtmlTreeBuilderStateTest.java b/src/test/java/org/jsoup/parser/HtmlTreeBuilderStateTest.java index c2dbfe3181..55b828e893 100644 --- a/src/test/java/org/jsoup/parser/HtmlTreeBuilderStateTest.java +++ b/src/test/java/org/jsoup/parser/HtmlTreeBuilderStateTest.java @@ -45,7 +45,7 @@ static void ensureSorted(List constants) { public void ensureArraysAreSorted() { List constants = findConstantArrays(Constants.class); ensureSorted(constants); - assertEquals(38, constants.size()); + assertEquals(39, constants.size()); } @Test public void ensureTagSearchesAreKnownTags() {