diff --git a/CHANGES b/CHANGES index f09e52bf7d..b4cc272349 100644 --- a/CHANGES +++ b/CHANGES @@ -8,6 +8,11 @@ Release 1.16.1 [PENDING] * Improvement: Calling Node.remove() on a node with no parent is now a no-op, vs a validation error. + * Bugfix: aligned the HTML Tree Builder processing steps for AfterBody and AfterAfterBody to the updated WHATWG + standard, to not pop the stack to close or elements. This prevents an errant closing preceding + structure. Also added appropriate error message outputs in this case. + + * Bugfix: Corrected support for ruby elements (, , , and ) to current spec. diff --git a/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java b/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java index 785643e5af..9de525b69e 100644 --- a/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java +++ b/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java @@ -301,9 +301,14 @@ void insert(Token.Comment commentToken) { insertNode(comment, commentToken); } + /** Inserts the provided character token into the current element. */ void insert(Token.Character characterToken) { + final Element el = currentElement(); // will be doc if no current element; allows for whitespace to be inserted into the doc root object (not on the stack) + insert(characterToken, el); + } + + void insert(Token.Character characterToken, Element el) { final Node node; - Element el = currentElement(); // will be doc if no current element; allows for whitespace to be inserted into the doc root object (not on the stack) final String tagName = el.normalName(); final String data = characterToken.getData(); @@ -317,6 +322,7 @@ else if (isContentForTagData(tagName)) onNodeInserted(node, characterToken); } + /** Inserts the provided character token into the provided element. Use when not going onto stack element */ private void insertNode(Node node, @Nullable Token token) { // if the stack hasn't been set up yet, elements (doctype, comments) go into the doc if (stack.isEmpty()) @@ -632,6 +638,20 @@ boolean inSelectScope(String targetName) { return false; } + /** Tests if there is some element on the stack that is not in the provided set. */ + boolean onStackNot(String[] allowedTags) { + final int bottom = stack.size() -1; + final int top = bottom > MaxScopeSearchDepth ? bottom - MaxScopeSearchDepth : 0; + // don't walk too far up the tree + + for (int pos = bottom; pos >= top; pos--) { + final String elName = stack.get(pos).normalName(); + if (!inSorted(elName, allowedTags)) + return true; + } + return false; + } + void setHeadElement(Element headElement) { this.headElement = headElement; } diff --git a/src/main/java/org/jsoup/parser/HtmlTreeBuilderState.java b/src/main/java/org/jsoup/parser/HtmlTreeBuilderState.java index baa16c4686..99edf8ce87 100644 --- a/src/main/java/org/jsoup/parser/HtmlTreeBuilderState.java +++ b/src/main/java/org/jsoup/parser/HtmlTreeBuilderState.java @@ -310,7 +310,8 @@ boolean process(Token t, HtmlTreeBuilder tb) { case EOF: if (tb.templateModeSize() > 0) return tb.process(t, InTemplate); - // todo: error if stack contains something not dd, dt, li, p, tbody, td, tfoot, th, thead, tr, body, html + if (tb.onStackNot(InBodyEndOtherErrors)) + tb.error(this); // stop parsing break; } @@ -726,16 +727,22 @@ private boolean inBodyEndTag(Token t, HtmlTreeBuilder tb) { tb.error(this); return false; } else { - // todo: error if stack contains something not dd, dt, li, optgroup, option, p, rp, rt, tbody, td, tfoot, th, thead, tr, body, html - anyOtherEndTag(t, tb); + if (tb.onStackNot(InBodyEndOtherErrors)) + tb.error(this); tb.transition(AfterBody); } break; case "html": - boolean notIgnored = tb.processEndTag("body"); - if (notIgnored) - return tb.process(endTag); - break; + if (!tb.onStack("body")) { + tb.error(this); + return false; // ignore + } else { + if (tb.onStackNot(InBodyEndOtherErrors)) + tb.error(this); + tb.transition(AfterBody); + return tb.process(t); // re-process + } + case "form": if (!tb.onStack("template")) { Element currentForm = tb.getFormElement(); @@ -1594,7 +1601,12 @@ else if (name.equals("col")) { AfterBody { boolean process(Token t, HtmlTreeBuilder tb) { if (isWhitespace(t)) { - tb.insert(t.asCharacter()); // out of spec - include whitespace. spec would move into body + // spec deviation - currently body is still on stack, but we want this to go to the html node + Element html = tb.getFromStack("html"); + if (html != null) + tb.insert(t.asCharacter(), html); + else + tb.process(t, InBody); // will get into body } else if (t.isComment()) { tb.insert(t.asComment()); // into html node } else if (t.isDoctype()) { @@ -1607,7 +1619,6 @@ boolean process(Token t, HtmlTreeBuilder tb) { tb.error(this); return false; } else { - if (tb.onStack("html")) tb.popStackToClose("html"); tb.transition(AfterAfterBody); } } else if (t.isEOF()) { @@ -1699,7 +1710,9 @@ boolean process(Token t, HtmlTreeBuilder tb) { } else if (t.isDoctype() || (t.isStartTag() && t.asStartTag().normalName().equals("html"))) { return tb.process(t, InBody); } else if (isWhitespace(t)) { - tb.insert(t.asCharacter()); + // spec deviation - body and html still on stack, but want this space to go after + Element doc = tb.getDocument(); + tb.insert(t.asCharacter(), doc); }else if (t.isEOF()) { // nice work chuck } else { @@ -1786,6 +1799,7 @@ static final class Constants { static final String[] InBodyEndClosers = new String[]{"address", "article", "aside", "blockquote", "button", "center", "details", "dir", "div", "dl", "fieldset", "figcaption", "figure", "footer", "header", "hgroup", "listing", "menu", "nav", "ol", "pre", "section", "summary", "ul"}; + static final String[] InBodyEndOtherErrors = new String[] {"body", "dd", "dt", "html", "li", "optgroup", "option", "p", "rb", "rp", "rt", "rtc", "tbody", "td", "tfoot", "th", "thead", "tr"}; static final String[] InBodyEndAdoptionFormatters = new String[]{"a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small", "strike", "strong", "tt", "u"}; static final String[] InBodyEndTableFosters = new String[]{"table", "tbody", "tfoot", "thead", "tr"}; static final String[] InTableToBody = new String[]{"tbody", "tfoot", "thead"}; diff --git a/src/main/java/org/jsoup/parser/Tag.java b/src/main/java/org/jsoup/parser/Tag.java index 366bc63e3f..97ed500402 100644 --- a/src/main/java/org/jsoup/parser/Tag.java +++ b/src/main/java/org/jsoup/parser/Tag.java @@ -246,7 +246,8 @@ protected Tag clone() { "sub", "sup", "bdo", "iframe", "embed", "span", "input", "select", "textarea", "label", "button", "optgroup", "option", "legend", "datalist", "keygen", "output", "progress", "meter", "area", "param", "source", "track", "summary", "command", "device", "area", "basefont", "bgsound", "menuitem", "param", "source", "track", - "data", "bdi", "s", "strike", "nobr" + "data", "bdi", "s", "strike", "nobr", + "rb" // deprecated but still known / special handling }; private static final String[] emptyTags = { "meta", "link", "base", "frame", "img", "br", "wbr", "embed", "hr", "input", "keygen", "col", "command", diff --git a/src/test/java/org/jsoup/parser/HtmlParserTest.java b/src/test/java/org/jsoup/parser/HtmlParserTest.java index 26b546d51e..43475ce847 100644 --- a/src/test/java/org/jsoup/parser/HtmlParserTest.java +++ b/src/test/java/org/jsoup/parser/HtmlParserTest.java @@ -1698,11 +1698,38 @@ private boolean didAddElements(String input) { parser.setTrackErrors(10); Document doc = Jsoup.parse(html, parser); ParseErrorList errors = parser.getErrors(); - assertEquals(1, errors.size()); + assertEquals(2, errors.size()); Element ruby = doc.expectFirst("ruby"); assertEquals( "
Hello
", TextUtil.stripNewlines(ruby.outerHtml())); assertEquals("<1:16>: Unexpected StartTag token [] when in state [InBody]", errors.get(0).toString()); } + + @Test void errorOnEofIfOpen() { + String html = "
"; + Parser parser = Parser.htmlParser(); + parser.setTrackErrors(10); + Document doc = Jsoup.parse(html, parser); + ParseErrorList errors = parser.getErrors(); + assertEquals(1, errors.size()); + assertEquals("Unexpected EOF token [] when in state [InBody]", errors.get(0).getErrorMessage()); + } + + @Test void NoErrorOnEofIfBodyOpen() { + String html = ""; + Parser parser = Parser.htmlParser(); + parser.setTrackErrors(10); + Document doc = Jsoup.parse(html, parser); + ParseErrorList errors = parser.getErrors(); + assertEquals(0, errors.size()); + } + + @Test void htmlClose() { + // https://github.com/jhy/jsoup/issues/1851 + String html = "
OneTwo
"; + Document doc = Jsoup.parse(html); + //assertEquals("OneTwo", doc.expectFirst("body > div").text()); + System.out.println(doc.html()); + } } diff --git a/src/test/java/org/jsoup/parser/HtmlTreeBuilderStateTest.java b/src/test/java/org/jsoup/parser/HtmlTreeBuilderStateTest.java index c2dbfe3181..55b828e893 100644 --- a/src/test/java/org/jsoup/parser/HtmlTreeBuilderStateTest.java +++ b/src/test/java/org/jsoup/parser/HtmlTreeBuilderStateTest.java @@ -45,7 +45,7 @@ static void ensureSorted(List constants) { public void ensureArraysAreSorted() { List constants = findConstantArrays(Constants.class); ensureSorted(constants); - assertEquals(38, constants.size()); + assertEquals(39, constants.size()); } @Test public void ensureTagSearchesAreKnownTags() {