Skip to content

Commit

Permalink
Updated AfterBody and AfterAfterBody to current spec
Browse files Browse the repository at this point in the history
Don't pop stack to close on </body> or </html>, but leave them on the stack.

Had to deviate from the spec slightly to allow whitespace to be added to the html or doc elements. (The goal of that is so that when pretty-printing is off, the output more closely resembles the input, by tracking newlines after </body> etc)

Fixes #1851
  • Loading branch information
jhy committed Mar 27, 2023
1 parent 21aac91 commit dea4969
Show file tree
Hide file tree
Showing 6 changed files with 81 additions and 14 deletions.
5 changes: 5 additions & 0 deletions CHANGES
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,11 @@ Release 1.16.1 [PENDING]
* Improvement: Calling Node.remove() on a node with no parent is now a no-op, vs a validation error.
<https://github.com/jhy/jsoup/issues/1898>

* Bugfix: aligned the HTML Tree Builder processing steps for AfterBody and AfterAfterBody to the updated WHATWG
standard, to not pop the stack to close <body> or <html> elements. This prevents an errant </html> closing preceding
structure. Also added appropriate error message outputs in this case.
<https://github.com/jhy/jsoup/issues/1851>

* Bugfix: Corrected support for ruby elements (<ruby>, <rp>, <rt>, and <rtc>) to current spec.
<https://github.com/jhy/jsoup/issues/1294>

Expand Down
22 changes: 21 additions & 1 deletion src/main/java/org/jsoup/parser/HtmlTreeBuilder.java
Original file line number Diff line number Diff line change
Expand Up @@ -301,9 +301,14 @@ void insert(Token.Comment commentToken) {
insertNode(comment, commentToken);
}

/** Inserts the provided character token into the current element. */
void insert(Token.Character characterToken) {
final Element el = currentElement(); // will be doc if no current element; allows for whitespace to be inserted into the doc root object (not on the stack)
insert(characterToken, el);
}

void insert(Token.Character characterToken, Element el) {
final Node node;
Element el = currentElement(); // will be doc if no current element; allows for whitespace to be inserted into the doc root object (not on the stack)
final String tagName = el.normalName();
final String data = characterToken.getData();

Expand All @@ -317,6 +322,7 @@ else if (isContentForTagData(tagName))
onNodeInserted(node, characterToken);
}

/** Inserts the provided character token into the provided element. Use when not going onto stack element */
private void insertNode(Node node, @Nullable Token token) {
// if the stack hasn't been set up yet, elements (doctype, comments) go into the doc
if (stack.isEmpty())
Expand Down Expand Up @@ -632,6 +638,20 @@ boolean inSelectScope(String targetName) {
return false;
}

/** Tests if there is some element on the stack that is not in the provided set. */
boolean onStackNot(String[] allowedTags) {
final int bottom = stack.size() -1;
final int top = bottom > MaxScopeSearchDepth ? bottom - MaxScopeSearchDepth : 0;
// don't walk too far up the tree

for (int pos = bottom; pos >= top; pos--) {
final String elName = stack.get(pos).normalName();
if (!inSorted(elName, allowedTags))
return true;
}
return false;
}

void setHeadElement(Element headElement) {
this.headElement = headElement;
}
Expand Down
34 changes: 24 additions & 10 deletions src/main/java/org/jsoup/parser/HtmlTreeBuilderState.java
Original file line number Diff line number Diff line change
Expand Up @@ -310,7 +310,8 @@ boolean process(Token t, HtmlTreeBuilder tb) {
case EOF:
if (tb.templateModeSize() > 0)
return tb.process(t, InTemplate);
// todo: error if stack contains something not dd, dt, li, p, tbody, td, tfoot, th, thead, tr, body, html
if (tb.onStackNot(InBodyEndOtherErrors))
tb.error(this);
// stop parsing
break;
}
Expand Down Expand Up @@ -726,16 +727,22 @@ private boolean inBodyEndTag(Token t, HtmlTreeBuilder tb) {
tb.error(this);
return false;
} else {
// todo: error if stack contains something not dd, dt, li, optgroup, option, p, rp, rt, tbody, td, tfoot, th, thead, tr, body, html
anyOtherEndTag(t, tb);
if (tb.onStackNot(InBodyEndOtherErrors))
tb.error(this);
tb.transition(AfterBody);
}
break;
case "html":
boolean notIgnored = tb.processEndTag("body");
if (notIgnored)
return tb.process(endTag);
break;
if (!tb.onStack("body")) {
tb.error(this);
return false; // ignore
} else {
if (tb.onStackNot(InBodyEndOtherErrors))
tb.error(this);
tb.transition(AfterBody);
return tb.process(t); // re-process
}

case "form":
if (!tb.onStack("template")) {
Element currentForm = tb.getFormElement();
Expand Down Expand Up @@ -1594,7 +1601,12 @@ else if (name.equals("col")) {
AfterBody {
boolean process(Token t, HtmlTreeBuilder tb) {
if (isWhitespace(t)) {
tb.insert(t.asCharacter()); // out of spec - include whitespace. spec would move into body
// spec deviation - currently body is still on stack, but we want this to go to the html node
Element html = tb.getFromStack("html");
if (html != null)
tb.insert(t.asCharacter(), html);
else
tb.process(t, InBody); // will get into body
} else if (t.isComment()) {
tb.insert(t.asComment()); // into html node
} else if (t.isDoctype()) {
Expand All @@ -1607,7 +1619,6 @@ boolean process(Token t, HtmlTreeBuilder tb) {
tb.error(this);
return false;
} else {
if (tb.onStack("html")) tb.popStackToClose("html");
tb.transition(AfterAfterBody);
}
} else if (t.isEOF()) {
Expand Down Expand Up @@ -1699,7 +1710,9 @@ boolean process(Token t, HtmlTreeBuilder tb) {
} else if (t.isDoctype() || (t.isStartTag() && t.asStartTag().normalName().equals("html"))) {
return tb.process(t, InBody);
} else if (isWhitespace(t)) {
tb.insert(t.asCharacter());
// spec deviation - body and html still on stack, but want this space to go after </html>
Element doc = tb.getDocument();
tb.insert(t.asCharacter(), doc);
}else if (t.isEOF()) {
// nice work chuck
} else {
Expand Down Expand Up @@ -1786,6 +1799,7 @@ static final class Constants {
static final String[] InBodyEndClosers = new String[]{"address", "article", "aside", "blockquote", "button", "center", "details", "dir", "div",
"dl", "fieldset", "figcaption", "figure", "footer", "header", "hgroup", "listing", "menu",
"nav", "ol", "pre", "section", "summary", "ul"};
static final String[] InBodyEndOtherErrors = new String[] {"body", "dd", "dt", "html", "li", "optgroup", "option", "p", "rb", "rp", "rt", "rtc", "tbody", "td", "tfoot", "th", "thead", "tr"};
static final String[] InBodyEndAdoptionFormatters = new String[]{"a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small", "strike", "strong", "tt", "u"};
static final String[] InBodyEndTableFosters = new String[]{"table", "tbody", "tfoot", "thead", "tr"};
static final String[] InTableToBody = new String[]{"tbody", "tfoot", "thead"};
Expand Down
3 changes: 2 additions & 1 deletion src/main/java/org/jsoup/parser/Tag.java
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,8 @@ protected Tag clone() {
"sub", "sup", "bdo", "iframe", "embed", "span", "input", "select", "textarea", "label", "button", "optgroup",
"option", "legend", "datalist", "keygen", "output", "progress", "meter", "area", "param", "source", "track",
"summary", "command", "device", "area", "basefont", "bgsound", "menuitem", "param", "source", "track",
"data", "bdi", "s", "strike", "nobr"
"data", "bdi", "s", "strike", "nobr",
"rb" // deprecated but still known / special handling
};
private static final String[] emptyTags = {
"meta", "link", "base", "frame", "img", "br", "wbr", "embed", "hr", "input", "keygen", "col", "command",
Expand Down
29 changes: 28 additions & 1 deletion src/test/java/org/jsoup/parser/HtmlParserTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -1698,11 +1698,38 @@ private boolean didAddElements(String input) {
parser.setTrackErrors(10);
Document doc = Jsoup.parse(html, parser);
ParseErrorList errors = parser.getErrors();
assertEquals(1, errors.size());
assertEquals(2, errors.size());
Element ruby = doc.expectFirst("ruby");
assertEquals(
"<ruby><div><rp>Hello</rp></div></ruby>",
TextUtil.stripNewlines(ruby.outerHtml()));
assertEquals("<1:16>: Unexpected StartTag token [<rp>] when in state [InBody]", errors.get(0).toString());
}

@Test void errorOnEofIfOpen() {
String html = "<div>";
Parser parser = Parser.htmlParser();
parser.setTrackErrors(10);
Document doc = Jsoup.parse(html, parser);
ParseErrorList errors = parser.getErrors();
assertEquals(1, errors.size());
assertEquals("Unexpected EOF token [] when in state [InBody]", errors.get(0).getErrorMessage());
}

@Test void NoErrorOnEofIfBodyOpen() {
String html = "<body>";
Parser parser = Parser.htmlParser();
parser.setTrackErrors(10);
Document doc = Jsoup.parse(html, parser);
ParseErrorList errors = parser.getErrors();
assertEquals(0, errors.size());
}

@Test void htmlClose() {
// https://github.com/jhy/jsoup/issues/1851
String html = "<body><div>One</html>Two</div></body>";
Document doc = Jsoup.parse(html);
//assertEquals("OneTwo", doc.expectFirst("body > div").text());
System.out.println(doc.html());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ static void ensureSorted(List<Object[]> constants) {
public void ensureArraysAreSorted() {
List<Object[]> constants = findConstantArrays(Constants.class);
ensureSorted(constants);
assertEquals(38, constants.size());
assertEquals(39, constants.size());
}

@Test public void ensureTagSearchesAreKnownTags() {
Expand Down

0 comments on commit dea4969

Please sign in to comment.