Skip to content

Commit

Permalink
Fix an issue where text order was incorrect when parsing pre-document…
Browse files Browse the repository at this point in the history
… HTML.

Fixes #23
  • Loading branch information
jhy committed Jul 2, 2011
1 parent 77add79 commit 27a52f9
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 5 deletions.
5 changes: 5 additions & 0 deletions CHANGES
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
jsoup changelog

*** Release 1.1.2 (pending)
* Fix an issue where text order was incorrect when parsing pre-document
HTML.
<http://github.com/jhy/jsoup/issues/issue/23>

*** Release 1.1.1
* Added selector support for :eq, :lt, and :gt
<http://github.com/jhy/jsoup/issues/issue/16>
Expand Down
12 changes: 7 additions & 5 deletions src/main/java/org/jsoup/nodes/Document.java
Original file line number Diff line number Diff line change
Expand Up @@ -100,14 +100,16 @@ public Document normalise() {
if (body() == null)
select("html").first().appendElement("body");

normalise(this);
normalise(select("html").first());
// pull text nodes out of root, html, and head els, and push into body. non-text nodes are already taken care
// of. do in inverse order to maintain text order.
normalise(head());
normalise(select("html").first());
normalise(this);

return this;
}

// does not recurse. the result order isn't great here (not intuitive); they are in the body though.
// does not recurse.
private void normalise(Element element) {
List<Node> toMove = new ArrayList<Node>();
for (Node node: element.childNodes) {
Expand All @@ -120,8 +122,8 @@ private void normalise(Element element) {

for (Node node: toMove) {
element.removeChild(node);
body().appendChild(new TextNode(" ", ""));
body().appendChild(node);
body().prependChild(node);
body().prependChild(new TextNode(" ", ""));
}
}

Expand Down
9 changes: 9 additions & 0 deletions src/test/java/org/jsoup/parser/ParserTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,15 @@ public class ParserTest {
assertEquals("Hello world", body.children().get(0).text());
}

@Test public void createsStructureFromBodySnippet() {
// the bar baz stuff naturally goes into the body, but the 'foo' goes into root, and the normalisation routine
// needs to move into the start of the body
String html = "foo <b>bar</b> baz";
Document doc = Jsoup.parse(html);
assertEquals ("foo bar baz", doc.text());

}

@Test public void handlesEscapedData() {
String html = "<div title='Surf &amp; Turf'>Reef &amp; Beef</div>";
Document doc = Jsoup.parse(html);
Expand Down

0 comments on commit 27a52f9

Please sign in to comment.