Skip to content

Commit

Permalink
Output polyglot commented CData sections in XML mode
Browse files Browse the repository at this point in the history
And only if the data does not already contain a (pseudo) CData section.

Fixes #2078
  • Loading branch information
jhy committed Dec 17, 2023
1 parent 74a1f6b commit 8a4bdae
Show file tree
Hide file tree
Showing 5 changed files with 143 additions and 16 deletions.
3 changes: 3 additions & 0 deletions CHANGES.md
Expand Up @@ -21,6 +21,9 @@
`parent [attr=va]`, causing incorrect selections. The fix includes a EvaluatorDebug class that generates a sexpr
to represent the query, allowing simpler and more thorough query parse
tests. [2073](https://github.com/jhy/jsoup/issues/2073)
* When generating XML-syntax output from parsed HTML, script nodes containing (pseudo) CData sections would have an
extraneous CData section added, causing script execution errors. Now, the data content is emitted in a HTML/XML/XHTML
polyglot format, if the data is not already within a CData section. [2078](https://github.com/jhy/jsoup/issues/2078)

---
Older changes for versions 0.1.1 (2010-Jan-31) through 1.17.1 (2023-Nov-27) may be found in
Expand Down
20 changes: 13 additions & 7 deletions src/main/java/org/jsoup/nodes/DataNode.java
@@ -1,6 +1,8 @@
package org.jsoup.nodes;

import java.io.IOException;
import java.util.regex.Pattern;

import org.jsoup.nodes.Entities.EscapeMode;

/**
Expand Down Expand Up @@ -41,14 +43,18 @@ public DataNode setWholeData(String data) {

@Override
void outerHtmlHead(Appendable accum, int depth, Document.OutputSettings out) throws IOException {
if (out.syntax() == Document.OutputSettings.Syntax.xml) {
// In XML mode, output data nodes as CDATA, so can parse as XML
accum
.append("<![CDATA[")
.append(getWholeData())
.append("]]>");
/* For XML output, escape the DataNode in a CData section. The data may contain pseudo-CData content if it was
parsed as HTML, so don't double up Cdata. Output in polygot HTML / XHTML / XML format. */
final String data = getWholeData();
if (out.syntax() == Document.OutputSettings.Syntax.xml && !data.contains("<![CDATA[")) {
if (hasParent() && parentNode.normalName().equals("script"))
accum.append("//<![CDATA[\n").append(data).append("\n//]]>");
else if (hasParent() && parentNode.normalName().equals("style"))
accum.append("/*<![CDATA[*/\n").append(data).append("\n/*]]>*/");
else
accum.append("<![CDATA[").append(data).append("]]>");
} else {
// In HTML, data is not escaped in return from data nodes, so " in script, style is plain
// In HTML, data is not escaped in the output of data nodes, so < and & in script, style is OK
accum.append(getWholeData());
}
}
Expand Down
13 changes: 9 additions & 4 deletions src/test/java/org/jsoup/helper/W3CDomTest.java
Expand Up @@ -351,12 +351,17 @@ public void canOutputHtmlWithoutNamespace() {
org.jsoup.nodes.Document jdoc = Jsoup.parse(html);
jdoc.outputSettings().syntax(org.jsoup.nodes.Document.OutputSettings.Syntax.xml);
String xml = jdoc.body().html();
assertTrue(xml.contains("<script><![CDATA[")); // as asserted in ElementTest
assertTrue(xml.contains("<script>//<![CDATA[\n1 && 2\n//]]></script>")); // as asserted in ElementTest
Document doc = parseXml(xml, false);
NodeList list = xpath(doc, "//script");
assertEquals(1, list.getLength());
Node script = list.item(0); // will be the cdata node
assertEquals("1 && 2", script.getTextContent());
assertEquals(2, list.getLength());
Node scriptComment = list.item(0); // will be the cdata node
assertEquals("//", scriptComment.getTextContent());
Node script = list.item(1);
assertEquals("\n" +
"1 && 2\n" +
"//", script.getTextContent());

}

@Test public void handlesEmptyDoctype() {
Expand Down
73 changes: 73 additions & 0 deletions src/test/java/org/jsoup/nodes/DataNodeTest.java
@@ -0,0 +1,73 @@
package org.jsoup.nodes;

import org.junit.jupiter.api.Test;

import java.io.IOException;

import static org.junit.jupiter.api.Assertions.assertEquals;

public class DataNodeTest {

@Test
public void xmlOutputScriptWithCData() throws IOException {
DataNode node = new DataNode("//<![CDATA[\nscript && <> data]]>");
node.parentNode = new Element("script");
StringBuilder accum = new StringBuilder();
node.outerHtmlHead(accum, 0, new Document.OutputSettings().syntax(Document.OutputSettings.Syntax.xml));
assertEquals("//<![CDATA[\nscript && <> data]]>", accum.toString());
}

@Test
public void xmlOutputScriptWithoutCData() throws IOException {
DataNode node = new DataNode("script && <> data");
node.parentNode = new Element("script");
StringBuilder accum = new StringBuilder();
node.outerHtmlHead(accum, 0, new Document.OutputSettings().syntax(Document.OutputSettings.Syntax.xml));
assertEquals("//<![CDATA[\nscript && <> data\n//]]>", accum.toString());
}

@Test
public void xmlOutputStyleWithCData() throws IOException {
DataNode node = new DataNode("/*<![CDATA[*/\nstyle && <> data]]>");
node.parentNode = new Element("style");
StringBuilder accum = new StringBuilder();
node.outerHtmlHead(accum, 0, new Document.OutputSettings().syntax(Document.OutputSettings.Syntax.xml));
assertEquals("/*<![CDATA[*/\nstyle && <> data]]>", accum.toString());
}

@Test
public void xmlOutputStyleWithoutCData() throws IOException {
DataNode node = new DataNode("style && <> data");
node.parentNode = new Element("style");
StringBuilder accum = new StringBuilder();
node.outerHtmlHead(accum, 0, new Document.OutputSettings().syntax(Document.OutputSettings.Syntax.xml));
assertEquals("/*<![CDATA[*/\nstyle && <> data\n/*]]>*/", accum.toString());
}

@Test
public void xmlOutputOtherWithCData() throws IOException {
DataNode node = new DataNode("<![CDATA[other && <> data]]>");
node.parentNode = new Element("other");
StringBuilder accum = new StringBuilder();
node.outerHtmlHead(accum, 0, new Document.OutputSettings().syntax(Document.OutputSettings.Syntax.xml));
assertEquals("<![CDATA[other && <> data]]>", accum.toString());
}

@Test
public void xmlOutputOtherWithoutCData() throws IOException {
DataNode node = new DataNode("other && <> data");
node.parentNode = new Element("other");
StringBuilder accum = new StringBuilder();
node.outerHtmlHead(accum, 0, new Document.OutputSettings().syntax(Document.OutputSettings.Syntax.xml));
assertEquals("<![CDATA[other && <> data]]>", accum.toString());
}

@Test
public void xmlOutputOrphanWithoutCData() throws IOException {
DataNode node = new DataNode("other && <> data");
StringBuilder accum = new StringBuilder();
node.outerHtmlHead(accum, 0, new Document.OutputSettings().syntax(Document.OutputSettings.Syntax.xml));
assertEquals("<![CDATA[other && <> data]]>", accum.toString());
}

}
50 changes: 45 additions & 5 deletions src/test/java/org/jsoup/nodes/ElementTest.java
Expand Up @@ -2755,16 +2755,56 @@ void prettySerializationRoundTrips(Document.OutputSettings settings) {
assertEquals("1 && 2", scriptDataNode.getWholeData());

doc.outputSettings().syntax(Document.OutputSettings.Syntax.xml);
String xml = doc.body().html();
Element p = doc.expectFirst("p");
String xml = p.html();
assertEquals(
"<script>//<![CDATA[\n" +
"1 && 2\n" +
"//]]></script>\n" +
"<style>/*<![CDATA[*/\n" +
"3 && 4\n" +
"/*]]>*/</style> 5 &amp;&amp; 6",
xml);

Document xmlDoc = Jsoup.parse(xml, Parser.xmlParser());
assertEquals(xml, xmlDoc.html());
Element scriptXmlEl = xmlDoc.expectFirst("script");
TextNode scriptText = (TextNode) scriptXmlEl.childNode(0);
assertEquals("//", scriptText.getWholeText());
CDataNode scriptCdata = (CDataNode) scriptXmlEl.childNode(1);
assertEquals("\n1 && 2\n//", scriptCdata.text());
}

@Test void datanodesOutputExistingCdataInXhtml() {
String html = "<p><script>//<![CDATA[\n1 && 2\n//]]></script><style>\n/*<![CDATA[*/3 && 4\n/*]]>*/</style> 5 &amp;&amp; 6</p>";;
Document doc = Jsoup.parse(html); // parsed as HTML
String out = TextUtil.normalizeSpaces(doc.body().html());
assertEquals("<p><script>//<![CDATA[1 && 2//]]></script><style>/*<![CDATA[*/3 && 4/*]]>*/</style> 5 &amp;&amp; 6</p>", out);
Element scriptEl = doc.expectFirst("script");
DataNode scriptDataNode = (DataNode) scriptEl.childNode(0);
assertEquals("//<![CDATA[\n" +
"1 && 2\n" +
"//]]>", scriptDataNode.getWholeData());

doc.outputSettings().syntax(Document.OutputSettings.Syntax.xml);
Element p = doc.expectFirst("p");
String xml = p.html();
assertEquals(
"<p><script><![CDATA[1 && 2]]></script><style><![CDATA[3 && 4]]></style> 5 &amp;&amp; 6</p>",
TextUtil.normalizeSpaces(xml));
"<script>//<![CDATA[\n" +
"1 && 2\n" +
"//]]></script>\n" +
"<style>\n" +
"/*<![CDATA[*/3 && 4\n" +
"/*]]>*/</style> 5 &amp;&amp; 6",
xml);

Document xmlDoc = Jsoup.parse(xml, Parser.xmlParser());
assertEquals(xml, xmlDoc.html());
Element scriptXmlEl = xmlDoc.expectFirst("script");
CDataNode scriptCdata = (CDataNode) scriptXmlEl.childNode(0);
assertEquals(scriptCdata.text(), scriptDataNode.getWholeData());
TextNode scriptText = (TextNode) scriptXmlEl.childNode(0);
assertEquals("//", scriptText.getWholeText());
CDataNode scriptCdata = (CDataNode) scriptXmlEl.childNode(1);
assertEquals("\n1 && 2\n//", scriptCdata.text());
}

@Test void outerHtmlAppendable() {
Expand Down

0 comments on commit 8a4bdae

Please sign in to comment.