From 8a4bdaef7aeaaa62d4016b890cdca87b1a701b7f Mon Sep 17 00:00:00 2001 From: Jonathan Hedley Date: Sun, 17 Dec 2023 12:06:44 +1100 Subject: [PATCH] Output polyglot commented CData sections in XML mode And only if the data does not already contain a (pseudo) CData section. Fixes #2078 --- CHANGES.md | 3 + src/main/java/org/jsoup/nodes/DataNode.java | 20 +++-- .../java/org/jsoup/helper/W3CDomTest.java | 13 +++- .../java/org/jsoup/nodes/DataNodeTest.java | 73 +++++++++++++++++++ .../java/org/jsoup/nodes/ElementTest.java | 50 +++++++++++-- 5 files changed, 143 insertions(+), 16 deletions(-) create mode 100644 src/test/java/org/jsoup/nodes/DataNodeTest.java diff --git a/CHANGES.md b/CHANGES.md index 3c1b3f1850..dca3f9b23c 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -21,6 +21,9 @@ `parent [attr=va]`, causing incorrect selections. The fix includes a EvaluatorDebug class that generates a sexpr to represent the query, allowing simpler and more thorough query parse tests. [2073](https://github.com/jhy/jsoup/issues/2073) +* When generating XML-syntax output from parsed HTML, script nodes containing (pseudo) CData sections would have an + extraneous CData section added, causing script execution errors. Now, the data content is emitted in a HTML/XML/XHTML + polyglot format, if the data is not already within a CData section. [2078](https://github.com/jhy/jsoup/issues/2078) --- Older changes for versions 0.1.1 (2010-Jan-31) through 1.17.1 (2023-Nov-27) may be found in diff --git a/src/main/java/org/jsoup/nodes/DataNode.java b/src/main/java/org/jsoup/nodes/DataNode.java index 4a0cf434f2..a357285528 100644 --- a/src/main/java/org/jsoup/nodes/DataNode.java +++ b/src/main/java/org/jsoup/nodes/DataNode.java @@ -1,6 +1,8 @@ package org.jsoup.nodes; import java.io.IOException; +import java.util.regex.Pattern; + import org.jsoup.nodes.Entities.EscapeMode; /** @@ -41,14 +43,18 @@ public DataNode setWholeData(String data) { @Override void outerHtmlHead(Appendable accum, int depth, Document.OutputSettings out) throws IOException { - if (out.syntax() == Document.OutputSettings.Syntax.xml) { - // In XML mode, output data nodes as CDATA, so can parse as XML - accum - .append(""); + /* For XML output, escape the DataNode in a CData section. The data may contain pseudo-CData content if it was + parsed as HTML, so don't double up Cdata. Output in polygot HTML / XHTML / XML format. */ + final String data = getWholeData(); + if (out.syntax() == Document.OutputSettings.Syntax.xml && !data.contains(""); + else if (hasParent() && parentNode.normalName().equals("style")) + accum.append("/**/"); + else + accum.append(""); } else { - // In HTML, data is not escaped in return from data nodes, so " in script, style is plain + // In HTML, data is not escaped in the output of data nodes, so < and & in script, style is OK accum.append(getWholeData()); } } diff --git a/src/test/java/org/jsoup/helper/W3CDomTest.java b/src/test/java/org/jsoup/helper/W3CDomTest.java index fe8379aea8..de0ce16694 100644 --- a/src/test/java/org/jsoup/helper/W3CDomTest.java +++ b/src/test/java/org/jsoup/helper/W3CDomTest.java @@ -351,12 +351,17 @@ public void canOutputHtmlWithoutNamespace() { org.jsoup.nodes.Document jdoc = Jsoup.parse(html); jdoc.outputSettings().syntax(org.jsoup.nodes.Document.OutputSettings.Syntax.xml); String xml = jdoc.body().html(); - assertTrue(xml.contains("")); // as asserted in ElementTest Document doc = parseXml(xml, false); NodeList list = xpath(doc, "//script"); - assertEquals(1, list.getLength()); - Node script = list.item(0); // will be the cdata node - assertEquals("1 && 2", script.getTextContent()); + assertEquals(2, list.getLength()); + Node scriptComment = list.item(0); // will be the cdata node + assertEquals("//", scriptComment.getTextContent()); + Node script = list.item(1); + assertEquals("\n" + + "1 && 2\n" + + "//", script.getTextContent()); + } @Test public void handlesEmptyDoctype() { diff --git a/src/test/java/org/jsoup/nodes/DataNodeTest.java b/src/test/java/org/jsoup/nodes/DataNodeTest.java new file mode 100644 index 0000000000..a7b626dee2 --- /dev/null +++ b/src/test/java/org/jsoup/nodes/DataNodeTest.java @@ -0,0 +1,73 @@ +package org.jsoup.nodes; + +import org.junit.jupiter.api.Test; + +import java.io.IOException; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +public class DataNodeTest { + + @Test + public void xmlOutputScriptWithCData() throws IOException { + DataNode node = new DataNode("// data]]>"); + node.parentNode = new Element("script"); + StringBuilder accum = new StringBuilder(); + node.outerHtmlHead(accum, 0, new Document.OutputSettings().syntax(Document.OutputSettings.Syntax.xml)); + assertEquals("// data]]>", accum.toString()); + } + + @Test + public void xmlOutputScriptWithoutCData() throws IOException { + DataNode node = new DataNode("script && <> data"); + node.parentNode = new Element("script"); + StringBuilder accum = new StringBuilder(); + node.outerHtmlHead(accum, 0, new Document.OutputSettings().syntax(Document.OutputSettings.Syntax.xml)); + assertEquals("// data\n//]]>", accum.toString()); + } + + @Test + public void xmlOutputStyleWithCData() throws IOException { + DataNode node = new DataNode("/* data]]>"); + node.parentNode = new Element("style"); + StringBuilder accum = new StringBuilder(); + node.outerHtmlHead(accum, 0, new Document.OutputSettings().syntax(Document.OutputSettings.Syntax.xml)); + assertEquals("/* data]]>", accum.toString()); + } + + @Test + public void xmlOutputStyleWithoutCData() throws IOException { + DataNode node = new DataNode("style && <> data"); + node.parentNode = new Element("style"); + StringBuilder accum = new StringBuilder(); + node.outerHtmlHead(accum, 0, new Document.OutputSettings().syntax(Document.OutputSettings.Syntax.xml)); + assertEquals("/* data\n/*]]>*/", accum.toString()); + } + + @Test + public void xmlOutputOtherWithCData() throws IOException { + DataNode node = new DataNode(" data]]>"); + node.parentNode = new Element("other"); + StringBuilder accum = new StringBuilder(); + node.outerHtmlHead(accum, 0, new Document.OutputSettings().syntax(Document.OutputSettings.Syntax.xml)); + assertEquals(" data]]>", accum.toString()); + } + + @Test + public void xmlOutputOtherWithoutCData() throws IOException { + DataNode node = new DataNode("other && <> data"); + node.parentNode = new Element("other"); + StringBuilder accum = new StringBuilder(); + node.outerHtmlHead(accum, 0, new Document.OutputSettings().syntax(Document.OutputSettings.Syntax.xml)); + assertEquals(" data]]>", accum.toString()); + } + + @Test + public void xmlOutputOrphanWithoutCData() throws IOException { + DataNode node = new DataNode("other && <> data"); + StringBuilder accum = new StringBuilder(); + node.outerHtmlHead(accum, 0, new Document.OutputSettings().syntax(Document.OutputSettings.Syntax.xml)); + assertEquals(" data]]>", accum.toString()); + } + +} diff --git a/src/test/java/org/jsoup/nodes/ElementTest.java b/src/test/java/org/jsoup/nodes/ElementTest.java index b4d3747754..71a43b7898 100644 --- a/src/test/java/org/jsoup/nodes/ElementTest.java +++ b/src/test/java/org/jsoup/nodes/ElementTest.java @@ -2755,16 +2755,56 @@ void prettySerializationRoundTrips(Document.OutputSettings settings) { assertEquals("1 && 2", scriptDataNode.getWholeData()); doc.outputSettings().syntax(Document.OutputSettings.Syntax.xml); - String xml = doc.body().html(); + Element p = doc.expectFirst("p"); + String xml = p.html(); + assertEquals( + "\n" + + " 5 && 6", + xml); + + Document xmlDoc = Jsoup.parse(xml, Parser.xmlParser()); + assertEquals(xml, xmlDoc.html()); + Element scriptXmlEl = xmlDoc.expectFirst("script"); + TextNode scriptText = (TextNode) scriptXmlEl.childNode(0); + assertEquals("//", scriptText.getWholeText()); + CDataNode scriptCdata = (CDataNode) scriptXmlEl.childNode(1); + assertEquals("\n1 && 2\n//", scriptCdata.text()); + } + + @Test void datanodesOutputExistingCdataInXhtml() { + String html = "

5 && 6

";; + Document doc = Jsoup.parse(html); // parsed as HTML + String out = TextUtil.normalizeSpaces(doc.body().html()); + assertEquals("

5 && 6

", out); + Element scriptEl = doc.expectFirst("script"); + DataNode scriptDataNode = (DataNode) scriptEl.childNode(0); + assertEquals("//", scriptDataNode.getWholeData()); + + doc.outputSettings().syntax(Document.OutputSettings.Syntax.xml); + Element p = doc.expectFirst("p"); + String xml = p.html(); assertEquals( - "

5 && 6

", - TextUtil.normalizeSpaces(xml)); + "\n" + + " 5 && 6", + xml); Document xmlDoc = Jsoup.parse(xml, Parser.xmlParser()); assertEquals(xml, xmlDoc.html()); Element scriptXmlEl = xmlDoc.expectFirst("script"); - CDataNode scriptCdata = (CDataNode) scriptXmlEl.childNode(0); - assertEquals(scriptCdata.text(), scriptDataNode.getWholeData()); + TextNode scriptText = (TextNode) scriptXmlEl.childNode(0); + assertEquals("//", scriptText.getWholeText()); + CDataNode scriptCdata = (CDataNode) scriptXmlEl.childNode(1); + assertEquals("\n1 && 2\n//", scriptCdata.text()); } @Test void outerHtmlAppendable() {