Merge branch 'pr/574'

jhy · Jul 6, 2015 · e6514a0 · e6514a0
2 parents 2736346 + 5236f0b
commit e6514a0
Show file tree

Hide file tree

Showing 5 changed files with 45 additions and 7 deletions.
diff --git a/CHANGES b/CHANGES
@@ -3,6 +3,9 @@ jsoup changelog
 *** Release 1.8.3 [PENDING]
  * Added support for custom boolean attributes.
    <https://github.com/jhy/jsoup/pull/555>
+
+ * When fetching XML URLs, automatically switch to the XML parser instead of the HTML parser.
+   <https://github.com/jhy/jsoup/pull/574>
 
  * Fixed an issue in Element.getElementSiblingIndex (and related methods) where sibling elements with the same content
    would incorrectly have the same sibling index.

diff --git a/src/main/java/org/jsoup/Connection.java b/src/main/java/org/jsoup/Connection.java
@@ -1,6 +1,7 @@
 package org.jsoup;
 
 import org.jsoup.nodes.Document;
+import org.jsoup.parser.HtmlTreeBuilder;
 import org.jsoup.parser.Parser;
 
 import java.io.IOException;
@@ -210,7 +211,8 @@ public final boolean hasBody() {
     Connection cookies(Map<String, String> cookies);
 
     /**
-     * Provide an alternate parser to use when parsing the response to a Document.
+     * Provide an alternate parser to use when parsing the response to a Document. If not set, defaults to the HTML
+     * parser, unless the response content-type is XML, in which case the XML parser is used.
      * @param parser alternate parser
      * @return this Connection, for chaining
      */

diff --git a/src/main/java/org/jsoup/helper/HttpConnection.java b/src/main/java/org/jsoup/helper/HttpConnection.java
@@ -357,6 +357,7 @@ public static class Request extends HttpConnection.Base<Connection.Request> impl
         private boolean ignoreHttpErrors = false;
         private boolean ignoreContentType = false;
         private Parser parser;
+        private boolean parserDefined = false; // called parser(...) vs initialized in ctor
         private boolean validateTSLCertificates = true;
         private String postDataCharset = DataUtil.defaultCharset;
 
@@ -437,6 +438,7 @@ public Collection<Connection.KeyVal> data() {
 
         public Request parser(Parser parser) {
             this.parser = parser;
+            parserDefined = true;
             return this;
         }
 
@@ -470,11 +472,9 @@ public static class Response extends HttpConnection.Base<Connection.Response> im
         private Connection.Request req;
 
         /*
-         * For example {@code application/atom+xml;charset=utf-8}.
-         * Stepping through it: start with {@code "application/"}, follow with word
-         * characters up to a {@code "+xml"}, and then maybe more ({@code .*}).
+         * Matches XML content types (like text/xml, application/xhtml+xml;charset=UTF8, etc)
          */
-        private static final Pattern xmlContentTypeRxp = Pattern.compile("application/\\w+\\+xml.*");
+        private static final Pattern xmlContentTypeRxp = Pattern.compile("(application|text)/\\w*\\+?xml.*");
 
         Response() {
             super();
@@ -541,12 +541,19 @@ static Response execute(Connection.Request req, Response previousResponse) throw
                 if (contentType != null
                         && !req.ignoreContentType()
                         && !contentType.startsWith("text/")
-                        && !contentType.startsWith("application/xml")
                         && !xmlContentTypeRxp.matcher(contentType).matches()
                         )
                     throw new UnsupportedMimeTypeException("Unhandled content type. Must be text/*, application/xml, or application/xhtml+xml",
                             contentType, req.url().toString());
 
+                // switch to the XML parser if content type is xml and not parser not explicitly set
+                if (contentType != null && xmlContentTypeRxp.matcher(contentType).matches()) {
+                    // only flip it if a HttpConnection.Request (i.e. don't presume other impls want it):
+                    if (req instanceof HttpConnection.Request && !((Request) req).parserDefined) {
+                        req.parser(Parser.xmlParser());
+                    }
+                }
+
                 res.charset = DataUtil.getCharsetFromContentType(res.contentType); // may be null, readInputStream deals with it
                 if (conn.getContentLength() != 0) { // -1 means unknown, chunked. sun throws an IO exception on 500 response with no content when trying to read body
                     InputStream bodyStream = null;

diff --git a/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java b/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java
@@ -11,7 +11,7 @@
 /**
  * HTML Tree Builder; creates a DOM from Tokens.
  */
-class HtmlTreeBuilder extends TreeBuilder {
+public class HtmlTreeBuilder extends TreeBuilder {
     // tag searches
     private static final String[] TagsScriptStyle = new String[]{"script", "style"};
     public static final String[] TagsSearchInScope = new String[]{"applet", "caption", "html", "table", "td", "th", "marquee", "object"};

diff --git a/src/test/java/org/jsoup/integration/UrlConnectTest.java b/src/test/java/org/jsoup/integration/UrlConnectTest.java
@@ -4,9 +4,13 @@
 import org.jsoup.HttpStatusException;
 import org.jsoup.Jsoup;
 import org.jsoup.UnsupportedMimeTypeException;
+import org.jsoup.helper.StringUtil;
 import org.jsoup.helper.W3CDom;
 import org.jsoup.nodes.Document;
 import org.jsoup.nodes.FormElement;
+import org.jsoup.parser.HtmlTreeBuilder;
+import org.jsoup.parser.Parser;
+import org.jsoup.parser.XmlTreeBuilder;
 import org.junit.Ignore;
 import org.junit.Test;
 
@@ -494,4 +498,26 @@ public void fetchToW3c() throws IOException {
         assertTrue(html.contains("jsoup"));
     }
 
+    @Test
+    public void fetchHandlesXml() throws IOException {
+        // should auto-detect xml and use XML parser, unless explicitly requested the html parser
+        String xmlUrl = "http://direct.infohound.net/tools/parse-xml.xml";
+        Connection con = Jsoup.connect(xmlUrl);
+        Document doc = con.get();
+        Connection.Request req = con.request();
+        assertTrue(req.parser().getTreeBuilder() instanceof XmlTreeBuilder);
+        assertEquals("<xml> <link> one </link> <table> Two </table> </xml>", StringUtil.normaliseWhitespace(doc.outerHtml()));
+    }
+
+    @Test
+    public void fetchHandlesXmlAsHtmlWhenParserSet() throws IOException {
+        // should auto-detect xml and use XML parser, unless explicitly requested the html parser
+        String xmlUrl = "http://direct.infohound.net/tools/parse-xml.xml";
+        Connection con = Jsoup.connect(xmlUrl).parser(Parser.htmlParser());
+        Document doc = con.get();
+        Connection.Request req = con.request();
+        assertTrue(req.parser().getTreeBuilder() instanceof HtmlTreeBuilder);
+        assertEquals("<html> <head></head> <body> <xml> <link>one <table> Two </table> </xml> </body> </html>", StringUtil.normaliseWhitespace(doc.outerHtml()));
+    }
+
 }