diff --git a/CHANGES b/CHANGES index 2f0446f321..2af9dabe70 100644 --- a/CHANGES +++ b/CHANGES @@ -3,6 +3,9 @@ jsoup changelog *** Release 1.8.3 [PENDING] * Added support for custom boolean attributes. + + * When fetching XML URLs, automatically switch to the XML parser instead of the HTML parser. + * Fixed an issue in Element.getElementSiblingIndex (and related methods) where sibling elements with the same content would incorrectly have the same sibling index. diff --git a/src/main/java/org/jsoup/Connection.java b/src/main/java/org/jsoup/Connection.java index d0639bbb5d..89ab899d99 100644 --- a/src/main/java/org/jsoup/Connection.java +++ b/src/main/java/org/jsoup/Connection.java @@ -1,6 +1,7 @@ package org.jsoup; import org.jsoup.nodes.Document; +import org.jsoup.parser.HtmlTreeBuilder; import org.jsoup.parser.Parser; import java.io.IOException; @@ -210,7 +211,8 @@ public final boolean hasBody() { Connection cookies(Map cookies); /** - * Provide an alternate parser to use when parsing the response to a Document. + * Provide an alternate parser to use when parsing the response to a Document. If not set, defaults to the HTML + * parser, unless the response content-type is XML, in which case the XML parser is used. * @param parser alternate parser * @return this Connection, for chaining */ diff --git a/src/main/java/org/jsoup/helper/HttpConnection.java b/src/main/java/org/jsoup/helper/HttpConnection.java index b6e7964841..284614b896 100644 --- a/src/main/java/org/jsoup/helper/HttpConnection.java +++ b/src/main/java/org/jsoup/helper/HttpConnection.java @@ -357,6 +357,7 @@ public static class Request extends HttpConnection.Base impl private boolean ignoreHttpErrors = false; private boolean ignoreContentType = false; private Parser parser; + private boolean parserDefined = false; // called parser(...) vs initialized in ctor private boolean validateTSLCertificates = true; private String postDataCharset = DataUtil.defaultCharset; @@ -437,6 +438,7 @@ public Collection data() { public Request parser(Parser parser) { this.parser = parser; + parserDefined = true; return this; } @@ -470,11 +472,9 @@ public static class Response extends HttpConnection.Base im private Connection.Request req; /* - * For example {@code application/atom+xml;charset=utf-8}. - * Stepping through it: start with {@code "application/"}, follow with word - * characters up to a {@code "+xml"}, and then maybe more ({@code .*}). + * Matches XML content types (like text/xml, application/xhtml+xml;charset=UTF8, etc) */ - private static final Pattern xmlContentTypeRxp = Pattern.compile("application/\\w+\\+xml.*"); + private static final Pattern xmlContentTypeRxp = Pattern.compile("(application|text)/\\w*\\+?xml.*"); Response() { super(); @@ -541,12 +541,19 @@ static Response execute(Connection.Request req, Response previousResponse) throw if (contentType != null && !req.ignoreContentType() && !contentType.startsWith("text/") - && !contentType.startsWith("application/xml") && !xmlContentTypeRxp.matcher(contentType).matches() ) throw new UnsupportedMimeTypeException("Unhandled content type. Must be text/*, application/xml, or application/xhtml+xml", contentType, req.url().toString()); + // switch to the XML parser if content type is xml and not parser not explicitly set + if (contentType != null && xmlContentTypeRxp.matcher(contentType).matches()) { + // only flip it if a HttpConnection.Request (i.e. don't presume other impls want it): + if (req instanceof HttpConnection.Request && !((Request) req).parserDefined) { + req.parser(Parser.xmlParser()); + } + } + res.charset = DataUtil.getCharsetFromContentType(res.contentType); // may be null, readInputStream deals with it if (conn.getContentLength() != 0) { // -1 means unknown, chunked. sun throws an IO exception on 500 response with no content when trying to read body InputStream bodyStream = null; diff --git a/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java b/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java index 9272d0896c..4a58fdf219 100644 --- a/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java +++ b/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java @@ -11,7 +11,7 @@ /** * HTML Tree Builder; creates a DOM from Tokens. */ -class HtmlTreeBuilder extends TreeBuilder { +public class HtmlTreeBuilder extends TreeBuilder { // tag searches private static final String[] TagsScriptStyle = new String[]{"script", "style"}; public static final String[] TagsSearchInScope = new String[]{"applet", "caption", "html", "table", "td", "th", "marquee", "object"}; diff --git a/src/test/java/org/jsoup/integration/UrlConnectTest.java b/src/test/java/org/jsoup/integration/UrlConnectTest.java index dfee15c4b9..03b654adc2 100644 --- a/src/test/java/org/jsoup/integration/UrlConnectTest.java +++ b/src/test/java/org/jsoup/integration/UrlConnectTest.java @@ -4,9 +4,13 @@ import org.jsoup.HttpStatusException; import org.jsoup.Jsoup; import org.jsoup.UnsupportedMimeTypeException; +import org.jsoup.helper.StringUtil; import org.jsoup.helper.W3CDom; import org.jsoup.nodes.Document; import org.jsoup.nodes.FormElement; +import org.jsoup.parser.HtmlTreeBuilder; +import org.jsoup.parser.Parser; +import org.jsoup.parser.XmlTreeBuilder; import org.junit.Ignore; import org.junit.Test; @@ -494,4 +498,26 @@ public void fetchToW3c() throws IOException { assertTrue(html.contains("jsoup")); } + @Test + public void fetchHandlesXml() throws IOException { + // should auto-detect xml and use XML parser, unless explicitly requested the html parser + String xmlUrl = "http://direct.infohound.net/tools/parse-xml.xml"; + Connection con = Jsoup.connect(xmlUrl); + Document doc = con.get(); + Connection.Request req = con.request(); + assertTrue(req.parser().getTreeBuilder() instanceof XmlTreeBuilder); + assertEquals(" one Two
", StringUtil.normaliseWhitespace(doc.outerHtml())); + } + + @Test + public void fetchHandlesXmlAsHtmlWhenParserSet() throws IOException { + // should auto-detect xml and use XML parser, unless explicitly requested the html parser + String xmlUrl = "http://direct.infohound.net/tools/parse-xml.xml"; + Connection con = Jsoup.connect(xmlUrl).parser(Parser.htmlParser()); + Document doc = con.get(); + Connection.Request req = con.request(); + assertTrue(req.parser().getTreeBuilder() instanceof HtmlTreeBuilder); + assertEquals(" one Two
", StringUtil.normaliseWhitespace(doc.outerHtml())); + } + }