Skip to content

Commit

Permalink
Merge branch 'pr/574'
Browse files Browse the repository at this point in the history
  • Loading branch information
jhy committed Jul 6, 2015
2 parents 2736346 + 5236f0b commit e6514a0
Show file tree
Hide file tree
Showing 5 changed files with 45 additions and 7 deletions.
3 changes: 3 additions & 0 deletions CHANGES
Expand Up @@ -3,6 +3,9 @@ jsoup changelog
*** Release 1.8.3 [PENDING]
* Added support for custom boolean attributes.
<https://github.com/jhy/jsoup/pull/555>

* When fetching XML URLs, automatically switch to the XML parser instead of the HTML parser.
<https://github.com/jhy/jsoup/pull/574>

* Fixed an issue in Element.getElementSiblingIndex (and related methods) where sibling elements with the same content
would incorrectly have the same sibling index.
Expand Down
4 changes: 3 additions & 1 deletion src/main/java/org/jsoup/Connection.java
@@ -1,6 +1,7 @@
package org.jsoup;

import org.jsoup.nodes.Document;
import org.jsoup.parser.HtmlTreeBuilder;
import org.jsoup.parser.Parser;

import java.io.IOException;
Expand Down Expand Up @@ -210,7 +211,8 @@ public final boolean hasBody() {
Connection cookies(Map<String, String> cookies);

/**
* Provide an alternate parser to use when parsing the response to a Document.
* Provide an alternate parser to use when parsing the response to a Document. If not set, defaults to the HTML
* parser, unless the response content-type is XML, in which case the XML parser is used.
* @param parser alternate parser
* @return this Connection, for chaining
*/
Expand Down
17 changes: 12 additions & 5 deletions src/main/java/org/jsoup/helper/HttpConnection.java
Expand Up @@ -357,6 +357,7 @@ public static class Request extends HttpConnection.Base<Connection.Request> impl
private boolean ignoreHttpErrors = false;
private boolean ignoreContentType = false;
private Parser parser;
private boolean parserDefined = false; // called parser(...) vs initialized in ctor
private boolean validateTSLCertificates = true;
private String postDataCharset = DataUtil.defaultCharset;

Expand Down Expand Up @@ -437,6 +438,7 @@ public Collection<Connection.KeyVal> data() {

public Request parser(Parser parser) {
this.parser = parser;
parserDefined = true;
return this;
}

Expand Down Expand Up @@ -470,11 +472,9 @@ public static class Response extends HttpConnection.Base<Connection.Response> im
private Connection.Request req;

/*
* For example {@code application/atom+xml;charset=utf-8}.
* Stepping through it: start with {@code "application/"}, follow with word
* characters up to a {@code "+xml"}, and then maybe more ({@code .*}).
* Matches XML content types (like text/xml, application/xhtml+xml;charset=UTF8, etc)
*/
private static final Pattern xmlContentTypeRxp = Pattern.compile("application/\\w+\\+xml.*");
private static final Pattern xmlContentTypeRxp = Pattern.compile("(application|text)/\\w*\\+?xml.*");

Response() {
super();
Expand Down Expand Up @@ -541,12 +541,19 @@ static Response execute(Connection.Request req, Response previousResponse) throw
if (contentType != null
&& !req.ignoreContentType()
&& !contentType.startsWith("text/")
&& !contentType.startsWith("application/xml")
&& !xmlContentTypeRxp.matcher(contentType).matches()
)
throw new UnsupportedMimeTypeException("Unhandled content type. Must be text/*, application/xml, or application/xhtml+xml",
contentType, req.url().toString());

// switch to the XML parser if content type is xml and not parser not explicitly set
if (contentType != null && xmlContentTypeRxp.matcher(contentType).matches()) {
// only flip it if a HttpConnection.Request (i.e. don't presume other impls want it):
if (req instanceof HttpConnection.Request && !((Request) req).parserDefined) {
req.parser(Parser.xmlParser());
}
}

res.charset = DataUtil.getCharsetFromContentType(res.contentType); // may be null, readInputStream deals with it
if (conn.getContentLength() != 0) { // -1 means unknown, chunked. sun throws an IO exception on 500 response with no content when trying to read body
InputStream bodyStream = null;
Expand Down
2 changes: 1 addition & 1 deletion src/main/java/org/jsoup/parser/HtmlTreeBuilder.java
Expand Up @@ -11,7 +11,7 @@
/**
* HTML Tree Builder; creates a DOM from Tokens.
*/
class HtmlTreeBuilder extends TreeBuilder {
public class HtmlTreeBuilder extends TreeBuilder {
// tag searches
private static final String[] TagsScriptStyle = new String[]{"script", "style"};
public static final String[] TagsSearchInScope = new String[]{"applet", "caption", "html", "table", "td", "th", "marquee", "object"};
Expand Down
26 changes: 26 additions & 0 deletions src/test/java/org/jsoup/integration/UrlConnectTest.java
Expand Up @@ -4,9 +4,13 @@
import org.jsoup.HttpStatusException;
import org.jsoup.Jsoup;
import org.jsoup.UnsupportedMimeTypeException;
import org.jsoup.helper.StringUtil;
import org.jsoup.helper.W3CDom;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.FormElement;
import org.jsoup.parser.HtmlTreeBuilder;
import org.jsoup.parser.Parser;
import org.jsoup.parser.XmlTreeBuilder;
import org.junit.Ignore;
import org.junit.Test;

Expand Down Expand Up @@ -494,4 +498,26 @@ public void fetchToW3c() throws IOException {
assertTrue(html.contains("jsoup"));
}

@Test
public void fetchHandlesXml() throws IOException {
// should auto-detect xml and use XML parser, unless explicitly requested the html parser
String xmlUrl = "http://direct.infohound.net/tools/parse-xml.xml";
Connection con = Jsoup.connect(xmlUrl);
Document doc = con.get();
Connection.Request req = con.request();
assertTrue(req.parser().getTreeBuilder() instanceof XmlTreeBuilder);
assertEquals("<xml> <link> one </link> <table> Two </table> </xml>", StringUtil.normaliseWhitespace(doc.outerHtml()));
}

@Test
public void fetchHandlesXmlAsHtmlWhenParserSet() throws IOException {
// should auto-detect xml and use XML parser, unless explicitly requested the html parser
String xmlUrl = "http://direct.infohound.net/tools/parse-xml.xml";
Connection con = Jsoup.connect(xmlUrl).parser(Parser.htmlParser());
Document doc = con.get();
Connection.Request req = con.request();
assertTrue(req.parser().getTreeBuilder() instanceof HtmlTreeBuilder);
assertEquals("<html> <head></head> <body> <xml> <link>one <table> Two </table> </xml> </body> </html>", StringUtil.normaliseWhitespace(doc.outerHtml()));
}

}

0 comments on commit e6514a0

Please sign in to comment.