diff --git a/src/main/java/org/jsoup/helper/DataUtil.java b/src/main/java/org/jsoup/helper/DataUtil.java index b32d86af5d..44a4db39bc 100644 --- a/src/main/java/org/jsoup/helper/DataUtil.java +++ b/src/main/java/org/jsoup/helper/DataUtil.java @@ -4,6 +4,7 @@ import org.jsoup.nodes.Element; import org.jsoup.nodes.XmlDeclaration; import org.jsoup.parser.Parser; +import org.jsoup.select.Elements; import java.io.ByteArrayOutputStream; import java.io.File; @@ -101,16 +102,20 @@ static Document parseByteData(ByteBuffer byteData, String charsetName, String ba // look for or HTML5 docData = Charset.forName(defaultCharset).decode(byteData).toString(); doc = parser.parseInput(docData, baseUri); - Element meta = doc.select("meta[http-equiv=content-type], meta[charset]").first(); + Elements metaElements = doc.select("meta[http-equiv=content-type], meta[charset]"); String foundCharset = null; // if not found, will keep utf-8 as best attempt - if (meta != null) { + for (Element meta : metaElements) { if (meta.hasAttr("http-equiv")) { foundCharset = getCharsetFromContentType(meta.attr("content")); } if (foundCharset == null && meta.hasAttr("charset")) { foundCharset = meta.attr("charset"); } + if (foundCharset != null) { + break; + } } + // look for if (foundCharset == null && doc.childNodeSize() > 0 && doc.childNode(0) instanceof XmlDeclaration) { XmlDeclaration prolog = (XmlDeclaration) doc.childNode(0); diff --git a/src/test/java/org/jsoup/helper/DataUtilTest.java b/src/test/java/org/jsoup/helper/DataUtilTest.java index 5555ea1158..94003b4912 100644 --- a/src/test/java/org/jsoup/helper/DataUtilTest.java +++ b/src/test/java/org/jsoup/helper/DataUtilTest.java @@ -101,6 +101,30 @@ public void wrongMetaCharsetFallback() { } } + @Test + public void secondMetaElementWithContentTypeContainsCharsetParameter() throws Exception { + ByteBuffer inBuffer = ByteBuffer.wrap(("" + + "" + + "" + + "한국어").getBytes("euc-kr")); + + Document doc = DataUtil.parseByteData(inBuffer, null, "http://example.com", Parser.htmlParser()); + + assertEquals("한국어", doc.body().text()); + } + + @Test + public void firstMetaElementWithCharsetShouldBeUsedForDecoding() throws Exception { + ByteBuffer inBuffer = ByteBuffer.wrap(("" + + "" + + "" + + "Übergrößenträger").getBytes("iso-8859-1")); + + Document doc = DataUtil.parseByteData(inBuffer, null, "http://example.com", Parser.htmlParser()); + + assertEquals("Übergrößenträger", doc.body().text()); + } + @Test public void supportsBOMinFiles() throws IOException { // test files from http://www.i18nl10n.com/korean/utftest/