From 6dcbf7104240e6de0898f6d97e91ee8ebb68f0a2 Mon Sep 17 00:00:00 2001 From: cketti Date: Wed, 22 Feb 2017 09:58:21 +0100 Subject: [PATCH 1/2] Add failing test for document with two meta elements with content type When attempting to retrieve the charset only the first meta element with content type information will be looked at. No further meta elements are considered even when the first one doesn't contain a charset parameter. --- src/test/java/org/jsoup/helper/DataUtilTest.java | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/test/java/org/jsoup/helper/DataUtilTest.java b/src/test/java/org/jsoup/helper/DataUtilTest.java index 5555ea1158..43125b20f8 100644 --- a/src/test/java/org/jsoup/helper/DataUtilTest.java +++ b/src/test/java/org/jsoup/helper/DataUtilTest.java @@ -101,6 +101,18 @@ public void wrongMetaCharsetFallback() { } } + @Test + public void secondMetaElementWithContentTypeContainsCharsetParameter() throws Exception { + ByteBuffer inBuffer = ByteBuffer.wrap(("" + + "" + + "" + + "한국어").getBytes("euc-kr")); + + Document doc = DataUtil.parseByteData(inBuffer, null, "http://example.com", Parser.htmlParser()); + + assertEquals("한국어", doc.body().text()); + } + @Test public void supportsBOMinFiles() throws IOException { // test files from http://www.i18nl10n.com/korean/utftest/ From a5266b14a2b7457e076d5f6429c2197baf5412e4 Mon Sep 17 00:00:00 2001 From: cketti Date: Wed, 22 Feb 2017 10:03:40 +0100 Subject: [PATCH 2/2] Consider all meta elements until a charset is found --- src/main/java/org/jsoup/helper/DataUtil.java | 9 +++++++-- src/test/java/org/jsoup/helper/DataUtilTest.java | 12 ++++++++++++ 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/jsoup/helper/DataUtil.java b/src/main/java/org/jsoup/helper/DataUtil.java index b32d86af5d..44a4db39bc 100644 --- a/src/main/java/org/jsoup/helper/DataUtil.java +++ b/src/main/java/org/jsoup/helper/DataUtil.java @@ -4,6 +4,7 @@ import org.jsoup.nodes.Element; import org.jsoup.nodes.XmlDeclaration; import org.jsoup.parser.Parser; +import org.jsoup.select.Elements; import java.io.ByteArrayOutputStream; import java.io.File; @@ -101,16 +102,20 @@ static Document parseByteData(ByteBuffer byteData, String charsetName, String ba // look for or HTML5 docData = Charset.forName(defaultCharset).decode(byteData).toString(); doc = parser.parseInput(docData, baseUri); - Element meta = doc.select("meta[http-equiv=content-type], meta[charset]").first(); + Elements metaElements = doc.select("meta[http-equiv=content-type], meta[charset]"); String foundCharset = null; // if not found, will keep utf-8 as best attempt - if (meta != null) { + for (Element meta : metaElements) { if (meta.hasAttr("http-equiv")) { foundCharset = getCharsetFromContentType(meta.attr("content")); } if (foundCharset == null && meta.hasAttr("charset")) { foundCharset = meta.attr("charset"); } + if (foundCharset != null) { + break; + } } + // look for if (foundCharset == null && doc.childNodeSize() > 0 && doc.childNode(0) instanceof XmlDeclaration) { XmlDeclaration prolog = (XmlDeclaration) doc.childNode(0); diff --git a/src/test/java/org/jsoup/helper/DataUtilTest.java b/src/test/java/org/jsoup/helper/DataUtilTest.java index 43125b20f8..94003b4912 100644 --- a/src/test/java/org/jsoup/helper/DataUtilTest.java +++ b/src/test/java/org/jsoup/helper/DataUtilTest.java @@ -113,6 +113,18 @@ public void secondMetaElementWithContentTypeContainsCharsetParameter() throws Ex assertEquals("한국어", doc.body().text()); } + @Test + public void firstMetaElementWithCharsetShouldBeUsedForDecoding() throws Exception { + ByteBuffer inBuffer = ByteBuffer.wrap(("" + + "" + + "" + + "Übergrößenträger").getBytes("iso-8859-1")); + + Document doc = DataUtil.parseByteData(inBuffer, null, "http://example.com", Parser.htmlParser()); + + assertEquals("Übergrößenträger", doc.body().text()); + } + @Test public void supportsBOMinFiles() throws IOException { // test files from http://www.i18nl10n.com/korean/utftest/