Skip to content

Commit

Permalink
Removed character range check (>= 0xD800 && <= 0xDFFF)
Browse files Browse the repository at this point in the history
That was preventing the valid decode of &#55357;&#56495; to 馃挴.

This rule must have been in the spec when initially implemented but I can't find a reference to it now. I'm assuming that the range had since been added, but can't immediately identify why it was explicitly excluded originally.

Fixes #2047
  • Loading branch information
jhy committed Dec 3, 2023
1 parent e39b9b9 commit 954c46a
Show file tree
Hide file tree
Showing 4 changed files with 35 additions and 6 deletions.
2 changes: 2 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
correctly. [2067](https://github.com/jhy/jsoup/issues/2067)
* When tracking the source position of a body fragment parse, a null pointer exception was
thrown. [2068](https://github.com/jhy/jsoup/issues/2068)
* A multi-point encoded emoji entity may be incorrectly decoded to the replacement
character. [2047](https://github.com/jhy/jsoup/issues/2074)

---
Older changes for versions 0.1.1 (2010-Jan-31) through 1.17.1 (2023-Nov-27) may be found in
Expand Down
7 changes: 5 additions & 2 deletions src/main/java/org/jsoup/parser/Tokeniser.java
Original file line number Diff line number Diff line change
Expand Up @@ -205,8 +205,11 @@ void advanceTransition(TokeniserState newState) {
int base = isHexMode ? 16 : 10;
charval = Integer.valueOf(numRef, base);
} catch (NumberFormatException ignored) {
} // skip
if (charval == -1 || (charval >= 0xD800 && charval <= 0xDFFF) || charval > 0x10FFFF) {
// skip
}
// todo: check for extra illegal unicode points as parse errors - described https://html.spec.whatwg.org/multipage/syntax.html#character-references and in Infra
// The numeric character reference forms described above are allowed to reference any code point excluding U+000D CR, noncharacters, and controls other than ASCII whitespace.
if (charval == -1 || charval > 0x10FFFF) {
characterReferenceError("character [%s] outside of valid range", charval);
codeRef[0] = replacementChar;
} else {
Expand Down
10 changes: 10 additions & 0 deletions src/test/java/org/jsoup/nodes/EntitiesTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -175,4 +175,14 @@ public class EntitiesTest {
String escaped2 = assertDoesNotThrow(() -> Entities.escape(text, clone2));
assertEquals(escaped1, escaped2);
}

@Test void parseHtmlEncodedEmojiMultipoint() {
String emoji = Parser.unescapeEntities("&#55357;&#56495;", false); // 馃挴
assertEquals("\uD83D\uDCAF", emoji);
}

@Test void parseHtmlEncodedEmoji() {
String emoji = Parser.unescapeEntities("&#128175;", false); // 馃挴
assertEquals("\uD83D\uDCAF", emoji);
}
}
22 changes: 18 additions & 4 deletions src/test/java/org/jsoup/parser/HtmlParserTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -851,7 +851,7 @@ private static Stream<Arguments> dupeAttributeData() {
}

@Test public void tracksErrorsWhenRequested() {
String html = "<p>One</p href='no'>\n<!DOCTYPE html>\n&arrgh;<font />&#33 &amp &#xD800;<br /></div><foo";
String html = "<p>One</p href='no'>\n<!DOCTYPE html>\n&arrgh;<font />&#33 &amp &#x110000;<br /></div><foo";
Parser parser = Parser.htmlParser().setTrackErrors(500);
Document doc = Jsoup.parse(html, "http://example.com", parser);

Expand All @@ -863,9 +863,9 @@ private static Stream<Arguments> dupeAttributeData() {
assertEquals("<3:16>: Tag [font] cannot be self closing; not a void tag", errors.get(3).toString());
assertEquals("<3:20>: Invalid character reference: missing semicolon on [&#33]", errors.get(4).toString());
assertEquals("<3:25>: Invalid character reference: missing semicolon on [&amp]", errors.get(5).toString());
assertEquals("<3:34>: Invalid character reference: character [55296] outside of valid range", errors.get(6).toString());
assertEquals("<3:46>: Unexpected EndTag token [</div>] when in state [InBody]", errors.get(7).toString());
assertEquals("<3:51>: Unexpectedly reached end of file (EOF) in input state [TagName]", errors.get(8).toString());
assertEquals("<3:36>: Invalid character reference: character [1114112] outside of valid range", errors.get(6).toString());
assertEquals("<3:48>: Unexpected EndTag token [</div>] when in state [InBody]", errors.get(7).toString());
assertEquals("<3:53>: Unexpectedly reached end of file (EOF) in input state [TagName]", errors.get(8).toString());
}

@Test public void tracksLimitedErrorsWhenRequested() {
Expand Down Expand Up @@ -1874,4 +1874,18 @@ private static void assertMathNamespace(Element el) {
assertMathNamespace(doc4.expectFirst("annotation-xml"));
assertHtmlNamespace(doc4.expectFirst("divv"));
}

@Test void parseEmojiFromMultipointEncoded() {
String html = "<img multi='&#55357;&#56495;' single='&#128175;' hexsingle='&#x1f4af;'>";
Document document = Jsoup.parse(html);
Element img = document.expectFirst("img");
assertEquals("\uD83D\uDCAF", img.attr("multi"));
assertEquals("\uD83D\uDCAF", img.attr("single"));
assertEquals("\uD83D\uDCAF", img.attr("hexsingle"));

assertEquals("<img multi=\"\uD83D\uDCAF\" single=\"\uD83D\uDCAF\" hexsingle=\"\uD83D\uDCAF\">", img.outerHtml());

img.ownerDocument().outputSettings().charset("ascii");
assertEquals("<img multi=\"&#x1f4af;\" single=\"&#x1f4af;\" hexsingle=\"&#x1f4af;\">", img.outerHtml());
}
}

0 comments on commit 954c46a

Please sign in to comment.