From 6ccd158754e21a749b2390ca27282a8dda7fb4f6 Mon Sep 17 00:00:00 2001 From: Jonathan Hedley Date: Fri, 20 Oct 2023 12:30:46 +1100 Subject: [PATCH] Escape supplemental characters correctly --- CHANGES | 3 +++ src/main/java/org/jsoup/helper/UrlBuilder.java | 1 + src/test/java/org/jsoup/helper/HttpConnectionTest.java | 6 ++++++ 3 files changed, 10 insertions(+) diff --git a/CHANGES b/CHANGES index bc120729e0..9f05522ca6 100644 --- a/CHANGES +++ b/CHANGES @@ -49,6 +49,9 @@ Release 1.16.2 [PENDING] ASCII. + * Bugfix: in Jsoup.connect(url), strings containing supplemental characters (e.g. emoji) were not URL escaped + correctly. + * Bugfix: in Jsoup.connect(url), the ConstrainableInputStream would clear Thread interrupts when reading the body. This precluded callers from spawning a thread, running a number of requests for a length of time, then joining that thread after interrupting it. diff --git a/src/main/java/org/jsoup/helper/UrlBuilder.java b/src/main/java/org/jsoup/helper/UrlBuilder.java index 4deda367ae..3ef9c56870 100644 --- a/src/main/java/org/jsoup/helper/UrlBuilder.java +++ b/src/main/java/org/jsoup/helper/UrlBuilder.java @@ -90,6 +90,7 @@ private static void appendToAscii(String s, boolean spaceAsPlus, StringBuilder s } else if (c > 127) { // out of ascii range sb.append(URLEncoder.encode(new String(Character.toChars(c)), UTF_8.name())); // ^^ is a bit heavy-handed - if perf critical, we could optimize + if (Character.charCount(c) == 2) i++; // advance past supplemental } else { sb.append((char) c); } diff --git a/src/test/java/org/jsoup/helper/HttpConnectionTest.java b/src/test/java/org/jsoup/helper/HttpConnectionTest.java index 9444c2da80..8df0f80397 100644 --- a/src/test/java/org/jsoup/helper/HttpConnectionTest.java +++ b/src/test/java/org/jsoup/helper/HttpConnectionTest.java @@ -260,6 +260,12 @@ public void caseInsensitiveHeaders(Locale locale) { assertEquals("https://test.com/foo%20bar/%5BOne%5D?q=white+space#frag", url2.toExternalForm()); } + @Test public void encodeUrlSupplementary() throws MalformedURLException { + URL url1 = new URL("https://example.com/tools/test💩.html"); // = "/tools/test\uD83D\uDCA9.html" + URL url2 = new UrlBuilder(url1).build(); + assertEquals("https://example.com/tools/test%F0%9F%92%A9.html", url2.toExternalForm()); + } + @Test void encodedUrlDoesntDoubleEncode() throws MalformedURLException { URL url1 = new URL("https://test.com/foo%20bar/%5BOne%5D?q=white+space#frag%20ment"); URL url2 = new UrlBuilder(url1).build();