diff --git a/CHANGES b/CHANGES index 52b3acec32..6b3e0a9b4b 100644 --- a/CHANGES +++ b/CHANGES @@ -3,7 +3,7 @@ jsoup changelog Release 1.16.1 [PENDING] * Improvement: Calling Node.remove() on a node with no parent is now a no-op, vs a validation error. - + * Bugfix: Corrected support for ruby elements (, , , and ) to current spec. @@ -11,6 +11,10 @@ Release 1.16.1 [PENDING] the incoming node may be inserted into the wrong relative location. + * Bugfix: In Jsoup.connect(url), if the input URL had components that were already % escaped, they would be escaped + again, causing errors when fetched. + + Release 1.15.4 [18-Feb-2023] * Improvement: added the ability to escape CSS selectors (tags, IDs, classes) to match elements that don't follow regular CSS syntax. For example, to match by classname

, use document.select("p.one\\.two"); diff --git a/src/main/java/org/jsoup/helper/HttpConnection.java b/src/main/java/org/jsoup/helper/HttpConnection.java index d183a521b4..88595c3ffc 100644 --- a/src/main/java/org/jsoup/helper/HttpConnection.java +++ b/src/main/java/org/jsoup/helper/HttpConnection.java @@ -20,6 +20,7 @@ import java.io.InputStream; import java.io.OutputStream; import java.io.OutputStreamWriter; +import java.io.UnsupportedEncodingException; import java.net.CookieManager; import java.net.CookieStore; import java.net.HttpURLConnection; @@ -30,6 +31,7 @@ import java.net.URI; import java.net.URISyntaxException; import java.net.URL; +import java.net.URLDecoder; import java.net.URLEncoder; import java.nio.Buffer; import java.nio.ByteBuffer; @@ -127,14 +129,21 @@ static URL encodeUrl(URL u) { u = punyUrl(u); try { // run the URL through URI, so components are encoded - URI uri = new URI(u.getProtocol(), u.getUserInfo(), u.getHost(), u.getPort(), u.getPath(), u.getQuery(), u.getRef()); + URI uri = new URI( + u.getProtocol(), decodePart(u.getUserInfo()), u.getHost(), u.getPort(), + decodePart(u.getPath()), decodePart(u.getQuery()), decodePart(u.getRef())); return uri.toURL(); - } catch (URISyntaxException | MalformedURLException e) { + } catch (URISyntaxException | MalformedURLException | UnsupportedEncodingException e) { // give up and return the original input return u; } } + @Nullable private static String decodePart(@Nullable String encoded) throws UnsupportedEncodingException { + if (encoded == null) return null; + return URLDecoder.decode(encoded, UTF_8.name()); + } + /** Convert an International URL to a Punycode URL. @param url input URL that may include an international hostname diff --git a/src/test/java/org/jsoup/helper/HttpConnectionTest.java b/src/test/java/org/jsoup/helper/HttpConnectionTest.java index f7618fb947..b655f8ff68 100644 --- a/src/test/java/org/jsoup/helper/HttpConnectionTest.java +++ b/src/test/java/org/jsoup/helper/HttpConnectionTest.java @@ -260,6 +260,20 @@ public void caseInsensitiveHeaders(Locale locale) { assertEquals("https://test.com/foo%20bar/%5BOne%5D?q=white%20space#frag", url2.toExternalForm()); } + @Test void encodedUrlDoesntDoubleEncode() throws MalformedURLException { + URL url1 = new URL("https://test.com/foo bar/[One]?q=white space#frag ment"); + URL url2 = HttpConnection.encodeUrl(url1); + URL url3 = HttpConnection.encodeUrl(url2); + assertEquals("https://test.com/foo%20bar/%5BOne%5D?q=white%20space#frag%20ment", url2.toExternalForm()); + assertEquals("https://test.com/foo%20bar/%5BOne%5D?q=white%20space#frag%20ment", url3.toExternalForm()); + } + + @Test void connectToEncodedUrl() { + Connection connect = Jsoup.connect("https://example.com/a%20b%20c?query+string"); + URL url = connect.request().url(); + assertEquals("https://example.com/a%20b%20c?query%20string", url.toExternalForm()); + } + @Test public void noUrlThrowsValidationError() throws IOException { HttpConnection con = new HttpConnection(); boolean threw = false;