Skip to content

Commit

Permalink
Escape non-ascii characters in URL query and anchor
Browse files Browse the repository at this point in the history
Improves #1914
  • Loading branch information
jhy committed Mar 26, 2023
1 parent 5afef3e commit 0121311
Show file tree
Hide file tree
Showing 4 changed files with 27 additions and 17 deletions.
3 changes: 2 additions & 1 deletion CHANGES
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
jsoup changelog

Release 1.16.1 [PENDING]
* Improvement: in Jsoup.connect(url), support URLs with Unicode characters in the path
* Improvement: in Jsoup.connect(url), natively support URLs with Unicode characters in the path or query string,
without having to be escaped by the caller.
<https://github.com/jhy/jsoup/issues/1914>

* Improvement: Calling Node.remove() on a node with no parent is now a no-op, vs a validation error.
Expand Down
34 changes: 20 additions & 14 deletions src/main/java/org/jsoup/helper/UrlBuilder.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@

/**
A utility class to normalize input URLs. jsoup internal; API subject to change.
<p>Normalization includes puny-coding the host, and encoding non-ascii path components. The query-string
is left mostly as-is, to avoid inadvertently/incorrectly decoding a desired '+' literal ('%2B') as a ' '.</p>
<p>Normalization includes puny-coding the host, and encoding non-ascii path components. Any non-ascii characters in
the query string (or the fragment/anchor) are escaped, but any existing escapes in those components are preserved.</p>
*/
final class UrlBuilder {
URL u;
Expand Down Expand Up @@ -47,19 +47,20 @@ URL build() {
StringBuilder sb = StringUtil.borrowBuilder().append(normUrl);
if (q != null) {
sb.append('?');
sb.append(normalizeQuery(StringUtil.releaseBuilder(q)));
appendToAscii(StringUtil.releaseBuilder(q), true, sb);
}
if (u.getRef() != null) {
sb.append('#');
sb.append(normalizeRef(u.getRef()));
appendToAscii(u.getRef(), false, sb);
}
normUrl = StringUtil.releaseBuilder(sb);
}
u = new URL(normUrl);
return u;
} catch (MalformedURLException | URISyntaxException e) {
} catch (MalformedURLException | URISyntaxException | UnsupportedEncodingException e) {
// we assert here so that any incomplete normalization issues can be caught in devel. but in practise,
// the remote end will be able to handle it, so in prod we just pass the original URL
// the remote end will be able to handle it, so in prod we just pass the original URL.
// The UnsupportedEncodingException would never happen as always UTF8
assert Validate.assertFail(e.toString());
return u;
}
Expand All @@ -84,14 +85,19 @@ private static String decodePart(String encoded) {
}
}

private static String normalizeQuery(String q) {
// minimal space normal; other characters left as supplied - if generated from jsoup data, will be encoded
return q.replace(' ', '+');
}

private static String normalizeRef(String r) {
// minimal space normal; other characters left as supplied
return r.replace(" ", "%20");
private static void appendToAscii(String s, boolean spaceAsPlus, StringBuilder sb) throws UnsupportedEncodingException {
// minimal normalization of Unicode -> Ascii, and space normal. Existing escapes are left as-is.
for (int i = 0; i < s.length(); i++) {
int c = s.codePointAt(i);
if (c == ' ') {
sb.append(spaceAsPlus ? '+' : "%20");
} else if (c > 127) { // out of ascii range
sb.append(URLEncoder.encode(new String(Character.toChars(c)), UTF_8.name()));
// ^^ is a bit heavy-handed - if perf critical, we could optimize
} else {
sb.append((char) c);
}
}
}


Expand Down
2 changes: 1 addition & 1 deletion src/main/java/org/jsoup/internal/StringUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,7 @@ public static boolean in(final String needle, final String... haystack) {
final int len = haystack.length;
for (int i = 0; i < len; i++) {
if (haystack[i].equals(needle))
return true;
return true;
}
return false;
}
Expand Down
5 changes: 4 additions & 1 deletion src/test/java/org/jsoup/integration/ConnectTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import org.jsoup.Connection;
import org.jsoup.HttpStatusException;
import org.jsoup.Jsoup;
import org.jsoup.helper.DataUtil;
import org.jsoup.helper.W3CDom;
import org.jsoup.integration.servlets.*;
import org.jsoup.internal.StringUtil;
Expand All @@ -20,6 +21,7 @@
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLDecoder;
import java.util.List;
import java.util.Map;

Expand Down Expand Up @@ -751,10 +753,11 @@ public void maxBodySizeInReadToByteBuffer() throws IOException {
}

@Test void fetchUnicodeUrl() throws IOException {
String url = EchoServlet.Url + "/✔/?%E9%8D%B5=%E5%80%A4"; // encoded 鍵=値
String url = EchoServlet.Url + "/✔/?鍵=値";
Document doc = Jsoup.connect(url).get();

assertEquals("/✔/", ihVal("Path Info", doc));
assertEquals("%E9%8D%B5=%E5%80%A4", ihVal("Query String", doc));
assertEquals("鍵=値", URLDecoder.decode(ihVal("Query String", doc), DataUtil.UTF_8.name()));
}
}

0 comments on commit 0121311

Please sign in to comment.