Skip to content

Commit

Permalink
Refactored URL normalization
Browse files Browse the repository at this point in the history
Introduced a simple UrlBuilder that encodes Unicode in paths in addition to existing normalizations (puny code etc)

Consistently normalize input URLs, both user-supplied and internally constructed.

Encode ' ' as '+' in query strings (vs %20).

Fixes #1914
  • Loading branch information
jhy committed Mar 26, 2023
1 parent 8b31aaa commit 6e71f35
Show file tree
Hide file tree
Showing 7 changed files with 149 additions and 91 deletions.
3 changes: 3 additions & 0 deletions CHANGES
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
jsoup changelog

Release 1.16.1 [PENDING]
* Improvement: in Jsoup.connect(url), support URLs with Unicode characters in the path
<https://github.com/jhy/jsoup/issues/1914>

* Improvement: Calling Node.remove() on a node with no parent is now a no-op, vs a validation error.
<https://github.com/jhy/jsoup/issues/1898>

Expand Down
93 changes: 9 additions & 84 deletions src/main/java/org/jsoup/helper/HttpConnection.java
Original file line number Diff line number Diff line change
Expand Up @@ -20,18 +20,13 @@
import java.io.InputStream;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.net.CookieManager;
import java.net.CookieStore;
import java.net.HttpURLConnection;
import java.net.IDN;
import java.net.InetSocketAddress;
import java.net.MalformedURLException;
import java.net.Proxy;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.net.URLDecoder;
import java.net.URLEncoder;
import java.nio.Buffer;
import java.nio.ByteBuffer;
Expand All @@ -49,6 +44,7 @@
import java.util.zip.InflaterInputStream;

import static org.jsoup.Connection.Method.HEAD;
import static org.jsoup.helper.DataUtil.UTF_8;
import static org.jsoup.internal.Normalizer.lowerCase;

/**
Expand All @@ -70,7 +66,6 @@ public class HttpConnection implements Connection {
public static final String FORM_URL_ENCODED = "application/x-www-form-urlencoded";
private static final int HTTP_TEMP_REDIR = 307; // http/1.1 temporary redirect, not in Java's set.
private static final String DefaultUploadType = "application/octet-stream";
private static final Charset UTF_8 = Charset.forName("UTF-8"); // Don't use StandardCharsets, not in Android API 10.
private static final Charset ISO_8859_1 = Charset.forName("ISO-8859-1");

/**
Expand Down Expand Up @@ -111,57 +106,6 @@ copied. All other settings (proxy, parser, cookies, etc) are copied.
req = new Request(copy);
}

/**
* Encodes the input URL into a safe ASCII URL string
* @param url unescaped URL
* @return escaped URL
*/
private static String encodeUrl(String url) {
try {
URL u = new URL(url);
return encodeUrl(u).toExternalForm();
} catch (Exception e) {
return url;
}
}

static URL encodeUrl(URL u) {
u = punyUrl(u);
try {
// run the URL through URI, so components are encoded
URI uri = new URI(
u.getProtocol(), decodePart(u.getUserInfo()), u.getHost(), u.getPort(),
decodePart(u.getPath()), decodePart(u.getQuery()), decodePart(u.getRef()));
return uri.toURL();
} catch (URISyntaxException | MalformedURLException | UnsupportedEncodingException e) {
// give up and return the original input
return u;
}
}

@Nullable private static String decodePart(@Nullable String encoded) throws UnsupportedEncodingException {
if (encoded == null) return null;
return URLDecoder.decode(encoded, UTF_8.name());
}

/**
Convert an International URL to a Punycode URL.
@param url input URL that may include an international hostname
@return a punycode URL if required, or the original URL
*/
private static URL punyUrl(URL url) {
if (!StringUtil.isAscii(url.getHost())) {
try {
String puny = IDN.toASCII(url.getHost());
url = new URL(url.getProtocol(), puny, url.getPort(), url.getFile()); // file will include ref, query if any
} catch (MalformedURLException e) {
// if passed a valid URL initially, cannot happen
throw new IllegalArgumentException(e);
}
}
return url;
}

private static String encodeMimeName(String val) {
return val.replace("\"", "%22");
}
Expand Down Expand Up @@ -189,7 +133,7 @@ public Connection url(URL url) {
public Connection url(String url) {
Validate.notEmptyParam(url, "url");
try {
req.url(new URL(encodeUrl(url)));
req.url(new URL(url));
} catch (MalformedURLException e) {
throw new IllegalArgumentException(String.format("The supplied URL, '%s', is malformed. Make sure it is an absolute URL, and starts with 'http://' or 'https://'. See https://jsoup.org/cookbook/extracting-data/working-with-urls", url), e);
}
Expand Down Expand Up @@ -441,7 +385,7 @@ public URL url() {

public T url(URL url) {
Validate.notNullParam(url, "url");
this.url = punyUrl(url); // if calling url(url) directly, does not go through encodeUrl, so we punycode it explicitly. todo - should we encode here as well?
this.url = new UrlBuilder(url).build();
return (T) this;
}

Expand Down Expand Up @@ -889,7 +833,7 @@ else if (methodHasBody)
if (location.startsWith("http:/") && location.charAt(6) != '/') // fix broken Location: http:/temp/AAG_New/en/index.php
location = location.substring(6);
URL redir = StringUtil.resolve(req.url(), location);
req.url(encodeUrl(redir));
req.url(redir);

req.executing = false;
return execute(req, res);
Expand Down Expand Up @@ -994,7 +938,7 @@ public String body() {
prepareByteData();
Validate.notNull(byteData);
// charset gets set from header on execute, and from meta-equiv on parse. parse may not have happened yet
String body = (charset == null ? DataUtil.UTF_8 : Charset.forName(charset))
String body = (charset == null ? UTF_8 : Charset.forName(charset))
.decode(byteData).toString();
((Buffer)byteData).rewind(); // cast to avoid covariant return type change in jdk9
return body;
Expand Down Expand Up @@ -1228,32 +1172,13 @@ private static void writePost(final Connection.Request req, final OutputStream o

// for get url reqs, serialise the data map into the url
private static void serialiseRequestUrl(Connection.Request req) throws IOException {
URL in = req.url();
StringBuilder url = StringUtil.borrowBuilder();
boolean first = true;
// reconstitute the query, ready for appends
url
.append(in.getProtocol())
.append("://")
.append(in.getAuthority()) // includes host, port
.append(in.getPath())
.append("?");
if (in.getQuery() != null) {
url.append(in.getQuery());
first = false;
}
UrlBuilder in = new UrlBuilder(req.url());

for (Connection.KeyVal keyVal : req.data()) {
Validate.isFalse(keyVal.hasInputStream(), "InputStream data not supported in URL query string.");
if (!first)
url.append('&');
else
first = false;
url
.append(URLEncoder.encode(keyVal.key(), DataUtil.defaultCharsetName))
.append('=')
.append(URLEncoder.encode(keyVal.value(), DataUtil.defaultCharsetName));
in.appendKeyVal(keyVal);
}
req.url(new URL(StringUtil.releaseBuilder(url)));
req.url(in.build());
req.data().clear(); // moved into url as get params
}
}
Expand Down
98 changes: 98 additions & 0 deletions src/main/java/org/jsoup/helper/UrlBuilder.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
package org.jsoup.helper;

import org.jsoup.Connection;
import org.jsoup.internal.StringUtil;

import javax.annotation.Nullable;
import java.io.UnsupportedEncodingException;
import java.net.IDN;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.net.URLDecoder;
import java.net.URLEncoder;

import static org.jsoup.helper.DataUtil.UTF_8;

/**
A utility class to normalize input URLs. jsoup internal; API subject to change.
<p>Normalization includes puny-coding the host, and encoding non-ascii path components. The query-string
is left mostly as-is, to avoid inadvertently/incorrectly decoding a desired '+' literal ('%2B') as a ' '.</p>
*/
final class UrlBuilder {
URL u;
@Nullable StringBuilder q;

UrlBuilder(URL inputUrl) {
this.u = inputUrl;
if (u.getQuery() != null)
q = StringUtil.borrowBuilder().append(u.getQuery());
}

URL build() {
try {
// use the URI class to encode non-ascii in path
URI uri = new URI(
u.getProtocol(),
u.getUserInfo(),
IDN.toASCII(decodePart(u.getHost())), // puny-code
u.getPort(),
decodePart(u.getPath()),
null, null // query and fragment appended later so as not to encode
);

String normUrl = uri.toASCIIString();
if (q != null || u.getRef() != null) {
StringBuilder sb = StringUtil.borrowBuilder().append(normUrl);
if (q != null) {
sb.append('?');
sb.append(normalizeQuery(StringUtil.releaseBuilder(q)));
}
if (u.getRef() != null) {
sb.append('#');
sb.append(normalizeRef(u.getRef()));
}
normUrl = StringUtil.releaseBuilder(sb);
}
u = new URL(normUrl);
return u;
} catch (MalformedURLException | URISyntaxException e) {
// we assert here so that any incomplete normalization issues can be caught in devel. but in practise,
// the remote end will be able to handle it, so in prod we just pass the original URL
assert Validate.assertFail(e.toString());
return u;
}
}

void appendKeyVal(Connection.KeyVal kv) throws UnsupportedEncodingException {
if (q == null)
q = StringUtil.borrowBuilder();
else
q.append('&');
q
.append(URLEncoder.encode(kv.key(), UTF_8.name()))
.append('=')
.append(URLEncoder.encode(kv.value(), UTF_8.name()));
}

private static String decodePart(String encoded) {
try {
return URLDecoder.decode(encoded, UTF_8.name());
} catch (UnsupportedEncodingException e) {
throw new RuntimeException(e); // wtf!
}
}

private static String normalizeQuery(String q) {
// minimal space normal; other characters left as supplied - if generated from jsoup data, will be encoded
return q.replace(' ', '+');
}

private static String normalizeRef(String r) {
// minimal space normal; other characters left as supplied
return r.replace(" ", "%20");
}


}
11 changes: 11 additions & 0 deletions src/main/java/org/jsoup/helper/Validate.java
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,17 @@ public static void fail(String msg) {
throw new ValidationException(msg);
}

/**
Cause a failure, but return false so it can be used in an assert statement.
@param msg message to output.
@return false, always
@throws IllegalStateException if we reach this state
*/
static boolean assertFail(String msg) {
fail(msg);
return false;
}

/**
Cause a failure.
@param msg message to output.
Expand Down
26 changes: 19 additions & 7 deletions src/test/java/org/jsoup/helper/HttpConnectionTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -256,22 +256,22 @@ public void caseInsensitiveHeaders(Locale locale) {

@Test public void encodeUrl() throws MalformedURLException {
URL url1 = new URL("https://test.com/foo bar/[One]?q=white space#frag");
URL url2 = HttpConnection.encodeUrl(url1);
assertEquals("https://test.com/foo%20bar/%5BOne%5D?q=white%20space#frag", url2.toExternalForm());
URL url2 = new UrlBuilder(url1).build();
assertEquals("https://test.com/foo%20bar/%5BOne%5D?q=white+space#frag", url2.toExternalForm());
}

@Test void encodedUrlDoesntDoubleEncode() throws MalformedURLException {
URL url1 = new URL("https://test.com/foo bar/[One]?q=white space#frag ment");
URL url2 = HttpConnection.encodeUrl(url1);
URL url3 = HttpConnection.encodeUrl(url2);
assertEquals("https://test.com/foo%20bar/%5BOne%5D?q=white%20space#frag%20ment", url2.toExternalForm());
assertEquals("https://test.com/foo%20bar/%5BOne%5D?q=white%20space#frag%20ment", url3.toExternalForm());
URL url2 = new UrlBuilder(url1).build();
URL url3 = new UrlBuilder(url2).build();
assertEquals("https://test.com/foo%20bar/%5BOne%5D?q=white+space#frag%20ment", url2.toExternalForm());
assertEquals("https://test.com/foo%20bar/%5BOne%5D?q=white+space#frag%20ment", url3.toExternalForm());
}

@Test void connectToEncodedUrl() {
Connection connect = Jsoup.connect("https://example.com/a%20b%20c?query+string");
URL url = connect.request().url();
assertEquals("https://example.com/a%20b%20c?query%20string", url.toExternalForm());
assertEquals("https://example.com/a%20b%20c?query+string", url.toExternalForm());
}

@Test public void noUrlThrowsValidationError() throws IOException {
Expand Down Expand Up @@ -303,6 +303,18 @@ public void caseInsensitiveHeaders(Locale locale) {
assertEquals(puny, req.url().toExternalForm());
}

@Test void supportsIdnWithPort() throws MalformedURLException {
String idn = "https://www.测试.测试:9001/foo.html?bar";
String puny = "https://www.xn--0zwm56d.xn--0zwm56d:9001/foo.html?bar";

Connection con = Jsoup.connect(idn);
assertEquals(puny, con.request().url().toExternalForm());

HttpConnection.Request req = new HttpConnection.Request();
req.url(new URL(idn));
assertEquals(puny, req.url().toExternalForm());
}

@Test public void validationErrorsOnExecute() throws IOException {
Connection con = new HttpConnection();
boolean urlThrew = false;
Expand Down
8 changes: 8 additions & 0 deletions src/test/java/org/jsoup/integration/ConnectTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -749,4 +749,12 @@ public void maxBodySizeInReadToByteBuffer() throws IOException {
Document doc = Jsoup.connect(url).get();
assertEquals(url, doc.location());
}

@Test void fetchUnicodeUrl() throws IOException {
String url = EchoServlet.Url + "/✔/?鍵=値";
Document doc = Jsoup.connect(url).get();

assertEquals("/✔/", ihVal("Path Info", doc));
assertEquals("鍵=値", ihVal("Query String", doc));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ private void doIt(HttpServletRequest req, HttpServletResponse res) throws IOExce
// some get items
write(w, "Method", req.getMethod());
write(w, "Request URI", req.getRequestURI());
write(w, "Path Info", req.getPathInfo());
write(w, "Query String", req.getQueryString());

// request headers (why is it an enumeration?)
Expand Down

0 comments on commit 6e71f35

Please sign in to comment.