Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP

Loading…

Fix decoding of invalid %encodings in URLs #535

Merged
merged 3 commits into from

3 participants

@Jonty

Currently a URL with invalid %encodings will cause an exception to be thrown in unquote_unreserved.

This patch reimplements unquote_unreserved to be almost identical to urllib.unquote, thus fixing the problem and simplifying the code.

@kennethreitz

Awesome, thanks!

Looks like there's a merge conflict now. Would you mind bringing it back up to date?

Jonty added some commits
@Jonty Jonty Reenable test_session_with_escaped_url test 3550ac7
@Jonty Jonty Add failing tests for invalid %encodings e28c1c9
@Jonty Jonty Rewrite unquote_unreserved based on urllib.unquote
This is almost entirely taken from the unquote implementation in urllib,
slightly modified for the case in hand. It now deals with invalid %encodings
rather than exploding, and the code is somewhat simpler.
06e4971
@Jonty

Rebased and fixed, should merge cleanly now.

@kennethreitz

You're the best. Thanks!

@kennethreitz kennethreitz merged commit 8a055c5 into kennethreitz:develop
@bluemoon

Python 3.x doesn't support .encode('hex')

Would using binascii be acceptable instead?

I was planning to change this to use binascii.b2a_hex tomorrow morning (which is all .encode('hex') does anyway), expect pushing!

Awesome! I'm not too picky, as long as it works properly :)

@kennethreitz

Reverting for now.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Commits on Apr 11, 2012
  1. @Jonty
  2. @Jonty
  3. @Jonty

    Rewrite unquote_unreserved based on urllib.unquote

    Jonty authored
    This is almost entirely taken from the unquote implementation in urllib,
    slightly modified for the case in hand. It now deals with invalid %encodings
    rather than exploding, and the code is somewhat simpler.
This page is out of date. Refresh to see the latest.
Showing with 84 additions and 79 deletions.
  1. +2 −1  AUTHORS.rst
  2. +10 −13 requests/utils.py
  3. +72 −65 tests/test_requests.py
View
3  AUTHORS.rst
@@ -93,4 +93,5 @@ Patches and Suggestions
- Jiri Machalek
- Steve Pulec
- Michael Kelly
-- Michael Newman <newmaniese@gmail.com>
+- Michael Newman <newmaniese@gmail.com>
+- Jonty Wareing <jonty@jonty.co.uk>
View
23 requests/utils.py
@@ -441,28 +441,25 @@ def stream_untransfer(gen, resp):
# The unreserved URI characters (RFC 3986)
-UNRESERVED_SET = frozenset(
+_unreserved_set = frozenset(
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
+ "0123456789-._~")
+_unreserved_hextochr = dict((c.encode('hex'), c) for c in _unreserved_set)
def unquote_unreserved(uri):
"""Un-escape any percent-escape sequences in a URI that are unreserved
characters.
This leaves all reserved, illegal and non-ASCII bytes encoded.
"""
- parts = uri.split('%')
- for i in range(1, len(parts)):
- h = parts[i][0:2]
- if len(h) == 2:
- c = chr(int(h, 16))
- if c in UNRESERVED_SET:
- parts[i] = c + parts[i][2:]
- else:
- parts[i] = '%' + parts[i]
- else:
- parts[i] = '%' + parts[i]
- return ''.join(parts)
+ res = uri.split('%')
+ s = res[0]
+ for item in res[1:]:
+ try:
+ s += _unreserved_hextochr[item[:2].lower()] + item[2:]
+ except (KeyError, UnicodeDecodeError):
+ s += '%' + item
+ return s
def requote_uri(uri):
View
137 tests/test_requests.py
@@ -117,71 +117,78 @@ def test_HTTP_200_OK_GET_WITH_MIXED_PARAMS(self):
# self.assertRaises(UnicodeEncodeError, requests.get,
# url=httpbin('get'), headers=heads)
- # def test_session_with_escaped_url(self):
- # # Test a URL that contains percent-escaped characters
- # # This URL should not be modified (double-escaped)
- # # Tests:
- # # - Quoted illegal characters ("%20" (' '), "%3C" ('<'), "%3E" ('>'))
- # # - Quoted reserved characters ("%25" ('%'), "%23" ('#'), "%2F" ('/'))
- # # - Quoted non-ASCII characters ("%C3%98", "%C3%A5")
- # path_fully_escaped = '%3Ca%25b%23c%2Fd%3E/%C3%98%20%C3%A5'
- # url = httpbin('get/' + path_fully_escaped)
- # response = get(url)
- # self.assertEqual(response.url, httpbin('get/' + path_fully_escaped))
-
- # # Test that illegal characters in a path get properly percent-escaped
- # # Tests:
- # # - Bare illegal characters (space, '<')
- # # - Bare non-ASCII characters ('\u00d8')
- # path = u'<a%25b%23c%2Fd%3E/\u00d8 %C3%A5'
- # url = httpbin('get/' + path)
- # response = get(url)
- # self.assertEqual(response.url, httpbin('get/' + path_fully_escaped))
-
- # # Test that reserved characters in a path do not get percent-escaped
- # # Tests:
- # # - All reserved characters (RFC 3986), except '?', '#', '[' and ']',
- # # which are not allowed in the path, and ';' which delimits
- # # parameters.
- # # All such characters must be allowed bare in path, and must not be
- # # encoded.
- # # - Special unreserved characters (RFC 3986), which should not be
- # # encoded (even though it wouldn't hurt).
- # path_reserved = '!$&\'()*+,/:=@-._~'
- # url = httpbin('get/' + path_reserved)
- # response = get(url)
- # self.assertEqual(response.url, httpbin('get/' + path_reserved))
-
- # # Test that percent-encoded unreserved characters in a path get
- # # normalised to their un-encoded forms.
- # path_unreserved = 'ABCDwxyz1234-._~'
- # path_unreserved_escaped = '%41%42%43%44%77%78%79%7A%31%32%33%34%2D%2E%5F%7E'
- # url = httpbin('get/' + path_unreserved_escaped)
- # response = get(url)
- # self.assertEqual(response.url, httpbin('get/' + path_unreserved))
-
- # # Re-run all of the same tests on the query part of the URI
- # query_fully_escaped = '%3Ca%25b%23c%2Fd%3E=%C3%98%20%C3%A5'
- # url = httpbin('get/?' + query_fully_escaped)
- # response = get(url)
- # self.assertEqual(response.url, httpbin('get/?' + query_fully_escaped))
-
- # query = u'<a%25b%23c%2Fd%3E=\u00d8 %C3%A5'
- # url = httpbin('get/?' + query)
- # response = get(url)
- # self.assertEqual(response.url, httpbin('get/?' + query_fully_escaped))
-
- # # The legal characters in query happens to be the same as in path
- # query_reserved = '!$&\'()*+,/:=@-._~'
- # url = httpbin('get/?' + query_reserved)
- # response = get(url)
- # self.assertEqual(response.url, httpbin('get/?' + query_reserved))
-
- # query_unreserved = 'ABCDwxyz=1234-._~'
- # query_unreserved_escaped = '%41%42%43%44%77%78%79%7A=%31%32%33%34%2D%2E%5F%7E'
- # url = httpbin('get/?' + query_unreserved_escaped)
- # response = get(url)
- # self.assertEqual(response.url, httpbin('get/?' + query_unreserved))
+ def test_session_with_escaped_url(self):
+ # Test a URL that contains percent-escaped characters
+ # This URL should not be modified (double-escaped)
+ # Tests:
+ # - Quoted illegal characters ("%20" (' '), "%3C" ('<'), "%3E" ('>'))
+ # - Quoted reserved characters ("%25" ('%'), "%23" ('#'), "%2F" ('/'))
+ # - Quoted non-ASCII characters ("%C3%98", "%C3%A5")
+ path_fully_escaped = '%3Ca%25b%23c%2Fd%3E/%C3%98%20%C3%A5'
+ url = httpbin('get/' + path_fully_escaped)
+ response = get(url)
+ self.assertEqual(response.url, httpbin('get/' + path_fully_escaped))
+
+ # Test that illegal characters in a path get properly percent-escaped
+ # Tests:
+ # - Bare illegal characters (space, '<')
+ # - Bare non-ASCII characters ('\u00d8')
+ path = u'<a%25b%23c%2Fd%3E/\u00d8 %C3%A5'
+ url = httpbin('get/' + path)
+ response = get(url)
+ self.assertEqual(response.url, httpbin('get/' + path_fully_escaped))
+
+ # Test that reserved characters in a path do not get percent-escaped
+ # Tests:
+ # - All reserved characters (RFC 3986), except '?', '#', '[' and ']',
+ # which are not allowed in the path, and ';' which delimits
+ # parameters.
+ # All such characters must be allowed bare in path, and must not be
+ # encoded.
+ # - Special unreserved characters (RFC 3986), which should not be
+ # encoded (even though it wouldn't hurt).
+ path_reserved = '!$&\'()*+,/:=@-._~'
+ url = httpbin('get/' + path_reserved)
+ response = get(url)
+ self.assertEqual(response.url, httpbin('get/' + path_reserved))
+
+ # Test that percent-encoded unreserved characters in a path get
+ # normalised to their un-encoded forms.
+ path_unreserved = 'ABCDwxyz1234-._~'
+ path_unreserved_escaped = '%41%42%43%44%77%78%79%7A%31%32%33%34%2D%2E%5F%7E'
+ url = httpbin('get/' + path_unreserved_escaped)
+ response = get(url)
+ self.assertEqual(response.url, httpbin('get/' + path_unreserved))
+
+ # Re-run all of the same tests on the query part of the URI
+ query_fully_escaped = '%3Ca%25b%23c%2Fd%3E=%C3%98%20%C3%A5'
+ url = httpbin('get/?' + query_fully_escaped)
+ response = get(url)
+ self.assertEqual(response.url, httpbin('get/?' + query_fully_escaped))
+
+ query = u'<a%25b%23c%2Fd%3E=\u00d8 %C3%A5'
+ url = httpbin('get/?' + query)
+ response = get(url)
+ self.assertEqual(response.url, httpbin('get/?' + query_fully_escaped))
+
+ # The legal characters in query happens to be the same as in path
+ query_reserved = '!$&\'()*+,/:=@-._~'
+ url = httpbin('get/?' + query_reserved)
+ response = get(url)
+ self.assertEqual(response.url, httpbin('get/?' + query_reserved))
+
+ query_unreserved = 'ABCDwxyz=1234-._~'
+ query_unreserved_escaped = '%41%42%43%44%77%78%79%7A=%31%32%33%34%2D%2E%5F%7E'
+ url = httpbin('get/?' + query_unreserved_escaped)
+ response = get(url)
+ self.assertEqual(response.url, httpbin('get/?' + query_unreserved))
+
+ # We should ignore invalid %encodings
+ request = requests.Request("http://0.0.0.0/get/%TEST")
+ self.assertEqual(request.path_url, "/get/%TEST")
+
+ request = requests.Request("http://0.0.0.0/get/%%TEST%%")
+ self.assertEqual(request.path_url, "/get/%%TEST%%")
def test_user_agent_transfers(self):
"""Issue XX"""
Something went wrong with that request. Please try again.