diff --git a/Doc/library/urllib.parse.rst b/Doc/library/urllib.parse.rst index 96b396510794b4..35b329a58329de 100644 --- a/Doc/library/urllib.parse.rst +++ b/Doc/library/urllib.parse.rst @@ -324,8 +324,9 @@ or on combining URL components into a URL string. ``#``, ``@``, or ``:`` will raise a :exc:`ValueError`. If the URL is decomposed before parsing, no error will be raised. - Following the `WHATWG spec`_ that updates RFC 3986, ASCII newline - ``\n``, ``\r`` and tab ``\t`` characters are stripped from the URL. + Following the `WHATWG spec`_ that updates RFC 3986, leading and trailing C0 + control and space characters are stripped from the URL. ``\n``, ``\r`` and + tab ``\t`` characters are removed from the URL at any position. .. versionchanged:: 3.6 Out-of-range port numbers now raise :exc:`ValueError`, instead of @@ -338,6 +339,10 @@ or on combining URL components into a URL string. .. versionchanged:: 3.10 ASCII newline and tab characters are stripped from the URL. + .. versionchanged:: 3.12 + Leading and trailing C0 control and space characters are stripped from + the URL + .. _WHATWG spec: https://url.spec.whatwg.org/#concept-basic-url-parser .. function:: urlunsplit(parts) diff --git a/Lib/test/test_urlparse.py b/Lib/test/test_urlparse.py index 80fb9e5cd2a445..c522f75fb581b9 100644 --- a/Lib/test/test_urlparse.py +++ b/Lib/test/test_urlparse.py @@ -649,6 +649,44 @@ def test_urlsplit_remove_unsafe_bytes(self): self.assertEqual(p.scheme, "http") self.assertEqual(p.geturl(), "http://www.python.org/javascript:alert('msg')/?query=something#fragment") + def test_urlsplit_strip_url(self): + noise = bytes([*range(0, 0x1f), 0x20]) + base_url = "http://User:Pass@www.python.org:080/doc/?query=yes#frag" + + url = noise.decode() + base_url + noise.decode() + p = urllib.parse.urlsplit(url) + self.assertEqual(p.scheme, "http") + self.assertEqual(p.netloc, "User:Pass@www.python.org:080") + self.assertEqual(p.path, "/doc/") + self.assertEqual(p.query, "query=yes") + self.assertEqual(p.fragment, "frag") + self.assertEqual(p.username, "User") + self.assertEqual(p.password, "Pass") + self.assertEqual(p.hostname, "www.python.org") + self.assertEqual(p.port, 80) + self.assertEqual(p.geturl(), base_url) + + url = noise + base_url.encode() + noise + p = urllib.parse.urlsplit(url) + self.assertEqual(p.scheme, b"http") + self.assertEqual(p.netloc, b"User:Pass@www.python.org:080") + self.assertEqual(p.path, b"/doc/") + self.assertEqual(p.query, b"query=yes") + self.assertEqual(p.fragment, b"frag") + self.assertEqual(p.username, b"User") + self.assertEqual(p.password, b"Pass") + self.assertEqual(p.hostname, b"www.python.org") + self.assertEqual(p.port, 80) + self.assertEqual(p.geturl(), base_url.encode()) + + # with scheme as cache-key + url = "//www.python.org/" + scheme = noise.decode() + "https" + noise.decode() + for _ in range(2): + p = urllib.parse.urlsplit(url, scheme=scheme) + self.assertEqual(p.scheme, "https") + self.assertEqual(p.geturl(), "https://www.python.org/") + def test_attributes_bad_port(self): """Check handling of invalid ports.""" for bytes in (False, True): @@ -656,7 +694,7 @@ def test_attributes_bad_port(self): for port in ("foo", "1.5", "-1", "0x10", "-0", "1_1", " 1", "1 ", "рем"): with self.subTest(bytes=bytes, parse=parse, port=port): netloc = "www.example.net:" + port - url = "http://" + netloc + url = "http://" + netloc + "/" if bytes: if netloc.isascii() and port.isascii(): netloc = netloc.encode("ascii") diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py index 5f95c5ff7f9c1c..fb4b57bdba4a99 100644 --- a/Lib/urllib/parse.py +++ b/Lib/urllib/parse.py @@ -79,6 +79,9 @@ '0123456789' '+-.') +# Leading and trailing C0 control and space to be stripped per WHATWG spec +_URL_CHARS_TO_STRIP = "".join([*(chr(i) for i in range(0, 0x1f + 1)), " "]) + # Unsafe bytes to be removed per WHATWG spec _UNSAFE_URL_BYTES_TO_REMOVE = ['\t', '\r', '\n'] @@ -452,6 +455,8 @@ def urlsplit(url, scheme='', allow_fragments=True): """ url, scheme, _coerce_result = _coerce_args(url, scheme) + url = url.strip(_URL_CHARS_TO_STRIP) + scheme = scheme.strip(_URL_CHARS_TO_STRIP) for b in _UNSAFE_URL_BYTES_TO_REMOVE: url = url.replace(b, "") diff --git a/Misc/NEWS.d/next/Security/2023-03-07-20-59-17.gh-issue-102153.14CLSZ.rst b/Misc/NEWS.d/next/Security/2023-03-07-20-59-17.gh-issue-102153.14CLSZ.rst new file mode 100644 index 00000000000000..97652398a0fd70 --- /dev/null +++ b/Misc/NEWS.d/next/Security/2023-03-07-20-59-17.gh-issue-102153.14CLSZ.rst @@ -0,0 +1,3 @@ +:func:`urllib.parse.urlsplit` now strips leading and trailing C0 control and +space characters following the controlling specification for URLs defined by +WHATWG in response to CVE-2023-24329. Patch by Illia Volochii.