Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions changelog.d/20250527_twm_srcset.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Added
-----

* Resolve relative URLs in ``srcset`` attributes and pass through ``srcset`` when sanitizing.
1 change: 1 addition & 0 deletions feedparser/sanitizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,7 @@ class HTMLSanitizer(BaseHTMLProcessor):
"size",
"span",
"src",
"srcset",
"start",
"step",
"style",
Expand Down
73 changes: 66 additions & 7 deletions feedparser/urls.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# Copyright 2025 Tom Most <twm@freecog.net>
# Copyright 2010-2024 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim
# All rights reserved.
Expand Down Expand Up @@ -116,6 +117,56 @@ def make_safe_absolute_uri(base, rel=None):
return uri


# Matches image candidate strings within a srcset attribute value as
# described in https://html.spec.whatwg.org/multipage/images.html#srcset-attributes
_srcset_candidate = re.compile(
r"""
# ASCII whitespace: https://infra.spec.whatwg.org/#ascii-whitespace
[\t\n\f\r ]*
(
# URL that doesn't start or end with a comma
(?!,)
[^\t\n\f\r ]+
(?<!,)
)
(
# Width descriptor like "1234w"
# https://html.spec.whatwg.org/multipage/common-microsyntaxes.html#non-negative-integers
[\t\n\f\r ]+
\d+w
|
# Pixel density descriptor like "2.0x"
# https://html.spec.whatwg.org/multipage/common-microsyntaxes.html#valid-floating-point-number
[\t\n\f\r ]+
\d+(?:\.\d+)?(?:[eE][-+]?\d+)?x
|
)
[\t\n\f\r ]*
(?:,|\Z)
""",
re.VERBOSE | re.ASCII,
)


def srcset_candidates(value: str) -> list[tuple[str, str]]:
"""
Split a ``srcset`` attribute value into candidates:

>>> srcset_candidates("/foo.jpg, /foo.2x.jpg 2x")
[("/foo.jpg", ""), ("/foo.2x.jpg", "2x")]

This doesn't validate the URLs, nor check for duplicate or conflicting
descriptors. It returns an empty list when parsing fails.
"""
pos = 0
candidates = []
while m := _srcset_candidate.match(value, pos):
desc = m[2].strip("\t\n\f\r ")
candidates.append((m[1], desc))
pos = m.end(0)
return candidates


class RelativeURIResolver(BaseHTMLProcessor):
relative_uris = {
("a", "href"),
Expand Down Expand Up @@ -156,15 +207,23 @@ def __init__(self, baseuri, encoding, _type):
def resolve_uri(self, uri):
return make_safe_absolute_uri(self.baseuri, uri.strip())

def resolve_srcset(self, srcset):
candidates = []
for uri, desc in srcset_candidates(srcset):
uri = self.resolve_uri(uri)
if desc:
candidates.append(f"{uri} {desc}")
else:
candidates.append(uri)
return ", ".join(candidates)

def unknown_starttag(self, tag, attrs):
attrs = self.normalize_attrs(attrs)
attrs = [
(
key,
((tag, key) in self.relative_uris) and self.resolve_uri(value) or value,
)
for key, value in attrs
]
for i, (key, value) in enumerate(attrs):
if (tag, key) in self.relative_uris:
attrs[i] = (key, self.resolve_uri(value))
elif tag in {"img", "source"} and key == "srcset":
attrs[i] = (key, self.resolve_srcset(value))
super().unknown_starttag(tag, attrs)


Expand Down
63 changes: 63 additions & 0 deletions tests/test_srcset_candidates.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import pytest

from feedparser.urls import srcset_candidates


def test_empty():
assert srcset_candidates("") == []
assert srcset_candidates(" \n") == []


def test_default():
assert srcset_candidates("/1x.jpg") == [("/1x.jpg", "")]


def test_pixel_density_descriptor_one():
assert srcset_candidates("/1x.jpg 1x") == [("/1x.jpg", "1x")]


def test_pixel_density_descriptor_two():
assert srcset_candidates("/1x.jpg 1x,/2x.jpg\t2.0x") == [
("/1x.jpg", "1x"),
("/2x.jpg", "2.0x"),
]


def test_pixel_density_descriptor_three():
assert srcset_candidates("/1x.jpg, /2x.jpg 2x , /3x.jpg 3x ") == [
("/1x.jpg", ""),
("/2x.jpg", "2x"),
("/3x.jpg", "3x"),
]


@pytest.mark.parametrize(
"pd", ["1x", "1.0x", "9.5x", "36x", "39.95x", "100x", "1e1x", "2E2x"]
)
def test_pixel_density_descriptor_floats(pd):
"""A pixel density descriptor allows all the valid float formats."""
assert [("/foo.jpg", pd)] == srcset_candidates("/foo.jpg " + pd)


def test_url_comma():
"""A URL containing a comma is not broken."""
assert srcset_candidates(" /,.jpg 6x,\n /,,,,.webp \t1e100x") == [
("/,.jpg", "6x"),
("/,,,,.webp", "1e100x"),
]


def test_width_one():
assert srcset_candidates("/a.png 600w") == [("/a.png", "600w")]


def test_width_two():
assert srcset_candidates("a.jpg 123w, b.jpg 1234w") == [
("a.jpg", "123w"),
("b.jpg", "1234w"),
]


@pytest.mark.parametrize("pd", ["1.5w", "9000X", "-23w", "-60x"])
def test_invalid(pd):
assert srcset_candidates("/x.gif " + pd) == []
10 changes: 10 additions & 0 deletions tests/wellformed/base/http_entry_content_base_srcset.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
<!--
Description: entry content srcset relative to document URI
Expect: not bozo and entries[0]['content'][0]['value'] == '<img srcset="http://127.0.0.1:8097/rel/img.png, http://127.0.0.1:8097/rel/img.2x.png 2x" />'
-->
<feed version="0.3" xmlns="http://purl.org/atom/ns#">
<entry>
<content type="text/html" mode="escaped">&lt;img srcset="/rel/img.png, /rel/img.2x.png 2x"&gt;</content>
</entry>
</feed>

Loading