kurtmckee · twm · May 28, 2025 · May 28, 2025 · May 28, 2025 · May 28, 2025
diff --git a/changelog.d/20250527_twm_srcset.rst b/changelog.d/20250527_twm_srcset.rst
@@ -0,0 +1,4 @@
+Added
+-----
+
+*   Resolve relative URLs in ``srcset`` attributes and pass through ``srcset`` when sanitizing.
diff --git a/feedparser/sanitizer.py b/feedparser/sanitizer.py
@@ -259,6 +259,7 @@ class HTMLSanitizer(BaseHTMLProcessor):
         "size",
         "span",
         "src",
+        "srcset",
         "start",
         "step",
         "style",

diff --git a/feedparser/urls.py b/feedparser/urls.py
@@ -1,3 +1,4 @@
+# Copyright 2025 Tom Most <twm@freecog.net>
 # Copyright 2010-2024 Kurt McKee <contactme@kurtmckee.org>
 # Copyright 2002-2008 Mark Pilgrim
 # All rights reserved.
@@ -116,6 +117,56 @@ def make_safe_absolute_uri(base, rel=None):
     return uri
 
 
+# Matches image candidate strings within a srcset attribute value as
+# described in https://html.spec.whatwg.org/multipage/images.html#srcset-attributes
+_srcset_candidate = re.compile(
+    r"""
+    # ASCII whitespace: https://infra.spec.whatwg.org/#ascii-whitespace
+    [\t\n\f\r ]*
+    (
+        # URL that doesn't start or end with a comma
+        (?!,)
+        [^\t\n\f\r ]+
+        (?<!,)
+    )
+    (
+        # Width descriptor like "1234w"
+        # https://html.spec.whatwg.org/multipage/common-microsyntaxes.html#non-negative-integers
+        [\t\n\f\r ]+
+        \d+w
+        |
+        # Pixel density descriptor like "2.0x"
+        # https://html.spec.whatwg.org/multipage/common-microsyntaxes.html#valid-floating-point-number
+        [\t\n\f\r ]+
+        \d+(?:\.\d+)?(?:[eE][-+]?\d+)?x
+        |
+    )
+    [\t\n\f\r ]*
+    (?:,|\Z)
+    """,
+    re.VERBOSE | re.ASCII,
+)
+
+
+def srcset_candidates(value: str) -> list[tuple[str, str]]:
+    """
+    Split a ``srcset`` attribute value into candidates:
+
+    >>> srcset_candidates("/foo.jpg, /foo.2x.jpg 2x")
+    [("/foo.jpg", ""), ("/foo.2x.jpg", "2x")]
+
+    This doesn't validate the URLs, nor check for duplicate or conflicting
+    descriptors. It returns an empty list when parsing fails.
+    """
+    pos = 0
+    candidates = []
+    while m := _srcset_candidate.match(value, pos):
+        desc = m[2].strip("\t\n\f\r ")
+        candidates.append((m[1], desc))
+        pos = m.end(0)
+    return candidates
+
+
 class RelativeURIResolver(BaseHTMLProcessor):
     relative_uris = {
         ("a", "href"),
@@ -156,15 +207,23 @@ def __init__(self, baseuri, encoding, _type):
     def resolve_uri(self, uri):
         return make_safe_absolute_uri(self.baseuri, uri.strip())
 
+    def resolve_srcset(self, srcset):
+        candidates = []
+        for uri, desc in srcset_candidates(srcset):
+            uri = self.resolve_uri(uri)
+            if desc:
+                candidates.append(f"{uri} {desc}")
+            else:
+                candidates.append(uri)
+        return ", ".join(candidates)
+
     def unknown_starttag(self, tag, attrs):
         attrs = self.normalize_attrs(attrs)
-        attrs = [
-            (
-                key,
-                ((tag, key) in self.relative_uris) and self.resolve_uri(value) or value,
-            )
-            for key, value in attrs
-        ]
+        for i, (key, value) in enumerate(attrs):
+            if (tag, key) in self.relative_uris:
+                attrs[i] = (key, self.resolve_uri(value))
+            elif tag in {"img", "source"} and key == "srcset":
+                attrs[i] = (key, self.resolve_srcset(value))
         super().unknown_starttag(tag, attrs)
 
 

diff --git a/tests/test_srcset_candidates.py b/tests/test_srcset_candidates.py
@@ -0,0 +1,63 @@
+import pytest
+
+from feedparser.urls import srcset_candidates
+
+
+def test_empty():
+    assert srcset_candidates("") == []
+    assert srcset_candidates("    \n") == []
+
+
+def test_default():
+    assert srcset_candidates("/1x.jpg") == [("/1x.jpg", "")]
+
+
+def test_pixel_density_descriptor_one():
+    assert srcset_candidates("/1x.jpg 1x") == [("/1x.jpg", "1x")]
+
+
+def test_pixel_density_descriptor_two():
+    assert srcset_candidates("/1x.jpg 1x,/2x.jpg\t2.0x") == [
+        ("/1x.jpg", "1x"),
+        ("/2x.jpg", "2.0x"),
+    ]
+
+
+def test_pixel_density_descriptor_three():
+    assert srcset_candidates("/1x.jpg, /2x.jpg  2x  , /3x.jpg 3x  ") == [
+        ("/1x.jpg", ""),
+        ("/2x.jpg", "2x"),
+        ("/3x.jpg", "3x"),
+    ]
+
+
+@pytest.mark.parametrize(
+    "pd", ["1x", "1.0x", "9.5x", "36x", "39.95x", "100x", "1e1x", "2E2x"]
+)
+def test_pixel_density_descriptor_floats(pd):
+    """A pixel density descriptor allows all the valid float formats."""
+    assert [("/foo.jpg", pd)] == srcset_candidates("/foo.jpg " + pd)
+
+
+def test_url_comma():
+    """A URL containing a comma is not broken."""
+    assert srcset_candidates(" /,.jpg 6x,\n /,,,,.webp \t1e100x") == [
+        ("/,.jpg", "6x"),
+        ("/,,,,.webp", "1e100x"),
+    ]
+
+
+def test_width_one():
+    assert srcset_candidates("/a.png 600w") == [("/a.png", "600w")]
+
+
+def test_width_two():
+    assert srcset_candidates("a.jpg 123w, b.jpg 1234w") == [
+        ("a.jpg", "123w"),
+        ("b.jpg", "1234w"),
+    ]
+
+
+@pytest.mark.parametrize("pd", ["1.5w", "9000X", "-23w", "-60x"])
+def test_invalid(pd):
+    assert srcset_candidates("/x.gif " + pd) == []
diff --git a/tests/wellformed/base/http_entry_content_base_srcset.xml b/tests/wellformed/base/http_entry_content_base_srcset.xml
@@ -0,0 +1,10 @@
+<!--
+Description: entry content srcset relative to document URI
+Expect:      not bozo and entries[0]['content'][0]['value'] == '<img srcset="http://127.0.0.1:8097/rel/img.png, http://127.0.0.1:8097/rel/img.2x.png 2x" />'
+-->
+<feed version="0.3" xmlns="http://purl.org/atom/ns#">
+<entry>
+    <content type="text/html" mode="escaped">&lt;img srcset="/rel/img.png, /rel/img.2x.png 2x"&gt;</content>
+</entry>
+</feed>
+
-Original file line number
+Diff line change
@@ Expand Up / @@ -259,6 +259,7 @@ class HTMLSanitizer(BaseHTMLProcessor): @@
             "size",
             "span",
             "src",
+            "srcset",
             "start",
             "step",
             "style",
@@ Expand Down @@