Skip to content

Commit

Permalink
Hakyll.Web.Html: fix processing of srcset attribute (#891)
Browse files Browse the repository at this point in the history
We used to assume that `srcset` is just like `src`, and contains only
one URL. As the name of the attribute might suggest, this assumption is
wrong.

The attribute's value has a somewhat-complex grammar, with optional
fields and different separators, so I implemented a Parsec parser for
(the most of) it. The parser expects URLs to be a contiguous string of
anything but space or comma (`srcset`'s separators); I hope it will be
sufficient.

This also adds tests for the affected functions, and documentation for
`getUrls`.

Fixes #889.
  • Loading branch information
Minoru committed Oct 11, 2021
1 parent eb09e8f commit 05070e8
Show file tree
Hide file tree
Showing 4 changed files with 137 additions and 4 deletions.
1 change: 1 addition & 0 deletions hakyll.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,7 @@ Test-suite hakyll-tests
bytestring >= 0.9 && < 0.11,
containers >= 0.3 && < 0.7,
filepath >= 1.0 && < 1.5,
tagsoup >= 0.13.1 && < 0.15,
text >= 0.11 && < 1.3,
unordered-containers >= 0.2 && < 0.3,
yaml >= 0.8.11 && < 0.12
Expand Down
110 changes: 106 additions & 4 deletions lib/Hakyll/Web/Html.hs
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,17 @@ module Hakyll.Web.Html
--------------------------------------------------------------------------------
import Data.Char (digitToInt, intToDigit,
isDigit, toLower)
import Data.List (isPrefixOf)
import Data.Either (fromRight)
import Data.List (isPrefixOf, intercalate)
import Data.Maybe (fromMaybe)
import qualified Data.Set as S
import Control.Monad (void)
import System.FilePath (joinPath, splitPath,
takeDirectory)
import Text.Blaze.Html (toHtml)
import Text.Blaze.Html.Renderer.String (renderHtml)
import qualified Text.Parsec as P
import qualified Text.Parsec.Char as PC
import qualified Text.HTML.TagSoup as TS
import Network.URI (isUnreserved, escapeURIString)

Expand Down Expand Up @@ -71,12 +76,21 @@ demoteHeadersBy amount

--------------------------------------------------------------------------------
isUrlAttribute :: String -> Bool
isUrlAttribute = (`elem` ["src", "href", "data", "poster", "srcset"])
isUrlAttribute = (`elem` ["src", "href", "data", "poster"])


--------------------------------------------------------------------------------
-- | Extract URLs from tags' attributes. Those would be the same URLs on which
-- `withUrls` would act.
getUrls :: [TS.Tag String] -> [String]
getUrls tags = [v | TS.TagOpen _ as <- tags, (k, v) <- as, isUrlAttribute k]
getUrls tags = [u | TS.TagOpen _ as <- tags, (k, v) <- as, u <- extractUrls k v]
where
extractUrls "srcset" value =
let srcset = fmap unSrcset $ P.parse srcsetParser "" value
in map srcsetImageCandidateUrl $ fromRight [] srcset
extractUrls key value
| isUrlAttribute key = [value]
| otherwise = []


--------------------------------------------------------------------------------
Expand All @@ -86,6 +100,14 @@ withUrls f = withTags tag
where
tag (TS.TagOpen s a) = TS.TagOpen s $ map attr a
tag x = x

attr input@("srcset", v) =
case fmap unSrcset $ P.parse srcsetParser "" v of
Right srcset ->
let srcset' = map (\i -> i { srcsetImageCandidateUrl = f $ srcsetImageCandidateUrl i }) srcset
srcset'' = show $ Srcset srcset'
in ("srcset", srcset'')
Left _ -> input
attr (k, v) = (k, if isUrlAttribute k then f v else v)


Expand Down Expand Up @@ -142,7 +164,7 @@ toUrl url = case (removeWinPathSeparator url) of
--------------------------------------------------------------------------------
-- | Get the relative url to the site root, for a given (absolute) url
toSiteRoot :: String -> String
toSiteRoot = removeWinPathSeparator . emptyException . joinPath
toSiteRoot = removeWinPathSeparator . emptyException . joinPath
. map parent . filter relevant . splitPath . takeDirectory
where
parent = const ".."
Expand Down Expand Up @@ -198,3 +220,83 @@ stripTags (x : xs) = x : stripTags xs
-- > "Me &amp; Dean"
escapeHtml :: String -> String
escapeHtml = renderHtml . toHtml


--------------------------------------------------------------------------------
data Srcset = Srcset {
unSrcset :: [SrcsetImageCandidate]
}


--------------------------------------------------------------------------------
instance Show Srcset where
show set = intercalate ", " $ map show $ unSrcset set


--------------------------------------------------------------------------------
data SrcsetImageCandidate = SrcsetImageCandidate {
srcsetImageCandidateUrl :: String
, srcsetImageCandidateDescriptor :: Maybe String
}


--------------------------------------------------------------------------------
instance Show SrcsetImageCandidate where
show candidate =
let url = srcsetImageCandidateUrl candidate
in case srcsetImageCandidateDescriptor candidate of
Just desc -> concat [url, " ", desc]
Nothing -> url


--------------------------------------------------------------------------------
-- HTML spec: https://html.spec.whatwg.org/#srcset-attributes
srcsetParser :: P.Parsec String () Srcset
srcsetParser = do
result <- candidate `P.sepBy1` (PC.char ',')
P.eof
return $ Srcset result
where
candidate :: P.Parsec String () SrcsetImageCandidate
candidate = do
P.skipMany ascii_whitespace
u <- url
P.skipMany ascii_whitespace
desc <- P.optionMaybe $ P.choice $ fmap P.try [width_descriptor, px_density_descriptor]
P.skipMany ascii_whitespace
return $ SrcsetImageCandidate {
srcsetImageCandidateUrl = u
, srcsetImageCandidateDescriptor = desc
}

-- This is an over-simplification, but should be good enough for our purposes
url :: P.Parsec String () String
url = P.many1 $ PC.noneOf " ,"

ascii_whitespace :: P.Parsec String () ()
ascii_whitespace = void $ P.oneOf "\x09\x0A\x0C\x0D\x20"

width_descriptor :: P.Parsec String () String
width_descriptor = do
number <- P.many1 PC.digit
void $ PC.char 'w'
return $ concat [number, "w"]

px_density_descriptor :: P.Parsec String () String
px_density_descriptor = do
sign <- P.optionMaybe $ PC.char '-'
int <- P.many1 PC.digit
frac <- P.optionMaybe $ do
void $ PC.char '.'
frac <- P.many1 PC.digit
return $ concat [".", frac]
expon <- P.optionMaybe $ do
letter <- P.oneOf "eE"
e_sign <- P.optionMaybe $ PC.oneOf "-+"
number <- P.many1 PC.digit
return $ concat [[letter], mb $ fmap show e_sign, number]
void $ PC.char 'x'
return $ concat [mb $ fmap show sign, int, mb frac, mb expon, "x"]

mb :: Maybe String -> String
mb = fromMaybe ""
2 changes: 2 additions & 0 deletions tests/Hakyll/Web/Html/RelativizeUrls/Tests.hs
Original file line number Diff line number Diff line change
Expand Up @@ -35,4 +35,6 @@ tests = testGroup "Hakyll.Web.Html.RelativizeUrls.Tests" $
, "<script src=\"//ajax.googleapis.com/jquery.min.js\"></script>" @=?
relativizeUrlsWith "../.."
"<script src=\"//ajax.googleapis.com/jquery.min.js\"></script>"
, "<img srcset=\"./image.png 200w, ./image2.png 400w\" />" @=?
relativizeUrlsWith "." "<img srcset=\"/image.png 200w, /image2.png 400w\" />"
]
28 changes: 28 additions & 0 deletions tests/Hakyll/Web/Html/Tests.hs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ module Hakyll.Web.Html.Tests
import Data.Char (toUpper)
import Test.Tasty (TestTree, testGroup)
import Test.Tasty.HUnit ((@=?))
import qualified Text.HTML.TagSoup as TS


--------------------------------------------------------------------------------
Expand All @@ -34,6 +35,23 @@ tests = testGroup "Hakyll.Web.Html.Tests" $ concat
demoteHeadersBy 0 "<h4>A h4 title</h4>" -- Assert that a demotion of @N < 1@ is a no-op.
]

, fromAssertions "getUrls"
[ ["/image1.png", "/image2.jpeg", "https://example.com", "/game.swf", "/poster.jpeg"] @=?
getUrls [
TS.TagOpen "img" [("src", "/image1.png")]
, TS.TagOpen "img" [("src", "/image2.jpeg")]
, TS.TagOpen "a" [("href", "https://example.com")]
, TS.TagOpen "object" [("data", "/game.swf")]
, TS.TagOpen "video" [("poster", "/poster.jpeg")]
]
, ["/image1.png", "/image2.jpeg", "/image3.bmp"] @=?
getUrls [ TS.TagOpen "img" [("srcset", "/image1.png 10w, /image2.jpeg, /image3.bmp 1.3x")] ]

-- Invalid srcset specification means no URLs are extracted
, [] @=?
getUrls [ TS.TagOpen "img" [("srcset", "/image1.png 10wide, /image2.jpeg, /image3.bmp 1.3px")] ]
]

, fromAssertions "withUrls"
[ "<a href=\"FOO\">bar</a>" @=?
withUrls (map toUpper) "<a href=\"foo\">bar</a>"
Expand All @@ -51,6 +69,16 @@ tests = testGroup "Hakyll.Web.Html.Tests" $ concat
-- Test minimizing elements
, "<meta bar=\"foo\" />" @=?
withUrls id "<meta bar=\"foo\" />"

-- Test that URLs are extracted from img's srcset
, "<img srcset=\"foo 200w\" />" @=?
withUrls (const "foo") "<img srcset=\"/path/to/image.png 200w\" />"
, "<img srcset=\"bar 200w, bar 400w\" />" @=?
withUrls (const "bar") "<img srcset=\"/small.jpeg 200w, /img/large.jpeg 400w\" />"

-- Invalid srcsets are left unchanged
, "<img srcset=\"/image1.png 200px\" />" @=?
withUrls (const "bar") "<img srcset=\"/image1.png 200px\" />"
]

, fromAssertions "toUrl"
Expand Down

0 comments on commit 05070e8

Please sign in to comment.