Skip to content

Commit

Permalink
SelfContained: properly handle data URIs in css urls.
Browse files Browse the repository at this point in the history
Also use a proper css parser (adds dependency on text-css).

Closes #2129.
  • Loading branch information
jgm committed May 4, 2015
1 parent 7979db0 commit 1b44acf
Show file tree
Hide file tree
Showing 2 changed files with 49 additions and 32 deletions.
1 change: 1 addition & 0 deletions pandoc.cabal
Expand Up @@ -268,6 +268,7 @@ Library
temporary >= 1.1 && < 1.3,
blaze-html >= 0.5 && < 0.9,
blaze-markup >= 0.5.1 && < 0.8,
css-text >= 0.1.2 && < 0.3,
yaml >= 0.8.8.2 && < 0.9,
scientific >= 0.2 && < 0.4,
vector >= 0.10 && < 0.11,
Expand Down
80 changes: 48 additions & 32 deletions src/Text/Pandoc/SelfContained.hs
Expand Up @@ -43,9 +43,17 @@ import qualified Data.ByteString.Lazy as L
import Text.Pandoc.Shared (renderTags', err, fetchItem')
import Text.Pandoc.MediaBag (MediaBag)
import Text.Pandoc.MIME (MimeType)
import Text.Pandoc.UTF8 (toString, fromString)
import Text.Pandoc.UTF8 (toString)
import Text.Pandoc.Options (WriterOptions(..))
import Data.List (isPrefixOf)
import Control.Applicative
import Text.CSS.Parse (parseNestedBlocks, NestedBlock(..))
import Text.CSS.Render (renderNestedBlocks)
import Data.Text.Encoding (decodeUtf8, encodeUtf8)
import qualified Data.Text as T
import Data.Text (Text)
import Data.Text.Lazy (toStrict)
import Data.Text.Lazy.Builder (toLazyText)

isOk :: Char -> Bool
isOk c = isAscii c && isAlphaNum c
Expand All @@ -69,54 +77,62 @@ convertTag media sourceURL t@(TagOpen tagname as)
where processAttribute (x,y) =
if x == "src" || x == "href" || x == "poster"
then do
(raw, mime) <- getRaw media sourceURL (fromAttrib "type" t) y
let enc = makeDataURI mime raw
enc <- getDataURI media sourceURL (fromAttrib "type" t) y
return (x, enc)
else return (x,y)
convertTag media sourceURL t@(TagOpen "script" as) =
case fromAttrib "src" t of
[] -> return t
src -> do
(raw, mime) <- getRaw media sourceURL (fromAttrib "type" t) src
let enc = makeDataURI mime raw
enc <- getDataURI media sourceURL (fromAttrib "type" t) src
return $ TagOpen "script" (("src",enc) : [(x,y) | (x,y) <- as, x /= "src"])
convertTag media sourceURL t@(TagOpen "link" as) =
case fromAttrib "href" t of
[] -> return t
src -> do
(raw, mime) <- getRaw media sourceURL (fromAttrib "type" t) src
let enc = makeDataURI mime raw
enc <- getDataURI media sourceURL (fromAttrib "type" t) src
return $ TagOpen "link" (("href",enc) : [(x,y) | (x,y) <- as, x /= "href"])
convertTag _ _ t = return t

-- NOTE: This is really crude, it doesn't respect CSS comments.
cssURLs :: MediaBag -> Maybe String -> FilePath -> ByteString
-> IO ByteString
cssURLs media sourceURL d orig =
case B.breakSubstring "url(" orig of
(x,y) | B.null y -> return orig
| otherwise -> do
let (u,v) = B.breakSubstring ")" $ B.drop 4 y
rest <- cssURLs media sourceURL d v
let url = toString
$ case B.take 1 u of
"\"" -> B.takeWhile (/='"') $ B.drop 1 u
"'" -> B.takeWhile (/='\'') $ B.drop 1 u
_ -> u
case url of
'#':_ -> return $ x `B.append` rest
_ -> do
let url' = if isURI url
then url
else d </> url
(raw, mime) <- getRaw media sourceURL "" url'
let enc = fromString $ makeDataURI mime raw
return $ x `B.append` "url(" `B.append` enc
`B.append` rest
cssURLs media sourceURL d orig = do
case parseNestedBlocks (decodeUtf8 orig) of
Left _err -> return orig
Right bs -> (encodeUtf8 . toStrict . toLazyText . renderNestedBlocks)
<$> mapM (handleCSSUrls media sourceURL d) bs

getRaw :: MediaBag -> Maybe String -> MimeType -> String
-> IO (ByteString, MimeType)
getRaw media sourceURL mimetype src = do
handleCSSUrls :: MediaBag -> Maybe String -> FilePath -> NestedBlock
-> IO NestedBlock
handleCSSUrls media sourceURL d (NestedBlock t bs) =
NestedBlock t <$> mapM (handleCSSUrls media sourceURL d) bs
handleCSSUrls media sourceURL d (LeafBlock (selector, attrs)) = do
attrs' <- mapM (handleCSSAttr media sourceURL d) attrs
return (LeafBlock (selector, attrs'))

handleCSSAttr :: MediaBag -> Maybe String -> FilePath -> (Text, Text)
-> IO (Text, Text)
handleCSSAttr media sourceURL d (key, val) =
if "url(" `T.isPrefixOf` val
then do
let url = T.unpack $ dropParens $ T.drop 3 val
case url of
'#':_ -> return (key, val)
'd':'a':'t':'a':':':_ -> return (key, val)
_ -> do
let url' = if isURI url then url else d </> url
enc <- getDataURI media sourceURL "" url'
return (key, T.pack enc)
else return (key, val)

dropParens :: Text -> Text
dropParens = T.dropAround (`elem` ['(',')','"','\'',' ','\t','\n','\r'])

getDataURI :: MediaBag -> Maybe String -> MimeType -> String
-> IO String
getDataURI _ _ _ src@('d':'a':'t':'a':':':_) = return src -- already data: uri
getDataURI media sourceURL mimetype src = do
let ext = map toLower $ takeExtension src
fetchResult <- fetchItem' media sourceURL src
(raw, respMime) <- case fetchResult of
Expand All @@ -142,7 +158,7 @@ getRaw media sourceURL mimetype src = do
result <- if mime == "text/css"
then cssURLs media cssSourceURL (takeDirectory src) raw'
else return raw'
return (result, mime)
return $ makeDataURI mime result

-- | Convert HTML into self-contained HTML, incorporating images,
-- scripts, and CSS using data: URIs.
Expand Down

0 comments on commit 1b44acf

Please sign in to comment.