From 0d27947b96f7066edeed8a63c196e045ac8822b5 Mon Sep 17 00:00:00 2001 From: Seth Speaks Date: Sat, 16 Sep 2023 09:21:01 -0700 Subject: [PATCH] HTML reader: parse task lists using input elements (#9066) Allow the HTML reader to parse task lists of the sort produced by pandoc. Closes #9047 --- src/Text/Pandoc/Readers/HTML.hs | 19 ++++++++++- src/Text/Pandoc/Readers/HTML/Types.hs | 3 +- test/command/9047.md | 47 +++++++++++++++++++++++++++ test/command/tasklist.md | 2 +- 4 files changed, 68 insertions(+), 3 deletions(-) create mode 100644 test/command/9047.md diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs index 5e4c8d3158bd..18441633d945 100644 --- a/src/Text/Pandoc/Readers/HTML.hs +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -129,6 +129,10 @@ setInChapter = local (\s -> s {inChapter = True}) setInPlain :: PandocMonad m => HTMLParser m s a -> HTMLParser m s a setInPlain = local (\s -> s {inPlain = True}) +-- Some items should be handled differently when in a list item tag, e.g. checkbox +setInListItem :: PandocMonad m => HTMLParser m s a -> HTMLParser m s a +setInListItem = local (\s -> s {inListItem = True}) + pHtml :: PandocMonad m => TagParser m Blocks pHtml = do (TagOpen "html" attr) <- lookAhead pAny @@ -334,7 +338,7 @@ pBulletList = try $ do return $ B.bulletList $ map (fixPlains True) items pListItem :: PandocMonad m => TagParser m Blocks -pListItem = do +pListItem = setInListItem $ do TagOpen _ attr' <- lookAhead $ pSatisfy (matchTagOpen "li" []) let attr = toStringAttr attr' let addId ident bs = case B.toList bs of @@ -344,6 +348,16 @@ pListItem = do maybe id addId (lookup "id" attr) <$> pInTags "li" block +pCheckbox :: PandocMonad m => TagParser m Inlines +pCheckbox = do + TagOpen _ attr' <- pSatisfy $ matchTagOpen "input" [("type","checkbox")] + TagClose _ <- pSatisfy (matchTagClose "input") + let attr = toStringAttr attr' + let isChecked = isJust $ lookup "checked" attr + let escapeSequence = B.str $ if isChecked then "\9746" else "\9744" + return $ escapeSequence <> B.space + + -- | Parses a list item just like 'pListItem', but allows sublists outside of -- @li@ tags to be treated as items. pListItem' :: PandocMonad m => TagParser m a -> TagParser m Blocks @@ -673,6 +687,9 @@ inline = pTagText <|> do "var" -> pCodeWithClass "var" "variable" "span" -> pSpan "math" -> pMath False + "input" + | lookup "type" attr == Just "checkbox" + -> asks inListItem >>= guard >> pCheckbox "script" | Just x <- lookup "type" attr , "math/tex" `T.isPrefixOf` x -> pScriptMath diff --git a/src/Text/Pandoc/Readers/HTML/Types.hs b/src/Text/Pandoc/Readers/HTML/Types.hs index f03454c3bfb6..a16773949d12 100644 --- a/src/Text/Pandoc/Readers/HTML/Types.hs +++ b/src/Text/Pandoc/Readers/HTML/Types.hs @@ -60,6 +60,7 @@ data HTMLLocal = HTMLLocal { quoteContext :: QuoteContext , inChapter :: Bool -- ^ Set if in chapter section , inPlain :: Bool -- ^ Set if in pPlain + , inListItem :: Bool -- ^ Set if in
  • tag } @@ -91,7 +92,7 @@ instance HasMeta HTMLState where deleteMeta s st = st {parserState = deleteMeta s $ parserState st} instance Default HTMLLocal where - def = HTMLLocal NoQuote False False + def = HTMLLocal NoQuote False False False instance HasLastStrPosition HTMLState where setLastStrPos s st = st {parserState = setLastStrPos s (parserState st)} diff --git a/test/command/9047.md b/test/command/9047.md new file mode 100644 index 000000000000..84c405e07ffc --- /dev/null +++ b/test/command/9047.md @@ -0,0 +1,47 @@ +tests meant to test the fixes of bug [#9047](https://github.com/jgm/pandoc/issues/9047) + +``` +% pandoc -f html -t html + +^D + +``` + +w/ rawHTML extension a checkbox by itself is kept +``` +% pandoc -f html+raw_html -t html+raw_html + +^D + + +``` + +w/ rawHTML extension, a checkbox in an `li` is handled properly +``` +% pandoc -f html+raw_html -t html+raw_html + +^D + +``` + +w/o rawHTML extension, a checkbox outside of an `li` is properly ignored +``` +% pandoc -f html -t html +

    foo

    +^D +foo +``` diff --git a/test/command/tasklist.md b/test/command/tasklist.md index b1c6491ccb5c..9aaa0a8aaeab 100644 --- a/test/command/tasklist.md +++ b/test/command/tasklist.md @@ -102,4 +102,4 @@ round trip: ^D - [ ] foo - [x] bar -``` +``` \ No newline at end of file