Skip to content

Commit

Permalink
HTML reader: parse task lists using input elements (#9066)
Browse files Browse the repository at this point in the history
Allow the HTML reader to parse task lists of the sort produced by pandoc.
Closes #9047
  • Loading branch information
sspeaks committed Sep 16, 2023
1 parent 1386131 commit 0d27947
Show file tree
Hide file tree
Showing 4 changed files with 68 additions and 3 deletions.
19 changes: 18 additions & 1 deletion src/Text/Pandoc/Readers/HTML.hs
Expand Up @@ -129,6 +129,10 @@ setInChapter = local (\s -> s {inChapter = True})
setInPlain :: PandocMonad m => HTMLParser m s a -> HTMLParser m s a
setInPlain = local (\s -> s {inPlain = True})

-- Some items should be handled differently when in a list item tag, e.g. checkbox
setInListItem :: PandocMonad m => HTMLParser m s a -> HTMLParser m s a
setInListItem = local (\s -> s {inListItem = True})

pHtml :: PandocMonad m => TagParser m Blocks
pHtml = do
(TagOpen "html" attr) <- lookAhead pAny
Expand Down Expand Up @@ -334,7 +338,7 @@ pBulletList = try $ do
return $ B.bulletList $ map (fixPlains True) items

pListItem :: PandocMonad m => TagParser m Blocks
pListItem = do
pListItem = setInListItem $ do
TagOpen _ attr' <- lookAhead $ pSatisfy (matchTagOpen "li" [])
let attr = toStringAttr attr'
let addId ident bs = case B.toList bs of
Expand All @@ -344,6 +348,16 @@ pListItem = do
maybe id addId (lookup "id" attr) <$>
pInTags "li" block

pCheckbox :: PandocMonad m => TagParser m Inlines
pCheckbox = do
TagOpen _ attr' <- pSatisfy $ matchTagOpen "input" [("type","checkbox")]
TagClose _ <- pSatisfy (matchTagClose "input")
let attr = toStringAttr attr'
let isChecked = isJust $ lookup "checked" attr
let escapeSequence = B.str $ if isChecked then "\9746" else "\9744"
return $ escapeSequence <> B.space


-- | Parses a list item just like 'pListItem', but allows sublists outside of
-- @li@ tags to be treated as items.
pListItem' :: PandocMonad m => TagParser m a -> TagParser m Blocks
Expand Down Expand Up @@ -673,6 +687,9 @@ inline = pTagText <|> do
"var" -> pCodeWithClass "var" "variable"
"span" -> pSpan
"math" -> pMath False
"input"
| lookup "type" attr == Just "checkbox"
-> asks inListItem >>= guard >> pCheckbox
"script"
| Just x <- lookup "type" attr
, "math/tex" `T.isPrefixOf` x -> pScriptMath
Expand Down
3 changes: 2 additions & 1 deletion src/Text/Pandoc/Readers/HTML/Types.hs
Expand Up @@ -60,6 +60,7 @@ data HTMLLocal = HTMLLocal
{ quoteContext :: QuoteContext
, inChapter :: Bool -- ^ Set if in chapter section
, inPlain :: Bool -- ^ Set if in pPlain
, inListItem :: Bool -- ^ Set if in <li> tag
}


Expand Down Expand Up @@ -91,7 +92,7 @@ instance HasMeta HTMLState where
deleteMeta s st = st {parserState = deleteMeta s $ parserState st}

instance Default HTMLLocal where
def = HTMLLocal NoQuote False False
def = HTMLLocal NoQuote False False False

instance HasLastStrPosition HTMLState where
setLastStrPos s st = st {parserState = setLastStrPos s (parserState st)}
Expand Down
47 changes: 47 additions & 0 deletions test/command/9047.md
@@ -0,0 +1,47 @@
tests meant to test the fixes of bug [#9047](https://github.com/jgm/pandoc/issues/9047)

```
% pandoc -f html -t html
<ul class="task-list">
<li><label><input type="checkbox" />foo</label></li>
<li><label><input type="checkbox" checked="" />bar</label></li>
<li><label><input type="button" checked="" />foobar</label></li>
<li><input id="hello" type="checkbox" checked/><label for="hello">hello</label></li>
</ul>
^D
<ul>
<li><label><input type="checkbox" />foo</label></li>
<li><label><input type="checkbox" checked="" />bar</label></li>
<li>foobar</li>
<li><label><input type="checkbox" checked="" />hello</label></li>
</ul>
```

w/ rawHTML extension a checkbox by itself is kept
```
% pandoc -f html+raw_html -t html+raw_html
<input type="checkbox" checked="" />
^D
<input type="checkbox" checked>
</input>
```

w/ rawHTML extension, a checkbox in an `li` is handled properly
```
% pandoc -f html+raw_html -t html+raw_html
<ul>
<li><input type="checkbox" checked/>foo</li>
</ul>
^D
<ul class="task-list">
<li><label><input type="checkbox" checked="" />foo</label></li>
</ul>
```

w/o rawHTML extension, a checkbox outside of an `li` is properly ignored
```
% pandoc -f html -t html
<p><input type="checkbox" checked/>foo</p>
^D
foo
```
2 changes: 1 addition & 1 deletion test/command/tasklist.md
Expand Up @@ -102,4 +102,4 @@ round trip:
^D
- [ ] foo
- [x] bar
```
```

0 comments on commit 0d27947

Please sign in to comment.