Skip to content

Commit

Permalink
Web.Pandoc: refactor reader selection
Browse files Browse the repository at this point in the history
  • Loading branch information
mmirate committed Aug 24, 2019
1 parent e8ea8cd commit da4445a
Showing 1 changed file with 133 additions and 17 deletions.
150 changes: 133 additions & 17 deletions lib/Hakyll/Web/Pandoc.hs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ module Hakyll.Web.Pandoc
( -- * The basic building blocks
readPandoc
, readPandocWith
, readPandocLBS
, readPandocLBSWith
, writePandoc
, writePandocWith
, renderPandoc
Expand All @@ -23,14 +25,93 @@ module Hakyll.Web.Pandoc

--------------------------------------------------------------------------------
import qualified Data.Text as T
import qualified Data.ByteString.Lazy as BL
import qualified Data.Text.Lazy as TL
import qualified Data.Text.Lazy.Encoding as TLE
import Text.Pandoc
--import Text.Pandoc.App.FormatHeuristics (formatFromFilePaths)
import Text.Pandoc.Highlighting (pygments)
import Data.Char (toLower)
import Data.Bifunctor (second)
import System.FilePath (takeExtension)


--------------------------------------------------------------------------------
import Hakyll.Core.Compiler
import Hakyll.Core.Identifier
import Hakyll.Core.Item
import Hakyll.Web.Pandoc.FileType
-- import Hakyll.Web.Pandoc.FileType


--------------------------------------------------------------------------------
-- | Determine format based on file extension.
-- Adapted from Pandoc, pending the public importability of
-- Text.Pandoc.App.FormatHeuristics.
formatFromFilePath :: String -> FilePath -> String
formatFromFilePath fallback x =
case takeExtension (map toLower x) of
".adoc" -> "asciidoc"
".asciidoc" -> "asciidoc"
".context" -> "context"
".ctx" -> "context"
".db" -> "docbook"
".doc" -> "doc" -- so we get an "unknown reader" error
".docx" -> "docx"
".dokuwiki" -> "dokuwiki"
".epub" -> "epub"
".fb2" -> "fb2"
".htm" -> "html"
".html" -> "html"
".icml" -> "icml"
".json" -> "json"
".latex" -> "latex"
".lhs" -> "markdown+lhs"
".ltx" -> "latex"
".markdown" -> "markdown"
".md" -> "markdown"
".ms" -> "ms"
".muse" -> "muse"
".native" -> "native"
".odt" -> "odt"
".opml" -> "opml"
".org" -> "org"
".pdf" -> "pdf" -- so we get an "unknown reader" error
".pptx" -> "pptx"
".roff" -> "ms"
".rst" -> "rst"
".rtf" -> "rtf"
".s5" -> "s5"
".t2t" -> "t2t"
".tei" -> "tei"
".tei.xml" -> "tei"
".tex" -> "latex"
".texi" -> "texinfo"
".texinfo" -> "texinfo"
".text" -> "markdown"
".textile" -> "textile"
".txt" -> "markdown"
".wiki" -> "mediawiki"
".xhtml" -> "html"
".ipynb" -> "ipynb"
['.',y] | y `elem` ['1'..'9'] -> "man"
_ -> fallback


--------------------------------------------------------------------------------
defaultReaderName :: Identifier -> String
defaultReaderName = formatFromFilePath "markdown" . toFilePath


--------------------------------------------------------------------------------
getReaderForIdentifier :: Identifier
-> (String, Reader PandocPure, Extensions)
getReaderForIdentifier ident =
let readerName = defaultReaderName ident in
case getReader readerName of
Left _ -> error $
"Hakyll.Web.readPandocWith: I don't know how to read a file of " ++
"the type " ++ show readerName ++ " for: " ++ show ident
Right (x, e) -> (readerName, x, e)


--------------------------------------------------------------------------------
Expand All @@ -41,33 +122,55 @@ readPandoc
readPandoc = readPandocWith defaultHakyllReaderOptions


--------------------------------------------------------------------------------
-- | Read a bytestring using pandoc, with the default options
readPandocLBS
:: Item BL.ByteString -- ^ String to read
-> Compiler (Item Pandoc) -- ^ Resulting document
readPandocLBS = readPandocLBSWith defaultHakyllReaderOptions


--------------------------------------------------------------------------------
-- | Read a string using pandoc, with the supplied options
readPandocWith
:: ReaderOptions -- ^ Parser options
-> Item String -- ^ String to read
-> Compiler (Item Pandoc) -- ^ Resulting document
readPandocWith ropt item =
case runPure $ traverse (reader ropt (itemFileType item)) (fmap T.pack item) of
case runPure $ traverse (reader ropt (itemIdentifier item)) (T.pack <$> item) of
Left err -> fail $
"Hakyll.Web.Pandoc.readPandocWith: parse failed: " ++ show err
Right item' -> return item'
where
reader ro t = case t of
DocBook -> readDocBook ro
Html -> readHtml ro
LaTeX -> readLaTeX ro
LiterateHaskell t' -> reader (addExt ro Ext_literate_haskell) t'
Markdown -> readMarkdown ro
MediaWiki -> readMediaWiki ro
OrgMode -> readOrg ro
Rst -> readRST ro
Textile -> readTextile ro
_ -> error $
"Hakyll.Web.readPandocWith: I don't know how to read a file of " ++
"the type " ++ show t ++ " for: " ++ show (itemIdentifier item)
reader ro i = case getReaderForIdentifier i of
(_, TextReader r, es) -> r (addExts ro es)
(ext, ByteStringReader _, _) -> error $
"Hakyll.Web.readPandocWith: files of the type " ++ (show ext) ++
" must be read by the ByteString-capable reader, for: " ++ (show i)
-- graceful degradation is impossible, as the UTF8 encoding mangles
-- any invalid characters; at best, could try rereading the file?

addExts ro es = ro {readerExtensions = es <> (readerExtensions ro)}

--------------------------------------------------------------------------------
-- | Read a bytestring using pandoc, with the supplied options; gracefully
-- decodes the bytestring into a String whenever accidentally used for a
-- textual input format
readPandocLBSWith
:: ReaderOptions -- ^ Parser options
-> Item BL.ByteString -- ^ ByteString to read
-> Compiler (Item Pandoc) -- ^ Resulting document
readPandocLBSWith ropt item =
case runPure $ traverse (reader ropt (itemIdentifier item)) item of
Left err -> fail $
"Hakyll.Web.Pandoc.readPandocLBSWith: parse failed: " ++ show err
Right item' -> return item'
where
reader ro i = case getReaderForIdentifier i of
(_, TextReader r, es) -> r (addExts ro es) . TL.toStrict . TLE.decodeUtf8
(_, ByteStringReader r, es) -> r (addExts ro es)

addExt ro e = ro {readerExtensions = enableExtension e $ readerExtensions ro}
addExts ro es = ro {readerExtensions = es <> (readerExtensions ro)}


--------------------------------------------------------------------------------
Expand Down Expand Up @@ -139,7 +242,20 @@ pandocCompilerWithTransformM :: ReaderOptions -> WriterOptions
-> Compiler (Item String)
pandocCompilerWithTransformM ropt wopt f =
writePandocWith wopt <$>
(traverse f =<< readPandocWith ropt =<< getResourceBody)
(traverse f =<< useUnderlyingReader)
where
getUnderlyingReader :: Compiler (Either String (Reader PandocPure, Extensions))
getUnderlyingReader = getReader . defaultReaderName <$> getUnderlying
addExts ro es = ro {readerExtensions = es <> (readerExtensions ro)}
innerRead :: Either String (Reader PandocPure, Extensions) -> Compiler (Item Pandoc)
innerRead (Left _) =
readPandocLBSWith ropt =<< getResourceLBS
innerRead (Right (ByteStringReader _, es)) =
readPandocLBSWith (addExts ropt es) =<< getResourceLBS
innerRead (Right (TextReader _, es)) =
readPandocWith (addExts ropt es) =<< getResourceBody
useUnderlyingReader :: Compiler (Item Pandoc)
useUnderlyingReader = innerRead =<< getUnderlyingReader


--------------------------------------------------------------------------------
Expand Down

0 comments on commit da4445a

Please sign in to comment.