-
Notifications
You must be signed in to change notification settings - Fork 48
/
BlazeFromHtml.hs
320 lines (290 loc) · 11.1 KB
/
BlazeFromHtml.hs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
-- | A module for conversion from HTML to BlazeHtml Haskell code.
--
module Main where
import Control.Monad (forM_, when)
import Control.Applicative ((<$>))
import Data.Maybe (listToMaybe, fromMaybe)
import Data.Char (toLower, isSpace)
import Control.Arrow (first)
import System.Environment (getArgs)
import System.FilePath (dropExtension)
import qualified Data.Map as M
import System.Console.GetOpt
import Text.HTML.TagSoup
import Util.Sanitize (sanitize)
import Util.GenerateHtmlCombinators hiding (main)
-- | Simple type to represent attributes.
--
type Attributes = [(String, String)]
-- | Intermediate tree representation. This representation contains several
-- constructors aimed at pretty-printing.
--
data Html = Parent String Attributes Html
| Block [Html]
| Text String
| Comment String
| Doctype
deriving (Show)
-- | Different combinator types.
--
data CombinatorType = ParentCombinator
| LeafCombinator
| UnknownCombinator
deriving (Eq, Show)
-- | Traverse the list of tags to produce an intermediate representation of the
-- HTML tree.
--
makeTree :: HtmlVariant -- ^ HTML variant used
-> Bool -- ^ Should ignore errors
-> [String] -- ^ Stack of open tags
-> [Tag String] -- ^ Tags to parse
-> (Html, [Tag String]) -- ^ (Result, unparsed part)
makeTree _ ignore stack []
| null stack || ignore = (Block [], [])
| otherwise = error $ "Error: tags left open at the end: " ++ show stack
makeTree variant ignore stack (TagPosition row _ : x : xs) = case x of
TagOpen tag attrs -> if toLower' tag == "!doctype"
then addHtml Doctype xs
else let tag' = toLower' tag
(inner, t) = case combinatorType variant tag' of
LeafCombinator -> (Block [], xs)
_ -> makeTree variant ignore (tag' : stack) xs
p = Parent tag' (map (first toLower') attrs) inner
in addHtml p t
-- The closing tag must match the stack. If it is a closing leaf, we can
-- ignore it
TagClose tag ->
let isLeafCombinator = combinatorType variant tag == LeafCombinator
matchesStack = listToMaybe stack == Just (toLower' tag)
in case (isLeafCombinator, matchesStack, ignore) of
-- It's a leaf combinator, don't care about this element
(True, _, _) -> makeTree variant ignore stack xs
-- It's a parent and the stack doesn't match
(False, False, False) -> error $
"Line " ++ show row ++ ": " ++ show tag ++ " closed but "
++ show stack ++ " should be closed instead."
-- Stack might not match but we ignore it anyway
(False, _, _) -> (Block [], xs)
TagText text -> addHtml (Text text) xs
TagComment comment -> addHtml (Comment comment) xs
_ -> makeTree variant ignore stack xs
where
addHtml html xs' = let (Block l, r) = makeTree variant ignore stack xs'
in (Block (html : l), r)
toLower' = map toLower
makeTree _ _ _ _ = error "TagSoup error"
-- | Remove empty text from the HTML.
--
removeEmptyText :: Html -> Html
removeEmptyText (Block b) = Block $ map removeEmptyText $ flip filter b $ \h ->
case h of Text text -> any (not . isSpace) text
_ -> True
removeEmptyText (Parent tag attrs inner) =
Parent tag attrs $ removeEmptyText inner
removeEmptyText x = x
-- | Try to eliminiate Block constructors as much as possible.
--
minimizeBlocks :: Html -> Html
minimizeBlocks (Parent t a (Block [x])) = minimizeBlocks $ Parent t a x
minimizeBlocks (Parent t a x) = Parent t a $ minimizeBlocks x
minimizeBlocks (Block x) = Block $ map minimizeBlocks x
minimizeBlocks x = x
-- | Get the type of a combinator, using a given variant.
--
combinatorType :: HtmlVariant -> String -> CombinatorType
combinatorType variant combinator
| combinator == "docTypeHtml" = ParentCombinator
| combinator `elem` parents variant = ParentCombinator
| combinator `elem` leafs variant = LeafCombinator
| otherwise = UnknownCombinator
-- | Create a special @<html>@ parent that includes the docype.
--
joinHtmlDoctype :: Html -> Html
joinHtmlDoctype (Block (Doctype : Parent "html" attrs inner : xs)) =
Block $ Parent "docTypeHtml" attrs inner : xs
joinHtmlDoctype x = x
-- | Produce the Blaze code from the HTML. The result is a list of lines.
--
fromHtml :: HtmlVariant -- ^ Used HTML variant
-> Bool -- ^ Should ignore errors
-> Html -- ^ HTML tree
-> [String] -- ^ Resulting lines of code
fromHtml _ _ Doctype = ["docType"]
fromHtml _ _ (Text text) = ["\"" ++ concatMap escape (trim text) ++ "\""]
where
-- Remove whitespace on both ends of a string
trim = reverse . dropWhile isSpace . reverse . dropWhile isSpace
-- Escape a number of characters
escape '"' = "\\\""
escape '\n' = "\\n"
escape x = [x]
fromHtml _ _ (Comment comment) = map ("-- " ++) $ lines comment
fromHtml variant ignore (Block block) =
concatMap (fromHtml variant ignore) block
fromHtml variant ignore (Parent tag attrs inner) =
case combinatorType variant tag of
-- Actual parent tags
ParentCombinator -> case inner of
(Block ls) -> if null ls
then [combinator ++
(if null attrs then " " else " $ ") ++ "mempty"]
else (combinator ++ " $ do") :
indent (fromHtml variant ignore inner)
-- We join non-block parents for better readability.
x -> let ls = fromHtml variant ignore x
apply = if dropApply x then " " else " $ "
in case ls of (y : ys) -> (combinator ++ apply ++ y) : ys
[] -> [combinator]
-- Leaf tags
LeafCombinator -> [combinator]
-- Unknown tag
UnknownCombinator -> if ignore
then fromHtml variant ignore inner
else error $ "Tag " ++ tag ++ " is illegal in "
++ show variant
where
combinator = qualifiedSanitize "H." tag ++ attributes'
attributes' = attrs >>= \(k, v) -> if k `elem` attributes variant
then " ! " ++ qualifiedSanitize "A." k ++ " " ++ show v
else if ignore
then ""
else error $ "Attribute " ++ k ++ " is illegal in " ++ show variant
-- Qualifies a tag with the given qualifier if needed, and sanitizes it.
qualifiedSanitize qualifier tag' =
(if isNameClash variant tag' then qualifier else "") ++ sanitize tag'
-- Check if we can drop the apply operator ($), for readability reasons.
-- This would change:
--
-- > p $ "Some text"
--
-- Into
--
-- > p "Some text"
--
dropApply (Parent _ _ _) = False
dropApply (Block _) = False
dropApply _ = null attrs
-- | Produce the code needed for initial imports.
--
getImports :: HtmlVariant -> [String]
getImports variant =
[ "{-# LANGUAGE OverloadedStrings #-}"
, ""
, import_ "Prelude"
, qualify "Prelude" "P"
, import_ "Data.Monoid (mempty)"
, ""
, import_ h
, qualify h "H"
, import_ a
, qualify a "A"
]
where
import_ = ("import " ++)
qualify name short = "import qualified " ++ name ++ " as " ++ short
h = getModuleName variant
a = getAttributeModuleName variant
-- | Convert the HTML to blaze code.
--
blazeFromHtml :: HtmlVariant -- ^ Variant to use
-> Bool -- ^ Produce standalone code
-> Bool -- ^ Should we ignore errors
-> String -- ^ Template name
-> String -- ^ HTML code
-> String -- ^ Resulting code
blazeFromHtml variant standalone ignore name =
unlines . addSignature . fromHtml variant ignore
. joinHtmlDoctype . minimizeBlocks
. removeEmptyText . fst . makeTree variant ignore []
. parseTagsOptions parseOptions { optTagPosition = True }
where
addSignature body = if standalone then [ name ++ " :: Html"
, name ++ " = do"
] ++ indent body
else body
-- | Indent block of code.
--
indent :: [String] -> [String]
indent = map (" " ++)
-- | Main function
--
main :: IO ()
main = do
args <- getOpt Permute options <$> getArgs
case args of
(o, n, []) -> let v = getVariant o
s = standalone' o
i = ignore' o
in do imports' v o
main' v s i n
(_, _, _) -> putStr help
where
-- No files given, work with stdin
main' variant standalone ignore [] = interact $
blazeFromHtml variant standalone ignore "template"
-- Handle all files
main' variant standalone ignore files = forM_ files $ \file -> do
body <- readFile file
putStrLn $ blazeFromHtml variant standalone ignore
(dropExtension file) body
-- Print imports if needed
imports' variant opts = when (standalone' opts) $
putStrLn $ unlines $ getImports variant
-- Should we produce standalone code?
standalone' opts = ArgStandalone `elem` opts
-- Should we ignore errors?
ignore' opts = ArgIgnoreErrors `elem` opts
-- Get the variant from the options
getVariant opts = fromMaybe defaultHtmlVariant $ listToMaybe $
flip concatMap opts $ \o -> case o of (ArgHtmlVariant x) -> [x]
_ -> []
-- | Help information.
--
help :: String
help = unlines $
[ "This is a tool to convert HTML code to BlazeHtml code. It is still"
, "experimental and the results might need to be edited manually."
, ""
, "USAGE"
, ""
, " blaze-from-html [OPTIONS...] [FILES ...]"
, ""
, "When no files are given, it works as a filter."
, ""
, "EXAMPLE"
, ""
, " blaze-from-html -v html4-strict index.html"
, ""
, "This converts the index.html file to Haskell code, writing to stdout."
, ""
, "OPTIONS"
, usageInfo "" options
, "VARIANTS"
, ""
] ++
map ((" " ++) . fst) (M.toList htmlVariants) ++
[ ""
, "By default, " ++ show defaultHtmlVariant ++ " is used."
]
-- | Options for the CLI program
--
data Arg = ArgHtmlVariant HtmlVariant
| ArgStandalone
| ArgIgnoreErrors
deriving (Show, Eq)
-- | A description of the options
--
options :: [OptDescr Arg]
options =
[ Option "v" ["html-variant"] htmlVariantOption "HTML variant to use"
, Option "s" ["standalone"] (NoArg ArgStandalone) "Produce standalone code"
, Option "e" ["ignore-errors"] (NoArg ArgIgnoreErrors) "Ignore most errors"
]
where
htmlVariantOption = flip ReqArg "VARIANT" $ \name -> ArgHtmlVariant $
fromMaybe (error $ "No HTML variant called " ++ name ++ " found.")
(M.lookup name htmlVariants)
-- | The default HTML variant
--
defaultHtmlVariant :: HtmlVariant
defaultHtmlVariant = html5