-
-
Notifications
You must be signed in to change notification settings - Fork 31
/
Tag.hs
228 lines (210 loc) · 8.54 KB
/
Tag.hs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
{-# LANGUAGE OverloadedStrings #-}
module Commonmark.Tag
( htmlTag
, htmlOpenTag
, htmlClosingTag
, htmlAttributeName
, htmlAttributeValue
, htmlDoubleQuotedAttributeValue
, Enders
, defaultEnders )
where
import Commonmark.Tokens
import Commonmark.TokParsers
import Control.Monad (liftM2, guard)
import Control.Monad.Trans.State.Strict
import Control.Monad.Trans.Class (lift)
import Unicode.Char (isAscii, isAlpha)
import qualified Data.Text as T
import Text.Parsec hiding (State)
data Enders =
Enders
{ scannedForCDATA :: !(Maybe SourcePos)
, scannedForProcessingInstruction :: !(Maybe SourcePos)
, scannedForDeclaration :: !(Maybe SourcePos)
} deriving Show
defaultEnders :: Enders
defaultEnders = Enders { scannedForCDATA = Nothing
, scannedForProcessingInstruction = Nothing
, scannedForDeclaration = Nothing }
(.&&.) :: (a -> Bool) -> (a -> Bool) -> (a -> Bool)
(.&&.) = liftM2 (&&)
-- A tag name consists of an ASCII letter followed by zero or more ASCII
-- letters, digits, or hyphens (-).
htmlTagName :: Monad m => ParsecT [Tok] s m [Tok]
htmlTagName = try $ do
let isTagText = T.all isAscii
let startsWithLetter t' = not (T.null t') && isAlpha (T.head t')
t <- satisfyWord (isTagText .&&. startsWithLetter)
rest <- many (symbol '-' <|> satisfyWord isTagText)
return (t:rest)
-- An attribute name consists of an ASCII letter, _, or :, followed by
-- zero or more ASCII letters, digits, _, ., :, or -. (Note: This is
-- the XML specification restricted to ASCII. HTML5 is laxer.)
htmlAttributeName :: Monad m => ParsecT [Tok] s m [Tok]
htmlAttributeName = try $ do
let isTagText t' = T.all isAscii t'
let startsWithLetter t' = not (T.null t') && isAlpha (T.head t')
t <- satisfyWord (startsWithLetter .&&. isTagText) <|>
symbol '_' <|>
symbol ':'
rest <- many $ satisfyWord isTagText
<|> symbol '_'
<|> symbol '-'
<|> symbol '.'
<|> symbol ':'
return (t:rest)
-- An attribute value specification consists of optional whitespace,
-- a = character, optional whitespace, and an attribute value.
htmlAttributeValueSpec :: Monad m => ParsecT [Tok] s m [Tok]
htmlAttributeValueSpec = try $ do
sps1 <- option [] whitespace
eq <- symbol '='
sps2 <- option [] whitespace
val <- htmlAttributeValue
return $ sps1 ++ [eq] ++ sps2 ++ val
-- An attribute value consists of an unquoted attribute value,
-- a single-quoted attribute value, or a double-quoted attribute value.
htmlAttributeValue :: Monad m => ParsecT [Tok] s m [Tok]
htmlAttributeValue =
htmlUnquotedAttributeValue <|>
htmlSingleQuotedAttributeValue <|>
htmlDoubleQuotedAttributeValue
-- An attribute consists of whitespace, an attribute name, and an optional
-- attribute value specification.
htmlAttribute :: Monad m => ParsecT [Tok] s m [Tok]
htmlAttribute = try $ do
sps <- whitespace
n <- htmlAttributeName
val <- option [] htmlAttributeValueSpec
return $ sps ++ n ++ val
-- An unquoted attribute value is a nonempty string of characters not
-- including spaces, ", ', =, <, >, or `.
htmlUnquotedAttributeValue :: Monad m => ParsecT [Tok] s m [Tok]
htmlUnquotedAttributeValue =
many1 $ noneOfToks [Spaces, LineEnd, Symbol '<', Symbol '>',
Symbol '=', Symbol '`', Symbol '\'', Symbol '"']
-- A single-quoted attribute value consists of ', zero or more characters
-- not including ', and a final '.
htmlSingleQuotedAttributeValue :: Monad m => ParsecT [Tok] s m [Tok]
htmlSingleQuotedAttributeValue = try $ do
op <- symbol '\''
contents <- many (satisfyTok (not . hasType (Symbol '\'')))
cl <- symbol '\''
return $ op : contents ++ [cl]
-- A double-quoted attribute value consists of ", zero or more characters
-- not including ", and a final ".
htmlDoubleQuotedAttributeValue :: Monad m => ParsecT [Tok] s m [Tok]
htmlDoubleQuotedAttributeValue = try $ do
op <- symbol '"'
contents <- many (satisfyTok (not . hasType (Symbol '"')))
cl <- symbol '"'
return $ op : contents ++ [cl]
-- | An open tag consists of a @<@ character, a tag name, zero or more
-- attributes, optional whitespace, an optional @/@ character, and a
-- @>@ character. This parses assumes that the opening @<@ has already
-- been parsed.
htmlOpenTag :: Monad m => ParsecT [Tok] s m [Tok]
htmlOpenTag = try $ do
-- assume < has already been parsed
n <- htmlTagName
attrs <- concat <$> many htmlAttribute
sps <- option [] whitespace
sl <- option [] $ (:[]) <$> symbol '/'
cl <- symbol '>'
return $ n ++ attrs ++ sps ++ sl ++ [cl]
-- | A closing tag consists of the string @</@, a tag name, optional
-- whitespace, and the character @>@. This parser assumes that the
-- opening @<@ has already been parsed.
htmlClosingTag :: Monad m => ParsecT [Tok] s m [Tok]
htmlClosingTag = try $ do
-- assume < has already been parsed
op <- symbol '/'
n <- htmlTagName
sps <- option [] whitespace
cl <- symbol '>'
return $ op : n ++ sps ++ [cl]
-- An HTML comment consists of `<!-->`, `<!--->`, or `<!--`, a string of
-- characters not including the string `-->`, and `-->`.
-- (See the HTML5 spec.)
htmlComment :: Monad m => ParsecT [Tok] s m [Tok]
htmlComment = try $ do
-- assume < has already been parsed
op <- sequence [ symbol '!'
, symbol '-'
, symbol '-' ]
let getContent =
try (sequence [ symbol '-', symbol '-', symbol '>' ])
<|> try ((++) <$> many1 (satisfyTok (not . hasType (Symbol '-')))
<*> getContent)
<|> try ((:) <$> symbol '-' <*> getContent)
(op ++) <$>
( ((:[]) <$> symbol '>')
<|> try (sequence [ symbol '-', symbol '>' ])
<|> getContent
)
-- A processing instruction consists of the string <?, a string of
-- characters not including the string ?>, and the string ?>.
htmlProcessingInstruction :: Monad m
=> ParsecT [Tok] s (StateT Enders m) [Tok]
htmlProcessingInstruction = try $ do
-- assume < has already been parsed
let questionmark = symbol '?'
op <- questionmark
pos <- getPosition
alreadyScanned <- lift $ gets scannedForProcessingInstruction
guard $ maybe True (< pos) alreadyScanned
contents <- many $ satisfyTok (not . hasType (Symbol '?'))
<|> try (questionmark <*
notFollowedBy (symbol '>'))
pos' <- getPosition
lift $ modify $ \st -> st{ scannedForProcessingInstruction = Just pos' }
cl <- sequence [ questionmark
, symbol '>' ]
return $ op : contents ++ cl
-- A declaration consists of the string <!, a name consisting of one or
-- more uppercase ASCII letters, whitespace, a string of characters not
-- including the character >, and the character >.
htmlDeclaration :: Monad m => ParsecT [Tok] s (StateT Enders m) [Tok]
htmlDeclaration = try $ do
-- assume < has already been parsed
op <- symbol '!'
pos <- getPosition
alreadyScanned <- lift $ gets scannedForDeclaration
guard $ maybe True (< pos) alreadyScanned
let isDeclName t = not (T.null t) && T.all (isAscii .&&. isAlpha) t
name <- satisfyWord isDeclName
ws <- whitespace
contents <- many (satisfyTok (not . hasType (Symbol '>')))
pos' <- getPosition
lift $ modify $ \st -> st{ scannedForDeclaration = Just pos' }
cl <- symbol '>'
return $ op : name : ws ++ contents ++ [cl]
-- A CDATA section consists of the string <![CDATA[, a string of characters
-- not including the string ]]>, and the string ]]>.
htmlCDATASection :: Monad m => ParsecT [Tok] s (StateT Enders m) [Tok]
htmlCDATASection = try $ do
-- assume < has already been parsed
op <- sequence [ symbol '!'
, symbol '['
, satisfyWord (== "CDATA")
, symbol '[' ]
pos <- getPosition
alreadyScanned <- lift $ gets scannedForCDATA
guard $ maybe True (< pos) alreadyScanned
let ender = try $ sequence [ symbol ']'
, symbol ']'
, symbol '>' ]
contents <- many $ do
notFollowedBy ender
anyTok
pos' <- getPosition
lift $ modify $ \st -> st{ scannedForCDATA = Just pos' }
cl <- ender
return $ op ++ contents ++ cl
-- An HTML tag consists of an open tag, a closing tag, an HTML comment,
-- a processing instruction, a declaration, or a CDATA section.
-- Assumes @<@ has already been parsed.
htmlTag :: Monad m => ParsecT [Tok] s (StateT Enders m) [Tok]
htmlTag = htmlOpenTag <|> htmlClosingTag <|> htmlComment <|>
htmlProcessingInstruction <|> htmlDeclaration <|> htmlCDATASection