/
LenientHtmlParser.purs
160 lines (136 loc) · 4.32 KB
/
LenientHtmlParser.purs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
module LenientHtmlParser where
import Prelude
import Control.Alt ((<|>))
import Data.Array (fromFoldable)
import Data.Either (Either)
import Data.Foldable (class Foldable)
import Data.Generic.Rep (class Generic)
import Data.Generic.Rep.Show (genericShow)
import Data.List (List, elem)
import Data.String (trim)
import Data.String.CodeUnits (dropRight, fromCharArray)
import Text.Parsing.StringParser (Parser, ParseError, runParser, fail)
import Text.Parsing.StringParser.Combinators (fix, many, many1, manyTill)
import Text.Parsing.StringParser.String (anyChar, char, eof, noneOf, regex, satisfy, string)
newtype TagName = TagName String
derive instance eqTagName :: Eq TagName
derive instance genericRepTagName :: Generic TagName _
instance showTagName :: Show TagName where show = genericShow
type Attributes = List Attribute
newtype Name = Name String
derive instance eqName :: Eq Name
derive instance genericRepName :: Generic Name _
instance showName :: Show Name where show = genericShow
newtype Value = Value String
derive instance eqValue :: Eq Value
derive instance genericRepValue :: Generic Value _
instance showValue :: Show Value where show = genericShow
data Attribute = Attribute Name Value
derive instance eqAttribute :: Eq Attribute
derive instance genericRepAttribute :: Generic Attribute _
instance showAttribute :: Show Attribute where show = genericShow
data Tag
= TagOpen TagName Attributes
| TagSingle TagName Attributes
| TagClose TagName
| TNode String
| TScript Attributes String
derive instance eqTag :: Eq Tag
derive instance genericRepTag :: Generic Tag _
instance showTag :: Show Tag where show = genericShow
flattenChars :: forall f. Foldable f => f Char -> String
flattenChars = trim <<< fromCharArray <<< fromFoldable
comment :: Parser Unit
comment = do
_ <- string "<!--"
_ <- manyTill anyChar $ string "-->"
pure unit
doctype :: Parser Unit
doctype = do
_ <- string "<!DOCTYPE" <|> string "<!doctype"
_ <- regex "[^>]*"
_ <- char '>'
pure unit
skipSpace :: Parser Unit
skipSpace = fix \_ ->
(comment *> skipSpace)
<|> (doctype *> skipSpace)
<|> (many1 ws *> skipSpace)
<|> pure unit
where
ws = satisfy \c ->
c == '\n' ||
c == '\r' ||
c == '\t' ||
c == ' '
lexeme :: forall p. Parser p -> Parser p
lexeme p = p <* skipSpace
validNameString :: Parser String
validNameString =
flattenChars
<$> many1 (noneOf ['=', ' ', '<', '>', '/', '"'])
attribute :: Parser Attribute
attribute = lexeme do
name <- validNameString
value <- (flattenChars <$> getValue) <|> pure ""
pure $ Attribute (Name name) (Value value)
where
termini = ['"', '>', ' ']
getValue = do
_ <- char '='
content <- withQuotes <|> withoutQuotes
pure content
withQuotes = do
_ <- char '"'
manyTill anyChar $ void (char '"') <|> eof
withoutQuotes = do
content <- many $ satisfy (not flip elem ['>', ' '])
_ <- void (char ' ') <|> eof <|> pure unit
pure content
tagOpenOrSingleOrClose :: Parser Tag
tagOpenOrSingleOrClose = lexeme $
char '<' *> (closeTag <|> tagOpenOrSingle)
closeTag :: Parser Tag
closeTag = lexeme do
_ <- char '/'
name <- validNameString
_ <- char '>'
pure $ TagClose (TagName name)
tagOpenOrSingle :: Parser Tag
tagOpenOrSingle = lexeme do
tagName <- lexeme $ TagName <$> validNameString
attrs <- many attribute <|> pure mempty
let spec' = spec tagName attrs
closeTagOpen spec'
<|> closeTagSingle spec'
<|> fail "no closure in sight for tag opening"
where
spec tagName attrs constructor =
constructor tagName attrs
closeTagOpen f =
char '>' *> pure (f TagOpen)
closeTagSingle f =
string "/>" *> pure (f TagSingle)
tnode :: Parser Tag
tnode = lexeme do
TNode <$> regex "[^<]+" <|> slow
where
slow = fix \_ ->
TNode <<< flattenChars <$> many1 (satisfy ((/=) '<'))
scriptTag :: Parser Tag
scriptTag = lexeme do
_ <- lexeme $ string "<script"
attrs <- manyTill attribute (char '>')
content <- dropRight 9 <$> regex "[\\s\\S]*</script>"
pure $ TScript attrs content
tag :: Parser Tag
tag = lexeme do
scriptTag <|> tagOpenOrSingleOrClose <|> tnode
tags :: Parser (List Tag)
tags = do
skipSpace
many tag
parse :: forall a. Parser a -> String -> Either ParseError a
parse p s = runParser p s
parseTags :: String -> Either ParseError (List Tag)
parseTags s = parse tags s