-
Notifications
You must be signed in to change notification settings - Fork 19
/
Match.lhs
278 lines (246 loc) · 9.05 KB
/
Match.lhs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
\begin{code}
{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE UndecidableInstances #-}
{-# LANGUAGE MultiParamTypeClasses #-}
{-# LANGUAGE DeriveDataTypeable #-}
{-# LANGUAGE MonoLocalBinds #-}
\end{code}
\begin{code}
module Text.RE.ZeInternals.Types.Match
( Match(..)
, noMatch
, emptyMatchArray
, matched
, matchedText
, matchCapture
, matchCaptures
, (!$$)
, captureText
, (!$$?)
, captureTextMaybe
, (!$)
, capture
, (!$?)
, captureMaybe
, RegexFix(..)
, convertMatchText
) where
\end{code}
\begin{code}
import Data.Array
import Data.Bits
import qualified Data.ByteString as BW
import qualified Data.ByteString.Char8 as B
import qualified Data.ByteString.Lazy.Char8 as LBS
import qualified Data.ByteString.UTF8 as B
import Data.Maybe
import qualified Data.Sequence as S
import qualified Data.Text as T
import qualified Data.Text.Encoding as T
import qualified Data.Text.Lazy as LT
import Data.Typeable
import Data.Word
import Text.RE.ZeInternals.Types.Capture
import Text.RE.ZeInternals.Types.CaptureID
import Text.Regex.Base
import qualified Text.Regex.PCRE as PCRE
import qualified Text.Regex.TDFA as TDFA
infixl 9 !$, !$$
\end{code}
\begin{code}
-- | the result of matching a RE to a text once (with @?=~@), retaining
-- the text that was matched against
data Match a =
Match
{ matchSource :: !a -- ^ the whole source text
, captureNames :: !CaptureNames -- ^ the RE's capture names
, matchArray :: !(Array CaptureOrdinal (Capture a))
-- ^ 0..n-1 captures,
-- starting with the
-- text matched by the
-- whole RE
}
deriving (Show,Eq,Typeable)
\end{code}
\begin{code}
-- | Construct a Match that does not match anything.
noMatch :: a -> Match a
noMatch t = Match t noCaptureNames emptyMatchArray
-- | an empty array of Capture
emptyMatchArray :: Array CaptureOrdinal (Capture a)
emptyMatchArray = listArray (CaptureOrdinal 0,CaptureOrdinal $ -1) []
\end{code}
\begin{code}
instance Functor Match where
fmap f Match{..} =
Match
{ matchSource = f matchSource
, captureNames = captureNames
, matchArray = fmap (fmap f) matchArray
}
\end{code}
\begin{code}
-- | tests whether the RE matched the source text at all
matched :: Match a -> Bool
matched = isJust . matchCapture
-- | yields the text matched by the RE, Nothing if no match
matchedText :: Match a -> Maybe a
matchedText = fmap capturedText . matchCapture
-- | the top-level capture if the source text matched the RE,
-- Nothing otherwise
matchCapture :: Match a -> Maybe (Capture a)
matchCapture = fmap fst . matchCaptures
-- | the main top-level capture (capture \'0'') and the sub captures
-- if the text matched the RE, @Nothing@ otherwise
matchCaptures :: Match a -> Maybe (Capture a,[Capture a])
matchCaptures Match{..} = case rangeSize (bounds matchArray) == 0 of
True -> Nothing
False -> Just (matchArray!0,drop 1 $ elems matchArray)
-- | an alternative for captureText
(!$$) :: Match a -> CaptureID -> a
(!$$) = flip captureText
-- | look up the text of the nth capture, 0 being the match of the whole
-- RE against the source text, 1, the first bracketed sub-expression to
-- be matched and so on
captureText :: CaptureID -> Match a -> a
captureText cid mtch = capturedText $ capture cid mtch
-- | an alternative for captureTextMaybe
(!$$?) :: Match a -> CaptureID -> Maybe a
(!$$?) = flip captureTextMaybe
-- | look up the text of the nth capture (0 being the match of the
-- whole), returning Nothing if the Match doesn't contain the capture
captureTextMaybe :: CaptureID -> Match a -> Maybe a
captureTextMaybe cid mtch = do
cap <- mtch !$? cid
case hasCaptured cap of
True -> Just $ capturedText cap
False -> Nothing
-- | an alternative for capture
(!$) :: Match a -> CaptureID -> Capture a
(!$) = flip capture
-- | look up the nth capture, 0 being the match of the whole RE against
-- the source text, 1, the first bracketed sub-expression to be matched
-- and so on
capture :: CaptureID -> Match a -> Capture a
capture cid mtch = fromMaybe oops $ mtch !$? cid
where
oops = error $ "capture: out of bounds (" ++ show cid ++ ")"
-- | an alternative for capture captureMaybe
(!$?) :: Match a -> CaptureID -> Maybe (Capture a)
(!$?) = flip captureMaybe
-- | look up the nth capture, 0 being the match of the whole RE against
-- the source text, 1, the first bracketed sub-expression to be matched
-- and so on, returning Nothing if there is no such capture, or if the
-- capture failed to capture anything (being in a failed alternate)
captureMaybe :: CaptureID -> Match a -> Maybe (Capture a)
captureMaybe cid mtch@Match{..} = do
i <- lookupCaptureID cid mtch
cap <- case bounds matchArray `inRange` i of
True -> Just $ matchArray ! i
False -> Nothing
case hasCaptured cap of
True -> Just cap
False -> Nothing
lookupCaptureID :: CaptureID -> Match a -> Maybe CaptureOrdinal
lookupCaptureID cid Match{..} =
either (const Nothing) Just $ findCaptureID cid captureNames
\end{code}
\begin{code}
-- | this instance hooks 'Match' into regex-base: regex consumers need
-- not worry about any of this
instance
( RegexContext regex source (AllTextSubmatches (Array Int) (source,(Int,Int)))
, RegexLike regex source
, RegexFix regex source
) =>
RegexContext regex source (Match source) where
match r s = convertMatchText r s $ getAllTextSubmatches $ match r s
matchM r s = do
y <- matchM r s
return $ convertMatchText r s $ getAllTextSubmatches y
\end{code}
\begin{code}
-- | convert a regex-base native MatchText into a regex Match type
convertMatchText :: RegexFix regex source
=> regex
-> source
-> MatchText source
-> Match source
convertMatchText re hay arr =
Match
{ matchSource = hay
, captureNames = noCaptureNames
, matchArray =
ixmap (CaptureOrdinal lo,CaptureOrdinal hi) getCaptureOrdinal $
fmap f arr
}
where
(lo,hi) = bounds arr
f (ndl,(off_,len_)) =
Capture
{ captureSource = hay
, capturedText = ndl
, captureOffset = off
, captureLength = len
}
where
CharRange off len = utf8_correct re hay off_ len_
\end{code}
\begin{code}
data CharRange = CharRange !Int !Int
deriving (Show)
class RegexFix regex source where
utf8_correct :: regex -> source -> Int -> Int -> CharRange
utf8_correct _ _ = CharRange
instance RegexFix TDFA.Regex [Char] where
instance RegexFix TDFA.Regex B.ByteString where
instance RegexFix TDFA.Regex LBS.ByteString where
instance RegexFix TDFA.Regex T.Text where
instance RegexFix TDFA.Regex LT.Text where
instance RegexFix TDFA.Regex (S.Seq Char) where
instance RegexFix PCRE.Regex [Char] where
utf8_correct _ = utf8_correct_bs . B.fromString
instance RegexFix PCRE.Regex B.ByteString where
instance RegexFix PCRE.Regex LBS.ByteString where
instance RegexFix PCRE.Regex T.Text where
utf8_correct _ = utf8_correct_bs . T.encodeUtf8
instance RegexFix PCRE.Regex LT.Text where
utf8_correct _ = utf8_correct_bs . T.encodeUtf8 . LT.toStrict
instance RegexFix PCRE.Regex (S.Seq Char) where
-- convert a byte offset+length in a UTF-8-encoded ByteString
-- into a character offset+length
utf8_correct_bs :: B.ByteString -> Int -> Int -> CharRange
utf8_correct_bs bs ix0 ln0 = case ix0+ln0 > BW.length bs of
True -> error "utf8_correct_bs: index+length out of range"
False -> skip 0 0 -- BW.index calls below should not fail
where
skip ix di = case compare ix ix0 of
GT -> error "utf8_correct_bs: UTF-8 decoding error"
EQ -> count ix di 0 ln0
LT -> case u8_width $ BW.index bs ix of
Single -> skip (ix+1) di
Double -> skip (ix+2) $ di+1
Triple -> skip (ix+3) $ di+2
Quadruple -> skip (ix+4) $ di+3
count ix di dl c = case compare c 0 of
LT -> error "utf8_correct_bs: length ends inside character"
EQ -> CharRange (ix0-di) (ln0-dl)
GT -> case u8_width $ BW.index bs ix of
Single -> count (ix+1) di dl $ c-1
Double -> count (ix+2) di (dl+1) $ c-2
Triple -> count (ix+3) di (dl+2) $ c-3
Quadruple -> count (ix+4) di (dl+3) $ c-4
data UTF8Size = Single | Double | Triple | Quadruple
deriving (Show)
u8_width :: Word8 -> UTF8Size
u8_width w8 = case w8 .&. 0x80 == 0x00 of
True -> Single
False -> case w8 .&. 0xE0 == 0xC0 of
True -> Double
False -> case w8 .&. 0xF0 == 0xE0 of
True -> Triple
False -> case w8 .&. 0xF8 == 0xF0 of
True -> Quadruple
False -> error "u8_width: UTF-8 decoding error"
\end{code}