Permalink
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
279 lines (246 sloc) 9.05 KB
\begin{code}
{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE UndecidableInstances #-}
{-# LANGUAGE MultiParamTypeClasses #-}
{-# LANGUAGE DeriveDataTypeable #-}
{-# LANGUAGE MonoLocalBinds #-}
\end{code}
\begin{code}
module Text.RE.ZeInternals.Types.Match
( Match(..)
, noMatch
, emptyMatchArray
, matched
, matchedText
, matchCapture
, matchCaptures
, (!$$)
, captureText
, (!$$?)
, captureTextMaybe
, (!$)
, capture
, (!$?)
, captureMaybe
, RegexFix(..)
, convertMatchText
) where
\end{code}
\begin{code}
import Data.Array
import Data.Bits
import qualified Data.ByteString as BW
import qualified Data.ByteString.Char8 as B
import qualified Data.ByteString.Lazy.Char8 as LBS
import qualified Data.ByteString.UTF8 as B
import Data.Maybe
import qualified Data.Sequence as S
import qualified Data.Text as T
import qualified Data.Text.Encoding as T
import qualified Data.Text.Lazy as LT
import Data.Typeable
import Data.Word
import Text.RE.ZeInternals.Types.Capture
import Text.RE.ZeInternals.Types.CaptureID
import Text.Regex.Base
import qualified Text.Regex.PCRE as PCRE
import qualified Text.Regex.TDFA as TDFA
infixl 9 !$, !$$
\end{code}
\begin{code}
-- | the result of matching a RE to a text once (with @?=~@), retaining
-- the text that was matched against
data Match a =
Match
{ matchSource :: !a -- ^ the whole source text
, captureNames :: !CaptureNames -- ^ the RE's capture names
, matchArray :: !(Array CaptureOrdinal (Capture a))
-- ^ 0..n-1 captures,
-- starting with the
-- text matched by the
-- whole RE
}
deriving (Show,Eq,Typeable)
\end{code}
\begin{code}
-- | Construct a Match that does not match anything.
noMatch :: a -> Match a
noMatch t = Match t noCaptureNames emptyMatchArray
-- | an empty array of Capture
emptyMatchArray :: Array CaptureOrdinal (Capture a)
emptyMatchArray = listArray (CaptureOrdinal 0,CaptureOrdinal $ -1) []
\end{code}
\begin{code}
instance Functor Match where
fmap f Match{..} =
Match
{ matchSource = f matchSource
, captureNames = captureNames
, matchArray = fmap (fmap f) matchArray
}
\end{code}
\begin{code}
-- | tests whether the RE matched the source text at all
matched :: Match a -> Bool
matched = isJust . matchCapture
-- | yields the text matched by the RE, Nothing if no match
matchedText :: Match a -> Maybe a
matchedText = fmap capturedText . matchCapture
-- | the top-level capture if the source text matched the RE,
-- Nothing otherwise
matchCapture :: Match a -> Maybe (Capture a)
matchCapture = fmap fst . matchCaptures
-- | the main top-level capture (capture \'0'') and the sub captures
-- if the text matched the RE, @Nothing@ otherwise
matchCaptures :: Match a -> Maybe (Capture a,[Capture a])
matchCaptures Match{..} = case rangeSize (bounds matchArray) == 0 of
True -> Nothing
False -> Just (matchArray!0,drop 1 $ elems matchArray)
-- | an alternative for captureText
(!$$) :: Match a -> CaptureID -> a
(!$$) = flip captureText
-- | look up the text of the nth capture, 0 being the match of the whole
-- RE against the source text, 1, the first bracketed sub-expression to
-- be matched and so on
captureText :: CaptureID -> Match a -> a
captureText cid mtch = capturedText $ capture cid mtch
-- | an alternative for captureTextMaybe
(!$$?) :: Match a -> CaptureID -> Maybe a
(!$$?) = flip captureTextMaybe
-- | look up the text of the nth capture (0 being the match of the
-- whole), returning Nothing if the Match doesn't contain the capture
captureTextMaybe :: CaptureID -> Match a -> Maybe a
captureTextMaybe cid mtch = do
cap <- mtch !$? cid
case hasCaptured cap of
True -> Just $ capturedText cap
False -> Nothing
-- | an alternative for capture
(!$) :: Match a -> CaptureID -> Capture a
(!$) = flip capture
-- | look up the nth capture, 0 being the match of the whole RE against
-- the source text, 1, the first bracketed sub-expression to be matched
-- and so on
capture :: CaptureID -> Match a -> Capture a
capture cid mtch = fromMaybe oops $ mtch !$? cid
where
oops = error $ "capture: out of bounds (" ++ show cid ++ ")"
-- | an alternative for capture captureMaybe
(!$?) :: Match a -> CaptureID -> Maybe (Capture a)
(!$?) = flip captureMaybe
-- | look up the nth capture, 0 being the match of the whole RE against
-- the source text, 1, the first bracketed sub-expression to be matched
-- and so on, returning Nothing if there is no such capture, or if the
-- capture failed to capture anything (being in a failed alternate)
captureMaybe :: CaptureID -> Match a -> Maybe (Capture a)
captureMaybe cid mtch@Match{..} = do
i <- lookupCaptureID cid mtch
cap <- case bounds matchArray `inRange` i of
True -> Just $ matchArray ! i
False -> Nothing
case hasCaptured cap of
True -> Just cap
False -> Nothing
lookupCaptureID :: CaptureID -> Match a -> Maybe CaptureOrdinal
lookupCaptureID cid Match{..} =
either (const Nothing) Just $ findCaptureID cid captureNames
\end{code}
\begin{code}
-- | this instance hooks 'Match' into regex-base: regex consumers need
-- not worry about any of this
instance
( RegexContext regex source (AllTextSubmatches (Array Int) (source,(Int,Int)))
, RegexLike regex source
, RegexFix regex source
) =>
RegexContext regex source (Match source) where
match r s = convertMatchText r s $ getAllTextSubmatches $ match r s
matchM r s = do
y <- matchM r s
return $ convertMatchText r s $ getAllTextSubmatches y
\end{code}
\begin{code}
-- | convert a regex-base native MatchText into a regex Match type
convertMatchText :: RegexFix regex source
=> regex
-> source
-> MatchText source
-> Match source
convertMatchText re hay arr =
Match
{ matchSource = hay
, captureNames = noCaptureNames
, matchArray =
ixmap (CaptureOrdinal lo,CaptureOrdinal hi) getCaptureOrdinal $
fmap f arr
}
where
(lo,hi) = bounds arr
f (ndl,(off_,len_)) =
Capture
{ captureSource = hay
, capturedText = ndl
, captureOffset = off
, captureLength = len
}
where
CharRange off len = utf8_correct re hay off_ len_
\end{code}
\begin{code}
data CharRange = CharRange !Int !Int
deriving (Show)
class RegexFix regex source where
utf8_correct :: regex -> source -> Int -> Int -> CharRange
utf8_correct _ _ = CharRange
instance RegexFix TDFA.Regex [Char] where
instance RegexFix TDFA.Regex B.ByteString where
instance RegexFix TDFA.Regex LBS.ByteString where
instance RegexFix TDFA.Regex T.Text where
instance RegexFix TDFA.Regex LT.Text where
instance RegexFix TDFA.Regex (S.Seq Char) where
instance RegexFix PCRE.Regex [Char] where
utf8_correct _ = utf8_correct_bs . B.fromString
instance RegexFix PCRE.Regex B.ByteString where
instance RegexFix PCRE.Regex LBS.ByteString where
instance RegexFix PCRE.Regex T.Text where
utf8_correct _ = utf8_correct_bs . T.encodeUtf8
instance RegexFix PCRE.Regex LT.Text where
utf8_correct _ = utf8_correct_bs . T.encodeUtf8 . LT.toStrict
instance RegexFix PCRE.Regex (S.Seq Char) where
-- convert a byte offset+length in a UTF-8-encoded ByteString
-- into a character offset+length
utf8_correct_bs :: B.ByteString -> Int -> Int -> CharRange
utf8_correct_bs bs ix0 ln0 = case ix0+ln0 > BW.length bs of
True -> error "utf8_correct_bs: index+length out of range"
False -> skip 0 0 -- BW.index calls below should not fail
where
skip ix di = case compare ix ix0 of
GT -> error "utf8_correct_bs: UTF-8 decoding error"
EQ -> count ix di 0 ln0
LT -> case u8_width $ BW.index bs ix of
Single -> skip (ix+1) di
Double -> skip (ix+2) $ di+1
Triple -> skip (ix+3) $ di+2
Quadruple -> skip (ix+4) $ di+3
count ix di dl c = case compare c 0 of
LT -> error "utf8_correct_bs: length ends inside character"
EQ -> CharRange (ix0-di) (ln0-dl)
GT -> case u8_width $ BW.index bs ix of
Single -> count (ix+1) di dl $ c-1
Double -> count (ix+2) di (dl+1) $ c-2
Triple -> count (ix+3) di (dl+2) $ c-3
Quadruple -> count (ix+4) di (dl+3) $ c-4
data UTF8Size = Single | Double | Triple | Quadruple
deriving (Show)
u8_width :: Word8 -> UTF8Size
u8_width w8 = case w8 .&. 0x80 == 0x00 of
True -> Single
False -> case w8 .&. 0xE0 == 0xC0 of
True -> Double
False -> case w8 .&. 0xF0 == 0xE0 of
True -> Triple
False -> case w8 .&. 0xF8 == 0xF0 of
True -> Quadruple
False -> error "u8_width: UTF-8 decoding error"
\end{code}