Text/RE/ZeInternals/Types/Match.lhs

\begin{code}
{-# LANGUAGE RecordWildCards            #-}
{-# LANGUAGE FlexibleInstances          #-}
{-# LANGUAGE UndecidableInstances       #-}
{-# LANGUAGE MultiParamTypeClasses      #-}
{-# LANGUAGE DeriveDataTypeable         #-}
{-# LANGUAGE MonoLocalBinds             #-}
\end{code}

\begin{code}
module Text.RE.ZeInternals.Types.Match
  ( Match(..)
  , noMatch
  , emptyMatchArray
  , matched
  , matchedText
  , matchCapture
  , matchCaptures
  , (!$$)
  , captureText
  , (!$$?)
  , captureTextMaybe
  , (!$)
  , capture
  , (!$?)
  , captureMaybe
  , RegexFix(..)
  , convertMatchText
  ) where
\end{code}

\begin{code}
import           Data.Array
import           Data.Bits
import qualified Data.ByteString                as BW
import qualified Data.ByteString.Char8          as B
import qualified Data.ByteString.Lazy.Char8     as LBS
import qualified Data.ByteString.UTF8           as B
import           Data.Maybe
import qualified Data.Sequence                  as S
import qualified Data.Text                      as T
import qualified Data.Text.Encoding             as T
import qualified Data.Text.Lazy                 as LT
import           Data.Typeable
import           Data.Word
import           Text.RE.ZeInternals.Types.Capture
import           Text.RE.ZeInternals.Types.CaptureID
import           Text.Regex.Base
import qualified Text.Regex.PCRE                as PCRE
import qualified Text.Regex.TDFA                as TDFA

infixl 9 !$, !$$
\end{code}

\begin{code}
-- | the result of matching a RE to a text once (with @?=~@), retaining
-- the text that was matched against
data Match a =
  Match
    { matchSource  :: !a                -- ^ the whole source text
    , captureNames :: !CaptureNames     -- ^ the RE's capture names
    , matchArray   :: !(Array CaptureOrdinal (Capture a))
                                        -- ^ 0..n-1 captures,
                                        -- starting with the
                                        -- text matched by the
                                        -- whole RE
    }
  deriving (Show,Eq,Typeable)
\end{code}

\begin{code}
-- | Construct a Match that does not match anything.
noMatch :: a -> Match a
noMatch t = Match t noCaptureNames emptyMatchArray

-- | an empty array of Capture
emptyMatchArray :: Array CaptureOrdinal (Capture a)
emptyMatchArray = listArray (CaptureOrdinal 0,CaptureOrdinal $ -1) []
\end{code}

\begin{code}
instance Functor Match where
  fmap f Match{..} =
    Match
      { matchSource  = f matchSource
      , captureNames = captureNames
      , matchArray   = fmap (fmap f) matchArray
      }
\end{code}

\begin{code}
-- | tests whether the RE matched the source text at all
matched :: Match a -> Bool
matched = isJust . matchCapture

-- | yields the text matched by the RE, Nothing if no match
matchedText :: Match a -> Maybe a
matchedText = fmap capturedText . matchCapture

-- | the top-level capture if the source text matched the RE,
-- Nothing otherwise
matchCapture :: Match a -> Maybe (Capture a)
matchCapture = fmap fst . matchCaptures

-- | the main top-level capture (capture \'0'') and the sub captures
-- if the text matched the RE, @Nothing@ otherwise
matchCaptures :: Match a -> Maybe (Capture a,[Capture a])
matchCaptures Match{..} = case rangeSize (bounds matchArray) == 0 of
  True  -> Nothing
  False -> Just (matchArray!0,drop 1 $ elems matchArray)

-- | an alternative for captureText
(!$$) :: Match a -> CaptureID -> a
(!$$) = flip captureText

-- | look up the text of the nth capture, 0 being the match of the whole
-- RE against the source text, 1, the first bracketed sub-expression to
-- be matched and so on
captureText :: CaptureID -> Match a -> a
captureText cid mtch = capturedText $ capture cid mtch

-- | an alternative for captureTextMaybe
(!$$?) :: Match a -> CaptureID -> Maybe a
(!$$?) = flip captureTextMaybe

-- | look up the text of the nth capture (0 being the match of the
-- whole), returning Nothing if the Match doesn't contain the capture
captureTextMaybe :: CaptureID -> Match a -> Maybe a
captureTextMaybe cid mtch = do
    cap <- mtch !$? cid
    case hasCaptured cap of
      True  -> Just $ capturedText cap
      False -> Nothing

-- | an alternative for capture
(!$) :: Match a -> CaptureID -> Capture a
(!$) = flip capture

-- | look up the nth capture, 0 being the match of the whole RE against
-- the source text, 1, the first bracketed sub-expression to be matched
-- and so on
capture :: CaptureID -> Match a -> Capture a
capture cid mtch = fromMaybe oops $ mtch !$? cid
  where
    oops = error $ "capture: out of bounds (" ++ show cid ++ ")"

-- | an alternative for capture captureMaybe
(!$?) :: Match a -> CaptureID -> Maybe (Capture a)
(!$?) = flip captureMaybe

-- | look up the nth capture, 0 being the match of the whole RE against
-- the source text, 1, the first bracketed sub-expression to be matched
-- and so on, returning Nothing if there is no such capture, or if the
-- capture failed to capture anything (being in a failed alternate)
captureMaybe :: CaptureID -> Match a -> Maybe (Capture a)
captureMaybe cid mtch@Match{..} = do
  i   <- lookupCaptureID cid mtch
  cap <- case bounds matchArray `inRange` i of
    True  -> Just $ matchArray ! i
    False -> Nothing
  case hasCaptured cap of
    True  -> Just cap
    False -> Nothing

lookupCaptureID :: CaptureID -> Match a -> Maybe CaptureOrdinal
lookupCaptureID cid Match{..} =
    either (const Nothing) Just $ findCaptureID cid captureNames
\end{code}


\begin{code}
-- | this instance hooks 'Match' into regex-base: regex consumers need
-- not worry about any of this
instance
    ( RegexContext regex source (AllTextSubmatches (Array Int) (source,(Int,Int)))
    , RegexLike    regex source
    , RegexFix     regex source
    ) =>
  RegexContext regex source (Match source) where
    match  r s = convertMatchText r s $ getAllTextSubmatches $ match r s
    matchM r s = do
      y <- matchM r s
      return $ convertMatchText r s $ getAllTextSubmatches y
\end{code}

\begin{code}
-- | convert a regex-base native MatchText into a regex Match type
convertMatchText :: RegexFix regex source
                 => regex
                 -> source
                 -> MatchText source
                 -> Match source
convertMatchText re hay arr =
    Match
      { matchSource  = hay
      , captureNames = noCaptureNames
      , matchArray   =
          ixmap (CaptureOrdinal lo,CaptureOrdinal hi) getCaptureOrdinal $
            fmap f arr
      }
  where
    (lo,hi) = bounds arr

    f (ndl,(off_,len_)) =
      Capture
        { captureSource = hay
        , capturedText  = ndl
        , captureOffset = off
        , captureLength = len
        }
      where
        CharRange off len = utf8_correct re hay off_ len_
\end{code}

\begin{code}
data CharRange = CharRange !Int !Int
  deriving (Show)

class RegexFix regex source where
  utf8_correct :: regex -> source -> Int -> Int -> CharRange
  utf8_correct _ _ = CharRange

instance RegexFix TDFA.Regex [Char]         where
instance RegexFix TDFA.Regex B.ByteString   where
instance RegexFix TDFA.Regex LBS.ByteString where
instance RegexFix TDFA.Regex T.Text         where
instance RegexFix TDFA.Regex LT.Text        where
instance RegexFix TDFA.Regex (S.Seq Char)   where

instance RegexFix PCRE.Regex [Char]         where
  utf8_correct _ = utf8_correct_bs . B.fromString
instance RegexFix PCRE.Regex B.ByteString   where
instance RegexFix PCRE.Regex LBS.ByteString where
instance RegexFix PCRE.Regex T.Text         where
  utf8_correct _ = utf8_correct_bs . T.encodeUtf8
instance RegexFix PCRE.Regex LT.Text        where
  utf8_correct _ = utf8_correct_bs . T.encodeUtf8 . LT.toStrict
instance RegexFix PCRE.Regex (S.Seq Char)   where

-- convert a byte offset+length in a UTF-8-encoded ByteString
-- into a character offset+length
utf8_correct_bs :: B.ByteString -> Int -> Int -> CharRange
utf8_correct_bs bs ix0 ln0 = case ix0+ln0 > BW.length bs of
    True  -> error "utf8_correct_bs: index+length out of range"
    False -> skip 0 0     -- BW.index calls below should not fail
  where
    skip ix di = case compare ix ix0 of
      GT -> error "utf8_correct_bs: UTF-8 decoding error"
      EQ -> count ix di 0 ln0
      LT -> case u8_width $ BW.index bs ix of
        Single    -> skip (ix+1)   di
        Double    -> skip (ix+2) $ di+1
        Triple    -> skip (ix+3) $ di+2
        Quadruple -> skip (ix+4) $ di+3

    count ix di dl c = case compare c 0 of
      LT -> error "utf8_correct_bs: length ends inside character"
      EQ -> CharRange (ix0-di) (ln0-dl)
      GT -> case u8_width $ BW.index bs ix of
        Single    -> count (ix+1) di  dl    $ c-1
        Double    -> count (ix+2) di (dl+1) $ c-2
        Triple    -> count (ix+3) di (dl+2) $ c-3
        Quadruple -> count (ix+4) di (dl+3) $ c-4

data UTF8Size = Single | Double | Triple | Quadruple
  deriving (Show)

u8_width :: Word8 -> UTF8Size
u8_width w8 = case   w8 .&. 0x80 == 0x00 of
  True  ->       Single
  False -> case      w8 .&. 0xE0 == 0xC0 of
    True  ->     Double
    False -> case    w8 .&. 0xF0 == 0xE0 of
      True  ->   Triple
      False -> case  w8 .&. 0xF8 == 0xF0 of
        True  -> Quadruple
        False -> error "u8_width: UTF-8 decoding error"
\end{code}