Skip to content

Commit

Permalink
1.0.1.1: Fix 1.0.1.0 release bug and provisionally fix UTF8/PCRE inte…
Browse files Browse the repository at this point in the history
…rworking

  * regex-pcre not working properly with UTF-8 text (fixes #141)
  * ZeInternals/SearchReplace (fixes #140)
  • Loading branch information
cdornan committed Jun 4, 2017
1 parent d6aca72 commit c45f1e9
Show file tree
Hide file tree
Showing 20 changed files with 310 additions and 146 deletions.
4 changes: 2 additions & 2 deletions Text/RE/ZeInternals/SearchReplace/PCRE/Text.hs
Expand Up @@ -24,8 +24,8 @@ import Language.Haskell.TH
import Language.Haskell.TH.Quote
import Text.RE.REOptions
import Text.RE.Tools.IsRegex
import Text.RE.ZeInternals.SearchReplace.TDFAEdPrime
import Text.RE.ZeInternals.TDFA
import Text.RE.ZeInternals.PCRE
import Text.RE.ZeInternals.SearchReplace.PCREEdPrime

-- | @[ed| ... \/\/\/ ... |]@, is equivalent to @[edMultilineSensitive| ... \/\/\/ ... |]@,
-- compiling a case-sensitive, multi-line 'SearchReplace'
Expand Down
4 changes: 2 additions & 2 deletions Text/RE/ZeInternals/SearchReplace/PCRE/Text/Lazy.hs
Expand Up @@ -24,8 +24,8 @@ import Language.Haskell.TH
import Language.Haskell.TH.Quote
import Text.RE.REOptions
import Text.RE.Tools.IsRegex
import Text.RE.ZeInternals.SearchReplace.TDFAEdPrime
import Text.RE.ZeInternals.TDFA
import Text.RE.ZeInternals.PCRE
import Text.RE.ZeInternals.SearchReplace.PCREEdPrime

-- | @[ed| ... \/\/\/ ... |]@, is equivalent to @[edMultilineSensitive| ... \/\/\/ ... |]@,
-- compiling a case-sensitive, multi-line 'SearchReplace'
Expand Down
95 changes: 90 additions & 5 deletions Text/RE/ZeInternals/Types/Match.lhs
Expand Up @@ -23,17 +23,30 @@ module Text.RE.ZeInternals.Types.Match
, capture
, (!$?)
, captureMaybe
, RegexFix(..)
, convertMatchText
) where
\end{code}

\begin{code}
import Data.Array
import Data.Bits
import qualified Data.ByteString as BW
import qualified Data.ByteString.Char8 as B
import qualified Data.ByteString.Lazy.Char8 as LBS
import qualified Data.ByteString.UTF8 as B
import Data.Maybe
import qualified Data.Sequence as S
import qualified Data.Text as T
import qualified Data.Text.Encoding as T
import qualified Data.Text.Lazy as LT
import Data.Typeable
import Data.Word
import Text.RE.ZeInternals.Types.Capture
import Text.RE.ZeInternals.Types.CaptureID
import Text.Regex.Base
import qualified Text.Regex.PCRE as PCRE
import qualified Text.Regex.TDFA as TDFA
infixl 9 !$, !$$
\end{code}
Expand Down Expand Up @@ -160,18 +173,23 @@ lookupCaptureID cid Match{..} =
instance
( RegexContext regex source (AllTextSubmatches (Array Int) (source,(Int,Int)))
, RegexLike regex source
, RegexFix regex source
) =>
RegexContext regex source (Match source) where
match r s = convertMatchText s $ getAllTextSubmatches $ match r s
match r s = convertMatchText r s $ getAllTextSubmatches $ match r s
matchM r s = do
y <- matchM r s
return $ convertMatchText s $ getAllTextSubmatches y
return $ convertMatchText r s $ getAllTextSubmatches y
\end{code}

\begin{code}
-- | convert a regex-base native MatchText into a regex Match type
convertMatchText :: source -> MatchText source -> Match source
convertMatchText hay arr =
convertMatchText :: RegexFix regex source
=> regex
-> source
-> MatchText source
-> Match source
convertMatchText re hay arr =
Match
{ matchSource = hay
, captureNames = noCaptureNames
Expand All @@ -182,11 +200,78 @@ convertMatchText hay arr =
where
(lo,hi) = bounds arr
f (ndl,(off,len)) =
f (ndl,(off_,len_)) =
Capture
{ captureSource = hay
, capturedText = ndl
, captureOffset = off
, captureLength = len
}
where
CharRange off len = utf8_correct re hay off_ len_
\end{code}

\begin{code}
data CharRange = CharRange !Int !Int
deriving (Show)
class RegexFix regex source where
utf8_correct :: regex -> source -> Int -> Int -> CharRange
utf8_correct _ _ = CharRange
instance RegexFix TDFA.Regex [Char] where
instance RegexFix TDFA.Regex B.ByteString where
instance RegexFix TDFA.Regex LBS.ByteString where
instance RegexFix TDFA.Regex T.Text where
instance RegexFix TDFA.Regex LT.Text where
instance RegexFix TDFA.Regex (S.Seq Char) where
instance RegexFix PCRE.Regex [Char] where
utf8_correct _ = utf8_correct_bs . B.fromString
instance RegexFix PCRE.Regex B.ByteString where
instance RegexFix PCRE.Regex LBS.ByteString where
instance RegexFix PCRE.Regex T.Text where
utf8_correct _ = utf8_correct_bs . T.encodeUtf8
instance RegexFix PCRE.Regex LT.Text where
utf8_correct _ = utf8_correct_bs . T.encodeUtf8 . LT.toStrict
instance RegexFix PCRE.Regex (S.Seq Char) where
-- convert a byte offset+length in a UTF-8-encoded ByteString
-- into a character offset+length
utf8_correct_bs :: B.ByteString -> Int -> Int -> CharRange
utf8_correct_bs bs ix0 ln0 = case ix0+ln0 > BW.length bs of
True -> error "utf8_correct_bs: index+length out of range"
False -> skip 0 0 -- BW.index calls below should not fail
where
skip ix di = case compare ix ix0 of
GT -> error "utf8_correct_bs: UTF-8 decoding error"
EQ -> count ix di 0 ln0
LT -> case u8_width $ BW.index bs ix of
Single -> skip (ix+1) di
Double -> skip (ix+2) $ di+1
Triple -> skip (ix+3) $ di+2
Quadruple -> skip (ix+4) $ di+3
count ix di dl c = case compare c 0 of
LT -> error "utf8_correct_bs: length ends inside character"
EQ -> CharRange (ix0-di) (ln0-dl)
GT -> case u8_width $ BW.index bs ix of
Single -> count (ix+1) di dl $ c-1
Double -> count (ix+2) di (dl+1) $ c-2
Triple -> count (ix+3) di (dl+2) $ c-3
Quadruple -> count (ix+4) di (dl+3) $ c-4
data UTF8Size = Single | Double | Triple | Quadruple
deriving (Show)
u8_width :: Word8 -> UTF8Size
u8_width w8 = case w8 .&. 0x80 == 0x00 of
True -> Single
False -> case w8 .&. 0xE0 == 0xC0 of
True -> Double
False -> case w8 .&. 0xF0 == 0xE0 of
True -> Triple
False -> case w8 .&. 0xF8 == 0xF0 of
True -> Quadruple
False -> error "u8_width: UTF-8 decoding error"
\end{code}
5 changes: 3 additions & 2 deletions Text/RE/ZeInternals/Types/Matches.lhs
Expand Up @@ -71,10 +71,11 @@ mainCaptures ac = [ capture c0 cs | cs<-allMatches ac ]
instance
( RegexContext regex source [MatchText source]
, RegexLike regex source
, RegexFix regex source
) =>
RegexContext regex source (Matches source) where
match r s = Matches s $ map (convertMatchText s) $ match r s
match r s = Matches s $ map (convertMatchText r s) $ match r s
matchM r s = do
y <- matchM r s
return $ Matches s $ map (convertMatchText s) y
return $ Matches s $ map (convertMatchText r s) y
\end{code}
6 changes: 5 additions & 1 deletion changelog
@@ -1,6 +1,10 @@
-*-change-log-*-

1.0.1.0 Chris Dornan <chris.dornan@irisconnect.co.uk> 2017-06-05
1.0.1.1 Chris Dornan <chris.dornan@irisconnect.co.uk> 2017-06-04
* regex-pcre not working properly with UTF-8 text (#141)
* ZeInternals/SearchReplace (#140)

1.0.1.0 Chris Dornan <chris.dornan@irisconnect.co.uk> 2017-06-03
* Text.RE.PCRE.Text[.Lazy] (#58)
* Update LSTHaskell versions (#136)
* Add re-top example (#137)
Expand Down
126 changes: 94 additions & 32 deletions examples/re-tests.lhs
Expand Up @@ -28,6 +28,8 @@ import Control.Monad
import Data.Array
import qualified Data.ByteString.Char8 as B
import qualified Data.ByteString.Lazy.Char8 as LBS
import qualified Data.ByteString.Lazy.UTF8 as LBS
import qualified Data.ByteString.UTF8 as B
import Data.Char
import qualified Data.Foldable as F
import qualified Data.HashMap.Strict as HM
Expand Down Expand Up @@ -377,18 +379,33 @@ search_replace_tests = testGroup "SearchReplace"
, testCase "TDFA.ed/LT" $ test LT.pack tdfa_eds
, testCase "TDFA.ed/T(d)" $ test T.pack tdfa_eds'
, testCase "PCRE.ed/LBS(d)" $ test LBS.pack pcre_eds'
, testg "TDFA.op/String" (T_ST.?=~/) (T_ST.*=~/) tdfa_sr
, testg "PCRE.op/String" (P_ST.?=~/) (P_ST.*=~/) pcre_sr
, testg "TDFA.op/B" (T_BS.?=~/) (T_BS.*=~/) tdfa_sr
, testg "PCRE.op/B" (P_BS.?=~/) (P_BS.*=~/) pcre_sr
, testg "TDFA.op/LBS" (TLBS.?=~/) (TLBS.*=~/) tdfa_sr
, testg "PCRE.op/LBS" (PLBS.?=~/) (PLBS.*=~/) pcre_sr
, testg "TDFA.op/T" (T_TX.?=~/) (T_TX.*=~/) tdfa_sr
, testg "PCRE.op/T" (P_TX.?=~/) (P_TX.*=~/) pcre_sr
, testg "TDFA.op/LT" (TLTX.?=~/) (TLTX.*=~/) tdfa_sr
, testg "PCRE.op/LT" (PLTX.?=~/) (PLTX.*=~/) pcre_sr
, testG "TDFA.op/S" (T_SQ.?=~/) (T_SQ.*=~/) tdfa_sr
, testG "PCRE.op/S" (P_SQ.?=~/) (P_SQ.*=~/) pcre_sr
, testg "TDFA.op" (T_ST.?=~/) (T_ST.*=~/) tdfa_sr
, testg "PCRE.op" (P_ST.?=~/) (P_ST.*=~/) pcre_sr
, testg "TDFA.op/String" (T_ST.?=~/) (T_ST.*=~/) tdfa_sr_str
, testg "PCRE.op/String" (P_ST.?=~/) (P_ST.*=~/) pcre_sr_str
, testg "TDFA.op/B" (T_BS.?=~/) (T_BS.*=~/) tdfa_sr_b
, testg "PCRE.op/B" (P_BS.?=~/) (P_BS.*=~/) pcre_sr_b
, testg "TDFA.op/LBS" (TLBS.?=~/) (TLBS.*=~/) tdfa_sr_lbs
, testg "PCRE.op/LBS" (PLBS.?=~/) (PLBS.*=~/) pcre_sr_lbs
, testg "TDFA.op/T" (T_TX.?=~/) (T_TX.*=~/) tdfa_sr_t
, testg "PCRE.op/T" (P_TX.?=~/) (P_TX.*=~/) pcre_sr_t
, testg "TDFA.op/LT" (TLTX.?=~/) (TLTX.*=~/) tdfa_sr_lt
, testg "PCRE.op/LT" (PLTX.?=~/) (PLTX.*=~/) pcre_sr_lt
, testG "TDFA.op/S" (T_SQ.?=~/) (T_SQ.*=~/) tdfa_sr_s
, testG "PCRE.op/S" (P_SQ.?=~/) (P_SQ.*=~/) pcre_sr_s
, testu "PCRE.U/String" id (P_ST.*=~/) [P_ST.ed|scientist///boffin|] (P_ST.*=~) [P_ST.re|λ-|]
, testu "PCRE.U/B" B.fromString (P_BS.*=~/) [P_BS.ed|scientist///boffin|] (P_BS.*=~) [P_BS.re|λ-|]
, testu "PCRE.U/LBS" LBS.fromString (PLBS.*=~/) [PLBS.ed|scientist///boffin|] (PLBS.*=~) [PLBS.re|λ-|]
, testu "PCRE.U/T" T.pack (P_TX.*=~/) [P_TX.ed|scientist///boffin|] (P_TX.*=~) [P_TX.re|λ-|]
, testu "PCRE.U/LT" LT.pack (PLTX.*=~/) [PLTX.ed|scientist///boffin|] (PLTX.*=~) [PLTX.re|λ-|]
, testu "PCRE.U/S" S.fromList (P_SQ.*=~/) [P_SQ.ed|scientist///boffin|] (P_SQ.*=~) [P_SQ.re|burble|]
, testu "TDFA.U/String" id (T_ST.*=~/) [T_ST.ed|scientist///boffin|] (T_ST.*=~) [T_ST.re|λ-|]
, testu "TDFA.U/B" B.fromString (T_BS.*=~/) [T_BS.ed|scientist///boffin|] (T_BS.*=~) [T_BS.re|burble|]
, testu "TDFA.U/LBS" LBS.fromString (TLBS.*=~/) [TLBS.ed|scientist///boffin|] (TLBS.*=~) [TLBS.re|burble|]
, testu "TDFA.U/T" T.pack (T_TX.*=~/) [T_TX.ed|scientist///boffin|] (T_TX.*=~) [T_TX.re|λ-|]
, testu "TDFA.U/LT" LT.pack (TLTX.*=~/) [TLTX.ed|scientist///boffin|] (TLTX.*=~) [TLTX.re|λ-|]
, testu "TDFA.U/S" S.fromList (T_SQ.*=~/) [T_SQ.ed|scientist///boffin|] (T_SQ.*=~) [T_SQ.re|λ-|]
]
where
test :: IsRegex re a => (String->a) -> Edits Identity re a -> Assertion
Expand All @@ -404,39 +421,84 @@ search_replace_tests = testGroup "SearchReplace"
, testCase "*=~/" $ S.fromList rsm @=? S.fromList inp `opm` sr
]
testu lab inj op sr qop rex = testGroup lab
[ testCase "*=~/" $ inj unr @=? inj uni `op` sr
, testCase "*=~" $ 1 @=? countMatches (inj uni `qop` rex)
]
inp, rs1, rsm :: IsString a => a
inp = "16/03/2017 01/01/2000\n"
rs1 = "2017-03-16 01/01/2000\n"
rsm = "2017-03-16 2000-01-01\n"
tdfa_eds :: IsRegex TDFA.RE a => Edits Identity TDFA.RE a
tdfa_eds = Select [Template tdfa_sr]
uni, unr :: String
uni = "\x2070E-\8364-\955-scientist-burble"
unr = "\x2070E-\8364-\955-boffin-burble"
tdfa_eds :: IsRegex TDFA.RE a => Edits Identity TDFA.RE a
tdfa_eds = Select [Template tdfa_sr]
pcre_eds :: IsRegex PCRE.RE a => Edits Identity PCRE.RE a
pcre_eds = Select [Template pcre_sr]
tdfa_sr :: IsRegex TDFA.RE a => SearchReplace TDFA.RE a
tdfa_sr = [TDFA.ed|${d}([0-9]{2})/${m}([0-9]{2})/${y}([0-9]{4})///${y}-${m}-${d}|]
pcre_sr :: IsRegex PCRE.RE a => SearchReplace PCRE.RE a
pcre_sr = [PCRE.ed|${d}([0-9]{2})/${m}([0-9]{2})/${y}([0-9]{4})///${y}-${m}-${d}|]
tdfa_sr_str :: SearchReplace TDFA.RE String
tdfa_sr_str = [T_ST.ed|${d}([0-9]{2})/${m}([0-9]{2})/${y}([0-9]{4})///${y}-${m}-${d}|]
pcre_sr_str :: SearchReplace PCRE.RE String
pcre_sr_str = [P_ST.ed|${d}([0-9]{2})/${m}([0-9]{2})/${y}([0-9]{4})///${y}-${m}-${d}|]
tdfa_sr_b :: SearchReplace TDFA.RE B.ByteString
tdfa_sr_b = [T_BS.ed|${d}([0-9]{2})/${m}([0-9]{2})/${y}([0-9]{4})///${y}-${m}-${d}|]
pcre_sr_b :: SearchReplace PCRE.RE B.ByteString
pcre_sr_b = [P_BS.ed|${d}([0-9]{2})/${m}([0-9]{2})/${y}([0-9]{4})///${y}-${m}-${d}|]
tdfa_sr_lbs :: SearchReplace TDFA.RE LBS.ByteString
tdfa_sr_lbs = [TLBS.ed|${d}([0-9]{2})/${m}([0-9]{2})/${y}([0-9]{4})///${y}-${m}-${d}|]
pcre_sr_lbs :: SearchReplace PCRE.RE LBS.ByteString
pcre_sr_lbs = [PLBS.ed|${d}([0-9]{2})/${m}([0-9]{2})/${y}([0-9]{4})///${y}-${m}-${d}|]
tdfa_sr_t :: SearchReplace TDFA.RE T.Text
tdfa_sr_t = [T_TX.ed|${d}([0-9]{2})/${m}([0-9]{2})/${y}([0-9]{4})///${y}-${m}-${d}|]
pcre_sr_t :: SearchReplace PCRE.RE T.Text
pcre_sr_t = [P_TX.ed|${d}([0-9]{2})/${m}([0-9]{2})/${y}([0-9]{4})///${y}-${m}-${d}|]
tdfa_sr_lt :: SearchReplace TDFA.RE LT.Text
tdfa_sr_lt = [TLTX.ed|${d}([0-9]{2})/${m}([0-9]{2})/${y}([0-9]{4})///${y}-${m}-${d}|]
pcre_eds :: IsRegex PCRE.RE a => Edits Identity PCRE.RE a
pcre_eds = Select [Template pcre_sr]
pcre_sr_lt :: SearchReplace PCRE.RE LT.Text
pcre_sr_lt = [PLTX.ed|${d}([0-9]{2})/${m}([0-9]{2})/${y}([0-9]{4})///${y}-${m}-${d}|]
tdfa_sr :: IsRegex TDFA.RE a => SearchReplace TDFA.RE a
tdfa_sr = [TDFA.ed|${d}([0-9]{2})/${m}([0-9]{2})/${y}([0-9]{4})///${y}-${m}-${d}|]
tdfa_sr_s :: SearchReplace TDFA.RE (S.Seq Char)
tdfa_sr_s = [T_SQ.ed|${d}([0-9]{2})/${m}([0-9]{2})/${y}([0-9]{4})///${y}-${m}-${d}|]
pcre_sr :: IsRegex PCRE.RE a => SearchReplace PCRE.RE a
pcre_sr = [PCRE.ed|${d}([0-9]{2})/${m}([0-9]{2})/${y}([0-9]{4})///${y}-${m}-${d}|]
pcre_sr_s :: SearchReplace PCRE.RE (S.Seq Char)
pcre_sr_s = [P_SQ.ed|${d}([0-9]{2})/${m}([0-9]{2})/${y}([0-9]{4})///${y}-${m}-${d}|]
tdfa_eds' :: IsRegex TDFA.RE a => Edits Identity TDFA.RE a
tdfa_eds' = Select [Template $ tdfa_csr "${d}([0-9]{2})/${m}([0-9]{2})/${y}([0-9]{4})" "${y}-${m}-${d}"]
tdfa_eds' :: IsRegex TDFA.RE a => Edits Identity TDFA.RE a
tdfa_eds' = Select [Template $ tdfa_csr "${d}([0-9]{2})/${m}([0-9]{2})/${y}([0-9]{4})" "${y}-${m}-${d}"]
pcre_eds' :: IsRegex PCRE.RE a => Edits Identity PCRE.RE a
pcre_eds' = Select [Template $ pcre_csr "${d}([0-9]{2})/${m}([0-9]{2})/${y}([0-9]{4})" "${y}-${m}-${d}"]
pcre_eds' :: IsRegex PCRE.RE a => Edits Identity PCRE.RE a
pcre_eds' = Select [Template $ pcre_csr "${d}([0-9]{2})/${m}([0-9]{2})/${y}([0-9]{4})" "${y}-${m}-${d}"]
tdfa_csr :: IsRegex TDFA.RE s
=> String
-> String
-> SearchReplace TDFA.RE s
tdfa_csr :: IsRegex TDFA.RE s
=> String
-> String
-> SearchReplace TDFA.RE s
tdfa_csr re_s = either error id . TDFA.compileSearchReplace re_s
pcre_csr :: IsRegex PCRE.RE s
=> String
-> String
-> SearchReplace PCRE.RE s
pcre_csr :: IsRegex PCRE.RE s
=> String
-> String
-> SearchReplace PCRE.RE s
pcre_csr re_s = either error id . PCRE.compileSearchReplace re_s
\end{code}

Expand Down
1 change: 1 addition & 0 deletions lib/cabal-masters/constraints-incl.cabal
Expand Up @@ -29,3 +29,4 @@
%- time-locale-compat == 0.1.*
%- transformers >= 0.2.2 && < 0.6
%- unordered-containers == 0.2.*
%- utf8-string == 1.0.*
2 changes: 1 addition & 1 deletion lib/cabal-masters/executables-incl.cabal
Expand Up @@ -66,7 +66,7 @@
Other-Modules:
TestKit

%build-depends-prog regex regex-with-pcre array base base-compat bytestring containers directory filepath heredoc regex-base regex-tdfa regex-tdfa-text regex-pcre-builtin shelly smallcheck tasty tasty-hunit tasty-smallcheck template-haskell text unordered-containers
%build-depends-prog regex regex-with-pcre array base base-compat bytestring containers directory filepath heredoc regex-base regex-tdfa regex-tdfa-text regex-pcre-builtin shelly smallcheck tasty tasty-hunit tasty-smallcheck template-haskell text unordered-containers utf8-string

%test-exe re-top
Hs-Source-Dirs: examples
Expand Down
2 changes: 1 addition & 1 deletion lib/cabal-masters/mega-regex.cabal
Expand Up @@ -38,7 +38,7 @@ Source-Repository this

%include "lib/cabal-masters/library-incl.cabal"

%build-depends-lib array bytestring base base-compat containers hashable regex-base regex-tdfa regex-tdfa-text regex-pcre-builtin regex-pcre-text template-haskell text time time-locale-compat transformers unordered-containers
%build-depends-lib array bytestring base base-compat containers hashable regex-base regex-tdfa regex-tdfa-text regex-pcre-builtin regex-pcre-text template-haskell text time time-locale-compat transformers unordered-containers utf8-string

%include "lib/cabal-masters/executables-incl.cabal"

Expand Down
2 changes: 1 addition & 1 deletion lib/cabal-masters/regex.cabal
Expand Up @@ -3,6 +3,6 @@ Name: regex
%include "lib/cabal-masters/constraints-incl.cabal"
%include "lib/cabal-masters/library-incl.cabal" exclude "PCRE"

%build-depends-lib array base base-compat bytestring containers hashable regex-base regex-tdfa regex-tdfa-text template-haskell text time time-locale-compat transformers unordered-containers
%build-depends-lib array base base-compat bytestring containers hashable regex-base regex-pcre-builtin regex-tdfa regex-tdfa-text template-haskell text time time-locale-compat transformers unordered-containers utf8-string

-- Generated with re-gen-cabals
1 change: 1 addition & 0 deletions lib/md/roadmap-incl.md
@@ -1,3 +1,4 @@
- [X] 2017-04-10 v1.0.0.0 [First stable release](https://github.com/iconnect/regex/milestone/3)
- [X] 2017-06-03 v1.0.1.0 [PCRE.Text, strict PVP, Update Stackage vrns, add re-top](https://github.com/iconnect/regex/milestone/19)
- [X] 2017-06-04 v1.0.1.1 [Fix 1.0.1.0 release bug and provisionally fix UTF8/PCRE interworking](https://github.com/iconnect/regex/milestone/20)
- [ ] 2017-08-31 v2.0.0.0 [Fast text replacement with benchmarks](https://github.com/iconnect/regex/milestone/4)

0 comments on commit c45f1e9

Please sign in to comment.