haskell-foundation · vincenthz · Jan 13, 2018 · Dec 6, 2017 · Dec 8, 2017 · Dec 8, 2017
diff --git a/basement/Basement/String.hs b/basement/Basement/String.hs
@@ -115,6 +115,7 @@ import           Basement.FinalPtr
 import           Basement.IntegralConv
 import           Basement.Floating
 import           Basement.MutableBuilder
+import           Basement.String.CaseMapping (upperMapping, lowerMapping)
 import           Basement.UTF8.Table
 import           Basement.UTF8.Helper
 import           Basement.UTF8.Base
@@ -1317,15 +1318,68 @@ decimalDigitsPtr startAcc ptr !endOfs !startOfs = loop startAcc startOfs
 {-# SPECIALIZE decimalDigitsPtr :: Int -> Addr# -> Offset Word8 -> Offset Word8 -> (# Int, Bool, Offset Word8 #) #-}
 {-# SPECIALIZE decimalDigitsPtr :: Word -> Addr# -> Offset Word8 -> Offset Word8 -> (# Word, Bool, Offset Word8 #) #-}
 
+-- | A unicode string size may vary during a case conversion operation.
+--   This function calculates the new buffer size for a case conversion.
+--   Returns Nothing if no case conversion is needed.
+caseConvertNBuff :: (Char -> CM) -> String -> Maybe (CountOf Word8)
+caseConvertNBuff op s@(String ba) = runST $ Vec.unsafeIndexer ba go
+  where
+    !sz = size s
+    !end = azero `offsetPlusE` sz
+    go :: (Offset Word8 -> Word8) -> ST st (Maybe (CountOf Word8))
+    go getIdx = loop (Offset 0) 0 False
+      where
+        !nextI = nextWithIndexer getIdx
+        eSize !e = if e == '\0' 
+                      then 0
+                      else charToBytes (fromEnum e)
+        loop !idx ns changed 
+            | idx == end = if changed
+                             then return $ Just ns
+                             else return Nothing
+            | otherwise = do
+                let !(c, idx') = nextI idx
+                    !cm@(CM c1 c2 c3) = op c 
+                    !cSize = if c2 == '\0' -- if c2 is empty, c3 will be empty as well.
+                              then charToBytes (fromEnum c1) 
+                              else eSize c1 + eSize c2 + eSize c3
+                    !nchanged = changed || c1 /= c || c2 /= '\0'
+                loop idx' (ns + cSize) nchanged
+
+-- | Convert a 'String' 'Char' by 'Char' using a case mapping function. 
+caseConvert :: (Char -> CM) -> String -> String
+caseConvert op s@(String ba) 
+  = case nBuff of
+      Nothing -> s
+      Just nLen -> runST $ unsafeCopyFrom s nLen go
+  where
+    !nBuff = caseConvertNBuff op s
+    go :: String -> Offset Char -> Offset8 -> MutableString s -> Offset8 -> ST s (Offset8, Offset8)
+    go src' srcI srcIdx dst dstIdx = do
+      let !(CM c1 c2 c3) = op c 
+      dstIdx <- write dst dstIdx c1
+      nextDstIdx <- 
+        if c2 == '\0' -- We don't want to check C3 if C2 is empty.
+          then return dstIdx
+          else do
+            dstIdx  <- writeValidChar c2 dstIdx
+            writeValidChar c3 dstIdx
+      return (nextSrcIdx, nextDstIdx)
+          where
+            !(Step c nextSrcIdx) = next src' srcIdx
+            writeValidChar cc wIdx =
+                if cc == '\0'
+                    then return wIdx 
+                else do
+                    write dst wIdx cc
+
 -- | Convert a 'String' to the upper-case equivalent.
---   Does not properly support multicharacter Unicode conversions.
 upper :: String -> String
-upper = charMap toUpper
+upper = caseConvert upperMapping
 
 -- | Convert a 'String' to the upper-case equivalent.
---   Does not properly support multicharacter Unicode conversions.
 lower :: String -> String
-lower = charMap toLower
+lower = caseConvert lowerMapping
 
 -- | Check whether the first string is a prefix of the second string.
 isPrefixOf :: String -> String -> Bool

diff --git a/basement/Basement/String/CaseMapping.hs b/basement/Basement/String/CaseMapping.hs
diff --git a/basement/Basement/UTF8/Types.hs b/basement/Basement/UTF8/Types.hs
@@ -9,6 +9,8 @@ module Basement.UTF8.Types
     , isValidStepDigit
     -- * Unicode Errors
     , ValidationFailure(..)
+    -- * Case Conversion
+    , CM (..)
     ) where
 
 import           Basement.Compat.Base
@@ -34,6 +36,9 @@ newtype StepDigit = StepDigit Word8
 -- | Step when processing ASCII character
 newtype StepASCII = StepASCII Word8
 
+-- | Specialized tuple used for case mapping.
+data CM = CM {-# UNPACK #-} !Char {-# UNPACK #-} !Char {-# UNPACK #-} !Char deriving (Eq)
+
 isValidStepASCII :: StepASCII -> Bool
 isValidStepASCII (StepASCII w) = w < 0x80
 

diff --git a/basement/basement.cabal b/basement/basement.cabal
@@ -124,6 +124,7 @@ library
 
                      Basement.UArray.Base
 
+                     Basement.String.CaseMapping
                      Basement.String.Encoding.Encoding
                      Basement.String.Encoding.UTF16
                      Basement.String.Encoding.UTF32

diff --git a/scripts/caseMapping/CaseFolding.hs b/scripts/caseMapping/CaseFolding.hs
@@ -0,0 +1,52 @@
+{-# LANGUAGE OverloadedStrings #-}
+
+module CaseFolding
+    (
+      CaseFolding(..)
+    , Fold(..)
+    , parseCF
+    , mapCF
+    ) where
+
+import           Foundation 
+import           Foundation.IO
+import qualified Foundation.Parser as P 
+import qualified Foundation.String as S (lower, fromBytesUnsafe)
+import           Foundation.VFS.FilePath
+
+import UnicodeParsers
+
+data Fold = Fold {
+      code :: String
+    , status :: Char
+    , mapping :: [String]
+    , name :: String
+    } deriving (Eq, Ord, Show)
+
+data CaseFolding = CF { cfComments :: [Comment], cfFolding :: [Fold] }
+                 deriving (Show)
+
+entries :: P.Parser String CaseFolding
+entries = CF <$> P.many comment <*> P.some entry
+  where
+    entry = Fold <$> unichar <* semiCol
+                 <*> oneOf "CFST" <* P.string ";"
+                 <*> unichars <* semiCol
+                 <*> (P.string "# " *> P.takeWhile (/= '\n')) <* P.string "\n"
+
+parseCF :: FilePath -> IO (Either (P.ParseError String) CaseFolding)
+parseCF name = P.parseOnly entries . S.fromBytesUnsafe <$> readFile name
+
+mapCF :: (String -> String) -> CaseFolding -> [String]
+mapCF twiddle (CF _ ms) = typ <> (fmap nice . filter p $ ms) <> [last]
+    where
+      typ    = ["foldMapping :: Char -> CM",
+                "{-# NOINLINE foldMapping #-}"]
+      last   = "foldMapping c = CM (toLower c) '\\0' '\\0'"
+      p f    = status f `elem` ("CF" :: String) &&
+               mapping f /= [twiddle (code f)]
+      nice c = "-- " <> name c <> "\n" <>
+               "foldMapping " <> niceMap (code c) <> " = CM " <> x <> " " <> y <> " " <> z
+           where pMap = (niceMap <$> mapping c) <> ["'\\0'","'\\0'","'\\0'"]
+                 niceMap x = "'\\x" <> x <> "'" 
+                 [x,y,z] = take (CountOf 3) pMap
diff --git a/scripts/caseMapping/CaseMapping.hs b/scripts/caseMapping/CaseMapping.hs
@@ -0,0 +1,42 @@
+{-# LANGUAGE OverloadedStrings #-}
+module CaseMapping (
+  main
+) where
+
+import qualified Basement.String as BS (charMap)
+import           Data.Char (toUpper, toLower)
+import           Foundation
+import           Foundation.IO 
+import qualified Foundation.String as S
+
+import CaseFolding
+import SpecialCasing
+
+main = do
+  psc <- parseSC "SpecialCasing.txt"
+  pcf <- parseCF "CaseFolding.txt"
+  scs <- case psc of
+           Left err -> putStrLn (show err) >> undefined
+           Right sc -> return sc
+  cfs <- case pcf of
+           Left err -> putStrLn (show err) >> undefined
+           Right cf -> return cf
+  h <- openFile ("../../basement/Basement/String/CaseMapping.hs") WriteMode
+  let comments = ("--" <>) <$>
+                 take 2 (cfComments cfs) <> take 2 (scComments scs)
+  (hPut h) . S.toBytes S.UTF8  . intercalate "\n" $
+                      ["{-# LANGUAGE Rank2Types #-}"
+                      ,"-- AUTOMATICALLY GENERATED - DO NOT EDIT"
+                      ,"-- Generated by scripts/caseMapping/generateCaseMapping.sh"] 
+                      <> comments <>
+                      [""
+                      ,"module Basement.String.CaseMapping where"
+                      ,""
+                      ,"import Data.Char"
+                      ,"import Basement.UTF8.Types"
+                      ,"",""]
+  (hPut h) . S.toBytes S.UTF8 . intercalate "\n" $ (mapSC "upper" upper) (BS.charMap toUpper) scs
+  (hPut h) . S.toBytes S.UTF8 . intercalate "\n" $ (mapSC "lower" lower) (BS.charMap toLower) scs
+  (hPut h) . S.toBytes S.UTF8 . intercalate "\n" $ (mapSC "title" title) (BS.charMap toUpper) scs
+  (hPut h) . S.toBytes S.UTF8 . intercalate "\n" $ mapCF (BS.charMap toLower) cfs
+  closeFile h
diff --git a/scripts/caseMapping/SpecialCasing.hs b/scripts/caseMapping/SpecialCasing.hs
@@ -0,0 +1,70 @@
+{-# LANGUAGE OverloadedStrings #-}
+
+module SpecialCasing
+    (
+      SpecialCasing(..)
+    , Case(..)
+    , parseSC
+    , mapSC
+    ) where
+
+import qualified Data.Char as C (toUpper)  
+
+import           Foundation 
+import           Foundation.IO
+import qualified Foundation.Parser as P 
+import           Foundation.VFS.FilePath
+import           Foundation.Collection   (nonEmpty_)
+import qualified Foundation.String as S 
+
+import UnicodeParsers
+
+data SpecialCasing = SC {scComments :: [Comment], scCasing :: [Case]}
+                        deriving (Show)
+
+data Case = Case {
+    code       :: String
+  , lower      :: [String]
+  , title      :: [String]
+  , upper      :: [String]
+  , conditions :: String
+  , name       :: String
+} deriving (Eq, Ord, Show)
+
+
+entries :: P.Parser String SpecialCasing
+entries = SC <$> P.many comment <*> P.many (entry <* P.many comment)
+  where
+    entry = Case <$> unichar  <* P.string ";"
+                 <*> unichars <* P.string ";"
+                 <*> unichars <* P.string ";"
+                 <*> unichars <* P.string "; " 
+                 <*> (P.takeWhile (/= '#') <* P.string "# ")
+                 <*> P.takeWhile (/= '\n') <* P.string "\n" 
+
+parseSC :: FilePath -> IO (Either (P.ParseError String) SpecialCasing)
+parseSC name = P.parseOnly entries . S.fromBytesUnsafe <$> readFile name
+
+mapSC :: String -> (Case -> [String]) -> (String -> String) -> SpecialCasing -> [String]
+mapSC wich access twiddle (SC _ ms) =
+    typ `mappend` (fmap nice . filter p $ ms) `mappend` last
+  where
+    typ    = [wich <>  "Mapping :: Char -> CM",
+              "{-# NOINLINE " <> wich <> "Mapping #-}"]
+    last   = [wich <> "Mapping c = CM (to" <> ucFst wich 
+              <> " c) '\\0' '\\0'","",""]
+    p c = [k] /= a && a /= [twiddle k] && null (conditions c)
+        where a = access c
+              k = code c
+    nice c = "-- " <> name c <> "\n" <>
+             wich <> "Mapping " <> pHex(code c) <> " = CM " 
+             <> x <> " " <> y <> " " <> z
+        where pMap = (pHex <$> access c) <> ["'\\0'","'\\0'","'\\0'"]
+              pHex x = "'\\x" <> x <> "'" 
+              [x,y,z] = take (CountOf 3) pMap
+
+ucFst :: String -> String
+ucFst s 
+  | null s  = "" 
+  | otherwise = (fromString [C.toUpper (head neS)]) <> tail neS
+  where neS = nonEmpty_ s
diff --git a/scripts/caseMapping/UnicodeParsers.hs b/scripts/caseMapping/UnicodeParsers.hs
@@ -0,0 +1,31 @@
+{-# LANGUAGE OverloadedStrings #-}
+module UnicodeParsers where
+
+import Foundation 
+import qualified Foundation.Parser as P
+import Foundation.String as S
+import Foundation.Collection (Element)
+
+type Comment = String
+
+hexDigits :: String
+hexDigits = "1234567890ABCDEF"
+
+comment :: P.Parser String Comment
+comment = (P.string "#" *> P.takeWhile (/= '\n') <* P.string "\n") <|> (P.string "\n" *> pure "")
+
+unichar :: P.Parser String String
+unichar = P.takeWhile (`elem` hexDigits)
+
+unichars :: P.Parser String [String]
+unichars = P.some elemz
+    where elemz = P.string " " *> unichar
+
+semiCol :: P.Parser String ()
+semiCol = P.string "; "
+
+oneOf :: String -> P.Parser String Char
+oneOf s = P.satisfy_ (`elem` s)
+
+spaces :: P.Parser String ()
+spaces = P.skipWhile (== ' ')
diff --git a/scripts/caseMapping/generateCaseMapping.sh b/scripts/caseMapping/generateCaseMapping.sh
@@ -0,0 +1,9 @@
+#!/bin/sh
+
+# This script will generate the unicode case mappings tables using unicode's 
+# CaseFolding <https://unicode.org/Public/UNIDATA/CaseFolding.txt> 
+# and SpecialCasing.txt <https://unicode.org/Public/UNIDATA/SpecialCasing.txt> files.
+#
+# Those two files should be downloaded and placed in the same directory as this script.
+#
+stack runghc -- -XNoImplicitPrelude -XRebindableSyntax -XTypeFamilies -XBangPatterns -XDeriveDataTypeable CaseMapping.hs
diff --git a/tests/Test/Foundation/String.hs b/tests/Test/Foundation/String.hs
@@ -59,6 +59,21 @@ testStringCases =
               "this is only a simple string but quite longer than the 64 bytes used in the modified UTF8 parser"
               "this is only a simple string but quite longer than the 64 bytes used in the modified UTF8 parser"
         ]
+    , Group "CaseMapping" 
+         [ Property "upper . upper == upper" $ \l ->
+             let s = fromList l
+              in upper (upper s) === upper s
+         , CheckPlan "a should capitalize to A" $ validate "a" $ upper "a" == "A"
+         , CheckPlan "b should capitalize to B" $ validate "b" $ upper "b" == "B"
+         , CheckPlan "B should not capitalize" $ validate "B" $ upper "B" == "B"
+         , CheckPlan "é should capitalize to É" $ validate "é" $ upper "é" == "É"
+         , CheckPlan "ß should capitalize to SS" $ validate "ß" $ upper "ß" == "SS"
+         , CheckPlan "ﬄ should capitalize to FFL" $ validate "ﬄ" $ upper "ﬄ" == "FFL"
+         , CheckPlan "0a should capitalize to 0A" $ validate "0a" $ upper "\0a" == "\0A"
+         , CheckPlan "0a should capitalize to 0A" $ validate "0a" $ upper "a\0a" == "A\0A"
+         , CheckPlan "0a should capitalize to 0A" $ validate "0a" $ upper "\0\0" == "\0\0"
+         , CheckPlan "00 should not capitalize" $ validate "00" $ upper "00" == "00"
+        ]
     {-
     , testGroup "replace" [
           testCase "indices '' 'bb' should raise an error" $ do