Skip to content
This repository was archived by the owner on Sep 20, 2023. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 58 additions & 4 deletions basement/Basement/String.hs
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ import Basement.FinalPtr
import Basement.IntegralConv
import Basement.Floating
import Basement.MutableBuilder
import Basement.String.CaseMapping (upperMapping, lowerMapping)
import Basement.UTF8.Table
import Basement.UTF8.Helper
import Basement.UTF8.Base
Expand Down Expand Up @@ -1317,15 +1318,68 @@ decimalDigitsPtr startAcc ptr !endOfs !startOfs = loop startAcc startOfs
{-# SPECIALIZE decimalDigitsPtr :: Int -> Addr# -> Offset Word8 -> Offset Word8 -> (# Int, Bool, Offset Word8 #) #-}
{-# SPECIALIZE decimalDigitsPtr :: Word -> Addr# -> Offset Word8 -> Offset Word8 -> (# Word, Bool, Offset Word8 #) #-}

-- | A unicode string size may vary during a case conversion operation.
-- This function calculates the new buffer size for a case conversion.
-- Returns Nothing if no case conversion is needed.
caseConvertNBuff :: (Char -> CM) -> String -> Maybe (CountOf Word8)
caseConvertNBuff op s@(String ba) = runST $ Vec.unsafeIndexer ba go
where
!sz = size s
!end = azero `offsetPlusE` sz
go :: (Offset Word8 -> Word8) -> ST st (Maybe (CountOf Word8))
go getIdx = loop (Offset 0) 0 False
where
!nextI = nextWithIndexer getIdx
eSize !e = if e == '\0'
then 0
Copy link
Contributor

@ndmitchell ndmitchell Jan 12, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is 0 of size 0 and not 1? Scratch that, I understand now

else charToBytes (fromEnum e)
loop !idx ns changed
| idx == end = if changed
then return $ Just ns
else return Nothing
| otherwise = do
let !(c, idx') = nextI idx
!cm@(CM c1 c2 c3) = op c
!cSize = if c2 == '\0' -- if c2 is empty, c3 will be empty as well.
then charToBytes (fromEnum c1)
else eSize c1 + eSize c2 + eSize c3
!nchanged = changed || c1 /= c || c2 /= '\0'
loop idx' (ns + cSize) nchanged

-- | Convert a 'String' 'Char' by 'Char' using a case mapping function.
caseConvert :: (Char -> CM) -> String -> String
caseConvert op s@(String ba)
= case nBuff of
Nothing -> s
Just nLen -> runST $ unsafeCopyFrom s nLen go
where
!nBuff = caseConvertNBuff op s
go :: String -> Offset Char -> Offset8 -> MutableString s -> Offset8 -> ST s (Offset8, Offset8)
go src' srcI srcIdx dst dstIdx = do
let !(CM c1 c2 c3) = op c
dstIdx <- write dst dstIdx c1
nextDstIdx <-
if c2 == '\0' -- We don't want to check C3 if C2 is empty.
then return dstIdx
else do
dstIdx <- writeValidChar c2 dstIdx
writeValidChar c3 dstIdx
return (nextSrcIdx, nextDstIdx)
where
!(Step c nextSrcIdx) = next src' srcIdx
writeValidChar cc wIdx =
if cc == '\0'
then return wIdx
else do
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

style nitpick: the usual style is:

if x
    then a
    else b

write dst wIdx cc

-- | Convert a 'String' to the upper-case equivalent.
-- Does not properly support multicharacter Unicode conversions.
upper :: String -> String
upper = charMap toUpper
upper = caseConvert upperMapping

-- | Convert a 'String' to the upper-case equivalent.
-- Does not properly support multicharacter Unicode conversions.
lower :: String -> String
lower = charMap toLower
lower = caseConvert lowerMapping

-- | Check whether the first string is a prefix of the second string.
isPrefixOf :: String -> String -> Bool
Expand Down
3,240 changes: 3,240 additions & 0 deletions basement/Basement/String/CaseMapping.hs

Large diffs are not rendered by default.

5 changes: 5 additions & 0 deletions basement/Basement/UTF8/Types.hs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ module Basement.UTF8.Types
, isValidStepDigit
-- * Unicode Errors
, ValidationFailure(..)
-- * Case Conversion
, CM (..)
) where

import Basement.Compat.Base
Expand All @@ -34,6 +36,9 @@ newtype StepDigit = StepDigit Word8
-- | Step when processing ASCII character
newtype StepASCII = StepASCII Word8

-- | Specialized tuple used for case mapping.
data CM = CM {-# UNPACK #-} !Char {-# UNPACK #-} !Char {-# UNPACK #-} !Char deriving (Eq)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should these be Char or Word8?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please document the invariant that it's isomorphic to [Char] and that all trailing Char must be \0.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggest instead using CM Int Int Int with -1 as the sentinel, since the rest of the foundation stuff is Int not Char for the UTFsize etc.


isValidStepASCII :: StepASCII -> Bool
isValidStepASCII (StepASCII w) = w < 0x80

Expand Down
1 change: 1 addition & 0 deletions basement/basement.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ library

Basement.UArray.Base

Basement.String.CaseMapping
Basement.String.Encoding.Encoding
Basement.String.Encoding.UTF16
Basement.String.Encoding.UTF32
Expand Down
52 changes: 52 additions & 0 deletions scripts/caseMapping/CaseFolding.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
{-# LANGUAGE OverloadedStrings #-}

module CaseFolding
(
CaseFolding(..)
, Fold(..)
, parseCF
, mapCF
) where

import Foundation
import Foundation.IO
import qualified Foundation.Parser as P
import qualified Foundation.String as S (lower, fromBytesUnsafe)
import Foundation.VFS.FilePath

import UnicodeParsers

data Fold = Fold {
code :: String
, status :: Char
, mapping :: [String]
, name :: String
} deriving (Eq, Ord, Show)

data CaseFolding = CF { cfComments :: [Comment], cfFolding :: [Fold] }
deriving (Show)

entries :: P.Parser String CaseFolding
entries = CF <$> P.many comment <*> P.some entry
where
entry = Fold <$> unichar <* semiCol
<*> oneOf "CFST" <* P.string ";"
<*> unichars <* semiCol
<*> (P.string "# " *> P.takeWhile (/= '\n')) <* P.string "\n"

parseCF :: FilePath -> IO (Either (P.ParseError String) CaseFolding)
parseCF name = P.parseOnly entries . S.fromBytesUnsafe <$> readFile name

mapCF :: (String -> String) -> CaseFolding -> [String]
mapCF twiddle (CF _ ms) = typ <> (fmap nice . filter p $ ms) <> [last]
where
typ = ["foldMapping :: Char -> CM",
"{-# NOINLINE foldMapping #-}"]
last = "foldMapping c = CM (toLower c) '\\0' '\\0'"
p f = status f `elem` ("CF" :: String) &&
mapping f /= [twiddle (code f)]
nice c = "-- " <> name c <> "\n" <>
"foldMapping " <> niceMap (code c) <> " = CM " <> x <> " " <> y <> " " <> z
where pMap = (niceMap <$> mapping c) <> ["'\\0'","'\\0'","'\\0'"]
niceMap x = "'\\x" <> x <> "'"
[x,y,z] = take (CountOf 3) pMap
42 changes: 42 additions & 0 deletions scripts/caseMapping/CaseMapping.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
{-# LANGUAGE OverloadedStrings #-}
module CaseMapping (
main
) where

import qualified Basement.String as BS (charMap)
import Data.Char (toUpper, toLower)
import Foundation
import Foundation.IO
import qualified Foundation.String as S

import CaseFolding
import SpecialCasing

main = do
psc <- parseSC "SpecialCasing.txt"
pcf <- parseCF "CaseFolding.txt"
scs <- case psc of
Left err -> putStrLn (show err) >> undefined
Right sc -> return sc
cfs <- case pcf of
Left err -> putStrLn (show err) >> undefined
Right cf -> return cf
h <- openFile ("../../basement/Basement/String/CaseMapping.hs") WriteMode
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

might be easier to have the haskell program write to stdout and redirect the stdout to the file when wanting to rewrite

let comments = ("--" <>) <$>
take 2 (cfComments cfs) <> take 2 (scComments scs)
(hPut h) . S.toBytes S.UTF8 . intercalate "\n" $
["{-# LANGUAGE Rank2Types #-}"
,"-- AUTOMATICALLY GENERATED - DO NOT EDIT"
,"-- Generated by scripts/caseMapping/generateCaseMapping.sh"]
<> comments <>
[""
,"module Basement.String.CaseMapping where"
,""
,"import Data.Char"
,"import Basement.UTF8.Types"
,"",""]
(hPut h) . S.toBytes S.UTF8 . intercalate "\n" $ (mapSC "upper" upper) (BS.charMap toUpper) scs
(hPut h) . S.toBytes S.UTF8 . intercalate "\n" $ (mapSC "lower" lower) (BS.charMap toLower) scs
(hPut h) . S.toBytes S.UTF8 . intercalate "\n" $ (mapSC "title" title) (BS.charMap toUpper) scs
(hPut h) . S.toBytes S.UTF8 . intercalate "\n" $ mapCF (BS.charMap toLower) cfs
closeFile h
70 changes: 70 additions & 0 deletions scripts/caseMapping/SpecialCasing.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
{-# LANGUAGE OverloadedStrings #-}

module SpecialCasing
(
SpecialCasing(..)
, Case(..)
, parseSC
, mapSC
) where

import qualified Data.Char as C (toUpper)

import Foundation
import Foundation.IO
import qualified Foundation.Parser as P
import Foundation.VFS.FilePath
import Foundation.Collection (nonEmpty_)
import qualified Foundation.String as S

import UnicodeParsers

data SpecialCasing = SC {scComments :: [Comment], scCasing :: [Case]}
deriving (Show)

data Case = Case {
code :: String
, lower :: [String]
, title :: [String]
, upper :: [String]
, conditions :: String
, name :: String
} deriving (Eq, Ord, Show)


entries :: P.Parser String SpecialCasing
entries = SC <$> P.many comment <*> P.many (entry <* P.many comment)
where
entry = Case <$> unichar <* P.string ";"
<*> unichars <* P.string ";"
<*> unichars <* P.string ";"
<*> unichars <* P.string "; "
<*> (P.takeWhile (/= '#') <* P.string "# ")
<*> P.takeWhile (/= '\n') <* P.string "\n"

parseSC :: FilePath -> IO (Either (P.ParseError String) SpecialCasing)
parseSC name = P.parseOnly entries . S.fromBytesUnsafe <$> readFile name

mapSC :: String -> (Case -> [String]) -> (String -> String) -> SpecialCasing -> [String]
mapSC wich access twiddle (SC _ ms) =
typ `mappend` (fmap nice . filter p $ ms) `mappend` last
where
typ = [wich <> "Mapping :: Char -> CM",
"{-# NOINLINE " <> wich <> "Mapping #-}"]
last = [wich <> "Mapping c = CM (to" <> ucFst wich
<> " c) '\\0' '\\0'","",""]
p c = [k] /= a && a /= [twiddle k] && null (conditions c)
where a = access c
k = code c
nice c = "-- " <> name c <> "\n" <>
wich <> "Mapping " <> pHex(code c) <> " = CM "
<> x <> " " <> y <> " " <> z
where pMap = (pHex <$> access c) <> ["'\\0'","'\\0'","'\\0'"]
pHex x = "'\\x" <> x <> "'"
[x,y,z] = take (CountOf 3) pMap

ucFst :: String -> String
ucFst s
| null s = ""
| otherwise = (fromString [C.toUpper (head neS)]) <> tail neS
where neS = nonEmpty_ s
31 changes: 31 additions & 0 deletions scripts/caseMapping/UnicodeParsers.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
{-# LANGUAGE OverloadedStrings #-}
module UnicodeParsers where

import Foundation
import qualified Foundation.Parser as P
import Foundation.String as S
import Foundation.Collection (Element)

type Comment = String

hexDigits :: String
hexDigits = "1234567890ABCDEF"

comment :: P.Parser String Comment
comment = (P.string "#" *> P.takeWhile (/= '\n') <* P.string "\n") <|> (P.string "\n" *> pure "")

unichar :: P.Parser String String
unichar = P.takeWhile (`elem` hexDigits)

unichars :: P.Parser String [String]
unichars = P.some elemz
where elemz = P.string " " *> unichar

semiCol :: P.Parser String ()
semiCol = P.string "; "

oneOf :: String -> P.Parser String Char
oneOf s = P.satisfy_ (`elem` s)

spaces :: P.Parser String ()
spaces = P.skipWhile (== ' ')
9 changes: 9 additions & 0 deletions scripts/caseMapping/generateCaseMapping.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/bin/sh

# This script will generate the unicode case mappings tables using unicode's
# CaseFolding <https://unicode.org/Public/UNIDATA/CaseFolding.txt>
# and SpecialCasing.txt <https://unicode.org/Public/UNIDATA/SpecialCasing.txt> files.
#
# Those two files should be downloaded and placed in the same directory as this script.
#
stack runghc -- -XNoImplicitPrelude -XRebindableSyntax -XTypeFamilies -XBangPatterns -XDeriveDataTypeable CaseMapping.hs
15 changes: 15 additions & 0 deletions tests/Test/Foundation/String.hs
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,21 @@ testStringCases =
"this is only a simple string but quite longer than the 64 bytes used in the modified UTF8 parser"
"this is only a simple string but quite longer than the 64 bytes used in the modified UTF8 parser"
]
, Group "CaseMapping"
[ Property "upper . upper == upper" $ \l ->
let s = fromList l
in upper (upper s) === upper s
, CheckPlan "a should capitalize to A" $ validate "a" $ upper "a" == "A"
, CheckPlan "b should capitalize to B" $ validate "b" $ upper "b" == "B"
, CheckPlan "B should not capitalize" $ validate "B" $ upper "B" == "B"
, CheckPlan "é should capitalize to É" $ validate "é" $ upper "é" == "É"
, CheckPlan "ß should capitalize to SS" $ validate "ß" $ upper "ß" == "SS"
, CheckPlan "ffl should capitalize to FFL" $ validate "ffl" $ upper "ffl" == "FFL"
, CheckPlan "0a should capitalize to 0A" $ validate "0a" $ upper "\0a" == "\0A"
, CheckPlan "0a should capitalize to 0A" $ validate "0a" $ upper "a\0a" == "A\0A"
, CheckPlan "0a should capitalize to 0A" $ validate "0a" $ upper "\0\0" == "\0\0"
, CheckPlan "00 should not capitalize" $ validate "00" $ upper "00" == "00"
]
{-
, testGroup "replace" [
testCase "indices '' 'bb' should raise an error" $ do
Expand Down