-
Notifications
You must be signed in to change notification settings - Fork 93
Unicode Case Mapping #461
Unicode Case Mapping #461
Changes from all commits
2e2f3f9
bac11eb
f9ead6c
219bc8a
dd5084b
175f5a8
18e80d6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -115,6 +115,7 @@ import Basement.FinalPtr | |
import Basement.IntegralConv | ||
import Basement.Floating | ||
import Basement.MutableBuilder | ||
import Basement.String.CaseMapping (upperMapping, lowerMapping) | ||
import Basement.UTF8.Table | ||
import Basement.UTF8.Helper | ||
import Basement.UTF8.Base | ||
|
@@ -1317,15 +1318,68 @@ decimalDigitsPtr startAcc ptr !endOfs !startOfs = loop startAcc startOfs | |
{-# SPECIALIZE decimalDigitsPtr :: Int -> Addr# -> Offset Word8 -> Offset Word8 -> (# Int, Bool, Offset Word8 #) #-} | ||
{-# SPECIALIZE decimalDigitsPtr :: Word -> Addr# -> Offset Word8 -> Offset Word8 -> (# Word, Bool, Offset Word8 #) #-} | ||
|
||
-- | A unicode string size may vary during a case conversion operation. | ||
-- This function calculates the new buffer size for a case conversion. | ||
-- Returns Nothing if no case conversion is needed. | ||
caseConvertNBuff :: (Char -> CM) -> String -> Maybe (CountOf Word8) | ||
caseConvertNBuff op s@(String ba) = runST $ Vec.unsafeIndexer ba go | ||
where | ||
!sz = size s | ||
!end = azero `offsetPlusE` sz | ||
go :: (Offset Word8 -> Word8) -> ST st (Maybe (CountOf Word8)) | ||
go getIdx = loop (Offset 0) 0 False | ||
where | ||
!nextI = nextWithIndexer getIdx | ||
eSize !e = if e == '\0' | ||
then 0 | ||
else charToBytes (fromEnum e) | ||
loop !idx ns changed | ||
| idx == end = if changed | ||
then return $ Just ns | ||
else return Nothing | ||
| otherwise = do | ||
let !(c, idx') = nextI idx | ||
!cm@(CM c1 c2 c3) = op c | ||
!cSize = if c2 == '\0' -- if c2 is empty, c3 will be empty as well. | ||
then charToBytes (fromEnum c1) | ||
else eSize c1 + eSize c2 + eSize c3 | ||
!nchanged = changed || c1 /= c || c2 /= '\0' | ||
loop idx' (ns + cSize) nchanged | ||
|
||
-- | Convert a 'String' 'Char' by 'Char' using a case mapping function. | ||
caseConvert :: (Char -> CM) -> String -> String | ||
caseConvert op s@(String ba) | ||
= case nBuff of | ||
Nothing -> s | ||
Just nLen -> runST $ unsafeCopyFrom s nLen go | ||
where | ||
!nBuff = caseConvertNBuff op s | ||
go :: String -> Offset Char -> Offset8 -> MutableString s -> Offset8 -> ST s (Offset8, Offset8) | ||
go src' srcI srcIdx dst dstIdx = do | ||
let !(CM c1 c2 c3) = op c | ||
dstIdx <- write dst dstIdx c1 | ||
nextDstIdx <- | ||
if c2 == '\0' -- We don't want to check C3 if C2 is empty. | ||
then return dstIdx | ||
else do | ||
dstIdx <- writeValidChar c2 dstIdx | ||
writeValidChar c3 dstIdx | ||
return (nextSrcIdx, nextDstIdx) | ||
where | ||
!(Step c nextSrcIdx) = next src' srcIdx | ||
writeValidChar cc wIdx = | ||
if cc == '\0' | ||
then return wIdx | ||
else do | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. style nitpick: the usual style is: if x
then a
else b |
||
write dst wIdx cc | ||
|
||
-- | Convert a 'String' to the upper-case equivalent. | ||
-- Does not properly support multicharacter Unicode conversions. | ||
upper :: String -> String | ||
upper = charMap toUpper | ||
upper = caseConvert upperMapping | ||
|
||
-- | Convert a 'String' to the upper-case equivalent. | ||
-- Does not properly support multicharacter Unicode conversions. | ||
lower :: String -> String | ||
lower = charMap toLower | ||
lower = caseConvert lowerMapping | ||
|
||
-- | Check whether the first string is a prefix of the second string. | ||
isPrefixOf :: String -> String -> Bool | ||
|
Large diffs are not rendered by default.
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,6 +9,8 @@ module Basement.UTF8.Types | |
, isValidStepDigit | ||
-- * Unicode Errors | ||
, ValidationFailure(..) | ||
-- * Case Conversion | ||
, CM (..) | ||
) where | ||
|
||
import Basement.Compat.Base | ||
|
@@ -34,6 +36,9 @@ newtype StepDigit = StepDigit Word8 | |
-- | Step when processing ASCII character | ||
newtype StepASCII = StepASCII Word8 | ||
|
||
-- | Specialized tuple used for case mapping. | ||
data CM = CM {-# UNPACK #-} !Char {-# UNPACK #-} !Char {-# UNPACK #-} !Char deriving (Eq) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should these be Char or Word8? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please document the invariant that it's isomorphic to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Suggest instead using |
||
|
||
isValidStepASCII :: StepASCII -> Bool | ||
isValidStepASCII (StepASCII w) = w < 0x80 | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
{-# LANGUAGE OverloadedStrings #-} | ||
|
||
module CaseFolding | ||
( | ||
CaseFolding(..) | ||
, Fold(..) | ||
, parseCF | ||
, mapCF | ||
) where | ||
|
||
import Foundation | ||
import Foundation.IO | ||
import qualified Foundation.Parser as P | ||
import qualified Foundation.String as S (lower, fromBytesUnsafe) | ||
import Foundation.VFS.FilePath | ||
|
||
import UnicodeParsers | ||
|
||
data Fold = Fold { | ||
code :: String | ||
, status :: Char | ||
, mapping :: [String] | ||
, name :: String | ||
} deriving (Eq, Ord, Show) | ||
|
||
data CaseFolding = CF { cfComments :: [Comment], cfFolding :: [Fold] } | ||
deriving (Show) | ||
|
||
entries :: P.Parser String CaseFolding | ||
entries = CF <$> P.many comment <*> P.some entry | ||
where | ||
entry = Fold <$> unichar <* semiCol | ||
<*> oneOf "CFST" <* P.string ";" | ||
<*> unichars <* semiCol | ||
<*> (P.string "# " *> P.takeWhile (/= '\n')) <* P.string "\n" | ||
|
||
parseCF :: FilePath -> IO (Either (P.ParseError String) CaseFolding) | ||
parseCF name = P.parseOnly entries . S.fromBytesUnsafe <$> readFile name | ||
|
||
mapCF :: (String -> String) -> CaseFolding -> [String] | ||
mapCF twiddle (CF _ ms) = typ <> (fmap nice . filter p $ ms) <> [last] | ||
where | ||
typ = ["foldMapping :: Char -> CM", | ||
"{-# NOINLINE foldMapping #-}"] | ||
last = "foldMapping c = CM (toLower c) '\\0' '\\0'" | ||
p f = status f `elem` ("CF" :: String) && | ||
mapping f /= [twiddle (code f)] | ||
nice c = "-- " <> name c <> "\n" <> | ||
"foldMapping " <> niceMap (code c) <> " = CM " <> x <> " " <> y <> " " <> z | ||
where pMap = (niceMap <$> mapping c) <> ["'\\0'","'\\0'","'\\0'"] | ||
niceMap x = "'\\x" <> x <> "'" | ||
[x,y,z] = take (CountOf 3) pMap |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
{-# LANGUAGE OverloadedStrings #-} | ||
module CaseMapping ( | ||
main | ||
) where | ||
|
||
import qualified Basement.String as BS (charMap) | ||
import Data.Char (toUpper, toLower) | ||
import Foundation | ||
import Foundation.IO | ||
import qualified Foundation.String as S | ||
|
||
import CaseFolding | ||
import SpecialCasing | ||
|
||
main = do | ||
psc <- parseSC "SpecialCasing.txt" | ||
pcf <- parseCF "CaseFolding.txt" | ||
scs <- case psc of | ||
Left err -> putStrLn (show err) >> undefined | ||
Right sc -> return sc | ||
cfs <- case pcf of | ||
Left err -> putStrLn (show err) >> undefined | ||
Right cf -> return cf | ||
h <- openFile ("../../basement/Basement/String/CaseMapping.hs") WriteMode | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. might be easier to have the haskell program write to stdout and redirect the stdout to the file when wanting to rewrite |
||
let comments = ("--" <>) <$> | ||
take 2 (cfComments cfs) <> take 2 (scComments scs) | ||
(hPut h) . S.toBytes S.UTF8 . intercalate "\n" $ | ||
["{-# LANGUAGE Rank2Types #-}" | ||
,"-- AUTOMATICALLY GENERATED - DO NOT EDIT" | ||
,"-- Generated by scripts/caseMapping/generateCaseMapping.sh"] | ||
<> comments <> | ||
["" | ||
,"module Basement.String.CaseMapping where" | ||
,"" | ||
,"import Data.Char" | ||
,"import Basement.UTF8.Types" | ||
,"",""] | ||
(hPut h) . S.toBytes S.UTF8 . intercalate "\n" $ (mapSC "upper" upper) (BS.charMap toUpper) scs | ||
(hPut h) . S.toBytes S.UTF8 . intercalate "\n" $ (mapSC "lower" lower) (BS.charMap toLower) scs | ||
(hPut h) . S.toBytes S.UTF8 . intercalate "\n" $ (mapSC "title" title) (BS.charMap toUpper) scs | ||
(hPut h) . S.toBytes S.UTF8 . intercalate "\n" $ mapCF (BS.charMap toLower) cfs | ||
closeFile h |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
{-# LANGUAGE OverloadedStrings #-} | ||
|
||
module SpecialCasing | ||
( | ||
SpecialCasing(..) | ||
, Case(..) | ||
, parseSC | ||
, mapSC | ||
) where | ||
|
||
import qualified Data.Char as C (toUpper) | ||
|
||
import Foundation | ||
import Foundation.IO | ||
import qualified Foundation.Parser as P | ||
import Foundation.VFS.FilePath | ||
import Foundation.Collection (nonEmpty_) | ||
import qualified Foundation.String as S | ||
|
||
import UnicodeParsers | ||
|
||
data SpecialCasing = SC {scComments :: [Comment], scCasing :: [Case]} | ||
deriving (Show) | ||
|
||
data Case = Case { | ||
code :: String | ||
, lower :: [String] | ||
, title :: [String] | ||
, upper :: [String] | ||
, conditions :: String | ||
, name :: String | ||
} deriving (Eq, Ord, Show) | ||
|
||
|
||
entries :: P.Parser String SpecialCasing | ||
entries = SC <$> P.many comment <*> P.many (entry <* P.many comment) | ||
where | ||
entry = Case <$> unichar <* P.string ";" | ||
<*> unichars <* P.string ";" | ||
<*> unichars <* P.string ";" | ||
<*> unichars <* P.string "; " | ||
<*> (P.takeWhile (/= '#') <* P.string "# ") | ||
<*> P.takeWhile (/= '\n') <* P.string "\n" | ||
|
||
parseSC :: FilePath -> IO (Either (P.ParseError String) SpecialCasing) | ||
parseSC name = P.parseOnly entries . S.fromBytesUnsafe <$> readFile name | ||
|
||
mapSC :: String -> (Case -> [String]) -> (String -> String) -> SpecialCasing -> [String] | ||
mapSC wich access twiddle (SC _ ms) = | ||
typ `mappend` (fmap nice . filter p $ ms) `mappend` last | ||
where | ||
typ = [wich <> "Mapping :: Char -> CM", | ||
"{-# NOINLINE " <> wich <> "Mapping #-}"] | ||
last = [wich <> "Mapping c = CM (to" <> ucFst wich | ||
<> " c) '\\0' '\\0'","",""] | ||
p c = [k] /= a && a /= [twiddle k] && null (conditions c) | ||
where a = access c | ||
k = code c | ||
nice c = "-- " <> name c <> "\n" <> | ||
wich <> "Mapping " <> pHex(code c) <> " = CM " | ||
<> x <> " " <> y <> " " <> z | ||
where pMap = (pHex <$> access c) <> ["'\\0'","'\\0'","'\\0'"] | ||
pHex x = "'\\x" <> x <> "'" | ||
[x,y,z] = take (CountOf 3) pMap | ||
|
||
ucFst :: String -> String | ||
ucFst s | ||
| null s = "" | ||
| otherwise = (fromString [C.toUpper (head neS)]) <> tail neS | ||
where neS = nonEmpty_ s |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
{-# LANGUAGE OverloadedStrings #-} | ||
module UnicodeParsers where | ||
|
||
import Foundation | ||
import qualified Foundation.Parser as P | ||
import Foundation.String as S | ||
import Foundation.Collection (Element) | ||
|
||
type Comment = String | ||
|
||
hexDigits :: String | ||
hexDigits = "1234567890ABCDEF" | ||
|
||
comment :: P.Parser String Comment | ||
comment = (P.string "#" *> P.takeWhile (/= '\n') <* P.string "\n") <|> (P.string "\n" *> pure "") | ||
|
||
unichar :: P.Parser String String | ||
unichar = P.takeWhile (`elem` hexDigits) | ||
|
||
unichars :: P.Parser String [String] | ||
unichars = P.some elemz | ||
where elemz = P.string " " *> unichar | ||
|
||
semiCol :: P.Parser String () | ||
semiCol = P.string "; " | ||
|
||
oneOf :: String -> P.Parser String Char | ||
oneOf s = P.satisfy_ (`elem` s) | ||
|
||
spaces :: P.Parser String () | ||
spaces = P.skipWhile (== ' ') |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
#!/bin/sh | ||
|
||
# This script will generate the unicode case mappings tables using unicode's | ||
# CaseFolding <https://unicode.org/Public/UNIDATA/CaseFolding.txt> | ||
# and SpecialCasing.txt <https://unicode.org/Public/UNIDATA/SpecialCasing.txt> files. | ||
# | ||
# Those two files should be downloaded and placed in the same directory as this script. | ||
# | ||
stack runghc -- -XNoImplicitPrelude -XRebindableSyntax -XTypeFamilies -XBangPatterns -XDeriveDataTypeable CaseMapping.hs |
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why is 0 of size 0 and not 1? Scratch that, I understand now