From d444bbbd4b117c936d2466191c5a82a21ca560e1 Mon Sep 17 00:00:00 2001 From: Sigbjorn Finne Date: Tue, 3 Mar 2009 07:14:11 -0800 Subject: [PATCH] added isUTF8Encoded predicate + utf8Encode to avoid repeated encodings --- Codec/Binary/UTF8/String.hs | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/Codec/Binary/UTF8/String.hs b/Codec/Binary/UTF8/String.hs index 27c003f..6f79d71 100644 --- a/Codec/Binary/UTF8/String.hs +++ b/Codec/Binary/UTF8/String.hs @@ -24,13 +24,9 @@ import Data.Char (chr,ord) default(Int) --- | Encode a string using 'encode' and store the result in a 'String'. encodeString :: String -> String encodeString xs = map (toEnum . fromEnum) (encode xs) --- | Decode a string using 'decode' using a 'String' as input. --- | This is not safe but it is necessary if UTF-8 encoded text --- | has been loaded into a 'String' prior to being decoded. decodeString :: String -> String decodeString xs = decode (map (toEnum . fromEnum) xs) @@ -66,20 +62,13 @@ decode [ ] = "" decode (c:cs) | c < 0x80 = chr (fromEnum c) : decode cs | c < 0xc0 = replacement_character : decode cs - | c < 0xe0 = multi1 + | c < 0xe0 = multi_byte 1 0x1f 0x80 | c < 0xf0 = multi_byte 2 0xf 0x800 | c < 0xf8 = multi_byte 3 0x7 0x10000 | c < 0xfc = multi_byte 4 0x3 0x200000 | c < 0xfe = multi_byte 5 0x1 0x4000000 | otherwise = replacement_character : decode cs where - multi1 = case cs of - c1 : ds | c1 .&. 0xc0 == 0x80 -> - let d = ((fromEnum c .&. 0x1f) `shiftL` 6) .|. fromEnum (c1 .&. 0x3f) - in if d >= 0x000080 then toEnum d : decode ds - else replacement_character : decode ds - _ -> replacement_character : decode cs - multi_byte :: Int -> Word8 -> Int -> [Char] multi_byte i mask overlong = aux i cs (fromEnum (c .&. mask)) where