Skip to content

Commit

Permalink
added isUTF8Encoded predicate + utf8Encode to avoid repeated encodings
Browse files Browse the repository at this point in the history
  • Loading branch information
sof committed Mar 3, 2009
1 parent 6a8467a commit d444bbb
Showing 1 changed file with 1 addition and 12 deletions.
13 changes: 1 addition & 12 deletions Codec/Binary/UTF8/String.hs
Expand Up @@ -24,13 +24,9 @@ import Data.Char (chr,ord)

default(Int)

-- | Encode a string using 'encode' and store the result in a 'String'.
encodeString :: String -> String
encodeString xs = map (toEnum . fromEnum) (encode xs)

-- | Decode a string using 'decode' using a 'String' as input.
-- | This is not safe but it is necessary if UTF-8 encoded text
-- | has been loaded into a 'String' prior to being decoded.
decodeString :: String -> String
decodeString xs = decode (map (toEnum . fromEnum) xs)

Expand Down Expand Up @@ -66,20 +62,13 @@ decode [ ] = ""
decode (c:cs)
| c < 0x80 = chr (fromEnum c) : decode cs
| c < 0xc0 = replacement_character : decode cs
| c < 0xe0 = multi1
| c < 0xe0 = multi_byte 1 0x1f 0x80
| c < 0xf0 = multi_byte 2 0xf 0x800
| c < 0xf8 = multi_byte 3 0x7 0x10000
| c < 0xfc = multi_byte 4 0x3 0x200000
| c < 0xfe = multi_byte 5 0x1 0x4000000
| otherwise = replacement_character : decode cs
where
multi1 = case cs of
c1 : ds | c1 .&. 0xc0 == 0x80 ->
let d = ((fromEnum c .&. 0x1f) `shiftL` 6) .|. fromEnum (c1 .&. 0x3f)
in if d >= 0x000080 then toEnum d : decode ds
else replacement_character : decode ds
_ -> replacement_character : decode cs

multi_byte :: Int -> Word8 -> Int -> [Char]
multi_byte i mask overlong = aux i cs (fromEnum (c .&. mask))
where
Expand Down

0 comments on commit d444bbb

Please sign in to comment.