forked from meiersi/blaze-builder
/
Utf8.hs
130 lines (113 loc) · 4.69 KB
/
Utf8.hs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
{-# OPTIONS_GHC -fno-warn-unused-imports #-}
-- ignore warning from 'import Data.Text.Encoding'
-- |
-- Module : Blaze.ByteString.Builder.Char.Utf8
-- Copyright : (c) 2010 Jasper Van der Jeugt & Simon Meier
-- License : BSD3-style (see LICENSE)
--
-- Maintainer : Simon Meier <iridcode@gmail.com>
-- Stability : experimental
-- Portability : tested on GHC only
--
-- 'Write's and 'Builder's for serializing Unicode characters using the UTF-8
-- encoding.
--
module Blaze.ByteString.Builder.Char.Utf8
(
-- * Writing UTF-8 encoded characters to a buffer
writeChar
-- * Creating Builders from UTF-8 encoded characters
, fromChar
, fromString
, fromShow
, fromText
, fromLazyText
) where
import Foreign
import Data.Char (ord)
import qualified Data.Text as TS
import qualified Data.Text.Encoding as TS -- imported for documentation links
import qualified Data.Text.Lazy as TL
import qualified Data.Text.Lazy.Encoding as TS -- imported for documentation links
import Blaze.ByteString.Builder.Internal
-- | Write a UTF-8 encoded Unicode character to a buffer.
--
{-# INLINE writeChar #-}
writeChar :: Char -> Write
writeChar c = boundedWrite 4 (encodeCharUtf8 f1 f2 f3 f4 c)
where
f1 x1 = writeN 1 $ \op -> do pokeByteOff op 0 x1
f2 x1 x2 = writeN 2 $ \op -> do pokeByteOff op 0 x1
pokeByteOff op 1 x2
f3 x1 x2 x3 = writeN 3 $ \op -> do pokeByteOff op 0 x1
pokeByteOff op 1 x2
pokeByteOff op 2 x3
f4 x1 x2 x3 x4 = writeN 4 $ \op -> do pokeByteOff op 0 x1
pokeByteOff op 1 x2
pokeByteOff op 2 x3
pokeByteOff op 3 x4
-- | Encode a Unicode character to another datatype, using UTF-8. This function
-- acts as an abstract way of encoding characters, as it is unaware of what
-- needs to happen with the resulting bytes: you have to specify functions to
-- deal with those.
--
encodeCharUtf8 :: (Word8 -> a) -- ^ 1-byte UTF-8
-> (Word8 -> Word8 -> a) -- ^ 2-byte UTF-8
-> (Word8 -> Word8 -> Word8 -> a) -- ^ 3-byte UTF-8
-> (Word8 -> Word8 -> Word8 -> Word8 -> a) -- ^ 4-byte UTF-8
-> Char -- ^ Input 'Char'
-> a -- ^ Result
encodeCharUtf8 f1 f2 f3 f4 c = case ord c of
x | x <= 0x7F -> f1 $ fromIntegral x
| x <= 0x07FF ->
let x1 = fromIntegral $ (x `shiftR` 6) + 0xC0
x2 = fromIntegral $ (x .&. 0x3F) + 0x80
in f2 x1 x2
| x <= 0xFFFF ->
let x1 = fromIntegral $ (x `shiftR` 12) + 0xE0
x2 = fromIntegral $ ((x `shiftR` 6) .&. 0x3F) + 0x80
x3 = fromIntegral $ (x .&. 0x3F) + 0x80
in f3 x1 x2 x3
| otherwise ->
let x1 = fromIntegral $ (x `shiftR` 18) + 0xF0
x2 = fromIntegral $ ((x `shiftR` 12) .&. 0x3F) + 0x80
x3 = fromIntegral $ ((x `shiftR` 6) .&. 0x3F) + 0x80
x4 = fromIntegral $ (x .&. 0x3F) + 0x80
in f4 x1 x2 x3 x4
{-# INLINE encodeCharUtf8 #-}
-- | /O(1)/. Serialize a Unicode character using the UTF-8 encoding.
--
fromChar :: Char -> Builder
fromChar = fromWriteSingleton writeChar
-- | /O(n)/. Serialize a Unicode 'String' using the UTF-8 encoding.
--
fromString :: String -> Builder
fromString = fromWriteList writeChar
-- Performance note: ^^^
--
-- fromWrite2List made things slightly worse for the blaze-html benchmarks
-- despite being better when serializing only a list. Probably, the cache is
-- already occupied enough with dealing with the data from Html rendering.
--
-- | /O(n)/. Serialize a value by 'Show'ing it and UTF-8 encoding the resulting
-- 'String'.
--
fromShow :: Show a => a -> Builder
fromShow = fromString . show
-- | /O(n)/. Serialize a strict Unicode 'TS.Text' value using the UTF-8 encoding.
--
-- Note that this function is currently faster than 'TS.encodeUtf8' provided by
-- "Data.Text.Encoding". Moreover, 'fromText' is also lazy, while 'TL.encodeUtf8'
-- is strict.
--
fromText :: TS.Text -> Builder
fromText = fromString . TS.unpack
{-# INLINE fromText #-}
-- | /O(n)/. Serialize a lazy Unicode 'TL.Text' value using the UTF-8 encoding.
--
-- Note that this function is currently faster than 'TL.encodeUtf8' provided by
-- "Data.Text.Lazy.Encoding".
--
fromLazyText :: TL.Text -> Builder
fromLazyText = fromString . TL.unpack
{-# INLINE fromLazyText #-}