-
Notifications
You must be signed in to change notification settings - Fork 0
/
Main.hs
93 lines (76 loc) · 2.86 KB
/
Main.hs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import BasePrelude
import Data.ByteString (ByteString)
import qualified Data.ByteString as BS
import Data.Attoparsec.ByteString
import Data.List (words)
readByte :: String -> Word8
readByte = foldl step 0
where
step byte char = consumeBit char (shiftL byte 1)
consumeBit '0' = id
consumeBit '1' = (.|. 1)
readBytes :: [String] -> ByteString
readBytes = BS.pack . fmap readByte
test :: [String] -> IO ()
test = print . parseUtf8 . readBytes
main :: IO ()
main = do
test [ "01101110", "01100001" -- na
, "01101001", "11001100", "10001000" -- ï
, "01110110", "01100101", "01110100" -- vet
, "11000011", "10101001" -- é
]
test ["11000000", "10000001"] -- overlong
test ["11000000"] -- not enough continuation bits
test ["10010010"] -- leading continuation bit
test ["11010111", "10000000", "10001010"] -- too many continuation bits
parseUtf8 :: ByteString -> Either String [Word32]
parseUtf8 = parseOnly utf8Parser
utf8Parser :: Parser [Word32]
utf8Parser = manyTill' codePointParser endOfInput
codePointParser :: Parser Word32
codePointParser =
byteSequence ["0xxxxxxx"] <|>
overlong 0x7F (multibyte "110xxxxx" 1) <|>
overlong 0x7FF (multibyte "1110xxxx" 2) <|>
checkedParser withMax (not . isSurrogate) "illegal surrogate pair"
where
multibyte leader count =
byteSequence (leader : replicate count "10xxxxxx")
fourByteParser = (overlong 0xFFFF (multibyte "11110xxx" 3))
withMax = checkedParser fourByteParser (< 0x110000) "illegal codepoint over 0x10FFFF"
isSurrogate w = w >= 0xD800 && w <= 0xDFFF
overlong :: Word32 -> Parser Word32 -> Parser Word32
overlong m parser = checkedParser parser (> m) "illegal overlong codepoint!"
byteSequence :: [String] -> Parser Word32
byteSequence patterns = do
subBytes <- mapM bitPattern patterns
return (foldl mergeSubByte 0 subBytes)
mergeSubByte :: Word32 -> SubByte -> Word32
mergeSubByte whole (byte, bits) =
shiftL whole bits .|. fromIntegral byte
type SubByte = (Word8, Int)
subZero :: SubByte
subZero = (0, 0)
pushBit :: Bool -> SubByte -> SubByte
pushBit True (b, n) = (setBit (shiftL b 1) 0, n + 1)
pushBit False (b, n) = (shiftL b 1, n + 1)
bitPattern :: String -> Parser SubByte
bitPattern pattern = satisfyMaybe (matchByte pattern)
matchByte :: String -> Word8 -> Maybe SubByte
matchByte pattern byte = foldM (flip check) subZero (zip pattern bits)
where
check ('1', True) = Just
check ('0', False) = Just
check ('x', bit) = Just . pushBit bit
check _ = const Nothing
bits = testBit byte <$> [7, 6 .. 0]
satisfyMaybe :: (Word8 -> Maybe a) -> Parser a
satisfyMaybe f = do
byte <- anyWord8
maybe (fail "maybe not satisfied") return (f byte)
checkedParser :: Parser a -> (a -> Bool) -> String -> Parser a
checkedParser parser predicate msg = do
word <- parser
unless (predicate word) (fail msg)
return word