diff --git a/Network/URI.hs b/Network/URI.hs index b826bfa2..1270f013 100644 --- a/Network/URI.hs +++ b/Network/URI.hs @@ -970,9 +970,50 @@ escapeURIString p s = concatMap (escapeURIChar p) s -- unEscapeString :: String -> String unEscapeString [] = "" -unEscapeString ('%':x1:x2:s) | isHexDigit x1 && isHexDigit x2 = - chr (digitToInt x1 * 16 + digitToInt x2) : unEscapeString s -unEscapeString (c:s) = c : unEscapeString s +unEscapeString s@(c:cs) = case unEscapeByte s of + Just (byte, rest) -> unEscapeUtf8 byte rest + Nothing -> c : unEscapeString cs + +unEscapeByte :: String -> Maybe (Int, String) +unEscapeByte ('%':x1:x2:s) | isHexDigit x1 && isHexDigit x2 = + Just (digitToInt x1 * 16 + digitToInt x2, s) +unEscapeByte _ = Nothing + +-- Adapted from http://hackage.haskell.org/package/utf8-string +-- by Eric Mertens, BSD3 +unEscapeUtf8 :: Int -> String -> String +unEscapeUtf8 c rest + | c < 0x80 = chr c : unEscapeString rest + | c < 0xc0 = replacement_character : unEscapeString rest + | c < 0xe0 = multi1 + | c < 0xf0 = multi_byte 2 0xf 0x800 + | c < 0xf8 = multi_byte 3 0x7 0x10000 + | c < 0xfc = multi_byte 4 0x3 0x200000 + | c < 0xfe = multi_byte 5 0x1 0x4000000 + | otherwise = replacement_character : unEscapeString rest + where + replacement_character = '\xfffd' + multi1 = case unEscapeByte rest of + Just (c1, ds) | c1 .&. 0xc0 == 0x80 -> + let d = ((fromEnum c .&. 0x1f) `shiftL` 6) .|. fromEnum (c1 .&. 0x3f) + in if d >= 0x000080 then toEnum d : unEscapeString ds + else replacement_character : unEscapeString ds + _ -> replacement_character : unEscapeString rest + + multi_byte i mask overlong = + aux i rest (unEscapeByte rest) (c .&. mask) + where + aux 0 rs _ acc + | overlong <= acc && acc <= 0x10ffff && + (acc < 0xd800 || 0xdfff < acc) && + (acc < 0xfffe || 0xffff < acc) = chr acc : unEscapeString rs + | otherwise = replacement_character : unEscapeString rs + + aux n _ (Just (r, rs)) acc + | r .&. 0xc0 == 0x80 = aux (n-1) rs (unEscapeByte rs) + $! shiftL acc 6 .|. (r .&. 0x3f) + + aux _ rs _ _ = replacement_character : unEscapeString rs ------------------------------------------------------------ -- Resolving a relative URI relative to a base URI diff --git a/network.cabal b/network.cabal index f6d0f876..2ccd4b85 100644 --- a/network.cabal +++ b/network.cabal @@ -80,7 +80,8 @@ test-suite uri HUnit, network, test-framework, - test-framework-hunit + test-framework-hunit, + test-framework-quickcheck2 source-repository head type: git diff --git a/tests/uri001.hs b/tests/uri001.hs index 831fd5ad..1078c984 100644 --- a/tests/uri001.hs +++ b/tests/uri001.hs @@ -56,6 +56,7 @@ import Data.Maybe (fromJust) import System.IO (openFile, IOMode(WriteMode), hClose) import qualified Test.Framework as TF import qualified Test.Framework.Providers.HUnit as TF +import qualified Test.Framework.Providers.QuickCheck2 as TF -- Test supplied string for valid URI reference syntax -- isValidURIRef :: String -> Bool @@ -1103,6 +1104,12 @@ testEscapeURIString06 = testEq "testEscapeURIString06" "hello%C3%B8%C2%A9%E6%97%A5%E6%9C%AC" (escapeURIString isUnescapedInURIComponent "helloø©日本") +propEscapeUnEscapeLoop :: String -> Bool +propEscapeUnEscapeLoop s = s == (unEscapeString $! escaped) + where + escaped = escapeURIString (const False) s + {-# NOINLINE escaped #-} + testEscapeURIString = TF.testGroup "testEscapeURIString" [ TF.testCase "testEscapeURIString01" testEscapeURIString01 , TF.testCase "testEscapeURIString02" testEscapeURIString02 @@ -1110,6 +1117,7 @@ testEscapeURIString = TF.testGroup "testEscapeURIString" , TF.testCase "testEscapeURIString04" testEscapeURIString04 , TF.testCase "testEscapeURIString05" testEscapeURIString05 , TF.testCase "testEscapeURIString06" testEscapeURIString06 + , TF.testProperty "propEscapeUnEscapeLoop" propEscapeUnEscapeLoop ] -- URI string normalization tests