Skip to content

Commit

Permalink
Check for invalid UTF8 when parsing .cabal files
Browse files Browse the repository at this point in the history
This assumes a permissive UTF8 decoder has inserted '\xfffd' as a replacement
character. We should check strictly for errors when we decode instead, though
it's nice to do that where we have some kind of error reporting infrastructure
which is why I've added it to the parser for the moment. The current error
message is not too bad, as it reports the line number.
  • Loading branch information
dcoutts committed Feb 25, 2008
1 parent d8b76cc commit 6a89033
Showing 1 changed file with 17 additions and 9 deletions.
26 changes: 17 additions & 9 deletions Distribution/ParseUtils.hs
Expand Up @@ -90,6 +90,7 @@ type LineNo = Int
data PError = AmbigousParse String LineNo
| NoParse String LineNo
| TabsError LineNo
| UTFError LineNo
| FromString String (Maybe LineNo)
deriving Show

Expand Down Expand Up @@ -129,6 +130,7 @@ locatedErrorMsg :: PError -> (Maybe LineNo, String)
locatedErrorMsg (AmbigousParse f n) = (Just n, "Ambiguous parse in field '"++f++"'")
locatedErrorMsg (NoParse f n) = (Just n, "Parse of field '"++f++"' failed: ")
locatedErrorMsg (TabsError n) = (Just n, "Tab used as indentation.")
locatedErrorMsg (UTFError n) = (Just n, "Invalid UTF-8 text.")
locatedErrorMsg (FromString s n) = (n, s)

syntaxError :: LineNo -> String -> ParseResult a
Expand All @@ -137,6 +139,9 @@ syntaxError n s = ParseFailed $ FromString s (Just n)
tabsError :: LineNo -> ParseResult a
tabsError ln = ParseFailed $ TabsError ln

utf8Error :: LineNo -> ParseResult a
utf8Error ln = ParseFailed $ UTFError ln

warning :: String -> ParseResult ()
warning s = ParseOk [s] ()

Expand Down Expand Up @@ -263,15 +268,18 @@ fName (Section _ n _ _) = n
fName _ = error "fname: not a field or section"

readFields :: String -> ParseResult [Field]
readFields input =
ifelse
=<< mapM (mkField 0)
=<< mkTree (tokenise input)

where tokenise = concatMap tokeniseLine
. trimLines
. lines
. normaliseLineEndings
readFields input =
case [ n | (n,l) <- zip [1..] ls
, '\xfffd' `elem` l ] of

(n:_) -> utf8Error n

[] -> ifelse
=<< mapM (mkField 0)
=<< mkTree tokens

where ls = (lines . normaliseLineEndings) input
tokens = (concatMap tokeniseLine . trimLines) ls

-- attach line number and determine indentation
trimLines :: [String] -> [(LineNo, Indent, HasTabs, String)]
Expand Down

0 comments on commit 6a89033

Please sign in to comment.