Skip to content

Commit

Permalink
Improve hyperlinker's 'spanToNewline' (#846)
Browse files Browse the repository at this point in the history
'spanToNewline' is used to help break apart the source into lines which
can then be partioned into CPP and non-CPP chunks. It is important that
'spanToNewline' not break apart tokens, so it needs to properly handle
things like

  * block comments, possibly nested
  * string literals, possibly multi-line
  * CPP macros, possibly multi-line

String literals in particular were not being properly handled. The fix
is to to fall back in 'Text.Read.lex' to help lex things that are not
comments.

Fixes #837.
  • Loading branch information
harpocrates authored and alexbiehl committed Jun 5, 2018
1 parent ee1ce11 commit bea565e
Show file tree
Hide file tree
Showing 3 changed files with 267 additions and 7 deletions.
32 changes: 25 additions & 7 deletions haddock-api/src/Haddock/Backends/Hyperlinker/Parser.hs
Expand Up @@ -3,6 +3,8 @@ module Haddock.Backends.Hyperlinker.Parser (parse) where
import Data.Either ( isRight, isLeft )
import Data.List ( foldl', isPrefixOf, isSuffixOf )
import Data.Maybe ( maybeToList )
import Data.Char ( isSpace )
import qualified Text.Read as R

import GHC ( DynFlags, addSourceToTokens )
import SrcLoc
Expand Down Expand Up @@ -109,30 +111,46 @@ isCPPline :: String -> Bool
isCPPline = isPrefixOf "#" . dropWhile (`elem` " \t") . take 5


-- | Split a "line" off the front of a string, supporting newline escapes.
--
-- By "line", we understand: the shortest substring ending in a '\n' that is not
--
-- 1. immediately preceded by a '\\'
-- 2. not inside some (possibly nested) block comment
-- | Split a "line" off the front of a string, hopefully without cutting tokens
-- in half. I say "hopefully" because knowing what a token is requires lexing,
-- yet lexing depends on this function.
--
-- All characters in the input are present in the output:
--
-- prop> curry (++) . spanToNewLine 0 = id
spanToNewline :: Int -- ^ open '{-'
-> String -- ^ input
-> (String, String)
spanToNewline _ [] = ([], [])

-- Base case and space characters
spanToNewline _ "" = ("", "")
spanToNewline n ('\n':str) | n <= 0 = ("\n", str)
spanToNewline n ('\n':str) | n <= 0 = ("\n", str)
spanToNewline n ('\\':'\n':str) =
let (str', rest) = spanToNewline n str
in ('\\':'\n':str', rest)

-- Block comments
spanToNewline n ('{':'-':str) =
let (str', rest) = spanToNewline (n+1) str
in ('{':'-':str', rest)
spanToNewline n ('-':'}':str) =
let (str', rest) = spanToNewline (n-1) str
in ('-':'}':str', rest)

-- When not in a block comment, try to lex a Haskell token
spanToNewline 0 str@(c:_) | ((lexed, str') : _) <- R.lex str, not (isSpace c) =
if all (== '-') lexed && length lexed >= 2
-- A Haskell line comment
then case span (/= '\n') str' of
(str'', '\n':rest) -> (lexed ++ str'' ++ "\n", rest)
(_, _) -> (str, "")

-- An actual Haskell token
else let (str'', rest) = spanToNewline 0 str'
in (lexed ++ str'', rest)

-- In all other cases, advance one character at a time
spanToNewline n (c:str) =
let (str', rest) = spanToNewline n str
in (c:str', rest)
Expand Down
216 changes: 216 additions & 0 deletions hypsrc-test/ref/src/CPP.html
@@ -0,0 +1,216 @@
<html xmlns="http://www.w3.org/1999/xhtml"
><head
><link rel="stylesheet" type="text/css" href="style.css"
/><script type="text/javascript" src="highlight.js"
></script
></head
><body
><pre
><span class="hs-pragma"
>{-# LANGUAGE CPP #-}</span
><span
>
</span
><a name="line-2"
></a
><span class="hs-keyword"
>module</span
><span
> </span
><span class="hs-identifier"
>CPP</span
><span
> </span
><span class="hs-keyword"
>where</span
><span
>
</span
><a name="line-3"
></a
><span
>
</span
><a name="line-4"
></a
><span class="hs-cpp"
>#define SOMETHING1
</span
><span
>
</span
><a name="line-6"
></a
><span class="hs-identifier"
>foo</span
><span
> </span
><span class="hs-glyph"
>::</span
><span
> </span
><span class="hs-identifier hs-type"
>String</span
><span
>
</span
><a name="line-7"
></a
><a name="foo"
><a href="CPP.html#foo"
><span class="hs-identifier"
>foo</span
></a
></a
><span
> </span
><span class="hs-glyph"
>=</span
><span
> </span
><span class="hs-comment"
>{- &quot; single quotes are fine in block comments
{- nested block comments are fine -}
-}</span
><span
> </span
><span class="hs-string"
>&quot;foo&quot;</span
><span
>
</span
><a name="line-10"
></a
><span
>
</span
><a name="line-11"
></a
><span class="hs-cpp"
>#define SOMETHING2
</span
><span
>
</span
><a name="line-13"
></a
><span class="hs-identifier"
>bar</span
><span
> </span
><span class="hs-glyph"
>::</span
><span
> </span
><span class="hs-identifier hs-type"
>String</span
><span
>
</span
><a name="line-14"
></a
><a name="bar"
><a href="CPP.html#bar"
><span class="hs-identifier"
>bar</span
></a
></a
><span
> </span
><span class="hs-glyph"
>=</span
><span
> </span
><span class="hs-string"
>&quot;block comment in a string is not a comment {- &quot;</span
><span
>
</span
><a name="line-15"
></a
><span
>
</span
><a name="line-16"
></a
><span class="hs-cpp"
>#define SOMETHING3
</span
><span
>
</span
><a name="line-18"
></a
><span class="hs-comment"
>-- &quot; single quotes are fine in line comments</span
><span
>
</span
><a name="line-19"
></a
><span class="hs-comment"
>-- {- unclosed block comments are fine in line comments</span
><span
>
</span
><a name="line-20"
></a
><span
>
</span
><a name="line-21"
></a
><span class="hs-comment"
>-- Multiline CPP is also fine</span
><span
>
</span
><a name="line-22"
></a
><span class="hs-cpp"
>#define FOO\
1
</span
><span
>
</span
><a name="line-25"
></a
><span class="hs-identifier"
>baz</span
><span
> </span
><span class="hs-glyph"
>::</span
><span
> </span
><span class="hs-identifier hs-type"
>String</span
><span
>
</span
><a name="line-26"
></a
><a name="baz"
><a href="CPP.html#baz"
><span class="hs-identifier"
>baz</span
></a
></a
><span
> </span
><span class="hs-glyph"
>=</span
><span
> </span
><span class="hs-string"
>&quot;line comment in a string is not a comment --&quot;</span
><span
>
</span
><a name="line-27"
></a
></pre
></body
></html
>
26 changes: 26 additions & 0 deletions hypsrc-test/src/CPP.hs
@@ -0,0 +1,26 @@
{-# LANGUAGE CPP #-}
module CPP where

#define SOMETHING1

foo :: String
foo = {- " single quotes are fine in block comments
{- nested block comments are fine -}
-} "foo"

#define SOMETHING2

bar :: String
bar = "block comment in a string is not a comment {- "

#define SOMETHING3

-- " single quotes are fine in line comments
-- {- unclosed block comments are fine in line comments

-- Multiline CPP is also fine
#define FOO\
1

baz :: String
baz = "line comment in a string is not a comment --"

0 comments on commit bea565e

Please sign in to comment.