Skip to content

Commit

Permalink
Merge pull request #156 from andreasabel/issue119
Browse files Browse the repository at this point in the history
[ fixed #119 ] latin1 encoding: each byte counts as 1 char
  • Loading branch information
simonmar committed Jan 27, 2020
2 parents ce48441 + ae525e3 commit 574ec8c
Show file tree
Hide file tree
Showing 7 changed files with 154 additions and 30 deletions.
16 changes: 14 additions & 2 deletions alex.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,21 @@ data-dir: data/

data-files:
AlexTemplate
AlexTemplate-debug
AlexTemplate-nopred
AlexTemplate-nopred-debug
AlexTemplate-latin1
AlexTemplate-latin1-debug
AlexTemplate-latin1-nopred
AlexTemplate-latin1-nopred-debug
AlexTemplate-ghc
AlexTemplate-ghc-nopred
AlexTemplate-ghc-debug
AlexTemplate-debug
AlexTemplate-ghc-nopred
AlexTemplate-ghc-nopred-debug
AlexTemplate-ghc-latin1
AlexTemplate-ghc-latin1-debug
AlexTemplate-ghc-latin1-nopred
AlexTemplate-ghc-latin1-nopred-debug
AlexWrapper-basic
AlexWrapper-basic-bytestring
AlexWrapper-strict-bytestring
Expand Down Expand Up @@ -110,6 +121,7 @@ extra-source-files:
tests/strict_typeclass.x
tests/unicode.x
tests/issue_71.x
tests/issue_119.x

source-repository head
type: git
Expand Down
39 changes: 32 additions & 7 deletions gen-alex-sdist/Main.hs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
module Main (main) where

import Control.Monad
import qualified Data.List as List
import Language.Preprocessor.Cpphs
import System.Directory
import System.FilePath
Expand Down Expand Up @@ -51,13 +52,37 @@ all_template_files :: [FilePath]
all_template_files = map fst (templates ++ wrappers)

templates :: [(FilePath,[String])]
templates = [
("AlexTemplate", []),
("AlexTemplate-ghc", ["ALEX_GHC"]),
("AlexTemplate-ghc-nopred",["ALEX_GHC", "ALEX_NOPRED"]),
("AlexTemplate-ghc-debug", ["ALEX_GHC","ALEX_DEBUG"]),
("AlexTemplate-debug", ["ALEX_DEBUG"])
]
templates =
[ ( templateFileName ghc latin1 nopred debug
, templateFlags ghc latin1 nopred debug
)
| ghc <- allBool
, latin1 <- allBool
, nopred <- allBool
, debug <- allBool
]
where
allBool = [False, True]

-- Keep this function in sync with its twin in src/Main.hs.
templateFileName :: Bool -> Bool -> Bool -> Bool -> FilePath
templateFileName ghc latin1 nopred debug =
List.intercalate "-" $ concat
[ [ "AlexTemplate" ]
, [ "ghc" | ghc ]
, [ "latin1" | latin1 ]
, [ "nopred" | nopred ]
, [ "debug" | debug ]
]

templateFlags :: Bool -> Bool -> Bool -> Bool -> [String]
templateFlags ghc latin1 nopred debug =
map ("ALEX_" ++) $ concat
[ [ "GHC" | ghc ]
, [ "LATIN1" | latin1 ]
, [ "NOPRED" | nopred ]
, [ "DEBUG" | debug ]
]

wrappers :: [(FilePath,[String])]
wrappers = [
Expand Down
2 changes: 2 additions & 0 deletions src/AbsSyn.hs
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,7 @@ type StartCode = Int
-- we can generate somewhat faster code in the case that
-- the lexer doesn't use predicates
data UsesPreds = UsesPreds | DoesntUsePreds
deriving Eq

usesPreds :: DFA s a -> UsesPreds
usesPreds dfa
Expand Down Expand Up @@ -390,3 +391,4 @@ extractActions scheme scanner = (scanner{scannerTokens = new_tokens}, decl_str .
-- Code generation targets

data Target = GhcTarget | HaskellTarget
deriving Eq
41 changes: 23 additions & 18 deletions src/Main.hs
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ import Control.Exception ( bracketOnError )
import Control.Monad ( when, liftM )
import Data.Char ( chr )
import Data.List ( isSuffixOf, nub )
import qualified Data.List as List
import Data.Maybe ( isJust, fromJust )
import Data.Version ( showVersion )
import System.Console.GetOpt ( getOpt, usageInfo, ArgOrder(..), OptDescr(..), ArgDescr(..) )
Expand Down Expand Up @@ -222,7 +223,7 @@ alex cli file basename script = do
hPutStr out_h (actions "")

-- add the template
let template_name = templateFile template_dir target usespreds cli
let template_name = templateFile template_dir target encoding usespreds cli
tmplt <- alexReadFile template_name
hPutStr out_h tmplt

Expand Down Expand Up @@ -403,23 +404,27 @@ templateDir def cli
[] -> def
ds -> return (last ds)

templateFile :: FilePath -> Target -> UsesPreds -> [CLIFlags] -> FilePath
templateFile dir target usespreds cli
= dir ++ "/AlexTemplate" ++ maybe_ghc ++ maybe_debug ++ maybe_nopred
where
maybe_ghc = case target of
GhcTarget -> "-ghc"
_ -> ""

maybe_debug
| OptDebugParser `elem` cli = "-debug"
| otherwise = ""

maybe_nopred =
case usespreds of
DoesntUsePreds | not (null maybe_ghc)
&& null maybe_debug -> "-nopred"
_ -> ""
-- Keep this function in sync with its twin in gen-alex-sdist/Main.hs.
templateFileName :: Bool -> Bool -> Bool -> Bool -> FilePath
templateFileName ghc latin1 nopred debug =
List.intercalate "-" $ concat
[ [ "AlexTemplate" ]
, [ "ghc" | ghc ]
, [ "latin1" | latin1 ]
, [ "nopred" | nopred ]
, [ "debug" | debug ]
]

templateFile :: FilePath -> Target -> Encoding -> UsesPreds -> [CLIFlags] -> FilePath
templateFile dir target encoding usespreds cli = concat
[ dir
, "/"
, templateFileName
(target == GhcTarget)
(encoding == Latin1)
(usespreds == DoesntUsePreds)
(OptDebugParser `elem` cli)
]

wrapperFile :: FilePath -> Scheme -> Maybe FilePath
wrapperFile dir scheme =
Expand Down
12 changes: 9 additions & 3 deletions templates/GenericTemplate.hs
Original file line number Diff line number Diff line change
Expand Up @@ -175,9 +175,15 @@ alex_scan_tkn user__ orig_input len input__ s last_acc =
ILIT(-1) -> (new_acc, input__)
-- on an error, we want to keep the input *before* the
-- character that failed, not after.
_ -> alex_scan_tkn user__ orig_input (if c < 0x80 || c >= 0xC0 then PLUS(len,ILIT(1)) else len)
-- note that the length is increased ONLY if this is the 1st byte in a char encoding)
new_input new_s new_acc
_ -> alex_scan_tkn user__ orig_input
#ifdef ALEX_LATIN1
PLUS(len,ILIT(1))
-- issue 119: in the latin1 encoding, *each* byte is one character
#else
(if c < 0x80 || c >= 0xC0 then PLUS(len,ILIT(1)) else len)
-- note that the length is increased ONLY if this is the 1st byte in a char encoding)
#endif
new_input new_s new_acc
}
where
check_accs (AlexAccNone) = last_acc
Expand Down
1 change: 1 addition & 0 deletions tests/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ TESTS = \
default_typeclass.x \
gscan_typeclass.x \
issue_71.x \
issue_119.x \
monad_typeclass.x \
monad_typeclass_bytestring.x \
monadUserState_typeclass.x \
Expand Down
73 changes: 73 additions & 0 deletions tests/issue_119.x
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
-- -*- haskell -*-
{
-- Issue 119,
-- reported 2017-10-11 by Herbert Valerio Riedel,
-- fixed 2020-01-26 by Andreas Abel.
--
-- Problem was: the computed token length (in number of characters)
-- attached to AlexToken is tailored to UTF8 encoding and wrong
-- for LATIN1 encoding.

module Main where

import Control.Monad (unless)
import qualified Data.ByteString as B
import Data.Word
import System.Exit (exitFailure)
}

%encoding "latin1"

:-

[\x01-\xff]+ { False }
[\x00] { True }

{
type AlexInput = B.ByteString

alexGetByte :: AlexInput -> Maybe (Word8,AlexInput)
alexGetByte = B.uncons

alexInputPrevChar :: AlexInput -> Char
alexInputPrevChar = undefined

-- generated by @alex@
alexScan :: AlexInput -> Int -> AlexReturn Bool

{-
GOOD cases:
("012\NUL3","012","\NUL3",3,3,False)
("\NUL0","\NUL","0",1,1,True)
("012","012","",3,3,False)
BAD case:
("0@P`p\128\144\160","0@P`p","",5,8,False)
expected:
("0@P`p\128\144\160","0@P`p\128\144\160","",8,8,False)
-}
main :: IO ()
main = do
go (B.pack [0x30,0x31,0x32,0x00,0x33]) -- GOOD
go (B.pack [0x00,0x30]) -- GOOD
go (B.pack [0x30,0x31,0x32]) -- GOOD

go (B.pack [0x30,0x40,0x50,0x60,0x70,0x80,0x90,0xa0]) -- WAS: BAD
where
go inp = do
case (alexScan inp 0) of
-- expected invariant: len == B.length inp - B.length inp'
AlexToken inp' len b -> do
let diff = B.length inp - B.length inp'
unless (len == diff) $ do
putStrLn $ "ERROR: reported length and consumed length differ!"
print (inp, B.take len inp, inp', len, diff, b)
exitFailure
_ -> undefined
}

0 comments on commit 574ec8c

Please sign in to comment.