Added lexer token class ExtToken with customized printing of tokens, …

…which * hides the mostly-useless token type numbers which were causing the tests to change unnecessarily, and * displays tokens whose text is modified (e.g. "'a'" lexes to a token whose text is "a") with a reconstruction of their original text. git-svn-id: http://switchb.org/svn/e/cl-e/trunk@786 bf3ccfa1-f3f3-0310-a3d5-cea1fe9d5a75
kpreid · Feb 19, 2012 · 0008e6e · 0008e6e
1 parent 4fc548d
commit 0008e6e
Show file tree

Hide file tree

Showing 4 changed files with 94 additions and 16 deletions.
diff --git a/antlr/ExtToken.java b/antlr/ExtToken.java
@@ -0,0 +1,47 @@
+// Copyright 2007 Kevin Reid, under the terms of the MIT X license
+// found at http://www.opensource.org/licenses/mit-license.html ...............
+
+import antlr.CommonToken;
+
+/** Modified print behavior over CommonToken; in particular, does not print the numeric token type, which is not useful information to the user; also knows how to reverse text transformations for particular token types. */
+public class ExtToken extends CommonToken {
+
+    /** this is the constructor invoked by CharScanner */
+    public ExtToken() { super(); }
+
+    public String getOriginalText() {
+        if (type == ETokenTypes.EOF) {
+            return "<EOF>";
+        } else if (type == ETokenTypes.SOURCE_VALUE_HOLE) {
+            return "<$-hole #" + getText() + ">";
+        } else if (type == ETokenTypes.SOURCE_PATTERN_HOLE) {
+            return "<@-hole #" + getText() + ">";
+        } else if (type == ETokenTypes.DOLLAR_IDENT) {
+            return "$" + getText();
+        } else if (type == ETokenTypes.AT_IDENT) {
+            return "@" + getText();
+        } else if (type == ETokenTypes.DOLLARESC) {
+            return "$\\" + getText(); // XXX escape
+        } else if (type == ETokenTypes.CHAR_LITERAL) {
+            return "'" + getText() + "'"; // XXX escape
+        } else if (type == ETokenTypes.STRING) {
+            return "\"" + getText() + "\""; // XXX escape
+        } else if (type == ETokenTypes.HEX) {
+            return "0x" + getText();
+        } else if (type == ETokenTypes.OCTAL) {
+            return "0" + getText();
+        } else if (type == ETokenTypes.URI) {
+            return "<" + getText() + ">";
+        } else if (type == ETokenTypes.URIGetter) {
+            return "<" + getText() + ">";
+        } else if (type == ETokenTypes.URIStart) {
+            return "<" + getText();
+        } else {
+            return getText();
+        }
+    }
+
+    public String toString() {
+        return "<\"" + getOriginalText() + "\" @ " + getLine() + ":" + getColumn() + ">";
+    }
+}
diff --git a/jlib/parseEToSExpression.emaker b/jlib/parseEToSExpression.emaker
@@ -64,10 +64,11 @@ def author(<unsafe>) {
     def elexer := makeEALexer(makeCountingLexerSharedInputState(makeStringReader(text), 0))
     def qlexer := makeQuasiLexer(elexer.getInputState())
     def tb := makeTokenMultiBuffer(["e", "quasi"], [elexer, qlexer])
-    elexer.setSelector(tb)
-    qlexer.setSelector(tb)
-    elexer.setFilename(fname)
-    qlexer.setFilename(fname)
+    for lexer in [elexer, qlexer] {
+      lexer.setSelector(tb)
+      lexer.setFilename(fname)
+      lexer.setTokenObjectClass("ExtToken")
+    }
 
     def parser := makeEParser(tb)
     parser.setFilename(fname)

diff --git a/lisp/antlr-system.lisp b/lisp/antlr-system.lisp
@@ -111,6 +111,8 @@
   :pathname (merge-pathnames #p"antlr/" (component-pathname (find-system :e-on-cl)))
   :components
     ((:java-source-file "ExtAST")
+     (:java-source-file "ExtToken"
+                        :depends-on ("e"))
      (:java-source-file "CountingCharBuffer")
      (:java-source-file "CountingLexerSharedInputState")
      (:antlr-source-file "e"

diff --git a/tests/syntax-in.updoc b/tests/syntax-in.updoc
@@ -96,7 +96,7 @@ or both.
 A decimal point must have at least one following digit, because otherwise it is ambiguous with call syntax.
 
 ? t("3.")
-# problem: (line 1)@3: unexpected token: ["null",<1>,line=1,col=3]
+# problem: (line 1)@3: unexpected token: <"<EOF>" @ 1:3>
 ? t("3.e2")
 # stdout: CurryE. CallE.
 #           LiteralE. 3
@@ -119,7 +119,7 @@ An additional dot always indicates call syntax.
 There must also be at least one preceding digit.
 
 ? t(".2")
-# problem: (line 1)@1: unexpected token: [".",<136>,line=1,col=1]
+# problem: (line 1)@1: unexpected token: <"." @ 1:1>
 
 --- Nouns ---
 
@@ -236,7 +236,7 @@ XXX ListExpr doesn't really belong here '
 
 Fails for the sake of sane precedence rules, even though it would be unambiguous, sort of.
 ? t("1 + def a := 2")
-# problem: (line 1)@11: unexpected token: [":=",<165>,line=1,col=11]
+# problem: (line 1)@11: unexpected token: <":=" @ 1:11>
 XXX produce a nice error for this?
 
 ? t("a & def _ {}")
@@ -554,8 +554,7 @@ Parentheses disable modPow form
 #           NounE. a
 
 ? t("a :b :c")
-# problem: (line 1)@6: unexpected token: [":",<175>,line=1,col=6]
-XXX prettier error
+# problem: (line 1)@6: unexpected token: <":" @ 1:6>
 
 --- ForExpr ---
 
@@ -675,7 +674,7 @@ XXX Despite the appearance of the above test, keyword testing is not properly im
 --- URISchemeExpr ---
 
 ? t("<>")
-# problem: (line 1)@1: unexpected token: ["<",<204>,line=1,col=1]
+# problem: (line 1)@1: unexpected token: <"<" @ 1:1>
 
 ? t("<a>")
 # stdout: URISchemeE. a
@@ -690,11 +689,11 @@ would-be-keywords
 
 erroneous
 ? t("<ab cd>")
-# problem: (line 1)@1: unexpected token: ["<",<204>,line=1,col=1]
+# problem: (line 1)@1: unexpected token: <"<" @ 1:1>
 ? t("<+a>")
-# problem: (line 1)@1: unexpected token: ["<",<204>,line=1,col=1]
+# problem: (line 1)@1: unexpected token: <"<" @ 1:1>
 ? t("<a!b>")
-# problem: (line 1)@1: unexpected token: ["<",<204>,line=1,col=1]
+# problem: (line 1)@1: unexpected token: <"<" @ 1:1>
 
 --- WhenExpr ---
 
@@ -781,9 +780,38 @@ XXX incomplete tests; borrow when-fn examples from syntax-sugar
 #             NounE. b
 #             [] NounE. c
 
+--- Parsing errors ---
+
+Printing of tokens in errors, for those tokens which have special treatment:
+
+? t("[")
+# problem: (line 1)@2: unexpected token: <"<EOF>" @ 1:2>
+XXX this is not the best; we shouldn't be quoting the meta
+
+NOTE: no handy way to test quasiliteral tokens (DOLLARESC, DOLLAR_IDENT, and AT_IDENT) because they can't occur in the wrong place
+
+? t("0 'a'")
+# problem: (line 1)@3: unexpected token: <"'a'" @ 1:3>
+
+? t("def a \"b\"")
+# problem: (line 1)@7: unexpected token: <""b"" @ 1:7>
+XXX should escape quotes
+
+? t("0 0x10")
+# problem: (line 1)@3: unexpected token: <"0x10" @ 1:3>
+
+? t("0 010")
+# problem: (line 1)@3: unexpected token: <"010" @ 1:3>
+
+? t("0 <ab:c>")
+# problem: (line 1)@3: unexpected token: <"<ab:c>" @ 1:3>
+
+? t("0 <ab>")
+# problem: (line 1)@3: unexpected token: <"<ab>" @ 1:3>
+
+? t("0 <ab: ")
+# problem: (line 1)@3: unexpected token: <"<ab:" @ 1:3>
+
 
 XXX test acceptance of line breaks everywhere
 XXX general parser tests for everything
-
-x ? t("")
-x # stdout: