Mostly finish the bespoke parser

iafisher · Jan 30, 2019 · a307518 · a307518
1 parent 1e6dfa5
commit a307518
Show file tree

Hide file tree

Showing 19 changed files with 300 additions and 117 deletions.
diff --git a/hera/checker.py b/hera/checker.py
@@ -1,7 +1,17 @@
 from contextlib import suppress
 from typing import Dict, List, Optional, Tuple
 
-from .data import Constant, DataLabel, Label, Op, Messages, Program, Settings, Token
+from .data import (
+    Constant,
+    DataLabel,
+    Label,
+    Op,
+    Messages,
+    Program,
+    Settings,
+    Token,
+    TOKEN,
+)
 from .op import Operation, resolve_ops
 from .utils import (
     DATA_STATEMENTS,
@@ -214,6 +224,6 @@ def convert_ops(
 def substitute_label(op: Operation, symbol_table: Dict[str, int]) -> Operation:
     """Substitute any label in the instruction with its concrete value."""
     for i, arg in enumerate(op.args):
-        if isinstance(arg, Token) and arg.type == "SYMBOL":
+        if isinstance(arg, Token) and arg.type == TOKEN.SYMBOL:
             op.args[i] = symbol_table[arg]
     return op
diff --git a/hera/data.py b/hera/data.py
@@ -10,6 +10,7 @@
 Version: December 2018
 """
 from collections import namedtuple
+from enum import Enum
 from typing import Optional, Tuple
 
 
@@ -80,11 +81,39 @@ def __new__(cls, type_, value, loc=None):
         return self
 
     def __repr__(self):
-        return "Token({}, {}, loc={})".format(
+        return "Token({!r}, {}, loc={})".format(
             self.type, super().__repr__(), self.location
         )
 
 
+class TOKEN(Enum):
+    """Enumeration for the type field of Token objects."""
+
+    # Values
+    INT = "TOKEN_INT"
+    REGISTER = "TOKEN_REGISTER"
+    SYMBOL = "TOKEN_SYMBOL"
+    STRING = "TOKEN_STRING"
+    BRACKETED = "TOKEN_BRACKETED"
+    CHAR = "TOKEN_CHAR"
+
+    # Operators
+    MINUS = "TOKEN_MINUS"
+    AT = "TOKEN_AT"
+    ASTERISK = "TOKEN_ASTERISK"
+    PLUS = "TOKEN_PLUS"
+    SLASH = "TOKEN_SLASH"
+
+    LPAREN = "TOKEN_LPAREN"
+    RPAREN = "TOKEN_RPAREN"
+    COMMA = "TOKEN_COMMA"
+
+    FMT = "TOKEN_FMT"
+    INCLUDE = "TOKEN_INCLUDE"
+    EOF = "TOKEN_EOF"
+    UNKNOWN = "TOKEN_UNKNOWN"
+
+
 class Messages:
     def __init__(self, msg=None, loc=None, *, warning=False):
         self.errors = []

diff --git a/hera/debugger/shell.py b/hera/debugger/shell.py
@@ -502,7 +502,7 @@ def print_flags(self):
     def evaluate_node(self, node):
         vm = self.debugger.vm
         if isinstance(node, IntNode):
-            if node.value >= 2 ** 16:
+            if node.value >= 2 ** 16 or node.value < -2 ** 15:
                 raise HERAError("integer literal exceeds 16 bits")
             return node.value
         elif isinstance(node, RegisterNode):

diff --git a/hera/lexer.py b/hera/lexer.py
@@ -1,6 +1,5 @@
-from enum import Enum
-
-from hera.data import Location, Token
+from hera.data import Location, Token, TOKEN
+from hera.utils import is_register
 
 
 class Lexer:
@@ -27,21 +26,61 @@ def next_token(self):
         else:
             ch = self.text[self.position]
             if ch.isalpha() or ch == "_":
-                length = self.read_register()
-                if length != -1:
+                length = self.read_symbol()
+                if is_register(self.text[self.position : self.position + length]):
                     self.set_token(TOKEN.REGISTER, length=length)
                 else:
-                    length = self.read_symbol()
                     self.set_token(TOKEN.SYMBOL, length=length)
             elif ch.isdigit():
                 length = self.read_int()
                 self.set_token(TOKEN.INT, length=length)
+            elif ch == '"':
+                loc = self.get_location()
+                s = self.consume_str()
+                self.tkn = Token(TOKEN.STRING, s, loc)
+            elif ch == "'":
+                if self.peek_char() == "\\":
+                    if self.peek_char(3) == "'":
+                        ch = self.peek_char(2)
+                        escape = escape_char(ch)
+                        self.next_char()  # open quote
+                        self.next_char()  # backslash
+                        loc = self.get_location()
+                        self.next_char()  # character
+                        self.next_char()  # end quote
+                        if len(escape) == 2:
+                            self.tkn = Token(TOKEN.CHAR, escape[1], loc)
+                        else:
+                            self.tkn = Token(TOKEN.CHAR, escape, loc)
+                else:
+                    if self.peek_char(2) == "'":
+                        ch = self.peek_char()
+                        self.next_char()  # open quote
+                        loc = self.get_location()
+                        self.next_char()  # character
+                        self.next_char()  # end quote
+                        self.tkn = Token(TOKEN.CHAR, ch, loc)
+                    else:
+                        self.set_token(TOKEN.UNKNOWN)
+            elif self.text[self.position :].startswith("#include"):
+                self.set_token(TOKEN.INCLUDE, length=len("#include"))
+            elif ch == "<":
+                self.next_char()
+                length = self.read_bracketed()
+                self.set_token(TOKEN.BRACKETED, length=length)
+                self.next_char()
             elif ch == ":":
                 self.position += 1
                 length = self.read_symbol()
                 self.set_token(TOKEN.FMT, length=length)
             elif ch == "-":
-                self.set_token(TOKEN.MINUS)
+                if self.peek_char().isdigit():
+                    self.position += 1
+                    length = self.read_int()
+                    self.position -= 1
+                    self.set_token(TOKEN.INT, length=length + 1)
+                else:
+                    self.set_token(TOKEN.MINUS)
             elif ch == "+":
                 self.set_token(TOKEN.PLUS)
             elif ch == "/":
@@ -61,33 +100,6 @@ def next_token(self):
 
         return self.tkn
 
-    def read_register(self):
-        ch = self.text[self.position]
-        if ch in "rR":
-            if self.peek_char() in "tT":
-                return 2
-            elif self.peek_char().isdigit():
-                length = 2
-                while self.peek_char(length).isdigit():
-                    length += 1
-                return length
-        elif ch in "pP":
-            if self.text[self.position :].lower().startswith("pc_ret"):
-                return 6
-            elif self.text[self.position :].lower().startswith("pc"):
-                return 2
-        elif ch in "fF":
-            if self.text[self.position :].lower().startswith("fp_alt"):
-                return 6
-            elif self.text[self.position :].lower().startswith("fp"):
-                return 2
-        elif ch in "sS":
-            if self.peek_char() in "pP":
-                return 2
-
-        # Default: not a register.
-        return -1
-
     def read_int(self):
         length = 1
         digits = set([str(i) for i in range(10)])
@@ -111,6 +123,27 @@ def read_symbol(self):
             length += 1
         return length
 
+    def read_bracketed(self):
+        length = 1
+        while self.position + length < len(self.text) and self.peek_char(length) != ">":
+            length += 1
+        return length
+
+    def consume_str(self):
+        sbuilder = []
+        self.next_char()
+        while self.text[self.position] != '"':
+            if self.text[self.position] == "\\":
+                escape = escape_char(self.text[self.position + 1])
+                sbuilder.append(escape)
+                self.next_char()
+                self.next_char()
+            else:
+                sbuilder.append(self.text[self.position])
+                self.next_char()
+        self.next_char()
+        return "".join(sbuilder)
+
     def skip(self):
         """Skip past whitespace and comments."""
         while True:
@@ -162,26 +195,19 @@ def set_token(self, typ, *, length=1):
         self.tkn = Token(typ, value, loc)
 
 
-class TOKEN(Enum):
-    # Values
-    INT = "TOKEN_INT"
-    REGISTER = "TOKEN_REGISTER"
-    SYMBOL = "TOKEN_SYMBOL"
-    STRING = "TOKEN_STRING"
-    CHAR = "TOKEN_CHAR"
-
-    # Operators
-    MINUS = "TOKEN_MINUS"
-    AT = "TOKEN_AT"
-    ASTERISK = "TOKEN_ASTERISK"
-    PLUS = "TOKEN_PLUS"
-    SLASH = "TOKEN_SLASH"
-
-    LPAREN = "TOKEN_LPAREN"
-    RPAREN = "TOKEN_RPAREN"
-    COMMA = "TOKEN_COMMA"
-
-    FMT = "TOKEN_FMT"
-    INCLUDE = "TOKEN_INCLUDE"
-    EOF = "TOKEN_EOF"
-    UNKNOWN = "TOKEN_UNKNOWN"
+def escape_char(c):
+    """Return the special character that `c` encodes.
+
+        >>> escape_char("n")
+        "\n"
+    """
+    if c == "n":
+        return "\n"
+    elif c == "t":
+        return "\t"
+    elif c == "\\":
+        return "\\"
+    elif c == '"':
+        return '"'
+    else:
+        return "\\" + c
diff --git a/hera/loader.py b/hera/loader.py
@@ -9,7 +9,7 @@
 
 from .checker import check
 from .data import HERAError, Messages, Op, Settings
-from .parser import parse
+from .parser_bespoke import parse
 from .utils import handle_messages, read_file
 
 

diff --git a/hera/op.py b/hera/op.py
@@ -2,7 +2,7 @@
 from contextlib import suppress
 from typing import Dict, List, Optional, Tuple
 
-from hera.data import Constant, DataLabel, Location, Messages, Op, Token
+from hera.data import Constant, DataLabel, Location, Messages, Op, Token, TOKEN
 from hera.utils import (
     format_int,
     from_u16,
@@ -70,7 +70,7 @@ def __str__(self):
 
 
 def arg_to_string(arg):
-    if isinstance(arg, Token) and arg.type == "STRING":
+    if isinstance(arg, Token) and arg.type == TOKEN.STRING:
         return json.dumps(arg)
     else:
         return str(arg)
@@ -937,7 +937,7 @@ def check_arglist(argtypes, args, symbol_table):
 
 
 def check_register(arg) -> Optional[str]:
-    if not isinstance(arg, Token) or arg.type != "REGISTER":
+    if not isinstance(arg, Token) or arg.type != TOKEN.REGISTER:
         return "expected register"
 
     if arg.lower() == "pc":
@@ -952,10 +952,10 @@ def check_register(arg) -> Optional[str]:
 
 
 def check_register_or_label(arg, symbol_table: Dict[str, int]) -> Optional[str]:
-    if not isinstance(arg, Token) or arg.type not in ("REGISTER", "SYMBOL"):
+    if not isinstance(arg, Token) or arg.type not in (TOKEN.REGISTER, TOKEN.SYMBOL):
         return "expected register or label"
 
-    if arg.type == "REGISTER":
+    if arg.type == TOKEN.REGISTER:
         return check_register(arg)
     else:
         try:
@@ -979,7 +979,7 @@ def check_label(arg) -> Optional[str]:
 
 
 def check_string(arg):
-    if not isinstance(arg, Token) or arg.type != "STRING":
+    if not isinstance(arg, Token) or arg.type != TOKEN.STRING:
         return "expected string literal"
     else:
         return None