Accept hex and octal escape sequences in char/str literals

Resolves #109
iafisher · Feb 9, 2019 · 542edc8 · 542edc8
1 parent f8e1678
commit 542edc8
Show file tree

Hide file tree

Showing 8 changed files with 131 additions and 53 deletions.
diff --git a/.flake8 b/.flake8
@@ -7,3 +7,8 @@ ignore =
     E203,
     # W503 line break before binary operator
     W503,
+    # W293 blank line contains whitespace (sometimes we need it in string literals,
+    # and black will catch it anyway)
+    W293,
+    # E501 line too long (black handles this)
+    E501,
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 **NOTE**: As permitted by semantic versioning, backward compatibility is NOT maintained for initial development, i.e. releases before 1.0.0.
 
 ## Unreleased
+### Added
+- Hexadecimal and octal escape sequences are now accepted in character and string literals.
+
 ### Changed
 - Execution times of long-running programs has been cut down by roughly 50%.
 - Use of `SWI` and `RTI` operations now result in parse-time errors instead of run-time warnings.

diff --git a/hera/data.py b/hera/data.py
@@ -4,7 +4,6 @@
 Version: February 2019
 """
 from collections import namedtuple
-from typing import Optional, Tuple
 
 
 DEFAULT_DATA_START = 0xC001
@@ -132,9 +131,6 @@ def warn(self, msg, loc=None):
         self.warnings.append((msg, loc))
 
 
-ErrorType = Tuple[str, Optional[Location]]
-
-
 class HERAError(Exception):
     pass
 

diff --git a/hera/lexer.py b/hera/lexer.py
@@ -6,7 +6,7 @@
 Version: February 2019
 """
 import string
-from typing import Optional
+from typing import Optional, Tuple
 
 from hera.data import Location, Messages, Token
 from hera.utils import NAMED_REGISTERS
@@ -116,6 +116,51 @@ def read_symbol(self) -> int:
             length += 1
         return length
 
+    HEX_DIGITS = "0123456789abcdefABCDEF"
+
+    def read_escape_char(self) -> Tuple[str, int]:
+        """Read an escape sequence (assuming `self.text[self.position] == "\\"`) and
+        return a pair (value, length), where `value` is what the escape sequence
+        resolves to and `length` is the number of characters read.
+        """
+        peek = self.peek_char()
+        loc = self.get_location()
+        loc = loc._replace(column=loc.column + 1)
+        if peek == "":
+            return ("", 0)
+        elif peek == "x":
+            # Hex escapes
+            peek2 = self.peek_char(2)
+            peek3 = self.peek_char(3)
+            if peek2 in self.HEX_DIGITS and peek3 in self.HEX_DIGITS:
+                ordv = int(peek2 + peek3, base=16)
+                return (chr(ordv), 3)
+            else:
+                self.warn("invalid hex escape", loc)
+                return ("x", 1)
+        elif peek.isdigit():
+            # Octal escapes
+            length = 1
+            while length <= 3:
+                if not self.peek_char(length).isdigit():
+                    break
+                length += 1
+
+            val = self.text[self.position + 1 : self.position + length]
+
+            try:
+                ordv = int(val, base=8)
+            except ValueError:
+                self.warn("invalid octal escape", loc)
+                return (self.peek_char(), 1)
+            else:
+                return (chr(ordv), length - 1)
+        else:
+            escape = escape_char(peek)
+            if len(escape) == 2:
+                self.warn("unrecognized backslash escape", loc)
+            return (escape, 1)
+
     def consume_bracketed(self) -> None:
         self.next_char()
         loc = self.get_location()
@@ -131,63 +176,53 @@ def consume_bracketed(self) -> None:
         self.next_char()
 
     def consume_str(self) -> None:
-        sbuilder = []
         loc = self.get_location()
         self.next_char()
-        while self.position < len(self.text) and self.text[self.position] != '"':
-            if self.text[self.position] == "\\":
-                if self.position == len(self.text) - 1:
-                    self.next_char()
-                    break
-
-                escape = escape_char(self.text[self.position + 1])
-                sbuilder.append(escape)
-                self.next_char()
-                if len(escape) == 2:
-                    self.warn("unrecognized backslash escape", self.get_location())
-                self.next_char()
-            else:
-                sbuilder.append(self.text[self.position])
-                self.next_char()
+        s = self.consume_delimited('"')
 
         if self.position == len(self.text):
             self.tkn = Token(Token.ERROR, "unclosed string literal", loc)
             return
 
         self.next_char()
-        s = "".join(sbuilder)
         self.tkn = Token(Token.STRING, s, loc)
 
     def consume_char(self) -> None:
         loc = self.get_location()
         self.next_char()
-        start = self.position
-        while self.position < len(self.text) and self.text[self.position] != "'":
-            if self.text[self.position] == "\\":
-                self.next_char()
-            self.next_char()
+        s = self.consume_delimited("'")
 
         if self.position == len(self.text):
             self.tkn = Token(Token.ERROR, "unclosed character literal", loc)
             return
 
-        contents = self.text[start : self.position]
+        self.next_char()
 
-        if len(contents) == 1:
-            loc = loc._replace(column=loc.column + 1)
-            self.tkn = Token(Token.CHAR, contents, loc)
-        elif len(contents) == 2 and contents[0] == "\\":
-            loc = loc._replace(column=loc.column + 2)
-            escape = escape_char(contents[1])
-            if len(escape) == 2:
-                self.tkn = Token(Token.CHAR, escape[1], loc)
-                self.warn("unrecognized backslash escape", loc)
-            else:
-                self.tkn = Token(Token.CHAR, escape, loc)
+        if len(s) == 1:
+            self.tkn = Token(Token.CHAR, s, loc)
+        elif len(s) == 2 and s[0] == "\\":
+            self.tkn = Token(Token.CHAR, s[1], loc)
         else:
             self.tkn = Token(Token.ERROR, "over-long character literal", loc)
 
-        self.next_char()
+    def consume_delimited(self, delimiter) -> str:
+        sbuilder = []
+        while self.position < len(self.text) and self.text[self.position] != delimiter:
+            if self.text[self.position] == "\\":
+                value, length = self.read_escape_char()
+                self.next_char()
+                if length == 0:
+                    # Length of zero indicates EOF.
+                    break
+
+                sbuilder.append(value)
+                for _ in range(length):
+                    self.next_char()
+            else:
+                sbuilder.append(self.text[self.position])
+                self.next_char()
+
+        return "".join(sbuilder)
 
     def skip(self) -> None:
         """Skip past whitespace and comments."""
@@ -220,6 +255,9 @@ def skip(self) -> None:
                 break
 
     def next_char(self) -> None:
+        """Advance the position in the text by one. Do not call this method if the
+        current position is past the end of the text.
+        """
         if self.text[self.position] == "\n":
             self.line += 1
             self.column = 1
@@ -228,6 +266,9 @@ def next_char(self) -> None:
         self.position += 1
 
     def peek_char(self, n=1) -> str:
+        """Return the n'th character in the text past the current position. If past the
+        end, return the empty string.
+        """
         return (
             self.text[self.position + n] if self.position + n < len(self.text) else ""
         )
@@ -239,9 +280,6 @@ def set_token(self, typ: str, *, length=1) -> None:
             self.next_char()
         self.tkn = Token(typ, value, loc)
 
-    def err(self, msg: str, loc) -> None:
-        self.messages.err(msg, loc)
-
     def warn(self, msg: str, loc) -> None:
         self.messages.warn(msg, loc)
 

diff --git a/hera/utils.py b/hera/utils.py
@@ -130,13 +130,10 @@ def print_message(msg: str, *, loc=None) -> None:
 
     if isinstance(loc, Location):
         linetext = loc.file_lines[loc.line - 1]
-        if linetext.strip():
-            caret = align_caret(linetext, loc.column) + "^"
-            msg += ", line {} col {} of {}\n\n  {}\n  {}\n".format(
-                loc.line, loc.column, loc.path, linetext, caret
-            )
-        else:
-            msg += ", line {0.line} col {0.column} of {0.path}".format(loc)
+        caret = align_caret(linetext, loc.column) + "^"
+        msg += ", line {} col {} of {}\n\n  {}\n  {}\n".format(
+            loc.line, loc.column, loc.path, linetext, caret
+        )
 
     sys.stderr.write(msg + "\n")
 

diff --git a/test/test_error.py b/test/test_error.py
@@ -460,10 +460,10 @@ def test_mega_error(capsys):
   ADD('c', "abc")
   ^
 
-Error: expected register, line 8 col 6 of test/assets/error/mega_error.hera
+Error: expected register, line 8 col 5 of test/assets/error/mega_error.hera
 
   ADD('c', "abc")
-       ^
+      ^
 
 Error: expected register, line 8 col 10 of test/assets/error/mega_error.hera
 

diff --git a/test/test_parse_error.py b/test/test_parse_error.py
@@ -164,7 +164,14 @@ def test_parse_error_for_unclosed_arglist(capsys):
     captured = capsys.readouterr().err
     assert (
         captured
-        == "\nError: expected comma or right parenthesis, line 2 col 1 of <stdin>\n"
+        == """\
+
+Error: expected comma or right parenthesis, line 2 col 1 of <stdin>
+
+  
+  ^
+
+"""
     )
 
 

diff --git a/test/test_unit/test_lexer.py b/test/test_unit/test_lexer.py
@@ -60,6 +60,14 @@ def test_lexer_with_character_literal():
     assert eq(lexer.next_token(), Token(Token.EOF, ""))
 
 
+def test_lexer_with_consecutive_character_literals():
+    lexer = lex_helper("'a''b'")
+
+    assert eq(lexer.tkn, Token(Token.CHAR, "a"))
+    assert eq(lexer.next_token(), Token(Token.CHAR, "b"))
+    assert eq(lexer.next_token(), Token(Token.EOF, ""))
+
+
 def test_lexer_with_character_literal_backslash_escape():
     lexer = lex_helper("'\\n'")
 
@@ -68,6 +76,30 @@ def test_lexer_with_character_literal_backslash_escape():
     assert eq(lexer.next_token(), Token(Token.EOF, ""))
 
 
+def test_lexer_with_hex_escapes():
+    lexer = lex_helper("'\\x41' \"\\x41\"")
+
+    assert eq(lexer.tkn, Token(Token.CHAR, "A"))
+    assert eq(lexer.next_token(), Token(Token.STRING, "A"))
+
+
+def test_lexer_with_octal_escapes():
+    lexer = lex_helper("'\\0''\\12' \"\\141\"")
+
+    assert eq(lexer.tkn, Token(Token.CHAR, "\x00"))
+    assert eq(lexer.next_token(), Token(Token.CHAR, "\n"))
+    assert eq(lexer.next_token(), Token(Token.STRING, "a"))
+
+
+def test_lexer_with_invalid_hex_escape():
+    lexer = lex_helper("'\\xa'  \"\\xgh\"")
+
+    assert len(lexer.messages.warnings) == 1
+    assert eq(lexer.tkn, Token(Token.ERROR, "over-long character literal"))
+    assert eq(lexer.next_token(), Token(Token.STRING, "xgh"))
+    assert len(lexer.messages.warnings) == 2
+
+
 def test_lexer_with_over_long_character_literal():
     lexer = lex_helper("'abc' 10")