From 542edc806d0592f8d9dcc65e09218823a0c10f88 Mon Sep 17 00:00:00 2001 From: Ian Fisher Date: Sat, 9 Feb 2019 09:24:41 -0500 Subject: [PATCH] Accept hex and octal escape sequences in char/str literals Resolves #109 --- .flake8 | 5 ++ CHANGELOG.md | 3 + hera/data.py | 4 -- hera/lexer.py | 116 +++++++++++++++++++++++------------ hera/utils.py | 11 ++-- test/test_error.py | 4 +- test/test_parse_error.py | 9 ++- test/test_unit/test_lexer.py | 32 ++++++++++ 8 files changed, 131 insertions(+), 53 deletions(-) diff --git a/.flake8 b/.flake8 index 77c1607..f153138 100644 --- a/.flake8 +++ b/.flake8 @@ -7,3 +7,8 @@ ignore = E203, # W503 line break before binary operator W503, + # W293 blank line contains whitespace (sometimes we need it in string literals, + # and black will catch it anyway) + W293, + # E501 line too long (black handles this) + E501, diff --git a/CHANGELOG.md b/CHANGELOG.md index 920ec15..e597570 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), **NOTE**: As permitted by semantic versioning, backward compatibility is NOT maintained for initial development, i.e. releases before 1.0.0. ## Unreleased +### Added +- Hexadecimal and octal escape sequences are now accepted in character and string literals. + ### Changed - Execution times of long-running programs has been cut down by roughly 50%. - Use of `SWI` and `RTI` operations now result in parse-time errors instead of run-time warnings. diff --git a/hera/data.py b/hera/data.py index d53e230..c9e855d 100644 --- a/hera/data.py +++ b/hera/data.py @@ -4,7 +4,6 @@ Version: February 2019 """ from collections import namedtuple -from typing import Optional, Tuple DEFAULT_DATA_START = 0xC001 @@ -132,9 +131,6 @@ def warn(self, msg, loc=None): self.warnings.append((msg, loc)) -ErrorType = Tuple[str, Optional[Location]] - - class HERAError(Exception): pass diff --git a/hera/lexer.py b/hera/lexer.py index c7646e5..2a38e37 100644 --- a/hera/lexer.py +++ b/hera/lexer.py @@ -6,7 +6,7 @@ Version: February 2019 """ import string -from typing import Optional +from typing import Optional, Tuple from hera.data import Location, Messages, Token from hera.utils import NAMED_REGISTERS @@ -116,6 +116,51 @@ def read_symbol(self) -> int: length += 1 return length + HEX_DIGITS = "0123456789abcdefABCDEF" + + def read_escape_char(self) -> Tuple[str, int]: + """Read an escape sequence (assuming `self.text[self.position] == "\\"`) and + return a pair (value, length), where `value` is what the escape sequence + resolves to and `length` is the number of characters read. + """ + peek = self.peek_char() + loc = self.get_location() + loc = loc._replace(column=loc.column + 1) + if peek == "": + return ("", 0) + elif peek == "x": + # Hex escapes + peek2 = self.peek_char(2) + peek3 = self.peek_char(3) + if peek2 in self.HEX_DIGITS and peek3 in self.HEX_DIGITS: + ordv = int(peek2 + peek3, base=16) + return (chr(ordv), 3) + else: + self.warn("invalid hex escape", loc) + return ("x", 1) + elif peek.isdigit(): + # Octal escapes + length = 1 + while length <= 3: + if not self.peek_char(length).isdigit(): + break + length += 1 + + val = self.text[self.position + 1 : self.position + length] + + try: + ordv = int(val, base=8) + except ValueError: + self.warn("invalid octal escape", loc) + return (self.peek_char(), 1) + else: + return (chr(ordv), length - 1) + else: + escape = escape_char(peek) + if len(escape) == 2: + self.warn("unrecognized backslash escape", loc) + return (escape, 1) + def consume_bracketed(self) -> None: self.next_char() loc = self.get_location() @@ -131,63 +176,53 @@ def consume_bracketed(self) -> None: self.next_char() def consume_str(self) -> None: - sbuilder = [] loc = self.get_location() self.next_char() - while self.position < len(self.text) and self.text[self.position] != '"': - if self.text[self.position] == "\\": - if self.position == len(self.text) - 1: - self.next_char() - break - - escape = escape_char(self.text[self.position + 1]) - sbuilder.append(escape) - self.next_char() - if len(escape) == 2: - self.warn("unrecognized backslash escape", self.get_location()) - self.next_char() - else: - sbuilder.append(self.text[self.position]) - self.next_char() + s = self.consume_delimited('"') if self.position == len(self.text): self.tkn = Token(Token.ERROR, "unclosed string literal", loc) return self.next_char() - s = "".join(sbuilder) self.tkn = Token(Token.STRING, s, loc) def consume_char(self) -> None: loc = self.get_location() self.next_char() - start = self.position - while self.position < len(self.text) and self.text[self.position] != "'": - if self.text[self.position] == "\\": - self.next_char() - self.next_char() + s = self.consume_delimited("'") if self.position == len(self.text): self.tkn = Token(Token.ERROR, "unclosed character literal", loc) return - contents = self.text[start : self.position] + self.next_char() - if len(contents) == 1: - loc = loc._replace(column=loc.column + 1) - self.tkn = Token(Token.CHAR, contents, loc) - elif len(contents) == 2 and contents[0] == "\\": - loc = loc._replace(column=loc.column + 2) - escape = escape_char(contents[1]) - if len(escape) == 2: - self.tkn = Token(Token.CHAR, escape[1], loc) - self.warn("unrecognized backslash escape", loc) - else: - self.tkn = Token(Token.CHAR, escape, loc) + if len(s) == 1: + self.tkn = Token(Token.CHAR, s, loc) + elif len(s) == 2 and s[0] == "\\": + self.tkn = Token(Token.CHAR, s[1], loc) else: self.tkn = Token(Token.ERROR, "over-long character literal", loc) - self.next_char() + def consume_delimited(self, delimiter) -> str: + sbuilder = [] + while self.position < len(self.text) and self.text[self.position] != delimiter: + if self.text[self.position] == "\\": + value, length = self.read_escape_char() + self.next_char() + if length == 0: + # Length of zero indicates EOF. + break + + sbuilder.append(value) + for _ in range(length): + self.next_char() + else: + sbuilder.append(self.text[self.position]) + self.next_char() + + return "".join(sbuilder) def skip(self) -> None: """Skip past whitespace and comments.""" @@ -220,6 +255,9 @@ def skip(self) -> None: break def next_char(self) -> None: + """Advance the position in the text by one. Do not call this method if the + current position is past the end of the text. + """ if self.text[self.position] == "\n": self.line += 1 self.column = 1 @@ -228,6 +266,9 @@ def next_char(self) -> None: self.position += 1 def peek_char(self, n=1) -> str: + """Return the n'th character in the text past the current position. If past the + end, return the empty string. + """ return ( self.text[self.position + n] if self.position + n < len(self.text) else "" ) @@ -239,9 +280,6 @@ def set_token(self, typ: str, *, length=1) -> None: self.next_char() self.tkn = Token(typ, value, loc) - def err(self, msg: str, loc) -> None: - self.messages.err(msg, loc) - def warn(self, msg: str, loc) -> None: self.messages.warn(msg, loc) diff --git a/hera/utils.py b/hera/utils.py index 051da41..fc276de 100644 --- a/hera/utils.py +++ b/hera/utils.py @@ -130,13 +130,10 @@ def print_message(msg: str, *, loc=None) -> None: if isinstance(loc, Location): linetext = loc.file_lines[loc.line - 1] - if linetext.strip(): - caret = align_caret(linetext, loc.column) + "^" - msg += ", line {} col {} of {}\n\n {}\n {}\n".format( - loc.line, loc.column, loc.path, linetext, caret - ) - else: - msg += ", line {0.line} col {0.column} of {0.path}".format(loc) + caret = align_caret(linetext, loc.column) + "^" + msg += ", line {} col {} of {}\n\n {}\n {}\n".format( + loc.line, loc.column, loc.path, linetext, caret + ) sys.stderr.write(msg + "\n") diff --git a/test/test_error.py b/test/test_error.py index 66e9001..85e3c83 100644 --- a/test/test_error.py +++ b/test/test_error.py @@ -460,10 +460,10 @@ def test_mega_error(capsys): ADD('c', "abc") ^ -Error: expected register, line 8 col 6 of test/assets/error/mega_error.hera +Error: expected register, line 8 col 5 of test/assets/error/mega_error.hera ADD('c', "abc") - ^ + ^ Error: expected register, line 8 col 10 of test/assets/error/mega_error.hera diff --git a/test/test_parse_error.py b/test/test_parse_error.py index 5312d1c..1a65b02 100644 --- a/test/test_parse_error.py +++ b/test/test_parse_error.py @@ -164,7 +164,14 @@ def test_parse_error_for_unclosed_arglist(capsys): captured = capsys.readouterr().err assert ( captured - == "\nError: expected comma or right parenthesis, line 2 col 1 of \n" + == """\ + +Error: expected comma or right parenthesis, line 2 col 1 of + + + ^ + +""" ) diff --git a/test/test_unit/test_lexer.py b/test/test_unit/test_lexer.py index fcf72d2..9b3744a 100644 --- a/test/test_unit/test_lexer.py +++ b/test/test_unit/test_lexer.py @@ -60,6 +60,14 @@ def test_lexer_with_character_literal(): assert eq(lexer.next_token(), Token(Token.EOF, "")) +def test_lexer_with_consecutive_character_literals(): + lexer = lex_helper("'a''b'") + + assert eq(lexer.tkn, Token(Token.CHAR, "a")) + assert eq(lexer.next_token(), Token(Token.CHAR, "b")) + assert eq(lexer.next_token(), Token(Token.EOF, "")) + + def test_lexer_with_character_literal_backslash_escape(): lexer = lex_helper("'\\n'") @@ -68,6 +76,30 @@ def test_lexer_with_character_literal_backslash_escape(): assert eq(lexer.next_token(), Token(Token.EOF, "")) +def test_lexer_with_hex_escapes(): + lexer = lex_helper("'\\x41' \"\\x41\"") + + assert eq(lexer.tkn, Token(Token.CHAR, "A")) + assert eq(lexer.next_token(), Token(Token.STRING, "A")) + + +def test_lexer_with_octal_escapes(): + lexer = lex_helper("'\\0''\\12' \"\\141\"") + + assert eq(lexer.tkn, Token(Token.CHAR, "\x00")) + assert eq(lexer.next_token(), Token(Token.CHAR, "\n")) + assert eq(lexer.next_token(), Token(Token.STRING, "a")) + + +def test_lexer_with_invalid_hex_escape(): + lexer = lex_helper("'\\xa' \"\\xgh\"") + + assert len(lexer.messages.warnings) == 1 + assert eq(lexer.tkn, Token(Token.ERROR, "over-long character literal")) + assert eq(lexer.next_token(), Token(Token.STRING, "xgh")) + assert len(lexer.messages.warnings) == 2 + + def test_lexer_with_over_long_character_literal(): lexer = lex_helper("'abc' 10")