Skip to content

Commit

Permalink
Accept hex and octal escape sequences in char/str literals
Browse files Browse the repository at this point in the history
Resolves #109
  • Loading branch information
iafisher committed Feb 9, 2019
1 parent f8e1678 commit 542edc8
Show file tree
Hide file tree
Showing 8 changed files with 131 additions and 53 deletions.
5 changes: 5 additions & 0 deletions .flake8
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,8 @@ ignore =
E203,
# W503 line break before binary operator
W503,
# W293 blank line contains whitespace (sometimes we need it in string literals,
# and black will catch it anyway)
W293,
# E501 line too long (black handles this)
E501,
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
**NOTE**: As permitted by semantic versioning, backward compatibility is NOT maintained for initial development, i.e. releases before 1.0.0.

## Unreleased
### Added
- Hexadecimal and octal escape sequences are now accepted in character and string literals.

### Changed
- Execution times of long-running programs has been cut down by roughly 50%.
- Use of `SWI` and `RTI` operations now result in parse-time errors instead of run-time warnings.
Expand Down
4 changes: 0 additions & 4 deletions hera/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
Version: February 2019
"""
from collections import namedtuple
from typing import Optional, Tuple


DEFAULT_DATA_START = 0xC001
Expand Down Expand Up @@ -132,9 +131,6 @@ def warn(self, msg, loc=None):
self.warnings.append((msg, loc))


ErrorType = Tuple[str, Optional[Location]]


class HERAError(Exception):
pass

Expand Down
116 changes: 77 additions & 39 deletions hera/lexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
Version: February 2019
"""
import string
from typing import Optional
from typing import Optional, Tuple

from hera.data import Location, Messages, Token
from hera.utils import NAMED_REGISTERS
Expand Down Expand Up @@ -116,6 +116,51 @@ def read_symbol(self) -> int:
length += 1
return length

HEX_DIGITS = "0123456789abcdefABCDEF"

def read_escape_char(self) -> Tuple[str, int]:
"""Read an escape sequence (assuming `self.text[self.position] == "\\"`) and
return a pair (value, length), where `value` is what the escape sequence
resolves to and `length` is the number of characters read.
"""
peek = self.peek_char()
loc = self.get_location()
loc = loc._replace(column=loc.column + 1)
if peek == "":
return ("", 0)
elif peek == "x":
# Hex escapes
peek2 = self.peek_char(2)
peek3 = self.peek_char(3)
if peek2 in self.HEX_DIGITS and peek3 in self.HEX_DIGITS:
ordv = int(peek2 + peek3, base=16)
return (chr(ordv), 3)
else:
self.warn("invalid hex escape", loc)
return ("x", 1)
elif peek.isdigit():
# Octal escapes
length = 1
while length <= 3:
if not self.peek_char(length).isdigit():
break
length += 1

val = self.text[self.position + 1 : self.position + length]

try:
ordv = int(val, base=8)
except ValueError:
self.warn("invalid octal escape", loc)
return (self.peek_char(), 1)
else:
return (chr(ordv), length - 1)
else:
escape = escape_char(peek)
if len(escape) == 2:
self.warn("unrecognized backslash escape", loc)
return (escape, 1)

def consume_bracketed(self) -> None:
self.next_char()
loc = self.get_location()
Expand All @@ -131,63 +176,53 @@ def consume_bracketed(self) -> None:
self.next_char()

def consume_str(self) -> None:
sbuilder = []
loc = self.get_location()
self.next_char()
while self.position < len(self.text) and self.text[self.position] != '"':
if self.text[self.position] == "\\":
if self.position == len(self.text) - 1:
self.next_char()
break

escape = escape_char(self.text[self.position + 1])
sbuilder.append(escape)
self.next_char()
if len(escape) == 2:
self.warn("unrecognized backslash escape", self.get_location())
self.next_char()
else:
sbuilder.append(self.text[self.position])
self.next_char()
s = self.consume_delimited('"')

if self.position == len(self.text):
self.tkn = Token(Token.ERROR, "unclosed string literal", loc)
return

self.next_char()
s = "".join(sbuilder)
self.tkn = Token(Token.STRING, s, loc)

def consume_char(self) -> None:
loc = self.get_location()
self.next_char()
start = self.position
while self.position < len(self.text) and self.text[self.position] != "'":
if self.text[self.position] == "\\":
self.next_char()
self.next_char()
s = self.consume_delimited("'")

if self.position == len(self.text):
self.tkn = Token(Token.ERROR, "unclosed character literal", loc)
return

contents = self.text[start : self.position]
self.next_char()

if len(contents) == 1:
loc = loc._replace(column=loc.column + 1)
self.tkn = Token(Token.CHAR, contents, loc)
elif len(contents) == 2 and contents[0] == "\\":
loc = loc._replace(column=loc.column + 2)
escape = escape_char(contents[1])
if len(escape) == 2:
self.tkn = Token(Token.CHAR, escape[1], loc)
self.warn("unrecognized backslash escape", loc)
else:
self.tkn = Token(Token.CHAR, escape, loc)
if len(s) == 1:
self.tkn = Token(Token.CHAR, s, loc)
elif len(s) == 2 and s[0] == "\\":
self.tkn = Token(Token.CHAR, s[1], loc)
else:
self.tkn = Token(Token.ERROR, "over-long character literal", loc)

self.next_char()
def consume_delimited(self, delimiter) -> str:
sbuilder = []
while self.position < len(self.text) and self.text[self.position] != delimiter:
if self.text[self.position] == "\\":
value, length = self.read_escape_char()
self.next_char()
if length == 0:
# Length of zero indicates EOF.
break

sbuilder.append(value)
for _ in range(length):
self.next_char()
else:
sbuilder.append(self.text[self.position])
self.next_char()

return "".join(sbuilder)

def skip(self) -> None:
"""Skip past whitespace and comments."""
Expand Down Expand Up @@ -220,6 +255,9 @@ def skip(self) -> None:
break

def next_char(self) -> None:
"""Advance the position in the text by one. Do not call this method if the
current position is past the end of the text.
"""
if self.text[self.position] == "\n":
self.line += 1
self.column = 1
Expand All @@ -228,6 +266,9 @@ def next_char(self) -> None:
self.position += 1

def peek_char(self, n=1) -> str:
"""Return the n'th character in the text past the current position. If past the
end, return the empty string.
"""
return (
self.text[self.position + n] if self.position + n < len(self.text) else ""
)
Expand All @@ -239,9 +280,6 @@ def set_token(self, typ: str, *, length=1) -> None:
self.next_char()
self.tkn = Token(typ, value, loc)

def err(self, msg: str, loc) -> None:
self.messages.err(msg, loc)

def warn(self, msg: str, loc) -> None:
self.messages.warn(msg, loc)

Expand Down
11 changes: 4 additions & 7 deletions hera/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,13 +130,10 @@ def print_message(msg: str, *, loc=None) -> None:

if isinstance(loc, Location):
linetext = loc.file_lines[loc.line - 1]
if linetext.strip():
caret = align_caret(linetext, loc.column) + "^"
msg += ", line {} col {} of {}\n\n {}\n {}\n".format(
loc.line, loc.column, loc.path, linetext, caret
)
else:
msg += ", line {0.line} col {0.column} of {0.path}".format(loc)
caret = align_caret(linetext, loc.column) + "^"
msg += ", line {} col {} of {}\n\n {}\n {}\n".format(
loc.line, loc.column, loc.path, linetext, caret
)

sys.stderr.write(msg + "\n")

Expand Down
4 changes: 2 additions & 2 deletions test/test_error.py
Original file line number Diff line number Diff line change
Expand Up @@ -460,10 +460,10 @@ def test_mega_error(capsys):
ADD('c', "abc")
^
Error: expected register, line 8 col 6 of test/assets/error/mega_error.hera
Error: expected register, line 8 col 5 of test/assets/error/mega_error.hera
ADD('c', "abc")
^
^
Error: expected register, line 8 col 10 of test/assets/error/mega_error.hera
Expand Down
9 changes: 8 additions & 1 deletion test/test_parse_error.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,14 @@ def test_parse_error_for_unclosed_arglist(capsys):
captured = capsys.readouterr().err
assert (
captured
== "\nError: expected comma or right parenthesis, line 2 col 1 of <stdin>\n"
== """\
Error: expected comma or right parenthesis, line 2 col 1 of <stdin>
^
"""
)


Expand Down
32 changes: 32 additions & 0 deletions test/test_unit/test_lexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,14 @@ def test_lexer_with_character_literal():
assert eq(lexer.next_token(), Token(Token.EOF, ""))


def test_lexer_with_consecutive_character_literals():
lexer = lex_helper("'a''b'")

assert eq(lexer.tkn, Token(Token.CHAR, "a"))
assert eq(lexer.next_token(), Token(Token.CHAR, "b"))
assert eq(lexer.next_token(), Token(Token.EOF, ""))


def test_lexer_with_character_literal_backslash_escape():
lexer = lex_helper("'\\n'")

Expand All @@ -68,6 +76,30 @@ def test_lexer_with_character_literal_backslash_escape():
assert eq(lexer.next_token(), Token(Token.EOF, ""))


def test_lexer_with_hex_escapes():
lexer = lex_helper("'\\x41' \"\\x41\"")

assert eq(lexer.tkn, Token(Token.CHAR, "A"))
assert eq(lexer.next_token(), Token(Token.STRING, "A"))


def test_lexer_with_octal_escapes():
lexer = lex_helper("'\\0''\\12' \"\\141\"")

assert eq(lexer.tkn, Token(Token.CHAR, "\x00"))
assert eq(lexer.next_token(), Token(Token.CHAR, "\n"))
assert eq(lexer.next_token(), Token(Token.STRING, "a"))


def test_lexer_with_invalid_hex_escape():
lexer = lex_helper("'\\xa' \"\\xgh\"")

assert len(lexer.messages.warnings) == 1
assert eq(lexer.tkn, Token(Token.ERROR, "over-long character literal"))
assert eq(lexer.next_token(), Token(Token.STRING, "xgh"))
assert len(lexer.messages.warnings) == 2


def test_lexer_with_over_long_character_literal():
lexer = lex_helper("'abc' 10")

Expand Down

0 comments on commit 542edc8

Please sign in to comment.