lexer/lexer.go

// Package lexer is an AWK lexer (tokenizer).
//
// The lexer turns a string of AWK source code into a stream of
// tokens for parsing.
//
// To tokenize some source, create a new lexer with NewLexer(src) and
// then call Scan() until the token type is EOF or ILLEGAL.
//
package lexer

import (
	"fmt"
)

// Lexer tokenizes a byte string of AWK source code. Use NewLexer to
// actually create a lexer, and Scan() or ScanRegex() to get tokens.
type Lexer struct {
	src      []byte
	offset   int
	ch       byte
	pos      Position
	nextPos  Position
	hadSpace bool
	lastTok  Token
}

// Position stores the source line and column where a token starts.
type Position struct {
	// Line number of the token (starts at 1).
	Line int
	// Column on the line (starts at 1). Note that this is the byte
	// offset into the line, not rune offset.
	Column int
}

// NewLexer creates a new lexer that will tokenize the given source
// code. See the module-level example for a working example.
func NewLexer(src []byte) *Lexer {
	l := &Lexer{src: src}
	l.nextPos.Line = 1
	l.nextPos.Column = 1
	l.next()
	return l
}

// HadSpace returns true if the previously-scanned token had
// whitespace before it. Used by the parser because when calling a
// user-defined function the grammar doesn't allow a space between
// the function name and the left parenthesis.
func (l *Lexer) HadSpace() bool {
	return l.hadSpace
}

// Scan scans the next token and returns its position (line/column),
// token value (one of the uppercased token constants), and the
// string value of the token. For most tokens, the token value is
// empty. For NAME, NUMBER, STRING, and REGEX tokens, it's the
// token's value. For an ILLEGAL token, it's the error message.
func (l *Lexer) Scan() (Position, Token, string) {
	pos, tok, val := l.scan()
	l.lastTok = tok
	return pos, tok, val
}

// Does the real work of scanning. Scan() wraps this to more easily
// set lastTok.
func (l *Lexer) scan() (Position, Token, string) {
	// Skip whitespace (except newline, which is a token)
	l.hadSpace = false
	for l.ch == ' ' || l.ch == '\t' || l.ch == '\r' || l.ch == '\\' {
		l.hadSpace = true
		if l.ch == '\\' {
			l.next()
			if l.ch == '\r' {
				l.next()
			}
			if l.ch != '\n' {
				return l.pos, ILLEGAL, "expected \\n after \\ line continuation"
			}
		}
		l.next()
	}
	if l.ch == '#' {
		// Skip comment till end of line
		l.next()
		for l.ch != '\n' && l.ch != 0 {
			l.next()
		}
	}
	if l.ch == 0 {
		// l.next() reached end of input
		return l.pos, EOF, ""
	}

	pos := l.pos
	tok := ILLEGAL
	val := ""

	ch := l.ch
	l.next()

	// Names: keywords and functions
	if isNameStart(ch) {
		start := l.offset - 2
		for isNameStart(l.ch) || (l.ch >= '0' && l.ch <= '9') {
			l.next()
		}
		name := string(l.src[start : l.offset-1])
		tok := KeywordToken(name)
		if tok == ILLEGAL {
			tok = NAME
			val = name
		}
		return pos, tok, val
	}

	// These are ordered by my guess at frequency of use. Should run
	// through a corpus of real AWK programs to determine actual
	// frequency.
	switch ch {
	case '$':
		tok = DOLLAR
	case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '.':
		// Avoid make/append and use l.offset directly for performance
		start := l.offset - 2
		gotDigit := false
		if ch != '.' {
			gotDigit = true
			for l.ch >= '0' && l.ch <= '9' {
				l.next()
			}
			if l.ch == '.' {
				l.next()
			}
		}
		for l.ch >= '0' && l.ch <= '9' {
			gotDigit = true
			l.next()
		}
		if !gotDigit {
			return l.pos, ILLEGAL, "expected digits"
		}
		if l.ch == 'e' || l.ch == 'E' {
			l.next()
			gotSign := false
			if l.ch == '+' || l.ch == '-' {
				gotSign = true
				l.next()
			}
			gotDigit = false
			for l.ch >= '0' && l.ch <= '9' {
				l.next()
				gotDigit = true
			}
			// Per awk/gawk, "1e" is allowed, but not "1e+"
			if gotSign && !gotDigit {
				return l.pos, ILLEGAL, "expected digits"
			}
		}
		tok = NUMBER
		val = string(l.src[start : l.offset-1])
	case '{':
		tok = LBRACE
	case '}':
		tok = RBRACE
	case '=':
		tok = l.choice('=', ASSIGN, EQUALS)
	case '<':
		tok = l.choice('=', LESS, LTE)
	case '>':
		switch l.ch {
		case '=':
			l.next()
			tok = GTE
		case '>':
			l.next()
			tok = APPEND
		default:
			tok = GREATER
		}
	case '"', '\'':
		// Note: POSIX awk spec doesn't allow single-quoted strings,
		// but this helps without quoting, especially on Windows
		// where the shell quote character is " (double quote).
		chars := make([]byte, 0, 32) // most won't require heap allocation
		for l.ch != ch {
			c := l.ch
			if c == 0 {
				return l.pos, ILLEGAL, "didn't find end quote in string"
			}
			if c == '\r' || c == '\n' {
				return l.pos, ILLEGAL, "can't have newline in string"
			}
			if c == '\\' {
				l.next()
				switch l.ch {
				case 'n':
					c = '\n'
				case 't':
					c = '\t'
				case 'r':
					c = '\r'
				default:
					c = l.ch
				}
			}
			chars = append(chars, c)
			l.next()
		}
		l.next()
		tok = STRING
		val = string(chars)
	case '(':
		tok = LPAREN
	case ')':
		tok = RPAREN
	case ',':
		tok = COMMA
	case ';':
		tok = SEMICOLON
	case '+':
		switch l.ch {
		case '+':
			l.next()
			tok = INCR
		case '=':
			l.next()
			tok = ADD_ASSIGN
		default:
			tok = ADD
		}
	case '-':
		switch l.ch {
		case '-':
			l.next()
			tok = DECR
		case '=':
			l.next()
			tok = SUB_ASSIGN
		default:
			tok = SUB
		}
	case '*':
		switch l.ch {
		case '*':
			l.next()
			tok = l.choice('=', POW, POW_ASSIGN)
		case '=':
			l.next()
			tok = MUL_ASSIGN
		default:
			tok = MUL
		}
	case '/':
		tok = l.choice('=', DIV, DIV_ASSIGN)
	case '%':
		tok = l.choice('=', MOD, MOD_ASSIGN)
	case '[':
		tok = LBRACKET
	case ']':
		tok = RBRACKET
	case '\n':
		tok = NEWLINE
	case '^':
		tok = l.choice('=', POW, POW_ASSIGN)
	case '!':
		switch l.ch {
		case '=':
			l.next()
			tok = NOT_EQUALS
		case '~':
			l.next()
			tok = NOT_MATCH
		default:
			tok = NOT
		}
	case '~':
		tok = MATCH
	case '?':
		tok = QUESTION
	case ':':
		tok = COLON
	case '&':
		tok = l.choice('&', ILLEGAL, AND)
		if tok == ILLEGAL {
			return l.pos, ILLEGAL, "unexpected char after '&'"
		}
	case '|':
		tok = l.choice('|', PIPE, OR)
	default:
		tok = ILLEGAL
		val = "unexpected char"
	}
	return pos, tok, val
}

// ScanRegex parses an AWK regular expression in /slash/ syntax. The
// AWK grammar has somewhat special handling of regex tokens, so the
// parser can only call this after a DIV or DIV_ASSIGN token has just
// been scanned.
func (l *Lexer) ScanRegex() (Position, Token, string) {
	pos, tok, val := l.scanRegex()
	l.lastTok = tok
	return pos, tok, val
}

// Does the real work of scanning a regex. ScanRegex() wraps this to
// more easily set lastTok.
func (l *Lexer) scanRegex() (Position, Token, string) {
	pos := l.pos
	chars := make([]byte, 0, 32) // most won't require heap allocation
	switch l.lastTok {
	case DIV:
		// Regex after '/' (the usual case)
		pos.Column -= 1
	case DIV_ASSIGN:
		// Regex after '/=' (happens when regex starts with '=')
		pos.Column -= 2
		chars = append(chars, '=')
	default:
		return l.pos, ILLEGAL, fmt.Sprintf("unexpected %s preceding regex", l.lastTok)
	}
	for l.ch != '/' {
		c := l.ch
		if c == 0 {
			return l.pos, ILLEGAL, "didn't find end slash in regex"
		}
		if c == '\r' || c == '\n' {
			return l.pos, ILLEGAL, "can't have newline in regex"
		}
		if c == '\\' {
			l.next()
			if l.ch != '/' {
				chars = append(chars, '\\')
			}
			c = l.ch
		}
		chars = append(chars, c)
		l.next()
	}
	l.next()
	return pos, REGEX, string(chars)
}

// Load the next character into l.ch (or 0 on end of input) and update
// line and column position.
func (l *Lexer) next() {
	l.pos = l.nextPos
	if l.offset >= len(l.src) {
		// For last character, move offset 1 past the end as it
		// simplifies offset calculations in NAME and NUMBER
		if l.ch != 0 {
			l.ch = 0
			l.offset++
		}
		return
	}
	ch := l.src[l.offset]
	if ch == '\n' {
		l.nextPos.Line++
		l.nextPos.Column = 1
	} else {
		l.nextPos.Column++
	}
	l.ch = ch
	l.offset++
}

func isNameStart(ch byte) bool {
	return ch == '_' || (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z')
}

func (l *Lexer) choice(ch byte, one, two Token) Token {
	if l.ch == ch {
		l.next()
		return two
	}
	return one
}