frontend/dockerfile/parser/parser.go

// The parser package implements a parser that transforms a raw byte-stream
// into a low-level Abstract Syntax Tree.
package parser

import (
	"bufio"
	"bytes"
	"fmt"
	"io"
	"regexp"
	"strconv"
	"strings"
	"unicode"

	"github.com/khulnasoft/devkit/frontend/dockerfile/command"
	"github.com/khulnasoft/devkit/frontend/dockerfile/shell"
	"github.com/pkg/errors"
)

// Node is a structure used to represent a parse tree.
//
// In the node there are three fields, Value, Next, and Children. Value is the
// current token's string value. Next is always the next non-child token, and
// children contains all the children. Here's an example:
//
// (value next (child child-next child-next-next) next-next)
//
// This data structure is frankly pretty lousy for handling complex languages,
// but lucky for us the Dockerfile isn't very complicated. This structure
// works a little more effectively than a "proper" parse tree for our needs.
type Node struct {
	Value       string          // actual content
	Next        *Node           // the next item in the current sexp
	Children    []*Node         // the children of this sexp
	Heredocs    []Heredoc       // extra heredoc content attachments
	Attributes  map[string]bool // special attributes for this node
	Original    string          // original line used before parsing
	Flags       []string        // only top Node should have this set
	StartLine   int             // the line in the original dockerfile where the node begins
	EndLine     int             // the line in the original dockerfile where the node ends
	PrevComment []string
}

// Location return the location of node in source code
func (node *Node) Location() []Range {
	return toRanges(node.StartLine, node.EndLine)
}

// Dump dumps the AST defined by `node` as a list of sexps.
// Returns a string suitable for printing.
func (node *Node) Dump() string {
	str := strings.ToLower(node.Value)

	if len(node.Flags) > 0 {
		str += fmt.Sprintf(" %q", node.Flags)
	}

	for _, n := range node.Children {
		str += "(" + n.Dump() + ")\n"
	}

	for n := node.Next; n != nil; n = n.Next {
		if len(n.Children) > 0 {
			str += " " + n.Dump()
		} else {
			str += " " + strconv.Quote(n.Value)
		}
	}

	return strings.TrimSpace(str)
}

func (node *Node) lines(start, end int) {
	node.StartLine = start
	node.EndLine = end
}

func (node *Node) canContainHeredoc() bool {
	// check for compound commands, like ONBUILD
	if ok := heredocCompoundDirectives[strings.ToLower(node.Value)]; ok {
		if node.Next != nil && len(node.Next.Children) > 0 {
			node = node.Next.Children[0]
		}
	}

	if ok := heredocDirectives[strings.ToLower(node.Value)]; !ok {
		return false
	}
	if isJSON := node.Attributes["json"]; isJSON {
		return false
	}

	return true
}

// AddChild adds a new child node, and updates line information
func (node *Node) AddChild(child *Node, startLine, endLine int) {
	child.lines(startLine, endLine)
	if node.StartLine < 0 {
		node.StartLine = startLine
	}
	node.EndLine = endLine
	node.Children = append(node.Children, child)
}

type Heredoc struct {
	Name           string
	FileDescriptor uint
	Expand         bool
	Chomp          bool
	Content        string
}

var (
	dispatch      map[string]func(string, *directives) (*Node, map[string]bool, error)
	reWhitespace  = regexp.MustCompile(`[\t\v\f\r ]+`)
	reComment     = regexp.MustCompile(`^#.*$`)
	reHeredoc     = regexp.MustCompile(`^(\d*)<<(-?)([^<]*)$`)
	reLeadingTabs = regexp.MustCompile(`(?m)^\t+`)
)

// DefaultEscapeToken is the default escape token
const DefaultEscapeToken = '\\'

var (
	// Directives allowed to contain heredocs
	heredocDirectives = map[string]bool{
		command.Add:  true,
		command.Copy: true,
		command.Run:  true,
	}

	// Directives allowed to contain directives containing heredocs
	heredocCompoundDirectives = map[string]bool{
		command.Onbuild: true,
	}
)

// directives is the structure used during a build run to hold the state of
// parsing directives.
type directives struct {
	parser                DirectiveParser
	escapeToken           rune           // Current escape token
	lineContinuationRegex *regexp.Regexp // Current line continuation regex
}

// setEscapeToken sets the default token for escaping characters and as line-
// continuation token in a Dockerfile. Only ` (backtick) and \ (backslash) are
// allowed as token.
func (d *directives) setEscapeToken(s string) error {
	if s != "`" && s != `\` {
		return errors.Errorf("invalid escape token '%s' does not match ` or \\", s)
	}
	d.escapeToken = rune(s[0])
	// The escape token is used both to escape characters in a line and as line
	// continuation token. If it's the last non-whitespace token, it is used as
	// line-continuation token, *unless* preceded by an escape-token.
	//
	// The second branch in the regular expression handles line-continuation
	// tokens on their own line, which don't have any character preceding them.
	//
	// Due to Go lacking negative look-ahead matching, this regular expression
	// does not currently handle a line-continuation token preceded by an *escaped*
	// escape-token ("foo \\\").
	d.lineContinuationRegex = regexp.MustCompile(`([^\` + s + `])\` + s + `[ \t]*$|^\` + s + `[ \t]*$`)
	return nil
}

// possibleParserDirective looks for parser directives, eg '# escapeToken=<char>'.
// Parser directives must precede any builder instruction or other comments,
// and cannot be repeated.
func (d *directives) possibleParserDirective(line string) error {
	directive, err := d.parser.ParseLine([]byte(line))
	if err != nil {
		return err
	}
	if directive != nil && directive.Name == keyEscape {
		return d.setEscapeToken(directive.Value)
	}
	return nil
}

// newDefaultDirectives returns a new directives structure with the default escapeToken token
func newDefaultDirectives() *directives {
	d := &directives{}
	d.setEscapeToken(string(DefaultEscapeToken))
	return d
}

func init() {
	// Dispatch Table. see line_parsers.go for the parse functions.
	// The command is parsed and mapped to the line parser. The line parser
	// receives the arguments but not the command, and returns an AST after
	// reformulating the arguments according to the rules in the parser
	// functions. Errors are propagated up by Parse() and the resulting AST can
	// be incorporated directly into the existing AST as a next.
	dispatch = map[string]func(string, *directives) (*Node, map[string]bool, error){
		command.Add:         parseMaybeJSONToList,
		command.Arg:         parseNameOrNameVal,
		command.Cmd:         parseMaybeJSON,
		command.Copy:        parseMaybeJSONToList,
		command.Entrypoint:  parseMaybeJSON,
		command.Env:         parseEnv,
		command.Expose:      parseStringsWhitespaceDelimited,
		command.From:        parseStringsWhitespaceDelimited,
		command.Healthcheck: parseHealthConfig,
		command.Label:       parseLabel,
		command.Maintainer:  parseString,
		command.Onbuild:     parseSubCommand,
		command.Run:         parseMaybeJSON,
		command.Shell:       parseMaybeJSON,
		command.StopSignal:  parseString,
		command.User:        parseString,
		command.Volume:      parseMaybeJSONToList,
		command.Workdir:     parseString,
	}
}

// newNodeFromLine splits the line into parts, and dispatches to a function
// based on the command and command arguments. A Node is created from the
// result of the dispatch.
func newNodeFromLine(line string, d *directives, comments []string) (*Node, error) {
	cmd, flags, args, err := splitCommand(line)
	if err != nil {
		return nil, err
	}

	fn := dispatch[strings.ToLower(cmd)]
	// Ignore invalid Dockerfile instructions
	if fn == nil {
		fn = parseIgnore
	}
	next, attrs, err := fn(args, d)
	if err != nil {
		return nil, err
	}

	return &Node{
		Value:       cmd,
		Original:    line,
		Flags:       flags,
		Next:        next,
		Attributes:  attrs,
		PrevComment: comments,
	}, nil
}

// Result contains the bundled outputs from parsing a Dockerfile.
type Result struct {
	AST         *Node
	EscapeToken rune
	Warnings    []Warning
}

// Warning contains information to identify and locate a warning generated
// during parsing.
type Warning struct {
	Short    string
	Detail   [][]byte
	URL      string
	Location *Range
}

// PrintWarnings to the writer
func (r *Result) PrintWarnings(out io.Writer) {
	if len(r.Warnings) == 0 {
		return
	}
	for _, w := range r.Warnings {
		fmt.Fprintf(out, "[WARNING]: %s\n", w.Short)
	}
	if len(r.Warnings) > 0 {
		fmt.Fprintf(out, "[WARNING]: Empty continuation lines will become errors in a future release.\n")
	}
}

// Parse consumes lines from a provided Reader, parses each line into an AST
// and returns the results of doing so.
func Parse(rwc io.Reader) (*Result, error) {
	d := newDefaultDirectives()
	currentLine := 0
	root := &Node{StartLine: -1}
	scanner := bufio.NewScanner(rwc)
	scanner.Split(scanLines)
	warnings := []Warning{}
	var comments []string

	var err error
	for scanner.Scan() {
		bytesRead := scanner.Bytes()
		if currentLine == 0 {
			// First line, strip the byte-order-marker if present
			bytesRead = bytes.TrimPrefix(bytesRead, utf8bom)
		}
		if isComment(bytesRead) {
			comment := strings.TrimSpace(string(bytesRead[1:]))
			if comment == "" {
				comments = nil
			} else {
				comments = append(comments, comment)
			}
		}
		bytesRead, err = processLine(d, bytesRead, true)
		if err != nil {
			return nil, withLocation(err, currentLine, 0)
		}
		currentLine++

		startLine := currentLine
		line, isEndOfLine := trimContinuationCharacter(string(bytesRead), d)
		if isEndOfLine && line == "" {
			continue
		}

		var hasEmptyContinuationLine bool
		for !isEndOfLine && scanner.Scan() {
			bytesRead, err := processLine(d, scanner.Bytes(), false)
			if err != nil {
				return nil, withLocation(err, currentLine, 0)
			}
			currentLine++

			if isComment(scanner.Bytes()) {
				// original line was a comment (processLine strips comments)
				continue
			}
			if isEmptyContinuationLine(bytesRead) {
				hasEmptyContinuationLine = true
				continue
			}

			continuationLine := string(bytesRead)
			continuationLine, isEndOfLine = trimContinuationCharacter(continuationLine, d)
			line += continuationLine
		}

		if hasEmptyContinuationLine {
			warnings = append(warnings, Warning{
				Short:    "Empty continuation line found in: " + line,
				Detail:   [][]byte{[]byte("Empty continuation lines will become errors in a future release")},
				URL:      "https://github.com/moby/moby/pull/33719",
				Location: &Range{Start: Position{Line: currentLine}, End: Position{Line: currentLine}},
			})
		}

		child, err := newNodeFromLine(line, d, comments)
		if err != nil {
			return nil, withLocation(err, startLine, currentLine)
		}

		if child.canContainHeredoc() {
			heredocs, err := heredocsFromLine(line)
			if err != nil {
				return nil, withLocation(err, startLine, currentLine)
			}

			for _, heredoc := range heredocs {
				terminator := []byte(heredoc.Name)
				terminated := false
				for scanner.Scan() {
					bytesRead := scanner.Bytes()
					currentLine++

					possibleTerminator := trimNewline(bytesRead)
					if heredoc.Chomp {
						possibleTerminator = trimLeadingTabs(possibleTerminator)
					}
					if bytes.Equal(possibleTerminator, terminator) {
						terminated = true
						break
					}
					heredoc.Content += string(bytesRead)
				}
				if !terminated {
					return nil, withLocation(errors.New("unterminated heredoc"), startLine, currentLine)
				}

				child.Heredocs = append(child.Heredocs, heredoc)
			}
		}

		root.AddChild(child, startLine, currentLine)
		comments = nil
	}

	if root.StartLine < 0 {
		return nil, withLocation(errors.New("file with no instructions"), currentLine, 0)
	}

	return &Result{
		AST:         root,
		Warnings:    warnings,
		EscapeToken: d.escapeToken,
	}, withLocation(handleScannerError(scanner.Err()), currentLine, 0)
}

// heredocFromMatch extracts a heredoc from a possible heredoc regex match.
func heredocFromMatch(match []string) (*Heredoc, error) {
	if len(match) == 0 {
		return nil, nil
	}

	fd, _ := strconv.ParseUint(match[1], 10, 0)
	chomp := match[2] == "-"
	rest := match[3]

	if len(rest) == 0 {
		return nil, nil
	}

	shlex := shell.NewLex('\\')
	shlex.SkipUnsetEnv = true

	// Attempt to parse both the heredoc both with *and* without quotes.
	// If there are quotes in one but not the other, then we know that some
	// part of the heredoc word is quoted, so we shouldn't expand the content.
	shlex.RawQuotes = false
	words, err := shlex.ProcessWords(rest, []string{})
	if err != nil {
		return nil, err
	}
	// quick sanity check that rest is a single word
	if len(words) != 1 {
		return nil, nil
	}

	shlex.RawQuotes = true
	wordsRaw, err := shlex.ProcessWords(rest, []string{})
	if err != nil {
		return nil, err
	}
	if len(wordsRaw) != len(words) {
		return nil, errors.Errorf("internal lexing of heredoc produced inconsistent results: %s", rest)
	}

	word := words[0]
	wordQuoteCount := strings.Count(word, `'`) + strings.Count(word, `"`)
	wordRaw := wordsRaw[0]
	wordRawQuoteCount := strings.Count(wordRaw, `'`) + strings.Count(wordRaw, `"`)

	expand := wordQuoteCount == wordRawQuoteCount

	return &Heredoc{
		Name:           word,
		Expand:         expand,
		Chomp:          chomp,
		FileDescriptor: uint(fd),
	}, nil
}

// ParseHeredoc parses a heredoc word from a target string, returning the
// components from the doc.
func ParseHeredoc(src string) (*Heredoc, error) {
	return heredocFromMatch(reHeredoc.FindStringSubmatch(src))
}

// MustParseHeredoc is a variant of ParseHeredoc that discards the error, if
// there was one present.
func MustParseHeredoc(src string) *Heredoc {
	heredoc, _ := ParseHeredoc(src)
	return heredoc
}

func heredocsFromLine(line string) ([]Heredoc, error) {
	shlex := shell.NewLex('\\')
	shlex.RawQuotes = true
	shlex.RawEscapes = true
	shlex.SkipUnsetEnv = true
	words, _ := shlex.ProcessWords(line, []string{})

	var docs []Heredoc
	for _, word := range words {
		heredoc, err := ParseHeredoc(word)
		if err != nil {
			return nil, err
		}
		if heredoc != nil {
			docs = append(docs, *heredoc)
		}
	}
	return docs, nil
}

// ChompHeredocContent chomps leading tabs from the heredoc.
func ChompHeredocContent(src string) string {
	return reLeadingTabs.ReplaceAllString(src, "")
}

func trimComments(src []byte) []byte {
	return reComment.ReplaceAll(src, []byte{})
}

func trimLeadingWhitespace(src []byte) []byte {
	return bytes.TrimLeftFunc(src, unicode.IsSpace)
}
func trimLeadingTabs(src []byte) []byte {
	return bytes.TrimLeft(src, "\t")
}
func trimNewline(src []byte) []byte {
	return bytes.TrimRight(src, "\r\n")
}

func isComment(line []byte) bool {
	return reComment.Match(trimLeadingWhitespace(trimNewline(line)))
}

func isEmptyContinuationLine(line []byte) bool {
	return len(trimLeadingWhitespace(trimNewline(line))) == 0
}

var utf8bom = []byte{0xEF, 0xBB, 0xBF}

func trimContinuationCharacter(line string, d *directives) (string, bool) {
	if d.lineContinuationRegex.MatchString(line) {
		line = d.lineContinuationRegex.ReplaceAllString(line, "$1")
		return line, false
	}
	return line, true
}

// TODO: remove stripLeftWhitespace after deprecation period. It seems silly
// to preserve whitespace on continuation lines. Why is that done?
func processLine(d *directives, token []byte, stripLeftWhitespace bool) ([]byte, error) {
	token = trimNewline(token)
	if stripLeftWhitespace {
		token = trimLeadingWhitespace(token)
	}
	return trimComments(token), d.possibleParserDirective(string(token))
}

// Variation of bufio.ScanLines that preserves the line endings
func scanLines(data []byte, atEOF bool) (advance int, token []byte, err error) {
	if atEOF && len(data) == 0 {
		return 0, nil, nil
	}
	if i := bytes.IndexByte(data, '\n'); i >= 0 {
		return i + 1, data[0 : i+1], nil
	}
	if atEOF {
		return len(data), data, nil
	}
	return 0, nil, nil
}

func handleScannerError(err error) error {
	switch err {
	case bufio.ErrTooLong:
		return errors.Errorf("dockerfile line greater than max allowed size of %d", bufio.MaxScanTokenSize-1)
	default:
		return err
	}
}