-
Notifications
You must be signed in to change notification settings - Fork 15
/
LexToken.scala
130 lines (119 loc) · 6.21 KB
/
LexToken.scala
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
/*
* Copyright 2020 Parsley Contributors <https://github.com/j-mie6/Parsley/graphs/contributors>
*
* SPDX-License-Identifier: BSD-3-Clause
*/
package parsley.errors.tokenextractors
import scala.collection.immutable.WrappedString
import parsley.Parsley, Parsley.{atomic, lookAhead, notFollowedBy}
import parsley.Success
import parsley.XAssert.assert
import parsley.character.{item, stringOfSome}
import parsley.combinator.{option, traverse}
import parsley.errors.{ErrorBuilder, Token}
import parsley.position
import org.typelevel.scalaccompat.annotation.unused
/** This extractor mixin provides an implementation for
* [[parsley.errors.ErrorBuilder.unexpectedToken `ErrorBuilder.unexpectedToken`]] when mixed into
* an error builder: it will try and parse the residual input to identify a valid lexical token
* to report.
*
* When parsing a grammar that as a dedicated lexical distinction, it is nice to be able to report
* problematic tokens relevant to that grammar as opposed to generic input lifted straight from the
* input stream. The easiest way of doing this would be having a pre-lexing pass and parsing based
* on tokens, but this is deliberately not how Parsley is designed. Instead, this extractor can
* try and parse the remaining input to try and identify a token on demand.
*
* If the `lexicalError` flag of the `unexpectedToken` method is not set, which would indicate a
* problem within a token reported by a classical lexer and not the parser, the extractor will
* try to parse each of the provided `tokens` in turn: whichever is the longest matched of these
* tokens will be reported as the problematic one (this can be changed by overriding `selectToken`).
* For best effect, these tokens should not consume
* whitespace (which would otherwise be included at the end of the token!): this means that, if
* using the `Lexer` class, the functionality in '''`nonlexeme`''' should be used. If one of the
* givens tokens cannot be parsed, the input until the ''next'' valid parsable token (or end of input)
* is returned as a `Token.Raw`.
*
* Currently, if `lexicalError` is true, this extractor will just return the next character
* as the problematic item (this may be changed by overriding the `extractItem` method).
*
* @since 4.0.0
*/
trait LexToken { this: ErrorBuilder[_] =>
/** The tokens that should be recognised by this extractor: each parser should return the
* intended name of the token exactly as it should appear in the `Named` token.
*
* This '''should''' include a whitespace parser for "unexpected whitespace".
*
* @since 4.0.0
* @note with the exception of the whitespace parser, these tokens should not consume
* trailing (and certainly not leading) whitespace: if using definitions from
* `parsley.token.Lexer` functionality, the `nonlexeme` versions of the tokens
* should be used.
*/
def tokens: Seq[Parsley[String]]
// this parser cannot and must not fail
private lazy val makeParser: Parsley[Either[::[(String, Int)], String]] = tokens match {
case t0 +: ts =>
val toks = traverse(t0, ts: _*)(p => option(lookAhead(atomic(p) <~> position.offset))).map(_.flatten).collect { case toks@(_::_) => toks }
// this can only fail if either there is no input (which there must be), or there is a token at the front, in which case `rawTok` is not parsed anyway
val rawTok = stringOfSome(traverse(t0, ts: _*)(notFollowedBy) *> item)
toks <+> rawTok
case _ => stringOfSome(_ => true).map(Right(_))
}
/** If the extractor is successful in identifying tokens that can be parsed from
* the residual input, this function will select ''one'' of them to report back.
*
* The default behaviour is to take the longest matched token (i.e. the one with
* the largest paired position). In case of a tie, the first token is chosen:
* this means that more specific tokens should be put sooner in the `tokens` list.
*
* @param matchedToks the list of tokens successfully parsed, along with the position
* at the end of that parse (careful: this position starts back at
* `(1, 1)`, ''not'' where the original parser left off!)
* @return the chosen token and position pair
* @note the `matchedToks` list is guaranteed to be non-empty
* @since 4.0.0
*/
def selectToken(matchedToks: List[(String, Int)]): (String, Int) = matchedToks.maxBy(_._2)
private final def selectTokenAndBuild(matchedToks: ::[(String, Int)]): Token = {
val (name, offset) = selectToken(matchedToks)
Token.Named(name, offset)
}
private final def extractToken(cs: Iterable[Char]): Token = {
assert(cs.nonEmpty, "we promised that the input is non-empty!")
val Success(rawOrToks) = makeParser.parse {
cs match {
case cs: WrappedString => cs.toString
case cs => cs.mkString
}
}: @unchecked
rawOrToks.fold(selectTokenAndBuild, Token.Raw.apply)
}
/** If the parser failed during the parsing of a token, this function extracts the problematic
* item from the remaining input.
*
* The default behaviour mimics [[parsley.errors.tokenextractors.SingleChar `SingleChar`]].
*
* @since 4.0.0
*/
def extractItem(cs: Iterable[Char], @unused amountOfInputParserWanted: Int): Token = SingleChar.unexpectedToken(cs)
/** @see [[parsley.errors.ErrorBuilder.unexpectedToken `ErrorBuilder.unexpectedToken`]] */
override final def unexpectedToken(cs: Iterable[Char], amountOfInputParserWanted: Int, lexicalError: Boolean): Token = {
if (!lexicalError) extractToken(cs)
// No lexical extraction should occur here!
else extractItem(cs, amountOfInputParserWanted)
}
}
/** This object contains helper functions useful for interacting with `LexToken`.
* @since 4.0.0
*/
object LexToken {
/** A convenient way to map a bunch of constant symbols to their names for the
* the generated error.
* @since 4.0.0
*/
def constantSymbols(ps: (Parsley[_], String)*): Seq[Parsley[String]] = ps.map {
case (p, n) => p.as(n)
}
}