-
Notifications
You must be signed in to change notification settings - Fork 275
/
LineTokenizer.scala
249 lines (223 loc) · 9.74 KB
/
LineTokenizer.scala
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
package net.liftweb.markdown
/*
* Copyright 2013 WorldWide Conferencing, LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Based on https://github.com/chenkelmann/actuarius originally developed by
* Christoph Henkelmann http://henkelmann.eu/
*/
import scala.collection.mutable.{HashMap, ArrayBuffer}
import scala.language.postfixOps
import scala.util.parsing.combinator.Parsers
import scala.util.parsing.input.{Position, Reader}
import scala.xml
/**
* A Reader for reading whole Strings as tokens.
* Used by the Tokenizer to parse whole lines as one Element.
*/
case class LineReader private (val lines:Seq[String],
val lineCount:Int)
extends Reader[String] {
/**should never be used anywhere, just a string that should stick out for better debugging*/
private def eofLine = "EOF"
def this(ls:Seq[String]) = this(ls, 1)
def first = if (lines.isEmpty) eofLine else lines.head
def rest = if (lines.isEmpty) this else new LineReader(lines.tail, lineCount + 1)
def atEnd = lines.isEmpty
def pos = new Position {
def line = lineCount
def column = 1
protected def lineContents = first
}
}
/**
* Chops the input into lines and turns those lines into line tokens.
* Also takes care of preprocessing link definitions and xml blocks.
*/
class LineTokenizer() extends Parsers {
object lineParsers extends LineParsers
/**we munch whole lines (OM NOM NOM)
*/
type Elem = String
/** Determines if xml blocks may be included verbatim.
* If true, they are passed through, else they are escaped and turned into paragraphs
*/
def allowXmlBlocks = true
/**
* Returns a parser based on the given line parser.
* The resulting parser succeeds if the given line parser consumes the whole String.
*/
def p[T](parser:lineParsers.Parser[T]):Parser[T] = Parser{in =>
if (in.atEnd) {
Failure("End of Input.", in)
} else {
lineParsers.parseAll(parser, in.first) match {
case lineParsers.Success(t, _) => Success(t, in.rest)
case n:lineParsers.NoSuccess => Failure(n.msg, in)
}
}
}
/** Returns the first char in the given string or a newline if the string is empty.
* This is done to speed up header parsing. Used to speed up line tokenizing substantially
* by using the first char in a line as lookahead for which parsers to even try.
*/
def firstChar(line:String):Char = {
if (line.length == 0) '\n' else line.charAt(0)
}
/**Finds the char in the given line that is the best indication of what kind of markdown line this is.
* The “special” Markdown lines all start with up to three spaces. Those are skipped if present.
* The first char after those (up to)three spaces or a newline is returned.
*/
def indicatorChar(line:String):Char = {
var i = 0
//skip the first three spaces, if present
while (i < 3 && i < line.length && line.charAt(i) == ' ') i += 1
//return the next char after the spaces or a newline if there are no more
if (i==line.length) '\n'
else line.charAt(i)
}
////////////////////////
// Link definitions //
////////////////////////
/** Tries to parse an URL from the next line if necessary.
* The passed tuple is the result from a previous parser and used to decide how to continue parsing.
*/
def maybeUrlInNextLine(prev:(LinkDefinitionStart, Option[String])):Parser[LinkDefinition] = prev match {
case (lds, Some(title)) => success(lds.toLinkDefinition(Some(title)))
case (lds, None) => Parser {in =>
if (in.atEnd) {
Success(lds.toLinkDefinition(None), in)
} else {
lineParsers.parseAll(lineParsers.linkDefinitionTitle, in.first) match {
case lineParsers.Success(title, _) => Success(lds.toLinkDefinition(Some(title)), in.rest)
case _ => Success(lds.toLinkDefinition(None), in)
}
}
}
}
/**
* Parses a link definition.
*/
def linkDefinition:Parser[LinkDefinition] = p(lineParsers.linkDefinitionStart) into(maybeUrlInNextLine)
/////////////////
// XML blocks //
/////////////////
/** The start of a verbatim XML chunk: any line starting directly with an XML element
*/
def xmlChunkStart = p(lineParsers.xmlBlockStartLine)
/** Parses any line that does not start with a closing XML element.
*/
def notXmlChunkEnd = p(lineParsers.notXmlBlockEndLine)
/** Parses a line beginning with a closing XML tag.
*/
def xmlChunkEnd = p(lineParsers.xmlBlockEndLine)
/** Very dumb parser for XML chunks.
*/
def xmlChunk = xmlChunkStart ~ (notXmlChunkEnd*) ~ xmlChunkEnd ^^ {
case s ~ ms ~ e => new XmlChunk(s + "\n" + ms.mkString("\n") + "\n" + e + "\n")
}
/** Parses Markdown Lines. Always succeeds.
*/
def lineToken = Parser{ in =>
if (in.atEnd) {
Failure("End of Input.", in)
} else {
val line = in.first
(firstChar(line), indicatorChar(line)) match {
case ('=', _) => p(lineParsers.setextHeader1)(in)
case ('-', _) => p(lineParsers.setext2OrRulerOrUItem)(in)
case ('#', _) => p(lineParsers.atxHeader)(in)
case (_, '-') => p(lineParsers.rulerOrUItem)(in)
case (_, '*') => p(lineParsers.rulerOrUItem)(in)
case (_, '+') => p(lineParsers.uItemStartLine)(in)
case (_, '>') => p(lineParsers.blockquoteLine)(in)
case (_, n) if (n >= '0' && n <= '9') => p(lineParsers.oItemStartLine)(in)
case (_, ' ') => p(lineParsers.emptyOrCode)(in)
case (_, '\t')=> p(lineParsers.emptyOrCode)(in)
case (_, '\n')=> p(lineParsers.emptyLine)(in)
case (_, '`') => p(lineParsers.fencedCodeStartOrEnd)(in)
case _ => p(lineParsers.otherLine)(in)
}
}
} | p(lineParsers.otherLine) //this makes sure every line is consumed, even if our guess was no good
/** Parses link definitions and verbatim xml blocks
*/
def preprocessToken = Parser{ in =>
if (in.atEnd) {
Failure("End of Input.", in)
} else {
val line = in.first
(firstChar(line), indicatorChar(line)) match {
//link definitions have absolute precedence
case (_, '[') => linkDefinition(in)
//then filter out xml blocks if allowed
case ('<', _) if (allowXmlBlocks) => xmlChunk(in)
//no token for preprocessing
case _ => Failure("No preprocessing token.", in)
}
}
}
/** Parses tokens that may occur inside a block. Works like the normal token parser except that
* it does not check for link definitions and verbatim XML.
*/
def innerTokens(lookup:Map[String, LinkDefinition]):Parser[MarkdownLineReader] = phrase(lineToken *) ^^ {
case ts => new MarkdownLineReader(ts, lookup)
}
/** Parses first level line tokens, i.e. Markdown lines, XML chunks and link definitions.
*/
def tokens:Parser[MarkdownLineReader] = phrase((preprocessToken | lineToken) *) ^^ { case ts =>
val lines = new ArrayBuffer[MarkdownLine]()
val lookup = new HashMap[String, LinkDefinition]()
for (t <- ts) { t match {
case ld:LinkDefinition => lookup(ld.id) = ld
case ml:MarkdownLine => lines.append(ml)
} }
new MarkdownLineReader(lines.toList, lookup.toMap)
}
/** Simple preprocessing: split the input at each newline. These whole lines are then fed to
* the actual Tokenizer.
*/
def splitLines(s:String):List[String] = {
def chopWindoze(line:String) = {
if (line.endsWith("\r")) {
line.substring(0, line.length-1)
} else {
line
}
}
s.split('\n').map(chopWindoze(_)).toList
}
/** Turns a list of inner lines (the payloads of the lines making up the block)
* into line tokens. Does not check for XML chunks or link definitions.
*/
def innerTokenize(lines:List[String], lookup:Map[String, LinkDefinition])=
innerTokens(lookup)(new LineReader(lines)) match {
case Success(reader, _) => reader
case n:NoSuccess =>
throw new IllegalStateException("Inner line Tokenizing failed. This is a bug. Message was: " + n.msg)
}
/** Tokenizes a whole Markdown document.
*/
def tokenize(s:String):MarkdownLineReader = tokenize(splitLines(s))
/** Tokenizes a preprocessed Markdown document.
*/
def tokenize(lines:List[String]):MarkdownLineReader = tokenize(new LineReader(lines))
/** Tokenizes preprocessed lines read from a line reader.
*/
def tokenize(lines:Reader[String]):MarkdownLineReader = tokens(lines) match {
case Success(reader, _) => reader
case n:NoSuccess =>
throw new IllegalStateException("Tokenizing failed. This is a bug. Message was: " + n.msg)
}
}