-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmarkdownParser.scala
488 lines (406 loc) · 16.5 KB
/
markdownParser.scala
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
package parser
import ast.{MarkdownDoc, Uri}
import parser.Parser.{isNot, list1}
import parser.markdownParser.inlineParser
import scala.annotation.tailrec
/**
* Parse Markdown into a parse tree
* Created by robertk on 13/05/17.
*/
object markdownParser {
import ast._
import Parser._
import Li._
val inLineStartTokens = "\n*_~[`".toList
// parse a String into a structure containing a list of Markdown instructions and a Small Symbol table that can fixup refLinks
def markdownDocumentParser: Parser[MarkdownDoc] = {
Parser(input => {
import Parser._
val parsed = parseAll(markdownASTParser, input)
//mutabl ref to imutable map ... may mak emutable as internal use only?
var symtab = Map[String, Uri]()
//todo maybe make parseAll take a function to do stuff if Link rhather than walk the list twice, either fire a sideefect or return a state/symtab object as well.
// warning side-effect on symtab
parsed match {
case Left(m) => ParseKo(m)
case Right(mdlist) => {
// look at the list and build up the symtable if we need extra knowlege from the parse.
mdlist foreach
{
case m: RefLinkUri => symtab = symtab + (m.refKey -> m.uri)
case _ => ;
}
ParseOk("", MDDoc(mdlist, symtab))
}
}
})
}
def markdownParser: Parser[List[Markdown]] = for {
mds <- list1(markdownASTParser)
} yield mds
def markdownASTParser: Parser[Markdown] =
blockParser ||| inlineParser
def blockParser: Parser[Markdown] =
paragraphParser ||| brParser ||| headerParser ||| blockquoteParser ||| blockCodeParser ||| listParser ||| hardwrapParser
def inlineParser: Parser[Markdown] =
emphasisParser ||| imageParser ||| hyperlinkParser ||| inlineCodeParser ||| rawHtmlParser
// test image before link as it is same except with ! as a prefix
def imageParser: Parser[Markdown] = imgParser ||| refImgParser ||| refLinkUriParser
def hyperlinkParser: Parser[Markdown] = linkParser ||| refLinkParser ||| refLinkUriParser
/**
* Parse Markdown headers. eg
*
* # this is a h1 header
* ### this is a h3 header
* becomes
* <h1>this is a header</h1>
* <h3>this is a header</h3>
*
* h6 is the smalles it will go, ie 10 '#'s will be <h6/>
*
*/
def headerParser: Parser[Markdown] = for {
_ <- list(space)
hashes <- list1(is('#'))
_ <- list(space)
md <- list1(inlineParser)
_ <- (is('\n') ||| value('\n')) // consume a new line or just succeed, we throw away last parse anyway
}
yield hashes.size match {
case 1 => H1(md)
case 2 => H2(md)
case 3 => H3(md)
case 4 => H4(md)
case 5 => H5(md)
case 6 => H6(md)
case 7 => H7(md)
case default => H7(md) // too many #'s so settle on 6
}
/**
* Block quotes can cover multi lines but must be presunted in one block quote eg
* *
* > this is line1
* > this is line2
* *
* this should produce
* *
* <blockquote>this is line1<br/>this is line2</blockquote>
**/
/**
* parses a single line of blockquote
*/
private def simpleblockquoteParser: Parser[SimpleBlockquote] = for {
_ <- list(space)
tag <- is('>')
_ <- list(space)
md <- list1(inlineParser)
_ <- (is('\n') ||| value('\n')) // consume a new line or just succeed, we throw away last parse anyway
} yield SimpleBlockquote(md)
/**
* runs simpleBlocquoteParser as many times as it can to parse all the consecutive lines of '> text'
* joins up the results with 'Br' as a single string and re-wraps in a single blockqote
*/
def blockquoteParser: Parser[Markdown] = {
Parser(input => {
val parsed: ParseState[List[SimpleBlockquote]] = (list1(simpleblockquoteParser)).run(input)
parsed match {
case ParseKo(m) => ParseKo(s"Not a blockquote, expected '> text', error message: $m")
case ParseOk(i, bqs) => {
val bqContent = bqs map ( _.value )
// intersperse the the lists with Br's use tail to chop of the unneeded first BR added
val bqContWithSep = bqContent.foldRight(List[Markdown]())((md, acc ) => Br :: md ++ acc).tail
ParseOk(i, Blockquote(bqContWithSep))
}
}
})
}
def simpleblockCodeParser: Parser[String] = for {
_ <- space
_ <- space
_ <- space
_ <- space
code <- list(isNot('\n'))
_ <- (is('\n') ||| value('\n')) // consume a new line or just succeed, we throw away last parse anyway
} yield code.mkString
def blockCodeParser : Parser[Markdown] = {
Parser(input => {
val parsed: ParseState[List[String]] = (list1(simpleblockCodeParser)).run(input)
parsed match {
case ParseKo(m) => ParseKo(s"Not a BlockCode, expected '4 x spaces then text', error message: $m")
case ParseOk(i, bcs) => {
val code = bcs mkString("\n")
ParseOk(i, BlockCode(code))
}
}
})
}
def inlineCodeParser: Parser[Markdown] = for {
_ <- is('`')
code <- list1(isNot('`'))
_ <- is('`')
} yield InlineCode(code.mkString)
//parse a line to see if it is an un-ordered list item
def uLIparser: Parser[Li] = for {
indent <- list(space)
_ <- isIn(List('*','+','-'))
_ <- list1(space)
md <- list1(inlineParser)
// md <- inlineParser.usemap[List[Char]](list1(isNot('\n')))( _.mkString )
_ <- (is('\n') ||| value('\n'))
} yield uli(md, indent.size)
//parse a line to see if it is an ordered list item
def oLIparser: Parser[Li] = for {
indent <- list(space)
_ <- list1(digit)
_ <- is('.')
_ <- list1(space)
md <- list1(inlineParser)
// md <- inlineParser.usemap[List[Char]](list1(isNot('\n')))( _.mkString )
_ <- (is('\n') ||| value('\n')) // consume a new line or just succeed, we throw away last parse anyway
} yield oli(md, indent.size)
// parse multilines if each line is some kind of li's
def liParser: Parser[List[Li]] = for {
// lis <- list1(uLIparser) ||| list1(oLIparser)
lis <- list1(uLIparser ||| oLIparser)
} yield lis
//create a rose tree of ul or li because we can have nested lists of either type
def listParser: Parser[Markdown] = {
import Tree._
// a function to convert a List[li] into a Parser used in a flatmap
def rose(lis: List[Li]): Parser[Markdown] = {
@tailrec
def rec(tail: List[Li], parentStack: List[Tree[Li]], listForest: List[Tree[Li]]): List[Tree[Li]] = {
import Li._
(tail, parentStack) match {
// base case return the listForest
case (Nil, Nil) => listForest
//pop the parent stack and recurse
case (Nil, p :: ps) => {
//we know parentstack is not empty so pop the head off mak it the forest as stuffu current forest in as a branch then recurse
// the list forest should also not be empty or else why is there a parent, but test to be sure
val poppedParent = if(!listForest.isEmpty) {p ++ listForest.head} else p
rec(tail, ps, poppedParent::listForest.tail)
}
// attache the lii to the correct place and recurse down lis list
case (lii :: rest, _) => {
(lii, listForest) match {
//First tree => prepend to the forest but convert it to a Branch if it is a leaf so the renderer will surround the leaf li with the correct list type
case (fresh@Oli(vf, indentf), Nil) => rec(rest, parentStack, branch(List(oleaf(fresh)),indentf) :: listForest)
case (fresh@Uli(vf, indentf), Nil) => rec(rest, parentStack, branch(List(uleaf(fresh)),indentf) :: listForest)
//same kind same nesting => add the value as a Leaf to the current forest Head
case (fresh@Oli(vf, indentf), t :: restForest) if indentf == t.depth && t.firstValue.getOrElse(Uli(Nil, indentf)).isOrdered => {
// make new lead tree with new leaf attached and replace the leading MdList
val nodeTree = t ++ oleaf(fresh)
rec(rest, parentStack, nodeTree :: restForest)
}
case (fresh@Uli(vf, indentf), t :: restForest) if indentf == t.depth && !t.firstValue.getOrElse(Oli(Nil, indentf)).isOrdered => {
// make new lead tree with new leaf attached and replace the leading MdList
val nodeTree = t ++ uleaf(fresh)
rec(rest, parentStack, nodeTree :: restForest)
}
// fresh li is more deeply nested than the last so recurse into the Branch structure change the parent!
//NB dont need to know specific type of t as we just save it and create a new tree od same type as fresh
case (fresh, t :: restForest) if fresh.indent > t.depth => {
// create a new tree starting with this li and append it to the branch list then recurse down the branch, set the parent so we can go back
// create a new forest whose head is this new branch
val newHeadTree = fresh match {
case f@Oli(vf, indentF) => branch(List(oleaf(f)),indentF)
case f@Uli(vf, indentF) => branch(List(uleaf(f)),indentF)
}
val newForestNested = newHeadTree :: restForest
rec(rest, t :: parentStack, newForestNested);
}
// end of a nested just grab the first tree in forest and addatch it to the parent tree as a branch
// then pop back out dont consume the lis and try again
case (fresh, t :: restForest) if fresh.indent < t.depth => {
//pass same back in but pop out to the parent
parentStack.headOption match {
case None => {
// if we were indented but have no parent then user started indented. we need to create the implied outer list
// of the same type as fresh, but add the nested tree as first child then recurs without consuming the list as weare recursing back out a level
// let next recursion deal with fresh as normal.
// nb no need to take tail of parent stack as it was an empty list head is None :-)
val newForestTree: Tree[Li] = fresh match {
case ff@Oli(vf, indentF) => branch(List(t),indentF)
case ff@Uli(vf, indentF) => branch(List(t),indentF)
}
rec(tail, parentStack, newForestTree :: restForest )
}
//pop the head off and attach t.head to it, ie add the exhausted branch as node
// then dont consume the list and recurse back out a level
case Some(tree) => rec(tail, parentStack.tail, tree ++ t :: restForest)
}
}
// we have a li at same level as tree but different type
// this MUST be a new List so create a new tree in the forest and and go again.
case (fresh, forest) => {
val newForestTree = fresh match {
case t@Oli(vf, indentF) => branch(List(oleaf(t)),indentF)
case t@Uli(vf, indentF) => branch(List(uleaf(t)),indentF)
}
rec(rest, parentStack, newForestTree :: forest)
}
}
}
}
}
// kick off the list walking recursion, need to know if i start with an ul or ol
//PRECONDITION before recurse we know list is not empty because liParser will have succeeded,
// would prefer a NonEmpty List type here rather than list
lis match {
case Nil => failed("no List[li] to parse") //never getting here
case li :: rest => {
val listForest = rec(lis, Nil, Nil)
value(MdForest(listForest.reverse))
}
}
}
//flatmap a liParser with rose to get a Parser[MdForest]
liParser flatMap rose // map (MdListTree(_))
}
def paragraphParser : Parser[Markdown] = for {
_ <- is('\n')
_ <- list1(is('\n'))
} yield Paragraph
/* -------------------------------------- */
/* ------- inline markdown ----------- */
/* ------ all have an end marker, then consume whitespace ---------- */
/* -------------------------------------- */
def linkParser: Parser[Markdown] = for {
// _ <- list(space)
_ <- is('[')
linkText <- inlineParser.usemap[List[Char]](list1(isNot(']')))( _.mkString )
_ <- is(']')
_ <- is('(')
url <- list1(satisfy(c => !c.isWhitespace && c != ')' )) flatMap parseHelper.buildURL
_ <- opt(space)
title <- opt( list1(isNot(')')) map (_.mkString("")) )
_ <- is(')')
// _ <- list(space) ||| value(Nil)
} yield Link(linkText, url, title)
// list1(satisfy(c => !c.isWhitespace && c != ')' ))
private def refLinkRefPartParser: Parser[String] = for {
_ <- is('[')
reftext <- list1(isNot(']'))
_ <- is(']')
} yield reftext mkString
/**
* parse the uri from a refererred to link
* parse [2] http://foo.ar.com
* @return
*/
def refLinkUriParser: Parser[Markdown] = for {
_ <- is('[')
linkKey <- list1(isNot(']'))
_ <- is(']')
_ <- list(space)
uri <- list(visible) flatMap parseHelper.buildURL
} yield RefLinkUri(linkKey.mkString, uri)
def refLinkParser: Parser[Markdown] = for {
_ <- is('[')
linkText <- inlineParser.usemap[List[Char]](list1(isNot(']')))( _.mkString )
_ <- is(']')
refText <- opt(refLinkRefPartParser)
} yield RefLink(linkText,refText.getOrElse(linkText.toString)) // TODO instead of toString, use typeclass show, we can let rawHtml show be its value
def imgParser: Parser[Markdown] = for {
_ <- is('!')
_ <- is('[')
altText <- (list1(isNot(']'))) map (_.mkString)
_ <- is(']')
_ <- is('(')
uri <- list1(satisfy(c => !c.isWhitespace && c != ')' )) flatMap parseHelper.buildURL
_ <- opt(space)
title <- opt( list1(isNot(')')) map (_.mkString("")) )
_ <- is(')')
} yield Img(altText, uri, title)
def refImgParser: Parser[Markdown] = for {
_ <- is('!')
_ <- is('[')
altText <- (list1(isNot(']'))) map (_.mkString)
_ <- is(']')
refText <- opt(refLinkRefPartParser)
} yield RefImg(altText,refText.getOrElse(altText))
/** raw HTML is just an inline String that can be printed to be consideredHTML. Just read unless we encounter an inline elemet or end of line*/
def rawHtmlParser: Parser[Markdown] = for {
theHtml <- list1(isNotIn(inLineStartTokens))
} yield RawHtml(theHtml.mkString)
def brParser: Parser[Markdown] = for {
_ <- space
_ <- spaces1
_ <- (is('\n'))
} yield Br
def hardwrapParser: Parser[Markdown] = for {
_ <- (is('\n'))
} yield Hardwrap
def emphasisParser: Parser[Markdown] = {
// notice that italic Parser is after boldParser, this is to so **x is understood as bold not as *xx ie italic
strikethroughParser ||| boldParser ||| italicParser
}
def boldParser1: Parser[Markdown] = for {
_ <- is('*')
_ <- is('*')
boldtext <- list1(inlineParser)
_ <- is('*')
_ <- is('*')
} yield Bold(boldtext)
def boldParser2: Parser[Markdown] = for {
_ <- is('_')
_ <- is('_')
boldtext <- list1(inlineParser)
_ <- is('_')
_ <- is('_')
} yield Bold(boldtext)
def boldParser: Parser[Markdown] = boldParser1 ||| boldParser2
def italicParser1: Parser[Markdown] = for {
_ <- is('*')
boldtext <- list1(inlineParser)
_ <- is('*')
} yield Italic(boldtext)
def italicParser2: Parser[Markdown] = for {
_ <- is('_')
boldtext <- list1(inlineParser)
_ <- is('_')
} yield Italic(boldtext)
def italicParser: Parser[Markdown] = italicParser1 ||| italicParser2
def strikethroughParser: Parser[Markdown] = for {
_ <- is('~')
_ <- is('~')
boldtext <- list1(inlineParser)
_ <- is('~')
_ <- is('~')
} yield Strikethrough(boldtext)
/*
* thing
* thing
1. thing
2. foo
- bar
* thing
*/
// val linkParser: Parser[Markdown] = for {
// _ <-
// }
}
object parseHelper {
import Parser._
import java.net.{URI => JURI}
/**
* Builds a Parser[Uri], instantiate the Uri case class, but tests the uri string using the Java URI constructor.
* If we dont have a java exception we are cool.
*
* @param attemptURL
* @return
*/
def buildURL(attemptURL: List[Char]): Parser[Uri] =
try{
if(attemptURL.isEmpty) failed("The URL can not be an empty string")
else {
val theUrlText = attemptURL.mkString
val url = new JURI(theUrlText)
value(Uri(theUrlText))
}
}
catch {
case e: Throwable => failed(e.getMessage)
}
}