-
Notifications
You must be signed in to change notification settings - Fork 0
/
lex.cs
126 lines (114 loc) · 3.96 KB
/
lex.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
/*
* lex.cs:
*
* Quick and dirty lexer using regular expressions. There's probably a
* better way, but I wasn't much interested in writing a lexer.
*/
using System.Collections.Generic;
using System.IO;
using System.Text.RegularExpressions;
namespace SaturnValley.SharpF
{
enum TokenType
{
Whitespace,
Identifier,
Boolean,
Integer,
Rational,
Character,
String,
Open,
Close,
SharpOpen,
Quote,
Quasiquote,
Unquote,
InterpUnquote,
Dot
}
class Token
{
public TokenType type;
public string text;
public Token(TokenType t, string tx)
{
type = t; text = tx;
}
}
class TokenData
{
public TokenType type;
public Regex regex;
public TokenData(TokenType t, Regex r)
{
type = t; regex = r;
}
}
internal class Lexer
{
const string symbolChars = @"!$%&*/:<=>?^_~A-Za-z";
const string subseqChars = @"+-.@0-9";
const string wordBoundary = @"(?=([ \t()';]|$))";
public static readonly TokenData[] Tokens =
{
// The vocabulary definition. Every regex must start with a
// caret!
new TokenData(TokenType.Whitespace,
new Regex(@"^([ \t]+|;.*)")),
new TokenData(TokenType.Identifier,
new Regex(@"^([" + symbolChars + @"]" +
@"[" + symbolChars + subseqChars + @"]*" +
@"|\+|-|\.\.\.)" + wordBoundary)),
new TokenData(TokenType.Integer,
new Regex("^-?[0-9]+" + wordBoundary)),
new TokenData(TokenType.Rational,
new Regex("^-?[0-9]+/[0-9]+" + wordBoundary)),
new TokenData(TokenType.Open,
new Regex(@"^\(")),
new TokenData(TokenType.Close,
new Regex(@"^\)")),
new TokenData(TokenType.Quote,
new Regex(@"^'")),
new TokenData(TokenType.String,
new Regex(@"^""[^""]*""")),
new TokenData(TokenType.Boolean,
new Regex(@"^#[TtFf]"))
};
// We use yield return to generate a stream of tokens that will be
// consumed by the parser. I like this because it really
// illustrates the point of coroutines, as execution flops back and
// forth between Lex and Parse without anyone having to explicitly
// juggle their state. This is "yield" not in the sense of crop
// yield, but as in yielding the right of way.
//
// Note that I use goto where I could almost certainly use break.
// Goto seems clearer; when there are this many nested loops, it's
// unclear where break will take us, and it'll be fragile when we
// refactor.
public static IEnumerable<Token> Lex(TextReader sr)
{
string line;
while (null != (line = sr.ReadLine()))
{
int pos = 0;
while (pos < line.Length)
{
foreach (TokenData td in Tokens)
{
Match m = td.regex.Match(line.Substring(pos));
if (m.Success)
{
if (td.type != TokenType.Whitespace)
yield return new Token(td.type, m.Value);
pos += m.Length;
goto okay;
}
}
throw new TokenException(line.Substring(pos));
okay:;
}
}
}
}
}