-
Notifications
You must be signed in to change notification settings - Fork 0
/
parser.c
204 lines (187 loc) · 5.21 KB
/
parser.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
/*
* parser.c
*
* Converts textual input to a list of tokens to be processed by the macro
* processor.
*/
#include <ctype.h>
#include <stdio.h>
#include "parser.h"
value_t number(const char *word);
int next_word(FILE *fd, char *word, int break_on_newline);
/*
* ~~ a brief interlude about the parser ~~
*
* The parser is passed a file descriptor (which could be a file, stdin,
* whatever) and a dict corresponding to the current scope. THIS DICT
* WILL PROBABLY BE MODIFIED. The parser itself doesn't modify it, but it will
* be given to macros, which may choose to add to it.
*
* Here is the parser algorithm:
* 1. Get the next word. If there is no next word, end.
* 2. Look up the word in the dictionary.
* b. if it's in the dictionary, copy the entry to the output list
* c. if it's not in the dictionary, pass it to number().
* 1. if it's a number, place it in the output list as a literal
* 2. if it's not a number, print an error and return NULL
* 3. Go to 1
*
* Words have a maximum length of 255 bytes. UTF-8 characters will take more
* space. Words longer than 255 bytes will be truncated.
*
* Returns a REVERSED program. It has to be passed through the macro processor
* before it can be run.
*/
cons_t *parse(FILE *fd, dict_t **scope, int break_on_newline) {
char word[256];
cons_t *prog = NULL;
dict_t *entry;
value_t val;
while (next_word(fd, word, break_on_newline)) {
entry = dict_get(*scope, word);
if (entry != NULL) {
list_push(&prog, wrap_symbol(entry));
} else {
val = number(word);
if (!null_p(val)) {
list_push(&prog, val);
} else {
*scope = dict_add(*scope, word, null());
list_push(&prog, wrap_symbol(*scope));
}
}
}
return prog;
}
int char2digit(char c, int radix) {
int val = -1;
if (isdigit(c)) {
val = c - '0';
} else if ('a' <= c && c <= 'f') {
val = 10 + (c - 'a');
} else if ('A' <= c && c <= 'F') {
val = 10 + (c - 'A');
}
if (val < radix) {
return val;
}
return -1;
}
enum state {
NUM_ST_BEGIN, /* no numbers eaten yet */
NUM_ST_SIGN, /* sign eaten (further signs invalid) */
NUM_ST_ZERO, /* one zero eaten, plus any signs */
NUM_ST_RADIX, /* radix set */
NUM_ST_DIGITS /* eaten two or more digits. more non-digit chars invalid */
};
/*
* Tries to parse a word as a number.
* At the moment it only handles integers.
*
* if word is a valid integer, return a numeric value
* if word is not a valid number, return a null value
*
* integer format:
* [+-]?(0[XxBbDdOo])?[\d]+
* radices:
* x = hex (digits 0-9, A-F, a-f)
* b = binary (valid 0, 1)
* d = decimal (valid 0-9)
* o = octal (valid 0-7)
*/
value_t number(const char *word) {
long acc = 0;
int sign = 1, radix = default_radix, digit;
unsigned long k = 0;
char c;
enum state st = NUM_ST_BEGIN;
while ((c = word[k++]) != '\0') {
switch (st) {
case NUM_ST_BEGIN:
if (c == '+' || c == '-') {
sign = c == '-' ? -1 : 1;
st = NUM_ST_SIGN;
} else if (c == '0') {
st = NUM_ST_ZERO;
} else if ((digit = char2digit(c, radix)) != -1) {
acc = digit; /* acc was zero */
st = NUM_ST_DIGITS;
} else {
return null();
}
break;
case NUM_ST_SIGN:
if (c == '0') {
st = NUM_ST_ZERO;
} else if ((digit = char2digit(c, radix)) != -1) {
acc = digit; /* acc was zero */
st = NUM_ST_DIGITS;
} else {
return null();
}
break;
case NUM_ST_ZERO:
if (c == 'x' || c == 'X') {
radix = 16;
st = NUM_ST_RADIX;
} else if (c == 'b' || c == 'B') {
radix = 2;
st = NUM_ST_RADIX;
} else if (c == 'd' || c == 'D') {
radix = 10;
st = NUM_ST_RADIX;
} else if (c == 'o' || c == 'O') {
radix = 8;
st = NUM_ST_RADIX;
} else if ((digit = char2digit(c, radix)) != -1) {
acc = digit; /* acc was zero */
st = NUM_ST_DIGITS;
} else {
return null();
}
break;
case NUM_ST_RADIX:
st = NUM_ST_DIGITS;
case NUM_ST_DIGITS:
if ((digit = char2digit(c, radix)) != -1) {
acc = acc * radix + digit;
} else {
return null();
}
}
}
return wrap_number(sign * acc);
}
int is_end(int c, int break_on_newline) {
return c == EOF || (break_on_newline && c == '\n');
}
int next_word(FILE *fd, char *word, int break_on_newline) {
int c, k = 0;
/* eat up any whitespace */
while (!is_end(c = fgetc(fd), break_on_newline) && isspace(c)) {
}
if (is_end(c, break_on_newline)) {
return 0; /* no more words :c */
}
/* c now contains the first character of the word */
do {
word[k++] = c; /* fill in the word */
} while (k < 255 && !is_end(c = fgetc(fd), break_on_newline) && !isspace(c));
/*
* at this point, if we ran up against the k limit, then c contains the
* last character in the word. if we didn't, it contains the first space
* after the word.
*/
if (!is_end(c, break_on_newline) && !isspace(c)) /* if we ran out of word space */
while (is_end(c = fgetc(fd), break_on_newline) && !isspace(c)); /* then ignore rest */
word[k] = '\0'; /* add a terminator */
if (break_on_newline && c == '\n') {
ungetc('\n', fd);
}
/*
* c now contains either EOF or a blank, which we can safely ignore.
* if it's an EOF, the next call to this function will return zero
* and end the parse loop.
*/
return 1;
}