Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Newer
Older
100755 826 lines (717 sloc) 30.089 kB
955caa8 initial barely working json parser
Chris Anderson authored
1 %% Copyright (c) 2008 Robert Virding. All rights reserved.
2
3 %% Redistribution and use in source and binary forms, with or without
4 %% modification, are permitted provided that the following conditions
5 %% are met:
6
7 %% 1. Redistributions of source code must retain the above copyright
8 %% notice, this list of conditions and the following disclaimer.
9 %% 2. Redistributions in binary form must reproduce the above copyright
10 %% notice, this list of conditions and the following disclaimer in the
11 %% documentation and/or other materials provided with the distribution.
12
13 %% THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
14 %% "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
15 %% LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
16 %% FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
17 %% COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
18 %% INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
19 %% BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
20 %% LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
21 %% CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 %% LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
23 %% ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
24 %% POSSIBILITY OF SUCH DAMAGE.
25
26 %%% File : leex.erl
27 %%% Author : Robert Virding (robert.virding@telia.com)
28 %%% Purpose : A Lexical Analyser Generator for Erlang.
29 %%%
30 %%% Most of the algorithms used here are taken pretty much as
31 %%% described in the "Dragon Book" by Aho, Sethi and Ullman. Some
32 %%% completing details were taken from "Compiler Design in C" by
33 %%% Hollub.
34
35 -module(leex).
36
37 -export([file/1,file/2,format_error/1]).
38
39 -import(lists, [member/2,reverse/1,seq/2,sort/1,
40 keymember/3,keysearch/3,keysort/2,
41 map/2,foldl/3,foldr/3,foreach/2]).
42 -import(string, [substr/2,substr/3,span/2,tokens/2]).
43 -import(ordsets, [is_element/2,add_element/2,union/2,subtract/2]).
44
45 %% -compile([export_all]).
46
47 -record(leex, {xfile=[], %Xrl file
48 efile=[], %Erl file
49 ifile=[], %Include file
50 gfile=[], %Graph file
51 module, %Module name
52 opts=[], %Options
53 errors=[],
54 warnings=[],
55 xport=none, %Xrl file port
56 eport=none, %Erl file port
57 iport=none, %Include file port
58 gport=none %Graph file port
59 }).
60
61 -record(nfa_state, {no,edges=[],accept=noaccept}).
62 -record(dfa_state, {no,nfa=[],trans=[],accept=noaccept}).
63
64 %% file(File) -> ok | error.
65 %% file(File, Options) -> ok | error.
66
67 file(File) -> file(File, []).
68
69 file(File, Opts) ->
70 St0 = #leex{},
71 St1 = filenames(File, Opts, St0), %Get all the filenames
72 try
73 case parse_file(St1) of
74 {ok,REAs,Actions,Code} ->
75 {NFA,NF} = build_combined_nfa(REAs),
76 io:fwrite("NFA contains ~w states, ", [size(NFA)]),
77 {DFA0,DF0} = build_dfa(NFA, NF),
78 io:fwrite("DFA contains ~w states, ", [length(DFA0)]),
79 {DFA,DF} = minimise_dfa(DFA0, DF0),
80 io:fwrite("minimised to ~w states.~n", [length(DFA)]),
81 %% io:fwrite("~p\n", [{NF,NFA}]),
82 %% io:fwrite("~p\n", [{DF,DFA}]),
83 Ret = out_file(St1, DFA, DF, Actions, Code),
84 case member(dfa_graph, Opts) of
85 true -> out_dfa_graph(St1, DFA, DF);
86 false -> ok
87 end,
88 Ret; %The important return!
89 {error,PError} ->
90 io:put_chars([$\n,gcc_error(St1#leex.xfile, PError),$\n]),
91 error
92 end
93 catch
94 %% This is not used yet!
95 throw: {leex_error,Error,St2} ->
96 close_files(St2),
97 io:put_chars([$\n,gcc_error(St2#leex.xfile, Error),$\n]),
98 error
99 end.
100
101 %% filenames(File, Options, State) -> State.
102 %% The default output dir is the current directory unless an
103 %% explicit one has been given in the options.
104
105 filenames(File, Opts, St0) ->
106 Dir = filename:dirname(File),
107 Base = filename:basename(File, ".xrl"),
108 Xfile = filename:join(Dir, Base ++ ".xrl"),
109 Efile = Base ++ ".erl",
110 Gfile = Base ++ ".dot",
111 Ifile = "leexinc.hrl",
112 Module = list_to_atom(Base),
113 St1 = St0#leex{xfile=Xfile,
114 ifile=Ifile,
115 opts=Opts,
116 module=Module},
117 %% Test for explicit out dir.
118 case keysearch(outdir, 1, Opts) of
119 {value,{outdir,D}} ->
120 St1#leex{efile=filename:join(D, Efile),
121 gfile=filename:join(D, Gfile)};
122 _ ->
123 St1#leex{efile=Efile,gfile=Gfile}
124 end.
125
126 close_files(St) ->
127 MaybeClose = fun (none) -> ok; (Port) -> file:close(Port) end,
128 MaybeClose(St#leex.xport),
129 MaybeClose(St#leex.eport),
130 MaybeClose(St#leex.iport),
131 MaybeClose(St#leex.gport).
132
133 format_error({open,F}) -> ["error opening ",io_lib:write_string(F)];
134 format_error(missing_defs) -> "missing definitions";
135 format_error(missing_rules) -> "missing rules";
136 format_error(bad_rule) -> "bad rule";
137 format_error({regexp,E})-> ["bad regexp `",regexp:format_error(E),"'"];
138 format_error({after_regexp,S}) ->
139 ["bad code after regexp ",io_lib:write_string(S)].
140
141 gcc_error(File, {Line,Mod,Error}) ->
142 io_lib:format("~s:~w: ~s", [File,Line,apply(Mod, format_error, [Error])]);
143 gcc_error(File, {Mod,Error}) ->
144 io_lib:format("~s: ~s", [File,apply(Mod, format_error, [Error])]).
145
146 %% parse_file(State) -> {ok,[REA],[Action],Code} | {error,Error}
147 %% when
148 %% REA = {RegExp,ActionNo};
149 %% Action = {ActionNo,ActionString};
150 %% Code = [char()].
151 %%
152 %% Read and parse the file Xfile.
153 %% After each section of the file has been parsed we directly call the
154 %% next section. This is done when we detect a line we don't recognise
155 %% in the current section. The file format is very simple and Erlang
156 %% token based, we allow empty lines and Erlang style comments.
157
158 parse_file(St0) ->
159 case file:open(St0#leex.xfile, [read]) of
160 {ok,Xfile} ->
161 St1 = St0#leex{xport=Xfile},
162 io:fwrite("Parsing file ~s, ", [St1#leex.xfile]),
163 case parse_head(Xfile) of
164 {ok,REAs,Actions,Code} ->
165 io:fwrite("contained ~w rules.~n", [length(REAs)]),
166 file:close(Xfile),
167 {ok,REAs,Actions,Code};
168 Error ->
169 file:close(Xfile),
170 Error
171 end;
172 {error,_} ->
173 {error,{none,leex,{open,St0#leex.xfile}}}
174 end.
175
176 %% parse_head(File)
177 %% Parse the head of the file.
178
179 parse_head(Ifile) -> parse_defs(Ifile, nextline(Ifile, 0)).
180
181 %% parse_defs(File, Line)
182 %% Parse the macro definition section of a file. Allow no definitions.
183
184 parse_defs(Ifile, {ok,"Definitions." ++ _,L}) ->
185 parse_defs(Ifile, nextline(Ifile, L), []);
186 parse_defs(_, {ok,_,L}) ->
187 {error,{L,leex,missing_defs}};
188 parse_defs(_, {eof,L}) ->
189 {error,{L,leex,missing_defs}}.
190
191 parse_defs(Ifile, {ok,Chars,L}, Ms) ->
192 case tokens(Chars, " \t\n") of %Also strips \n from eol!
193 [Name,"=",Def] ->
194 parse_defs(Ifile, nextline(Ifile, L), [{Name,Def}|Ms]);
195 _ -> %Anything else
196 parse_rules(Ifile, {ok,Chars,L}, Ms)
197 end;
198 parse_defs(Ifile, Line, Ms) ->
199 parse_rules(Ifile, Line, Ms).
200
201 %% parse_rules(File, Line, Macros)
202 %% Parse the RE rules section of the file. This must exist.
203
204 parse_rules(Ifile, {ok,"Rules." ++ _Rest,L}, Ms) ->
205 parse_rules(Ifile, nextline(Ifile, L), Ms, [], [], 0);
206 parse_rules(_, {ok,_,L}, _) ->
207 {error,{L,leex,missing_rules}};
208 parse_rules(_, {eof,L}, _) ->
209 {error,{L,leex,missing_rules}}.
210
211 %% parse_rules(File, Result, Macros, RegExpActions, Actions, Acount) ->
212 %% {ok,RegExpActions,Actions} | {error,E}.
213
214 parse_rules(Ifile, NextLine, Ms, REAs, As, N) ->
215 case NextLine of
216 {ok,"Erlang code." ++ _Rest,L} ->
217 %% Must be careful to put rules in correct order!
218 parse_code(Ifile, L, reverse(REAs), reverse(As));
219 {ok,Chars,L0} ->
220 %%io:fwrite("~w: ~p~n", [L0,Chars]),
221 case collect_rule(Ifile, Chars, L0) of
222 {ok,Re,Atoks,L1} ->
223 case parse_rule(Re, L0, Atoks, Ms, N) of
224 {ok,REA,A} ->
225 parse_rules(Ifile, nextline(Ifile, L1), Ms,
226 [REA|REAs], [A|As], N+1);
227 {error,E} -> {error,E}
228 end;
229 {error,E} -> {error,E}
230 end;
231 {eof,_} ->
232 %% Must be careful to put rules in correct order!
233 {ok,reverse(REAs),reverse(As),[]}
234 end.
235
236 %% collect_rule(File, Line, Lineno) ->
237 %% {ok,RegExp,ActionTokens,NewLineno} | {error,E}.
238 %% Collect a complete rule by reading lines until the the regexp and
239 %% action has been read. Keep track of line number.
240
241 collect_rule(Ifile, Chars, L0) ->
242 {match,St,Len} = regexp:first_match(Chars, "[^ \t]+"),
243 %%io:fwrite("RE = ~p~n", [substr(Chars, St, Len)]),
244 case collect_action(Ifile, substr(Chars, St+Len), L0, []) of
245 {ok,[{':',_}|Toks],L1} -> {ok,substr(Chars, St, Len),Toks,L1};
246 {ok,_,_} -> {error,{L0,leex,bad_rule}};
247 {eof,L1} -> {error,{L1,leex,bad_rule}};
248 {error,E,_} -> {error,E}
249 end.
250
251 collect_action(Ifile, Chars, L0, Cont0) ->
252 case erl_scan:tokens(Cont0, Chars, L0) of
253 {done,{ok,Toks,_},_} -> {ok,Toks,L0};
254 {done,{eof,_},_} -> {eof,L0};
255 {done,{error,E,_},_} -> {error,E,L0};
256 {more,Cont1} ->
257 collect_action(Ifile, io:get_line(Ifile, leex), L0+1, Cont1)
258 end.
259
260 %% parse_rule(RegExpString, RegExpLine, ActionTokens, Macros, Counter)
261 %% Parse one regexp after performing macro substition.
262
263 parse_rule(S, Line, [{dot,_}], Ms, N) ->
264 case parse_rule_regexp(S, Ms) of
265 {ok,R} ->
266 {ok,{R,N},{N,empty_action}};
267 {error,E} ->
268 {error,{Line,leex,{regexp,E}}}
269 end;
270 parse_rule(S, Line, Atoks, Ms, N) ->
271 case parse_rule_regexp(S, Ms) of
272 {ok,R} ->
273 case erl_parse:parse_exprs(Atoks) of
274 {ok,Aes} ->
275 %% Check for token variables.
276 TokenChars = var_used('TokenChars', Atoks),
277 TokenLen = var_used('TokenLen', Atoks),
278 TokenLine = var_used('TokenLine', Atoks),
279 {ok,{R,N},{N,Aes,TokenChars,TokenLen,TokenLine}};
280 {error,_} ->
281 {error,{Line,leex,{after_regexp,S}}}
282 end;
283 {error,E} ->
284 {error,{Line,leex,{regexp,E}}}
285 end.
286
287 var_used(Name, Toks) ->
288 case keysearch(Name, 3, Toks) of
289 {value,{var,_,Name}} -> true;
290 _ -> false
291 end.
292
293 %% parse_rule_regexp(RegExpString, Macros) -> {ok,RegExp} | {error,Error}.
294 %% Substitute in macros and parse RegExpString. Cannot use regexp:gsub
295 %% here as it uses info in replace string (&).
296
297 parse_rule_regexp(RE0, [{M,Exp}|Ms]) ->
298 case regexp:matches(RE0, "{" ++ M ++ "}") of
299 {match,Mats} ->
300 RE1 = sub_repl(Mats, Exp, RE0, 1),
301 parse_rule_regexp(RE1, Ms);
302 {error,_} ->
303 parse_rule_regexp(RE0, Ms)
304 end;
305 parse_rule_regexp(RE, []) ->
306 %% io:fwrite("RE = ~p~n", [RE]),
307 regexp:parse(RE).
308
309 sub_repl([{St,L}|Ss], Rep, S, Pos) ->
310 Rs = sub_repl(Ss, Rep, S, St+L),
311 substr(S, Pos, St-Pos) ++ Rep ++ Rs;
312 sub_repl([], _Rep, S, Pos) -> substr(S, Pos).
313
314 %% parse_code(File, Line, REAs, Actions)
315 %% Parse the code section of the file.
316
317 parse_code(Ifile, _, REAs, As) ->
318 {ok,REAs,As,io:get_chars(Ifile, leex, 102400)}.
319
320 %% nextline(InputFile, PrevLineNo) -> {ok,Chars,LineNo} | {eof,LineNo}.
321 %% Get the next line skipping comment lines and blank lines.
322
323 nextline(Ifile, L) ->
324 case io:get_line(Ifile, leex) of
325 eof -> {eof,L};
326 Chars ->
327 case substr(Chars, span(Chars, " \t\n")+1) of
328 [$%|_Rest] -> nextline(Ifile, L+1);
329 [] -> nextline(Ifile, L+1);
330 _Other -> {ok,Chars,L+1}
331 end
332 end.
333
334 %% build_combined_nfa(RegExpActionList) -> {NFA,FirstState}.
335 %% Build the combined NFA using Thompson's construction straight out
336 %% of the book. Build the separate NFAs in the same order as the
337 %% rules so that the accepting have ascending states have ascending
338 %% state numbers. Start numbering the states from 1 as we put the
339 %% states in a tuple with the state number as the index.
340
341 build_combined_nfa(REAs) ->
342 {NFA0,Firsts,Free} = build_nfa_list(REAs, [], [], 1),
343 F = #nfa_state{no=Free,edges=epsilon_trans(Firsts)},
344 {list_to_tuple(keysort(#nfa_state.no, [F|NFA0])),Free}.
345
346 build_nfa_list([{RE,Action}|REAs], NFA0, Firsts, Free0) ->
347 {NFA1,Free1,First} = build_nfa(RE, Free0, Action),
348 build_nfa_list(REAs, NFA1 ++ NFA0, [First|Firsts], Free1);
349 build_nfa_list([], NFA, Firsts, Free) ->
350 {NFA,reverse(Firsts),Free}.
351
352 epsilon_trans(Firsts) -> [ {epsilon,F} || F <- Firsts ].
353
354 %% build_nfa(RegExp, NextState, Action) -> {NFA,NextState,FirstState}.
355 %% When building the NFA states for a ??? we don't build the end
356 %% state, just allocate a State for it and return this state
357 %% number. This allows us to avoid building unnecessary states for
358 %% concatenation which would then have to be removed by overwriting
359 %% an existing state.
360
361 build_nfa(RE, N0, Action) ->
362 {NFA,N1,E} = build_nfa(RE, N0+1, N0, []),
363 {[#nfa_state{no=E,accept={accept,Action}}|NFA],N1,N0}.
364
365 %% build_nfa(RegExp, NextState, FirstState, NFA) -> {NFA,NextState,EndState}.
366 %% The NFA is a list of nfa_state is no predefined order. The state
367 %% number of the returned EndState is already allocated!
368
369 build_nfa({'or',RE1,RE2}, N0, F, NFA0) ->
370 {NFA1,N1,E1} = build_nfa(RE1, N0+1, N0, NFA0),
371 {NFA2,N2,E2} = build_nfa(RE2, N1+1, N1, NFA1),
372 E = N2, %End state
373 {[#nfa_state{no=F,edges=[{epsilon,N0},{epsilon,N1}]},
374 #nfa_state{no=E1,edges=[{epsilon,E}]},
375 #nfa_state{no=E2,edges=[{epsilon,E}]}|NFA2],
376 N2+1,E};
377 build_nfa({concat,RE1, RE2}, N0, F, NFA0) ->
378 {NFA1,N1,E1} = build_nfa(RE1, N0, F, NFA0),
379 {NFA2,N2,E2} = build_nfa(RE2, N1, E1, NFA1),
380 {NFA2,N2,E2};
381 build_nfa({kclosure,RE}, N0, F, NFA0) ->
382 {NFA1,N1,E1} = build_nfa(RE, N0+1, N0, NFA0),
383 E = N1, %End state
384 {[#nfa_state{no=F,edges=[{epsilon,N0},{epsilon,E}]},
385 #nfa_state{no=E1,edges=[{epsilon,N0},{epsilon,E}]}|NFA1],
386 N1+1,E};
387 build_nfa({pclosure,RE}, N0, F, NFA0) ->
388 {NFA1,N1,E1} = build_nfa(RE, N0+1, N0, NFA0),
389 E = N1, %End state
390 {[#nfa_state{no=F,edges=[{epsilon,N0}]},
391 #nfa_state{no=E1,edges=[{epsilon,N0},{epsilon,E}]}|NFA1],
392 N1+1,E};
393 build_nfa({optional,RE}, N0, F, NFA0) ->
394 {NFA1,N1,E1} = build_nfa(RE, N0+1, N0, NFA0),
395 E = N1, %End state
396 {[#nfa_state{no=F,edges=[{epsilon,N0},{epsilon,E}]},
397 #nfa_state{no=E1,edges=[{epsilon,E}]}|NFA1],
398 N1+1,E};
399 build_nfa({char_class,Cc}, N, F, NFA) ->
400 {[#nfa_state{no=F,edges=[{char_class(Cc),N}]}|NFA],N+1,N};
401 build_nfa({comp_class,Cc}, N, F, NFA) ->
402 {[#nfa_state{no=F,edges=[{comp_class(Cc),N}]}|NFA],N+1,N};
403 build_nfa(C, N, F, NFA) when integer(C) ->
404 {[#nfa_state{no=F,edges=[{[C],N}]}|NFA],N+1,N}.
405
406 char_class(Cc) ->
407 lists:foldl(fun ({C1,C2}, Set) -> union(seq(C1, C2), Set);
408 (C, Set) -> add_element(C, Set) end, [], Cc).
409
410 comp_class(Cc) -> subtract(seq(0, 255), char_class(Cc)).
411
412 %% build_dfa(NFA, NfaFirstState) -> {DFA,DfaFirstState}.
413 %% Build a DFA from an NFA using "subset construction". The major
414 %% difference from the book is that we keep the marked and unmarked
415 %% DFA states in seperate lists. New DFA states are added to the
416 %% unmarked list and states are marked by moving them to the marked
417 %% list. We assume that the NFA accepting state numbers are in
418 %% ascending order for the rules and use ordsets to keep this order.
419
420 build_dfa(NFA, Nf) ->
421 D = #dfa_state{no=0,nfa=eclosure([Nf], NFA)},
422 {build_dfa([D], 1, [], NFA),0}.
423
424 %% build_dfa([UnMarked], NextState, [Marked], NFA) -> DFA.
425 %% Traverse the unmarked states. Temporarily add the current unmarked
426 %% state to the marked list before calculating translation, this is
427 %% to avoid adding too many duplicate states. Add it properly to the
428 %% marked list afterwards with correct translations.
429
430 build_dfa([U|Us0], N0, Ms, NFA) ->
431 {Ts,Us1,N1} = build_dfa(255, U#dfa_state.nfa, Us0, N0, [], [U|Ms], NFA),
432 M = U#dfa_state{trans=Ts,accept=accept(U#dfa_state.nfa, NFA)},
433 build_dfa(Us1, N1, [M|Ms], NFA);
434 build_dfa([], _, Ms, _) -> Ms.
435
436 %% build_dfa(Char, [NfaState], [Unmarked], NextState, [Transition], [Marked], NFA) ->
437 %% {Transitions,UnmarkedStates,NextState}.
438 %% Foreach NFA state set calculate the legal translations. N.B. must
439 %% search *BOTH* the unmarked and marked lists to check if DFA state
440 %% already exists. By test characters downwards and prepending
441 %% transitions we get the transition lists in ascending order.
442
443 build_dfa(C, Set, Us, N, Ts, Ms, NFA) when C >= 0 ->
444 case eclosure(move(Set, C, NFA), NFA) of
445 S when S /= [] ->
446 case dfa_state_exist(S, Us, Ms) of
447 {yes,T} ->
448 build_dfa(C-1, Set, Us, N, [{C,T}|Ts], Ms, NFA);
449 no ->
450 U = #dfa_state{no=N,nfa=S},
451 build_dfa(C-1, Set, [U|Us], N+1, [{C,N}|Ts], Ms, NFA)
452 end;
453 [] ->
454 build_dfa(C-1, Set, Us, N, Ts, Ms, NFA)
455 end;
456 build_dfa(-1, _, Us, N, Ts, _, _) ->
457 {Ts,Us,N}.
458
459 %% dfa_state_exist(Set, Unmarked, Marked) -> {yes,State} | no.
460
461 dfa_state_exist(S, Us, Ms) ->
462 case keysearch(S, #dfa_state.nfa, Us) of
463 {value,#dfa_state{no=T}} -> {yes,T};
464 false ->
465 case keysearch(S, #dfa_state.nfa, Ms) of
466 {value,#dfa_state{no=T}} -> {yes,T};
467 false -> no
468 end
469 end.
470
471 %% eclosure([State], NFA) -> [State].
472 %% move([State], Char, NFA) -> [State].
473 %% These are straight out of the book. As eclosure uses ordsets then
474 %% the generated state sets are in ascending order.
475
476 eclosure(Sts, NFA) -> eclosure(Sts, NFA, []).
477
478 eclosure([St|Sts], NFA, Ec) ->
479 #nfa_state{edges=Es} = element(St, NFA),
480 eclosure([ N || {epsilon,N} <- Es,
481 not is_element(N, Ec) ] ++ Sts,
482 NFA, add_element(St, Ec));
483 eclosure([], _, Ec) -> Ec.
484
485 move(Sts, C, NFA) ->
486 [St || N <- Sts,
487 {C1,St} <- (element(N, NFA))#nfa_state.edges,
488 list(C1),
489 member(C, C1) ].
490
491 %% accept([State], NFA) -> {accept,A} | noaccept.
492 %% Scan down the state list until we find an accepting state.
493
494 accept([St|Sts], NFA) ->
495 case element(St, NFA) of
496 #nfa_state{accept={accept,A}} -> {accept,A};
497 #nfa_state{accept=noaccept} -> accept(Sts, NFA)
498 end;
499 accept([], _) -> noaccept.
500
501 %% minimise_dfa(DFA, DfaFirst) -> {DFA,DfaFirst}.
502 %% Minimise the DFA by removing equivalent states. We consider a
503 %% state if both the transitions and the their accept state is the
504 %% same. First repeatedly run throught the DFA state list removing
505 %% equivalent states and updating remaining transitions with
506 %% remaining equivalent state numbers. When no more reductions are
507 %% possible then pack the remaining state numbers to get consecutive
508 %% states.
509
510 minimise_dfa(DFA0, Df0) ->
511 case min_dfa(DFA0) of
512 {DFA1,[]} -> %No reduction!
513 {DFA2,Rs} = pack_dfa(DFA1),
514 {min_update(DFA2, Rs),min_use(Df0, Rs)};
515 {DFA1,Rs} ->
516 minimise_dfa(min_update(DFA1, Rs), min_use(Df0, Rs))
517 end.
518
519 min_dfa(DFA) -> min_dfa(DFA, [], []).
520
521 min_dfa([D|DFA0], Rs0, MDFA) ->
522 {DFA1,Rs1} = min_delete(DFA0, D#dfa_state.trans, D#dfa_state.accept,
523 D#dfa_state.no, Rs0, []),
524 min_dfa(DFA1, Rs1, [D|MDFA]);
525 min_dfa([], Rs, MDFA) -> {MDFA,Rs}.
526
527 %% min_delete(States, Trans, Action, NewN, Rs, MiniDFA) -> {MiniDFA,Rs}.
528 %% Delete all states with same transactions and action. Return
529 %% rewrites and minimised DFA with no duplicate states.
530
531 min_delete([#dfa_state{no=N,trans=T,accept=A}|DFA], T, A, NewN, Rs, MDFA) ->
532 min_delete(DFA, T, A, NewN, [{N,NewN}|Rs], MDFA);
533 min_delete([D|DFA], T, A, NewN, Rs, MDFA) ->
534 min_delete(DFA, T, A, NewN, Rs, [D|MDFA]);
535 min_delete([], _, _, _, Rs, MDFA) -> {MDFA,Rs}.
536
537 min_update(DFA, Rs) ->
538 [ D#dfa_state{trans=min_update_trans(D#dfa_state.trans, Rs)} || D <- DFA ].
539
540 min_update_trans(Tr, Rs) ->
541 [ {C,min_use(S, Rs)} || {C,S} <- Tr ].
542
543 min_use(Old, [{Old,New}|_]) -> New;
544 min_use(Old, [_|Reds]) -> min_use(Old, Reds);
545 min_use(Old, []) -> Old.
546
547 pack_dfa(DFA) -> pack_dfa(DFA, 0, [], []).
548
549 pack_dfa([D|DFA], NewN, Rs, PDFA) ->
550 pack_dfa(DFA, NewN+1,
551 [{D#dfa_state.no,NewN}|Rs],
552 [D#dfa_state{no=NewN}|PDFA]);
553 pack_dfa([], _, Rs, PDFA) -> {PDFA,Rs}.
554
555 %% The main output is the yystate function which is built from the
556 %% DFA. It has the spec:
557 %%
558 %% yystate() -> InitialState.
559 %% yystate(State, InChars, Line, CurrTokLen, AcceptAction, AcceptLen) ->
560 %% {Action, AcceptLength, RestChars, Line} | Accepting end state
561 %% {Action, AcceptLength, RestChars, Line, State} | Accepting state
562 %% {reject, AcceptLength, CurrTokLen, RestChars, Line, State} |
563 %% {Action, AcceptLength, CurrTokLen, RestChars, Line, State}.
564
565 %% The return CurrTokLen is always the current number of characters
566 %% scanned in the current token. The returns have the follwoing
567 %% meanings:
568 %% {Action, AcceptLength, RestChars, Line} -
569 %% The scanner has reached an accepting end-state, for example after
570 %% a regexp "abc". Action is the action number and AcceptLength is
571 %% the length of the matching token.
572 %%
573 %% {Action, AcceptLength, RestChars, Line, State} -
574 %% The scanner has reached an accepting transition state,
575 %% continuation depends on RestChars. If RestChars == [] (no more
576 %% current characters) then we need to get more characters to see if
577 %% it is an end-state, otherwise (eof or chars) then it is an end
578 %% state.
579 %%
580 %% {reject, AcceptLength, CurrTokLen, RestChars, Line, State} -
581 %% {Action, AcceptLength, CurrTokLen, RestChars, Line, State} -
582 %% The scanner has reached a non accepting transistion state. If
583 %% RestChars == [] we need to get more characters to continue.
584 %% Otherwise if 'reject' then no accepting state has been reached it
585 %% is an error. If we have an Action and AcceptLength then these are
586 %% the last accept state, use them and continue from there.
587
588 %% out_file(LeexState, DFA, DfaStart, [Action], Code) -> ok | error.
589 %% Generate an output .erl file from the include file, the DFA and
590 %% the code for the action.s
591
592 out_file(St0, DFA, DF, Actions, Code) ->
593 io:fwrite("Writing file ~s, ", [St0#leex.efile]),
594 case file:path_open([".", [code:lib_dir(),"/tools/include"]],
595 St0#leex.ifile, [read]) of
596 {ok,Ifile,_} ->
597 St1 = St0#leex{iport=Ifile},
598 case file:open(St1#leex.efile, [write]) of
599 {ok,Ofile} ->
600 St2 = St1#leex{eport=Ofile},
601 out_file(Ifile, Ofile, St2#leex.module,
602 DFA, DF, Actions, Code),
603 file:close(Ifile),
604 file:close(Ofile),
605 io:fwrite("ok~n"),
606 ok;
607 {error,_} ->
608 file:close(Ifile),
609 io:fwrite("~s: open error~n", [St1#leex.efile]),
610 error
611 end;
612 {error,_} ->
613 io:fwrite("~s: open error~n", [St0#leex.ifile]),
614 error
615 end.
616
617 %% out_file(IncFile, OutFile, ModName, DFA, DfaStart, Actions, Code) -> ok.
618 %% Copy the include file line by line substituting special lines with
619 %% generated code. We cheat by only looking at the first 5
620 %% characters.
621
622 out_file(Ifile, Ofile, Mod, DFA, DF, Actions, Code) ->
623 case io:get_line(Ifile, leex) of
624 eof -> ok;
625 Line ->
626 case substr(Line, 1, 5) of
627 "##mod" -> io:fwrite(Ofile, "-module(~w).\n", [Mod]);
628 "##cod" -> io:put_chars(Ofile, Code);
629 "##dfa" -> out_dfa(Ofile, DFA, DF);
630 "##act" -> out_actions(Ofile, Actions);
631 _ -> io:put_chars(Ofile, Line)
632 end,
633 out_file(Ifile, Ofile, Mod, DFA, DF, Actions, Code)
634 end.
635
636 out_dfa(File, DFA, DF) ->
637 io:fwrite(File, "yystate() -> ~w.~n~n", [DF]),
638 foreach(fun (S) -> out_trans(File, S) end, DFA),
639 io:fwrite(File, "yystate(S, Ics, Line, Tlen, Action, Alen) ->~n", []),
640 io:fwrite(File, " {Action,Alen,Tlen,Ics,Line,S}.~n", []).
641
642 out_trans(File, #dfa_state{no=N,trans=[],accept={accept,A}}) ->
643 %% Accepting end state, guaranteed done.
644 io:fwrite(File, "yystate(~w, Ics, Line, Tlen, _, _) ->~n", [N]),
645 io:fwrite(File, " {~w,Tlen,Ics,Line};~n", [A]);
646 out_trans(File, #dfa_state{no=N,trans=Tr,accept={accept,A}}) ->
647 %% Accepting state, but there maybe more.
648 foreach(fun (T) -> out_accept_tran(File, N, A, T) end, pack_trans(Tr)),
649 io:fwrite(File, "yystate(~w, Ics, Line, Tlen, _, _) ->~n", [N]),
650 io:fwrite(File, " {~w,Tlen,Ics,Line,~w};~n", [A,N]);
651 out_trans(File, #dfa_state{no=N,trans=Tr,accept=noaccept}) ->
652 %% Non-accepting transition state.
653 foreach(fun (T) -> out_noaccept_tran(File, N, T) end, pack_trans(Tr)),
654 io:fwrite(File, "yystate(~w, Ics, Line, Tlen, Action, Alen) ->~n", [N]),
655 io:fwrite(File, " {Action,Alen,Tlen,Ics,Line,~w};~n", [N]).
656
657 out_accept_tran(File, N, A, {{Cf,Cl},S}) ->
658 out_accept_head(File, N, io_lib:write_char(Cf), io_lib:write_char(Cl)),
659 out_accept_body(File, S, "Line", A);
660 out_accept_tran(File, N, A, {$\n,S}) ->
661 out_accept_head(File, N, "$\\n"),
662 out_accept_body(File, S, "Line+1", A);
663 out_accept_tran(File, N, A, {C,S}) ->
664 Char = io_lib:write_char(C),
665 out_accept_head(File, N, Char),
666 out_accept_body(File, S, "Line", A).
667
668 out_noaccept_tran(File, N, {{Cf,Cl},S}) ->
669 out_noaccept_head(File, N, io_lib:write_char(Cf), io_lib:write_char(Cl)),
670 out_noaccept_body(File, S, "Line");
671 out_noaccept_tran(File, N, {$\n,S}) ->
672 out_noaccept_head(File, N, "$\\n"),
673 out_noaccept_body(File, S, "Line+1");
674 out_noaccept_tran(File, N, {C,S}) ->
675 Char = io_lib:write_char(C),
676 out_noaccept_head(File, N, Char),
677 out_noaccept_body(File, S, "Line").
678
679 out_accept_head(File, State, Char) ->
680 io:fwrite(File, "yystate(~w, [~s|Ics], Line, Tlen, _, _) ->\n",
681 [State,Char]).
682
683 out_accept_head(File, State, Min, Max) ->
684 io:fwrite(File, "yystate(~w, [C|Ics], Line, Tlen, _, _) when C >= ~s, C =< ~s ->\n",
685 [State,Min,Max]).
686
687 out_accept_body(File, Next, Line, Action) ->
688 io:fwrite(File, " yystate(~w, Ics, ~s, Tlen+1, ~w, Tlen);\n",
689 [Next,Line,Action]).
690
691 out_noaccept_head(File, State, Char) ->
692 io:fwrite(File, "yystate(~w, [~s|Ics], Line, Tlen, Action, Alen) ->\n",
693 [State,Char]).
694
695 out_noaccept_head(File, State, Min, Max) ->
696 io:fwrite(File, "yystate(~w, [C|Ics], Line, Tlen, Action, Alen) when C >= ~s, C =< ~s ->\n",
697 [State,Min,Max]).
698
699 out_noaccept_body(File, Next, Line) ->
700 io:fwrite(File, " yystate(~w, Ics, ~s, Tlen+1, Action, Alen);\n",
701 [Next,Line]).
702
703 %% pack_trans([{Char,State}]) -> [{Crange,State}] when
704 %% Crange = {Char,Char} | Char.
705 %% Pack the translation table into something more suitable for
706 %% generating code. Ranges of characters with the same State are
707 %% packed together, while solitary characters are left "as is". We
708 %% KNOW how the pattern matching compiler works so solitary
709 %% characters are stored before ranges. We do this using ordsets for
710 %% for the packed table. Always break out $\n as solitary character.
711
712 pack_trans(Tr) -> pack_trans(Tr, []).
713
714 pack_trans([{C,S}|Tr], Pt) -> pack_trans(Tr, C, C, S, Pt);
715 pack_trans([], Pt) -> Pt.
716
717 pack_trans([{$\n,S1}|Tr], Cf, Cl, S, Pt0) ->
718 Pt1 = pack_trans(Cf, Cl, S, Pt0), %Take what we got ...
719 Pt2 = add_element({$\n,S1}, Pt1), %add separate $\n ...
720 pack_trans(Tr, Pt2); %and keep going
721 pack_trans([{C,S}|Tr], Cf, Cl, S, Pt) when C == Cl + 1 ->
722 pack_trans(Tr, Cf, C, S, Pt); %Add to range
723 pack_trans(Tr, Cf, Cl, S, Pt) ->
724 pack_trans(Tr, pack_trans(Cf, Cl, S, Pt)).
725
726 pack_trans(C, C, S, Pt) -> add_element({C,S}, Pt);
727 pack_trans(Cf, Cl, S, Pt) when Cl == Cf + 1 ->
728 add_element({Cf,S}, add_element({Cl,S}, Pt));
729 pack_trans(Cf, Cl, S, Pt) -> add_element({{Cf,Cl},S}, Pt).
730
731 %% out_actions(File, ActionList) -> ok.
732 %% Write out the action table.
733
734 out_actions(File, As) ->
735 foreach(fun (A) -> out_action(File, A) end, As),
736 io:fwrite(File, "yyaction(_, _, _, _) -> error.~n", []).
737
738 out_action(File, {A,empty_action}) ->
739 io:fwrite(File, "yyaction(~w, _, _, _) -> skip_token;~n", [A]);
740 out_action(File, {A,Code,TokenChars,TokenLen,TokenLine}) ->
741 Len = if TokenLen or TokenChars -> "TokenLen" ; true -> "_" end,
742 Line = if TokenLine -> "TokenLine" ; true -> "_" end,
743 Tcs = if TokenChars -> "YYtcs" ; true -> "_" end,
744 io:fwrite(File, "yyaction(~w, ~s, ~s, ~s) ->~n", [A,Len,Tcs,Line]),
745 if
746 TokenChars == true ->
747 io:fwrite(File, " TokenChars = yypre(YYtcs, TokenLen),~n", []);
748 TokenChars == false -> ok
749 end,
750 io:fwrite(File, " ~s;~n", [erl_pp:exprs(Code, 4, none)]).
751
752 %% out_dfa_graph(LeexState, DFA, DfaStart) -> ok | error.
753 %% Writes the DFA to a .dot file in DOT-format which can be viewed
754 %% with Graphviz.
755
756 out_dfa_graph(St, DFA, DF) ->
757 io:fwrite("Writing DFA to file ~s, ", [St#leex.gfile]),
758 case file:open(St#leex.gfile, [write]) of
759 {ok,Gfile} ->
760 io:fwrite(Gfile, "digraph DFA {~n", []),
761 out_dfa_states(Gfile, DFA, DF),
762 out_dfa_edges(Gfile, DFA),
763 io:fwrite(Gfile, "}~n", []),
764 file:close(Gfile),
765 io:fwrite("ok~n"),
766 ok;
767 {error,_} ->
768 io:fwrite("~s: open error~n", [St#leex.gfile]),
769 error
770 end.
771
772 out_dfa_states(File, DFA, DF) ->
773 foreach(fun (S) -> out_dfa_state(File, DF, S) end, DFA),
774 io:fwrite(File, "~n", []).
775
776 out_dfa_state(File, DF, #dfa_state{no=DF, accept={accept,_}}) ->
777 io:fwrite(File, " ~b [shape=doublecircle color=green];~n", [DF]);
778 out_dfa_state(File, DF, #dfa_state{no=DF, accept=noaccept}) ->
779 io:fwrite(File, " ~b [shape=circle color=green];~n", [DF]);
780 out_dfa_state(File, _, #dfa_state{no=S, accept={accept,_}}) ->
781 io:fwrite(File, " ~b [shape=doublecircle];~n", [S]);
782 out_dfa_state(File, _, #dfa_state{no=S, accept=noaccept}) ->
783 io:fwrite(File, " ~b [shape=circle];~n", [S]).
784
785 out_dfa_edges(File, DFA) ->
786 foreach(fun (#dfa_state{no=S,trans=Trans}) ->
787 Pt = pack_trans(Trans),
788 Tdict = foldl(fun ({Cr,T}, D) ->
789 orddict:append(T, Cr, D)
790 end, orddict:new(), Pt),
791 foreach(fun (T) ->
792 Crs = orddict:fetch(T, Tdict),
793 Edgelab = dfa_edgelabel(Crs),
794 io:fwrite(File, " ~b -> ~b [label=\"~s\"];~n",
795 [S,T,Edgelab])
796 end, sort(orddict:fetch_keys(Tdict)))
797 end, DFA).
798
799 dfa_edgelabel([C]) when is_integer(C) -> quote(C);
800 dfa_edgelabel(Cranges) ->
801 "[" ++ map(fun ({A,B}) -> [quote(A), "-", quote(B)];
802 (C) -> [quote(C)]
803 end, Cranges) ++ "]".
804
805 quote($^) -> "\\^";
806 quote($.) -> "\\.";
807 quote($$) -> "\\$";
808 quote($-) -> "\\-";
809 quote($[) -> "\\[";
810 quote($]) -> "\\]";
811 quote($\s) -> "\\\\s";
812 quote($\") -> "\\\"";
813 quote($\b) -> "\\\\b";
814 quote($\f) -> "\\\\f";
815 quote($\n) -> "\\\\n";
816 quote($\r) -> "\\\\r";
817 quote($\t) -> "\\\\t";
818 quote($\e) -> "\\\\e";
819 quote($\v) -> "\\\\v";
820 quote($\d) -> "\\\\d";
821 quote($\\) -> "\\\\";
822 quote(C) when 32 =< C, C =< 126 -> [C];
823 quote(C) when 0 =< C, C =< 255 ->
824 <<T2:2,T1:3,T0:3>> = <<C>>,
825 ["\\\\", $0+T2, $0+T1, $0+T0].
Something went wrong with that request. Please try again.