Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Newer
Older
100644 283 lines (240 sloc) 7.715 kb
455578a Steve Vinoski major trailing whitespace cleanup
vinoski authored
1 %% -*- Erlang -*-
0be3c7e Claes Wikstrom untabified all of yaws
authored
2 %% File: parse_html.erl
3 %% Author: Johan Bevemyr
4 %% Created: Tue Nov 25 20:53:36 2003
5 %% Purpose: Transform html to an erlang represention (ehtml)
ac01b59 Added html->ehtml parser.
Johan Bevemyr authored
6
7 -module('yaws_html').
8 -author('jb@bevemyr.com').
9
10 -export([parse/1,parse/2,h2e/1]).
11
12 parse(Name) ->
13 {ok, B} = file:read_file(Name),
14 h2e(binary_to_list(B)).
15
16 parse(Name,Out) ->
17 {ok, B} = file:read_file(Name),
18 case h2e(binary_to_list(B)) of
0be3c7e Claes Wikstrom untabified all of yaws
authored
19 {ehtml, [], Ehtml} ->
20 Cont = io_lib:format("~p", [{ehtml, Ehtml}]),
21 file:write_file(Out, Cont);
22 Error ->
23 Error
ac01b59 Added html->ehtml parser.
Johan Bevemyr authored
24 end.
25
26 h2e(Input) ->
27 Tokens = tokenize(Input, [], [], 1),
28 parse(Tokens, {ehtml,[],0}, [], []).
29
0be3c7e Claes Wikstrom untabified all of yaws
authored
30 %% parse(Tokens, Stack, Acc)
ac01b59 Added html->ehtml parser.
Johan Bevemyr authored
31
92bafb4 Claes Wikstrom removed the ability change userid, also stopped writing to /tmp/yaws and...
authored
32 parse([], {T,A,_L}, [], Acc) ->
ac01b59 Added html->ehtml parser.
Johan Bevemyr authored
33 {T, A, lists:reverse(Acc)};
34 parse([], {T,A,L}, [{CTag,CAcc}|Stack], Acc) ->
35 io:format("Unterminated tag '~p' at line ~p\n", [T,L]),
36 parse([], CTag, Stack, [{T,A,lists:reverse(Acc)}|CAcc]);
37 parse([{begin_tag,T,A,L}|Tokens], CTag, Stack, Acc) ->
38 case tag_type(T) of
0be3c7e Claes Wikstrom untabified all of yaws
authored
39 leaf ->
40 parse(Tokens, CTag, Stack, [{T,A}|Acc]);
41 node ->
42 parse(Tokens, {T,A,L}, [{CTag,Acc}|Stack],[])
ac01b59 Added html->ehtml parser.
Johan Bevemyr authored
43 end;
44
92bafb4 Claes Wikstrom removed the ability change userid, also stopped writing to /tmp/yaws and...
authored
45 parse([{end_tag,T,[],_L}|Tokens], {T,A,_}, [{CTag,CAcc}|Stack], Acc) ->
ac01b59 Added html->ehtml parser.
Johan Bevemyr authored
46 E = case Acc of
0be3c7e Claes Wikstrom untabified all of yaws
authored
47 [Single] ->
48 {T,A,Single};
49 _ ->
50 {T,A,lists:reverse(Acc)}
51 end,
ac01b59 Added html->ehtml parser.
Johan Bevemyr authored
52 parse(Tokens, CTag, Stack, [E|CAcc]);
53
92bafb4 Claes Wikstrom removed the ability change userid, also stopped writing to /tmp/yaws and...
authored
54 parse([{end_tag,T1,[],L1}|Tokens], CTag = {T2,_A,L2}, Stack, Acc) ->
a1b8e0c Added set cookie parser
Johan Bevemyr authored
55 case tag_type(T1) of
0be3c7e Claes Wikstrom untabified all of yaws
authored
56 leaf -> %% ignore
57 parse(Tokens, CTag, Stack, Acc);
58 node ->
59 Msg = lists:flatten(io_lib:format(
60 "expected '</~p>' on line ~p, start "
61 "tag at line: ~p", [T2,L1,L2])),
62 {error, Msg}
00d985a *** empty log message ***
Johan Bevemyr authored
63 end;
ac01b59 Added html->ehtml parser.
Johan Bevemyr authored
64
92bafb4 Claes Wikstrom removed the ability change userid, also stopped writing to /tmp/yaws and...
authored
65 parse([{data, Data, _Line}|Tokens], CTag, Stack, Acc) ->
ac01b59 Added html->ehtml parser.
Johan Bevemyr authored
66 case skip_space(Data, 0) of
0be3c7e Claes Wikstrom untabified all of yaws
authored
67 {[], _} ->
68 parse(Tokens, CTag, Stack, Acc);
69 _ ->
70 parse(Tokens, CTag, Stack, [Data|Acc])
ac01b59 Added html->ehtml parser.
Johan Bevemyr authored
71 end.
0be3c7e Claes Wikstrom untabified all of yaws
authored
72 %%
ac01b59 Added html->ehtml parser.
Johan Bevemyr authored
73
455578a Steve Vinoski major trailing whitespace cleanup
vinoski authored
74 tag_type(p) -> leaf;
75 tag_type(hr) -> leaf;
76 tag_type(input) -> leaf;
ac01b59 Added html->ehtml parser.
Johan Bevemyr authored
77 tag_type(base) -> leaf;
78 tag_type(img) -> leaf;
79 tag_type('!doctype') -> leaf;
80 tag_type(meta) -> leaf;
81 tag_type(link) -> leaf;
82 tag_type(br) -> leaf;
83 tag_type(_) -> node.
84
0be3c7e Claes Wikstrom untabified all of yaws
authored
85 %% tokenize(Input, DataAcc, TokenAcc, LineNr)
ac01b59 Added html->ehtml parser.
Johan Bevemyr authored
86
87 tokenize([], [], Tokens, _Line) ->
88 lists:reverse(Tokens);
89 tokenize([], Acc, Tokens, Line) ->
90 lists:reverse([{data, lists:reverse(Acc), Line}|Tokens]);
91 tokenize([$<,$!,$-,$-|R0], Acc, Tokens, L0) ->
92 {R1, L1} = skip_comment(R0,L0),
93 tokenize(R1, Acc, Tokens, L1);
94 tokenize([$<|R0], Acc, Tokens, L0) ->
95 {Tag,R1,L1} = scan_tag(R0,L0),
455578a Steve Vinoski major trailing whitespace cleanup
vinoski authored
96 if
0be3c7e Claes Wikstrom untabified all of yaws
authored
97 Acc == [] ->
98 next_token(Tag, R1, [Tag|Tokens], L1);
99 true ->
100 Data = {data,lists:reverse(Acc),L0},
101 next_token(Tag, R1, [Tag,Data|Tokens], L1)
ac01b59 Added html->ehtml parser.
Johan Bevemyr authored
102 end;
103 tokenize([C=$\n|R0], Acc, Tokens, L) ->
104 tokenize(R0, [C|Acc], Tokens, L+1);
105 tokenize([C=$\r|R0], Acc, Tokens, L) ->
106 tokenize(R0, [C|Acc], Tokens, L+1);
107 tokenize([C|R0], Acc, Tokens, L) ->
108 tokenize(R0, [C|Acc], Tokens, L).
109
0be3c7e Claes Wikstrom untabified all of yaws
authored
110 %%
ac01b59 Added html->ehtml parser.
Johan Bevemyr authored
111
112 next_token({begin_tag, script, _, _}, R, Tokens, L) ->
113 {Data, R1, L1} = scan_endtag(R, "script", L),
114 tokenize(R1, [], [{data, Data, L}|Tokens], L1);
115 next_token({begin_tag, style, _, _}, R, Tokens, L) ->
116 {Data, R1, L1} = scan_endtag(R, "style", L),
117 tokenize(R1, [], [{data, Data, L}|Tokens], L1);
118 next_token(_Tag, R, Tokens, L) ->
119 tokenize(R, [], Tokens, L).
120
121 %% '<' <id> <sp>+ [<id><sp>*['='<val>]]* ['/'] '>'
122
123 scan_tag([$/|I], L) ->
92bafb4 Claes Wikstrom removed the ability change userid, also stopped writing to /tmp/yaws and...
authored
124 {_R0,L0} = skip_space(I, L),
ac01b59 Added html->ehtml parser.
Johan Bevemyr authored
125 {Name,R1,L1} = scan_tag_name(I, L0),
126 {R2,L2} = skip_space(R1, L1),
127 {Args,R3,L3} = scan_tag_args(R2, L2),
128 {{end_tag,list_to_atom(lowercase(Name)),Args,L0}, R3, L3};
129 scan_tag(I, L) ->
92bafb4 Claes Wikstrom removed the ability change userid, also stopped writing to /tmp/yaws and...
authored
130 {_R0,L0} = skip_space(I, L),
ac01b59 Added html->ehtml parser.
Johan Bevemyr authored
131 {Name,R1,L1} = scan_tag_name(I, L0),
132 {R2,L2} = skip_space(R1, L1),
133 {Args,R3,L3} = scan_tag_args(R2, L2),
134 {{begin_tag,list_to_atom(lowercase(Name)),Args,L0}, R3, L3}.
135
0be3c7e Claes Wikstrom untabified all of yaws
authored
136 %%
ac01b59 Added html->ehtml parser.
Johan Bevemyr authored
137
138 scan_tag_name(I, L) ->
139 scan_token(I, [], L).
140
0be3c7e Claes Wikstrom untabified all of yaws
authored
141 %%
ac01b59 Added html->ehtml parser.
Johan Bevemyr authored
142
143 scan_tag_args(I, L) ->
144 scan_tag_args(I, [], L).
145
146 scan_tag_args([], Acc, L) ->
147 {lists:reverse(Acc), [], L};
148 scan_tag_args([$>|R], Acc, L) ->
149 {lists:reverse(Acc), R, L};
0be3c7e Claes Wikstrom untabified all of yaws
authored
150 scan_tag_args(R=[$<|_], Acc, L) -> %%%% bad html
ac01b59 Added html->ehtml parser.
Johan Bevemyr authored
151 {lists:reverse(Acc), R, L};
152 scan_tag_args(R0, Acc, L0) ->
153 {Name,R1,L1} = scan_value(R0, L0),
154 {R2, L2} = skip_space(R1, L1),
155 case R2 of
0be3c7e Claes Wikstrom untabified all of yaws
authored
156 [$=|R3] ->
157 {R4,L4} = skip_space(R3, L2),
158 {Value,R5,L5} = scan_value(R4, L4),
159 {R6,L6} = skip_space(R5, L5),
160 OptName = list_to_atom(lowercase(Name)),
161 scan_tag_args(R6, [{OptName,Value}|Acc], L6);
162 _ ->
163 scan_tag_args(R2, [Name|Acc], L2)
ac01b59 Added html->ehtml parser.
Johan Bevemyr authored
164 end.
165
0be3c7e Claes Wikstrom untabified all of yaws
authored
166 %%
ac01b59 Added html->ehtml parser.
Johan Bevemyr authored
167
168 scan_value([$"|R], L) ->
169 scan_quote(R, [], $", L);
170 scan_value([$'|R], L) ->
171 scan_quote(R, [], $', L);
172 scan_value(R, L) ->
6e96a14 Claes Wikstrom indendation cleanup
authored
173 scan_token(R, [], L).
ac01b59 Added html->ehtml parser.
Johan Bevemyr authored
174
0be3c7e Claes Wikstrom untabified all of yaws
authored
175 %%
ac01b59 Added html->ehtml parser.
Johan Bevemyr authored
176
177 scan_token([], Acc, L) ->
178 {lists:reverse(Acc), [], L};
179 scan_token(R=[$>|_], Acc, L) ->
180 {lists:reverse(Acc), R, L};
0be3c7e Claes Wikstrom untabified all of yaws
authored
181 scan_token(R=[$<|_], Acc, L) -> %%% bad html
ac01b59 Added html->ehtml parser.
Johan Bevemyr authored
182 {lists:reverse(Acc), R, L};
183 scan_token(R=[$=|_], Acc, L) -> %% bad html
184 {lists:reverse(Acc), R, L};
185 scan_token([C|R], Acc, L0) ->
186 case char_class(C) of
0be3c7e Claes Wikstrom untabified all of yaws
authored
187 space ->
188 {lists:reverse(Acc), R, L0};
189 nl ->
190 {lists:reverse(Acc), R, L0+1};
191 _ ->
192 scan_token(R, [C|Acc], L0)
ac01b59 Added html->ehtml parser.
Johan Bevemyr authored
193 end.
194
0be3c7e Claes Wikstrom untabified all of yaws
authored
195 %%
ac01b59 Added html->ehtml parser.
Johan Bevemyr authored
196
197 scan_quote([], Acc, _Q, L) ->
198 {lists:reverse(Acc), [], L};
199 scan_quote([Q|R], Acc, Q, L) ->
200 {lists:reverse(Acc), R, L};
201 scan_quote([C=$\n|R], Acc, Q, L) ->
202 scan_quote(R, [C|Acc], Q, L+1);
203 scan_quote([C=$\r|R], Acc, Q, L) ->
204 scan_quote(R, [C|Acc], Q, L+1);
205 scan_quote([C|R], Acc, Q, L) ->
206 scan_quote(R, [C|Acc], Q, L).
0be3c7e Claes Wikstrom untabified all of yaws
authored
207
208 %%
ac01b59 Added html->ehtml parser.
Johan Bevemyr authored
209
210 scan_endtag(R, Tag, L) ->
211 scan_endtag(R, Tag, [], L).
212
213 scan_endtag([], _Tag, Acc, L) ->
214 {lists:reverse(Acc), [], L};
215 scan_endtag(R=[$<,$/|R0], Tag, Acc, L0) ->
216 case casecmp(Tag, R0) of
0be3c7e Claes Wikstrom untabified all of yaws
authored
217 {true, R1} ->
218 {R2,_} = skip_space(R1,L0),
219 if hd(R2) == $> ->
220 {lists:reverse(Acc), R, L0};
221 true ->
222 scan_endtag(R0, Tag, Acc, L0)
223 end;
224 false ->
225 scan_endtag(R0, Tag, Acc, L0)
ac01b59 Added html->ehtml parser.
Johan Bevemyr authored
226 end;
227 scan_endtag([C=$\n|R], Tag, Acc, L) ->
228 scan_endtag(R, Tag, [C|Acc], L+1);
229 scan_endtag([C=$\r|R], Tag, Acc, L) ->
230 scan_endtag(R, Tag, [C|Acc], L+1);
231 scan_endtag([C|R], Tag, Acc, L) ->
232 scan_endtag(R, Tag, [C|Acc], L).
233
0be3c7e Claes Wikstrom untabified all of yaws
authored
234 %%
ac01b59 Added html->ehtml parser.
Johan Bevemyr authored
235
236 casecmp([], R) -> {true, R};
237 casecmp([C1|T1], [C2|T2]) ->
238 C2low = lowercase_ch(C2),
239 if C1 == C2low -> casecmp(T1,T2);
240 true -> false
241 end.
242
0be3c7e Claes Wikstrom untabified all of yaws
authored
243 %%
ac01b59 Added html->ehtml parser.
Johan Bevemyr authored
244
245 char_class($\n) -> nl;
246 char_class($\r) -> nl;
247 char_class($ ) -> space;
248 char_class($\t) -> space;
249 char_class(C) when C >= $a, C =< $z -> alpha;
250 char_class(C) when C >= $A, C =< $Z -> alpha;
251 char_class(C) when C >= $0, C =< $9 -> digit;
92bafb4 Claes Wikstrom removed the ability change userid, also stopped writing to /tmp/yaws and...
authored
252 char_class(_C) -> other.
ac01b59 Added html->ehtml parser.
Johan Bevemyr authored
253
0be3c7e Claes Wikstrom untabified all of yaws
authored
254 %%
ac01b59 Added html->ehtml parser.
Johan Bevemyr authored
255
256 skip_space([], L) ->
257 {[], L};
258 skip_space(R = [C|R0], L) ->
259 case char_class(C) of
0be3c7e Claes Wikstrom untabified all of yaws
authored
260 nl ->
261 skip_space(R0, L+1);
262 space ->
263 skip_space(R0, L);
264 _ ->
265 {R, L}
ac01b59 Added html->ehtml parser.
Johan Bevemyr authored
266 end.
267
0be3c7e Claes Wikstrom untabified all of yaws
authored
268 %%
ac01b59 Added html->ehtml parser.
Johan Bevemyr authored
269
270 skip_comment([], L) -> {[], L};
271 skip_comment([$-,$-,$>|R],L) -> {R,L};
272 skip_comment([$\n|R],L) -> skip_comment(R,L+1);
273 skip_comment([$\r|R],L) -> skip_comment(R,L+1);
92bafb4 Claes Wikstrom removed the ability change userid, also stopped writing to /tmp/yaws and...
authored
274 skip_comment([_C|R],L) -> skip_comment(R,L).
ac01b59 Added html->ehtml parser.
Johan Bevemyr authored
275
0be3c7e Claes Wikstrom untabified all of yaws
authored
276 %%
ac01b59 Added html->ehtml parser.
Johan Bevemyr authored
277
278 lowercase(Str) ->
279 [lowercase_ch(S) || S <- Str].
280
281 lowercase_ch(C) when C>=$A, C=<$Z -> C + 32;
282 lowercase_ch(C) -> C.
Something went wrong with that request. Please try again.