Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Newer
Older
100644 283 lines (240 sloc) 7.715 kb
455578a @vinoski major trailing whitespace cleanup
vinoski authored
1 %% -*- Erlang -*-
0be3c7e @klacke untabified all of yaws
authored
2 %% File: parse_html.erl
3 %% Author: Johan Bevemyr
4 %% Created: Tue Nov 25 20:53:36 2003
5 %% Purpose: Transform html to an erlang represention (ehtml)
ac01b59 Added html->ehtml parser.
Johan Bevemyr authored
6
7 -module('yaws_html').
8 -author('jb@bevemyr.com').
9
10 -export([parse/1,parse/2,h2e/1]).
11
12 parse(Name) ->
13 {ok, B} = file:read_file(Name),
14 h2e(binary_to_list(B)).
15
16 parse(Name,Out) ->
17 {ok, B} = file:read_file(Name),
18 case h2e(binary_to_list(B)) of
0be3c7e @klacke untabified all of yaws
authored
19 {ehtml, [], Ehtml} ->
20 Cont = io_lib:format("~p", [{ehtml, Ehtml}]),
21 file:write_file(Out, Cont);
22 Error ->
23 Error
ac01b59 Added html->ehtml parser.
Johan Bevemyr authored
24 end.
25
26 h2e(Input) ->
27 Tokens = tokenize(Input, [], [], 1),
28 parse(Tokens, {ehtml,[],0}, [], []).
29
0be3c7e @klacke untabified all of yaws
authored
30 %% parse(Tokens, Stack, Acc)
ac01b59 Added html->ehtml parser.
Johan Bevemyr authored
31
92bafb4 @klacke removed the ability change userid, also stopped writing to /tmp/yaws …
authored
32 parse([], {T,A,_L}, [], Acc) ->
ac01b59 Added html->ehtml parser.
Johan Bevemyr authored
33 {T, A, lists:reverse(Acc)};
34 parse([], {T,A,L}, [{CTag,CAcc}|Stack], Acc) ->
35 io:format("Unterminated tag '~p' at line ~p\n", [T,L]),
36 parse([], CTag, Stack, [{T,A,lists:reverse(Acc)}|CAcc]);
37 parse([{begin_tag,T,A,L}|Tokens], CTag, Stack, Acc) ->
38 case tag_type(T) of
0be3c7e @klacke untabified all of yaws
authored
39 leaf ->
40 parse(Tokens, CTag, Stack, [{T,A}|Acc]);
41 node ->
42 parse(Tokens, {T,A,L}, [{CTag,Acc}|Stack],[])
ac01b59 Added html->ehtml parser.
Johan Bevemyr authored
43 end;
44
92bafb4 @klacke removed the ability change userid, also stopped writing to /tmp/yaws …
authored
45 parse([{end_tag,T,[],_L}|Tokens], {T,A,_}, [{CTag,CAcc}|Stack], Acc) ->
ac01b59 Added html->ehtml parser.
Johan Bevemyr authored
46 E = case Acc of
0be3c7e @klacke untabified all of yaws
authored
47 [Single] ->
48 {T,A,Single};
49 _ ->
50 {T,A,lists:reverse(Acc)}
51 end,
ac01b59 Added html->ehtml parser.
Johan Bevemyr authored
52 parse(Tokens, CTag, Stack, [E|CAcc]);
53
92bafb4 @klacke removed the ability change userid, also stopped writing to /tmp/yaws …
authored
54 parse([{end_tag,T1,[],L1}|Tokens], CTag = {T2,_A,L2}, Stack, Acc) ->
a1b8e0c Added set cookie parser
Johan Bevemyr authored
55 case tag_type(T1) of
0be3c7e @klacke untabified all of yaws
authored
56 leaf -> %% ignore
57 parse(Tokens, CTag, Stack, Acc);
58 node ->
59 Msg = lists:flatten(io_lib:format(
60 "expected '</~p>' on line ~p, start "
61 "tag at line: ~p", [T2,L1,L2])),
62 {error, Msg}
00d985a *** empty log message ***
Johan Bevemyr authored
63 end;
ac01b59 Added html->ehtml parser.
Johan Bevemyr authored
64
92bafb4 @klacke removed the ability change userid, also stopped writing to /tmp/yaws …
authored
65 parse([{data, Data, _Line}|Tokens], CTag, Stack, Acc) ->
ac01b59 Added html->ehtml parser.
Johan Bevemyr authored
66 case skip_space(Data, 0) of
0be3c7e @klacke untabified all of yaws
authored
67 {[], _} ->
68 parse(Tokens, CTag, Stack, Acc);
69 _ ->
70 parse(Tokens, CTag, Stack, [Data|Acc])
ac01b59 Added html->ehtml parser.
Johan Bevemyr authored
71 end.
0be3c7e @klacke untabified all of yaws
authored
72 %%
ac01b59 Added html->ehtml parser.
Johan Bevemyr authored
73
455578a @vinoski major trailing whitespace cleanup
vinoski authored
74 tag_type(p) -> leaf;
75 tag_type(hr) -> leaf;
76 tag_type(input) -> leaf;
ac01b59 Added html->ehtml parser.
Johan Bevemyr authored
77 tag_type(base) -> leaf;
78 tag_type(img) -> leaf;
79 tag_type('!doctype') -> leaf;
80 tag_type(meta) -> leaf;
81 tag_type(link) -> leaf;
82 tag_type(br) -> leaf;
83 tag_type(_) -> node.
84
0be3c7e @klacke untabified all of yaws
authored
85 %% tokenize(Input, DataAcc, TokenAcc, LineNr)
ac01b59 Added html->ehtml parser.
Johan Bevemyr authored
86
87 tokenize([], [], Tokens, _Line) ->
88 lists:reverse(Tokens);
89 tokenize([], Acc, Tokens, Line) ->
90 lists:reverse([{data, lists:reverse(Acc), Line}|Tokens]);
91 tokenize([$<,$!,$-,$-|R0], Acc, Tokens, L0) ->
92 {R1, L1} = skip_comment(R0,L0),
93 tokenize(R1, Acc, Tokens, L1);
94 tokenize([$<|R0], Acc, Tokens, L0) ->
95 {Tag,R1,L1} = scan_tag(R0,L0),
455578a @vinoski major trailing whitespace cleanup
vinoski authored
96 if
0be3c7e @klacke untabified all of yaws
authored
97 Acc == [] ->
98 next_token(Tag, R1, [Tag|Tokens], L1);
99 true ->
100 Data = {data,lists:reverse(Acc),L0},
101 next_token(Tag, R1, [Tag,Data|Tokens], L1)
ac01b59 Added html->ehtml parser.
Johan Bevemyr authored
102 end;
103 tokenize([C=$\n|R0], Acc, Tokens, L) ->
104 tokenize(R0, [C|Acc], Tokens, L+1);
105 tokenize([C=$\r|R0], Acc, Tokens, L) ->
106 tokenize(R0, [C|Acc], Tokens, L+1);
107 tokenize([C|R0], Acc, Tokens, L) ->
108 tokenize(R0, [C|Acc], Tokens, L).
109
0be3c7e @klacke untabified all of yaws
authored
110 %%
ac01b59 Added html->ehtml parser.
Johan Bevemyr authored
111
112 next_token({begin_tag, script, _, _}, R, Tokens, L) ->
113 {Data, R1, L1} = scan_endtag(R, "script", L),
114 tokenize(R1, [], [{data, Data, L}|Tokens], L1);
115 next_token({begin_tag, style, _, _}, R, Tokens, L) ->
116 {Data, R1, L1} = scan_endtag(R, "style", L),
117 tokenize(R1, [], [{data, Data, L}|Tokens], L1);
118 next_token(_Tag, R, Tokens, L) ->
119 tokenize(R, [], Tokens, L).
120
121 %% '<' <id> <sp>+ [<id><sp>*['='<val>]]* ['/'] '>'
122
123 scan_tag([$/|I], L) ->
92bafb4 @klacke removed the ability change userid, also stopped writing to /tmp/yaws …
authored
124 {_R0,L0} = skip_space(I, L),
ac01b59 Added html->ehtml parser.
Johan Bevemyr authored
125 {Name,R1,L1} = scan_tag_name(I, L0),
126 {R2,L2} = skip_space(R1, L1),
127 {Args,R3,L3} = scan_tag_args(R2, L2),
128 {{end_tag,list_to_atom(lowercase(Name)),Args,L0}, R3, L3};
129 scan_tag(I, L) ->
92bafb4 @klacke removed the ability change userid, also stopped writing to /tmp/yaws …
authored
130 {_R0,L0} = skip_space(I, L),
ac01b59 Added html->ehtml parser.
Johan Bevemyr authored
131 {Name,R1,L1} = scan_tag_name(I, L0),
132 {R2,L2} = skip_space(R1, L1),
133 {Args,R3,L3} = scan_tag_args(R2, L2),
134 {{begin_tag,list_to_atom(lowercase(Name)),Args,L0}, R3, L3}.
135
0be3c7e @klacke untabified all of yaws
authored
136 %%
ac01b59 Added html->ehtml parser.
Johan Bevemyr authored
137
138 scan_tag_name(I, L) ->
139 scan_token(I, [], L).
140
0be3c7e @klacke untabified all of yaws
authored
141 %%
ac01b59 Added html->ehtml parser.
Johan Bevemyr authored
142
143 scan_tag_args(I, L) ->
144 scan_tag_args(I, [], L).
145
146 scan_tag_args([], Acc, L) ->
147 {lists:reverse(Acc), [], L};
148 scan_tag_args([$>|R], Acc, L) ->
149 {lists:reverse(Acc), R, L};
0be3c7e @klacke untabified all of yaws
authored
150 scan_tag_args(R=[$<|_], Acc, L) -> %%%% bad html
ac01b59 Added html->ehtml parser.
Johan Bevemyr authored
151 {lists:reverse(Acc), R, L};
152 scan_tag_args(R0, Acc, L0) ->
153 {Name,R1,L1} = scan_value(R0, L0),
154 {R2, L2} = skip_space(R1, L1),
155 case R2 of
0be3c7e @klacke untabified all of yaws
authored
156 [$=|R3] ->
157 {R4,L4} = skip_space(R3, L2),
158 {Value,R5,L5} = scan_value(R4, L4),
159 {R6,L6} = skip_space(R5, L5),
160 OptName = list_to_atom(lowercase(Name)),
161 scan_tag_args(R6, [{OptName,Value}|Acc], L6);
162 _ ->
163 scan_tag_args(R2, [Name|Acc], L2)
ac01b59 Added html->ehtml parser.
Johan Bevemyr authored
164 end.
165
0be3c7e @klacke untabified all of yaws
authored
166 %%
ac01b59 Added html->ehtml parser.
Johan Bevemyr authored
167
168 scan_value([$"|R], L) ->
169 scan_quote(R, [], $", L);
170 scan_value([$'|R], L) ->
171 scan_quote(R, [], $', L);
172 scan_value(R, L) ->
6e96a14 @klacke indendation cleanup
authored
173 scan_token(R, [], L).
ac01b59 Added html->ehtml parser.
Johan Bevemyr authored
174
0be3c7e @klacke untabified all of yaws
authored
175 %%
ac01b59 Added html->ehtml parser.
Johan Bevemyr authored
176
177 scan_token([], Acc, L) ->
178 {lists:reverse(Acc), [], L};
179 scan_token(R=[$>|_], Acc, L) ->
180 {lists:reverse(Acc), R, L};
0be3c7e @klacke untabified all of yaws
authored
181 scan_token(R=[$<|_], Acc, L) -> %%% bad html
ac01b59 Added html->ehtml parser.
Johan Bevemyr authored
182 {lists:reverse(Acc), R, L};
183 scan_token(R=[$=|_], Acc, L) -> %% bad html
184 {lists:reverse(Acc), R, L};
185 scan_token([C|R], Acc, L0) ->
186 case char_class(C) of
0be3c7e @klacke untabified all of yaws
authored
187 space ->
188 {lists:reverse(Acc), R, L0};
189 nl ->
190 {lists:reverse(Acc), R, L0+1};
191 _ ->
192 scan_token(R, [C|Acc], L0)
ac01b59 Added html->ehtml parser.
Johan Bevemyr authored
193 end.
194
0be3c7e @klacke untabified all of yaws
authored
195 %%
ac01b59 Added html->ehtml parser.
Johan Bevemyr authored
196
197 scan_quote([], Acc, _Q, L) ->
198 {lists:reverse(Acc), [], L};
199 scan_quote([Q|R], Acc, Q, L) ->
200 {lists:reverse(Acc), R, L};
201 scan_quote([C=$\n|R], Acc, Q, L) ->
202 scan_quote(R, [C|Acc], Q, L+1);
203 scan_quote([C=$\r|R], Acc, Q, L) ->
204 scan_quote(R, [C|Acc], Q, L+1);
205 scan_quote([C|R], Acc, Q, L) ->
206 scan_quote(R, [C|Acc], Q, L).
0be3c7e @klacke untabified all of yaws
authored
207
208 %%
ac01b59 Added html->ehtml parser.
Johan Bevemyr authored
209
210 scan_endtag(R, Tag, L) ->
211 scan_endtag(R, Tag, [], L).
212
213 scan_endtag([], _Tag, Acc, L) ->
214 {lists:reverse(Acc), [], L};
215 scan_endtag(R=[$<,$/|R0], Tag, Acc, L0) ->
216 case casecmp(Tag, R0) of
0be3c7e @klacke untabified all of yaws
authored
217 {true, R1} ->
218 {R2,_} = skip_space(R1,L0),
219 if hd(R2) == $> ->
220 {lists:reverse(Acc), R, L0};
221 true ->
222 scan_endtag(R0, Tag, Acc, L0)
223 end;
224 false ->
225 scan_endtag(R0, Tag, Acc, L0)
ac01b59 Added html->ehtml parser.
Johan Bevemyr authored
226 end;
227 scan_endtag([C=$\n|R], Tag, Acc, L) ->
228 scan_endtag(R, Tag, [C|Acc], L+1);
229 scan_endtag([C=$\r|R], Tag, Acc, L) ->
230 scan_endtag(R, Tag, [C|Acc], L+1);
231 scan_endtag([C|R], Tag, Acc, L) ->
232 scan_endtag(R, Tag, [C|Acc], L).
233
0be3c7e @klacke untabified all of yaws
authored
234 %%
ac01b59 Added html->ehtml parser.
Johan Bevemyr authored
235
236 casecmp([], R) -> {true, R};
237 casecmp([C1|T1], [C2|T2]) ->
238 C2low = lowercase_ch(C2),
239 if C1 == C2low -> casecmp(T1,T2);
240 true -> false
241 end.
242
0be3c7e @klacke untabified all of yaws
authored
243 %%
ac01b59 Added html->ehtml parser.
Johan Bevemyr authored
244
245 char_class($\n) -> nl;
246 char_class($\r) -> nl;
247 char_class($ ) -> space;
248 char_class($\t) -> space;
249 char_class(C) when C >= $a, C =< $z -> alpha;
250 char_class(C) when C >= $A, C =< $Z -> alpha;
251 char_class(C) when C >= $0, C =< $9 -> digit;
92bafb4 @klacke removed the ability change userid, also stopped writing to /tmp/yaws …
authored
252 char_class(_C) -> other.
ac01b59 Added html->ehtml parser.
Johan Bevemyr authored
253
0be3c7e @klacke untabified all of yaws
authored
254 %%
ac01b59 Added html->ehtml parser.
Johan Bevemyr authored
255
256 skip_space([], L) ->
257 {[], L};
258 skip_space(R = [C|R0], L) ->
259 case char_class(C) of
0be3c7e @klacke untabified all of yaws
authored
260 nl ->
261 skip_space(R0, L+1);
262 space ->
263 skip_space(R0, L);
264 _ ->
265 {R, L}
ac01b59 Added html->ehtml parser.
Johan Bevemyr authored
266 end.
267
0be3c7e @klacke untabified all of yaws
authored
268 %%
ac01b59 Added html->ehtml parser.
Johan Bevemyr authored
269
270 skip_comment([], L) -> {[], L};
271 skip_comment([$-,$-,$>|R],L) -> {R,L};
272 skip_comment([$\n|R],L) -> skip_comment(R,L+1);
273 skip_comment([$\r|R],L) -> skip_comment(R,L+1);
92bafb4 @klacke removed the ability change userid, also stopped writing to /tmp/yaws …
authored
274 skip_comment([_C|R],L) -> skip_comment(R,L).
ac01b59 Added html->ehtml parser.
Johan Bevemyr authored
275
0be3c7e @klacke untabified all of yaws
authored
276 %%
ac01b59 Added html->ehtml parser.
Johan Bevemyr authored
277
278 lowercase(Str) ->
279 [lowercase_ch(S) || S <- Str].
280
281 lowercase_ch(C) when C>=$A, C=<$Z -> C + 32;
282 lowercase_ch(C) -> C.
Something went wrong with that request. Please try again.