Skip to content

Commit

Permalink
now <img and <a are converted to absolute urls
Browse files Browse the repository at this point in the history
  • Loading branch information
ivkosh committed Jan 24, 2011
1 parent 53afafd commit a9978cf
Showing 1 changed file with 2 additions and 11 deletions.
13 changes: 2 additions & 11 deletions src/t.erl
Expand Up @@ -112,23 +112,14 @@ fetch_page(Url) ->
io_lib:format(<<"<html><head><title>Error</title></head><body>Error: cannot fetch ~s ~p</body></html>">>, [Url, ErrVal])
end.

% U is a binary!
to_abs_url({<<"src">>, U}, Ctx) -> {<<"src">>, list_to_binary(full_url(Ctx, binary_to_list(U)))};
to_abs_url({<<"href">>, U}, Ctx) -> {<<"href">>, list_to_binary(full_url(Ctx, binary_to_list(U)))};
to_abs_url(A, _) -> A.

simplify_page(Url) ->
Body = fetch_page(Url), % FIXME: делать в отдельном процессе и слать сообщение по завершению
Ctx = url_context(Url),
Body = fetch_page(Url), % TODO: делать в отдельном процессе и слать сообщение по завершению
try mochiweb_html:parse(Body) of % не сработает если в файле нет ни одного тэга html
TreeOrig ->
TitleStr = get_title(TreeOrig),
{_, _, TreeBody} = find_first_el(<<"body">>, TreeOrig),
% ??? Every html has <body> or not? what if html is mailformed?
TreeBodyClean = clean_html_tree(TreeBody),
% превращаем относительные url в абсолютные
TreeBodyWithImg = repl_el_attr_f(<<"img">>, fun(L) -> [ to_abs_url(El, Ctx) || El <- L ] end, TreeBodyClean),
TreeBodyWithA = repl_el_attr_f(<<"a">>, fun(L) -> [ to_abs_url(El, Ctx) || El <- L ] end, TreeBodyWithImg),
TreeOut2 = {
<<"html">>, [], [
{
Expand All @@ -137,7 +128,7 @@ simplify_page(Url) ->
{
<<"body">>, [],
[ {<<"h1">>, [], TitleStr} ] ++ % можно и убрать
TreeBodyClean
TreeBodyWithA
}
]
},
Expand Down

0 comments on commit a9978cf

Please sign in to comment.