From 5631e9c4bfaef0f7af55175c8b1c830b3225df85 Mon Sep 17 00:00:00 2001 From: Ivan Koshkin Date: Tue, 1 Feb 2011 10:54:39 +0300 Subject: [PATCH 01/32] or changed to orelse in fold --- src/rdbl.erl | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/rdbl.erl b/src/rdbl.erl index 5bfde4b..86df918 100644 --- a/src/rdbl.erl +++ b/src/rdbl.erl @@ -395,14 +395,20 @@ get_score(Node) -> score_by_class_or_id({_, _, Attrs, _})-> score_by_class_or_id(Attrs); score_by_class_or_id([]) -> 0; score_by_class_or_id(Attrs=[_|_]) -> - AttrVals = [ V || {K, V} <- [Attrs], (K == <<"id">>) or (K == <<"class">>) ], + AttrVals = [ V || {K, V} <- [Attrs], (K == <<"id">>) orelse (K == <<"class">>) ], if AttrVals == [] -> 0; % no id or class (list is empty) true -> % e.g. we have id or class or both - case lists:foldl(fun(El, Acc) -> Acc or (re:run(El, ?RE_NEGATIVE, [{capture, none}]) == match) end, false, AttrVals) of + case lists:foldl( + fun(El, Acc) -> + Acc orelse (re:run(El, ?RE_NEGATIVE, [{capture, none}]) == match) + end, false, AttrVals) of true -> -50; false -> - case lists:foldl(fun(El, Acc) -> Acc or (re:run(El, ?RE_POSITIVE, [{capture, none}]) == match) end, false, AttrVals) of + case lists:foldl( + fun(El, Acc) -> + Acc orelse (re:run(El, ?RE_POSITIVE, [{capture, none}]) == match) + end, false, AttrVals) of true -> 25; false -> 0 end From 56ef4283c6707543ac3b1ce8d6e2b3651c5361c1 Mon Sep 17 00:00:00 2001 From: Ivan Koshkin Date: Tue, 1 Feb 2011 12:55:43 +0300 Subject: [PATCH 02/32] @spec of replace_node changed --- src/rdbl.erl | 36 +++++++++++++++++++++++------------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/src/rdbl.erl b/src/rdbl.erl index 86df918..4107a71 100644 --- a/src/rdbl.erl +++ b/src/rdbl.erl @@ -13,7 +13,7 @@ -define(DEBUG, 1). -ifdef(DEBUG). --export([find_node/2, find_all_nodes/2, remove_node/2, replace_node/3, replace_node/4]). +-export([find_node/2, find_all_nodes/2, remove_node/2, replace_node/2, replace_node/3]). -export([fetch_page/1, simplify_page/3]). -export([brbr_to_p/1, count_commas/1, clean_html_tree/1]). -export([init_scores/1, clean_scores/1, modify_score/3, get_score/1, get_ref/1, get_parent_ref/1, score_tree/1, score_list/1]). @@ -117,8 +117,8 @@ simplify_page(Body, Ctx, DefaultContentType) -> init_scores( % converting urls in and to absolute urls % TODO: ! rewrite replace node to work with list of tags (to pass once on tree) - replace_node(<<"a">>, <<"a">>, fun(L) -> [ to_abs_url(El, Ctx) || El <- L ] end, - replace_node(<<"img">>, <<"img">>, fun(L) -> [ to_abs_url(El, Ctx) || El <- L ] end, + replace_node({<<"a">>, <<"a">>}, fun(L) -> [ to_abs_url(El, Ctx) || El <- L ] end, + replace_node({<<"img">>, <<"img">>}, fun(L) -> [ to_abs_url(El, Ctx) || El <- L ] end, clean_html_tree({<<"div">>, [], TreeBody}))) % converting body to div ) ), @@ -248,23 +248,33 @@ remove_node(Key, {E, S, A, R}) -> {E, S, A, remove_node(Key, R)}; % continue to remove_node(_, []) -> []; remove_node(Key, [H|T]) -> [remove_node(Key, H) | remove_node(Key, T)]. % processing list -%% @spec replace_node(binary(), binary(), fun( [html_attr()] ) -> [html_attr()], html_node() | scored_html_node()) -> html_node() | scored_html_node(). +%% @spec replace_node({binary(), binary()}|[{binary(), binary()}], fun( [html_attr()] ) -> [html_attr()], html_node() | scored_html_node()) -> html_node() | scored_html_node(). %% @doc HTML tag & attribute replacer. +%% @doc First parameter is tuple of two binaries: {Key, NewKey} or list of such tuples to replace +%% @doc multiple keys at once in one run on html tree. %% @doc Func is used to transform list of tag attributes: fun(AttrList) -> ModifiedAttrList %% @doc if Func is omitted, F(L)->L end is used, e.g. list of attrs will be not modified at all. %% %% @doc example: replace_node(<<"br">>, <<"p">>, HtmlTree) -> HtmlTreeWithBrReplacedToP %% @doc example: replace_node(<<"br">>, <<"br">>, fun(L)->TransformedL end, HtmlTree) -> HtmlTreeWithBrReplacedToP -replace_node(Key, NewKey, NodeIn) -> replace_node(Key, NewKey, fun(L)->L end, NodeIn). +replace_node({Key, NewKey}, NodeIn) -> replace_node({Key, NewKey}, fun(L)->L end, NodeIn). % -replace_node(_K, _NK, _Func, NodeIn) when is_binary(NodeIn) -> NodeIn; -replace_node(_K, _NK, _Func, {comment, _}) -> []; % dropping comments -replace_node(Key, NewKey, Func, {Key, Attr, Rest}) -> {NewKey, Func(Attr), replace_node(Key, NewKey, Func, Rest)}; % Key found changing and processing subtree -replace_node(Key, NewKey, Func, {Key, S, Attr, Rest}) -> {NewKey, S, Func(Attr), replace_node(Key, NewKey, Func, Rest)}; % Key found changing and processing subtree -replace_node(Key, NewKey, Func, {E, A, R}) -> {E, A, replace_node(Key, NewKey, Func, R)}; % continue to subtree -replace_node(Key, NewKey, Func, {E, S, A, R}) -> {E, S, A, replace_node(Key, NewKey, Func, R)}; % continue to subtree -replace_node(_, _, _, []) -> []; -replace_node(Key, NewKey, Func, [H|T]) -> [replace_node(Key, NewKey, Func, H) | replace_node(Key, NewKey, Func, T)]. % processing list recursively +replace_node({_K, _NK}, _Func, NodeIn) when is_binary(NodeIn) -> NodeIn; +replace_node({_K, _NK}, _Func, {comment, _}) -> []; % dropping comments +replace_node({Key, NewKey}, Func, {Key, Attr, Rest}) when is_binary(Key) -> {NewKey, Func(Attr), replace_node({Key, NewKey}, Func, Rest)}; % Key found changing and processing subtree +replace_node({Key, NewKey}, Func, {Key, S, Attr, Rest}) when is_binary(Key) -> {NewKey, S, Func(Attr), replace_node({Key, NewKey}, Func, Rest)}; % Key found changing and processing subtree + + +%%%%replace_node(KeyList=[_|_], _NewKey, Func, {Key, Attr, Rest}) when is_binary(Key) -> {NewKey, Func(Attr), replace_node(Key, NewKey, Func, Rest)}; % Key found changing and processing subtree + +%replace_node({Key, + + +%!!!! +replace_node({Key, NewKey}, Func, {E, A, R}) -> {E, A, replace_node({Key, NewKey}, Func, R)}; % continue to subtree +replace_node({Key, NewKey}, Func, {E, S, A, R}) -> {E, S, A, replace_node({Key, NewKey}, Func, R)}; % continue to subtree +replace_node({_, _}, _, []) -> []; +replace_node({Key, NewKey}, Func, [H|T]) -> [replace_node({Key, NewKey}, Func, H) | replace_node({Key, NewKey}, Func, T)]. % processing list recursively %% @spec brbr_to_p(html_node() | scored_html_node()) -> html_node() | scored_html_node() %% @doc replaces more than 2
s in row with

From 3a304004b59866d6a3d742d71b888a230c5b773f Mon Sep 17 00:00:00 2001 From: Ivan Koshkin Date: Tue, 1 Feb 2011 20:47:43 +0300 Subject: [PATCH 03/32] replace_node now can work with list of keys and process multiple tag replacements at once in one walk of the tree --- src/rdbl.erl | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/src/rdbl.erl b/src/rdbl.erl index 4107a71..56c1dbb 100644 --- a/src/rdbl.erl +++ b/src/rdbl.erl @@ -116,10 +116,10 @@ simplify_page(Body, Ctx, DefaultContentType) -> ScoredTree = score_tree( init_scores( % converting urls in and to absolute urls - % TODO: ! rewrite replace node to work with list of tags (to pass once on tree) - replace_node({<<"a">>, <<"a">>}, fun(L) -> [ to_abs_url(El, Ctx) || El <- L ] end, - replace_node({<<"img">>, <<"img">>}, fun(L) -> [ to_abs_url(El, Ctx) || El <- L ] end, - clean_html_tree({<<"div">>, [], TreeBody}))) % converting body to div + replace_node( + [{<<"a">>, <<"a">>}, {<<"img">>, <<"img">>}], fun(L) -> [ to_abs_url(El, Ctx) || El <- L ] end, + clean_html_tree({<<"div">>, [], TreeBody}) % converting body to div + ) ) ), OptimumRef = get_max_score_ref(ScoredTree), @@ -261,16 +261,18 @@ replace_node({Key, NewKey}, NodeIn) -> replace_node({Key, NewKey}, fun(L)->L end % replace_node({_K, _NK}, _Func, NodeIn) when is_binary(NodeIn) -> NodeIn; replace_node({_K, _NK}, _Func, {comment, _}) -> []; % dropping comments -replace_node({Key, NewKey}, Func, {Key, Attr, Rest}) when is_binary(Key) -> {NewKey, Func(Attr), replace_node({Key, NewKey}, Func, Rest)}; % Key found changing and processing subtree -replace_node({Key, NewKey}, Func, {Key, S, Attr, Rest}) when is_binary(Key) -> {NewKey, S, Func(Attr), replace_node({Key, NewKey}, Func, Rest)}; % Key found changing and processing subtree - - -%%%%replace_node(KeyList=[_|_], _NewKey, Func, {Key, Attr, Rest}) when is_binary(Key) -> {NewKey, Func(Attr), replace_node(Key, NewKey, Func, Rest)}; % Key found changing and processing subtree - -%replace_node({Key, - - -%!!!! +replace_node({Key, NewKey}, Func, {Key, Attr, Rest}) -> {NewKey, Func(Attr), replace_node({Key, NewKey}, Func, Rest)}; % Key found changing and processing subtree +replace_node({Key, NewKey}, Func, {Key, S, Attr, Rest}) -> {NewKey, S, Func(Attr), replace_node({Key, NewKey}, Func, Rest)}; % Key found changing and processing subtree +replace_node(KeyList=[_|_], Func, {Key, Attr, Rest}) -> % case when KeyList is list - changing multiple keys in one walk of tree + case lists:keyfind(Key, 1, KeyList) of + {Key, NewKey} -> {NewKey, Func(Attr), replace_node(KeyList, Func, Rest)}; % Key found changing and processing subtree + false -> {Key, Attr, replace_node(KeyList, Func, Rest)} + end; +replace_node(KeyList=[_|_], Func, {Key, S, Attr, Rest}) -> + case lists:keyfind(Key, 1, KeyList) of + {Key, NewKey} -> {NewKey, S, Func(Attr), replace_node(KeyList, Func, Rest)}; % Key found changing and processing subtree + false -> {Key, S, Attr, replace_node(KeyList, Func, Rest)} + end; replace_node({Key, NewKey}, Func, {E, A, R}) -> {E, A, replace_node({Key, NewKey}, Func, R)}; % continue to subtree replace_node({Key, NewKey}, Func, {E, S, A, R}) -> {E, S, A, replace_node({Key, NewKey}, Func, R)}; % continue to subtree replace_node({_, _}, _, []) -> []; From 603308e16ee72185cc5ad3c6ebc8232a2c1b2c07 Mon Sep 17 00:00:00 2001 From: Ivan Koshkin Date: Tue, 1 Feb 2011 20:58:31 +0300 Subject: [PATCH 04/32] fix --- src/rdbl.erl | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/rdbl.erl b/src/rdbl.erl index 56c1dbb..74dd315 100644 --- a/src/rdbl.erl +++ b/src/rdbl.erl @@ -257,12 +257,14 @@ remove_node(Key, [H|T]) -> [remove_node(Key, H) | remove_node(Key, T)]. % proces %% %% @doc example: replace_node(<<"br">>, <<"p">>, HtmlTree) -> HtmlTreeWithBrReplacedToP %% @doc example: replace_node(<<"br">>, <<"br">>, fun(L)->TransformedL end, HtmlTree) -> HtmlTreeWithBrReplacedToP -replace_node({Key, NewKey}, NodeIn) -> replace_node({Key, NewKey}, fun(L)->L end, NodeIn). +replace_node(Ks, NodeIn) -> replace_node(Ks, fun(L)->L end, NodeIn). % -replace_node({_K, _NK}, _Func, NodeIn) when is_binary(NodeIn) -> NodeIn; -replace_node({_K, _NK}, _Func, {comment, _}) -> []; % dropping comments +replace_node(_Ks, _Func, NodeIn) when is_binary(NodeIn) -> NodeIn; +replace_node(_Ks, _Func, {comment, _}) -> []; % dropping comments replace_node({Key, NewKey}, Func, {Key, Attr, Rest}) -> {NewKey, Func(Attr), replace_node({Key, NewKey}, Func, Rest)}; % Key found changing and processing subtree replace_node({Key, NewKey}, Func, {Key, S, Attr, Rest}) -> {NewKey, S, Func(Attr), replace_node({Key, NewKey}, Func, Rest)}; % Key found changing and processing subtree +replace_node({Key, NewKey}, Func, {E, A, R}) -> {E, A, replace_node({Key, NewKey}, Func, R)}; % continue to subtree +replace_node({Key, NewKey}, Func, {E, S, A, R}) -> {E, S, A, replace_node({Key, NewKey}, Func, R)}; % continue to subtree replace_node(KeyList=[_|_], Func, {Key, Attr, Rest}) -> % case when KeyList is list - changing multiple keys in one walk of tree case lists:keyfind(Key, 1, KeyList) of {Key, NewKey} -> {NewKey, Func(Attr), replace_node(KeyList, Func, Rest)}; % Key found changing and processing subtree @@ -273,10 +275,8 @@ replace_node(KeyList=[_|_], Func, {Key, S, Attr, Rest}) -> {Key, NewKey} -> {NewKey, S, Func(Attr), replace_node(KeyList, Func, Rest)}; % Key found changing and processing subtree false -> {Key, S, Attr, replace_node(KeyList, Func, Rest)} end; -replace_node({Key, NewKey}, Func, {E, A, R}) -> {E, A, replace_node({Key, NewKey}, Func, R)}; % continue to subtree -replace_node({Key, NewKey}, Func, {E, S, A, R}) -> {E, S, A, replace_node({Key, NewKey}, Func, R)}; % continue to subtree -replace_node({_, _}, _, []) -> []; -replace_node({Key, NewKey}, Func, [H|T]) -> [replace_node({Key, NewKey}, Func, H) | replace_node({Key, NewKey}, Func, T)]. % processing list recursively +replace_node(_, _, []) -> []; +replace_node(Ks, Func, [H|T]) -> [replace_node(Ks, Func, H) | replace_node(Ks, Func, T)]. % processing list recursively %% @spec brbr_to_p(html_node() | scored_html_node()) -> html_node() | scored_html_node() %% @doc replaces more than 2
s in row with

From ab1b859a8362abf21beebaec50c7665b6dae38a9 Mon Sep 17 00:00:00 2001 From: Ivan Koshkin Date: Fri, 18 Feb 2011 09:29:03 +0300 Subject: [PATCH 05/32] comments changed --- src/rdbl.erl | 48 ++++++++++++++++++++++++------------------------ 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/src/rdbl.erl b/src/rdbl.erl index 74dd315..3f0aa32 100644 --- a/src/rdbl.erl +++ b/src/rdbl.erl @@ -20,7 +20,7 @@ -export([full_url/2, url_context/1]). -endif. -%% @type score(): +%% type score(): %% Keeps readability score and additional references for every HTML tree element -record(score, { ref, @@ -36,7 +36,7 @@ % or maybe commas are not counted??? %% -%% @type scored_html_node() = {string(), score(), [html_attr()], [html_node() | string()]} +%% type scored_html_node() = {string(), score(), [html_attr()], [html_node() | string()]} %% %% See definitions of html_node() and html_attr() in mochiweb_html.erl @@ -48,7 +48,7 @@ %% @spec simplify_url(string()) -> string() %% @doc fetches url, simplifies its content and returns simplified page as a string() -%% @doc example: simplify_url("http://news.yandex.ru/") -> SimplifiedPageText +%% example: simplify_url("http://news.yandex.ru/") -> SimplifiedPageText simplify_url(Url) -> {ContentType, Body} = fetch_page(Url), % TODO: делать в отдельном процессе и слать сообщение по завершению Ctx = url_context(Url), @@ -56,7 +56,7 @@ simplify_url(Url) -> %% @spec simplify_url(string(), string()) -> ok %% @doc fetches url, simplifies its content and saves to file -%% @doc example: simplify_url("http://news.yandex.ru/", "out.html") +%% example: simplify_url("http://news.yandex.ru/", "out.html") simplify_url(Url, FileName) -> Page = simplify_url(Url), {ok, F} = file:open(FileName, [binary, write]), % TODO: check if file open @@ -66,7 +66,7 @@ simplify_url(Url, FileName) -> %% @spec simplify_file(string(), string()) -> ok %% @doc reads file from disk, simplifies its content and saves to file -%% @doc example: simplify_url("index.html", "out.html") +%% example: simplify_url("index.html", "out.html") simplify_file(FileNameIn, FileNameOut) -> {ok, Html} = file:read_file(FileNameIn),% TODO: check errors Page = simplify_page(Html), @@ -103,8 +103,8 @@ extract_content_type(Tree, DefaultContentType) -> %% @spec simplify_page(string(), {string(), string()}, string()) -> string() %% @doc main function -%% @doc takes document contens (Body) and document context (Ctx, see url_context/1) -%% @doc returns simplified page as a string() +%% takes document contens (Body) and document context (Ctx, see url_context/1) +%% returns simplified page as a string() simplify_page(Body, Ctx, DefaultContentType) -> try mochiweb_html:parse(Body) of % parse() will not work if Body contains no html tags TreeOrig -> @@ -178,11 +178,11 @@ find_node(Key, HtmlNode) when is_binary(Key) -> find_node_bykey(Key, HtmlNode %% @doc the same as find_node(), but returns all nodes with specific tag as a list find_all_nodes(Key, HtmlNode) when is_binary(Key) -> find_node_bykey(Key, HtmlNode, multi). -%% @spec find_node_by_key(binary(), html_node() | scored_html_node(), first | multi) -> html_node() | scored_html_node() | [html_node() | scored_html_node()] +%% @spec find_node_bykey(binary(), html_node() | scored_html_node(), first | multi) -> html_node() | scored_html_node() | [html_node() | scored_html_node()] %% @doc helper function for find_node() and find_all_nodes() -%% @doc returns: -%% @doc - first element found as html_node() | scored_html_node() if SearchType != multi -%% @doc - list of all found elements as [html_node() | scored_html_node()] if SearchType == multi +%% returns: +%% - first element found as html_node() | scored_html_node() if SearchType != multi +%% - list of all found elements as [html_node() | scored_html_node()] if SearchType == multi find_node_bykey(_, HtmlNode, _) when is_binary(HtmlNode) -> []; % don't searching for leafs find_node_bykey(_, {comment, _}, _) -> []; % comments in mochiweb_html:parse are 2-element tuples, dropping them find_node_bykey(Key, Elem, SearchType) when is_tuple(Elem) -> % Element found @@ -214,7 +214,7 @@ find_node_bykey(Key, [H|T], SearchType) -> find_node_bykey(Key, T, SearchType) end. -%% @spec find_node_by_ref(reference(), scored_html_node()) -> scored_html_node() +%% @spec find_node_byref(reference(), scored_html_node()) -> scored_html_node() %% @doc helper function for find_node() find_node_byref(_Ref, HtmlNode) when is_binary(HtmlNode) -> []; % leaf is not an option find_node_byref(_, {comment, _}) -> []; % comment is not an option @@ -232,7 +232,7 @@ find_node_byref(Ref, [H|T]) -> % walk list if it is not empty %% @spec remove_node(binary() | [binary()], html_node() | scored_html_node()) -> html_node() | scored_html_node() %% @doc HTML tag remover. Removes from node all subtrees with Key and returns cleaned html_node() | scored_html_node() -%% @doc example: remove_node(<<"script">>, HtmlTree) -> HtmlTreeWithoutScripts +%% example: remove_node(<<"script">>, HtmlTree) -> HtmlTreeWithoutScripts % if Key is a list - removing all list elements from tree remove_node([], HtmlTree) -> HtmlTree; % TODO: неэфективно - дерево пробегается столько раз, какова длина списка ключей. @@ -250,13 +250,13 @@ remove_node(Key, [H|T]) -> [remove_node(Key, H) | remove_node(Key, T)]. % proces %% @spec replace_node({binary(), binary()}|[{binary(), binary()}], fun( [html_attr()] ) -> [html_attr()], html_node() | scored_html_node()) -> html_node() | scored_html_node(). %% @doc HTML tag & attribute replacer. -%% @doc First parameter is tuple of two binaries: {Key, NewKey} or list of such tuples to replace -%% @doc multiple keys at once in one run on html tree. -%% @doc Func is used to transform list of tag attributes: fun(AttrList) -> ModifiedAttrList -%% @doc if Func is omitted, F(L)->L end is used, e.g. list of attrs will be not modified at all. +%% First parameter is tuple of two binaries: {Key, NewKey} or list of such tuples to replace +%% multiple keys at once in one run on html tree. +%% Func is used to transform list of tag attributes: fun(AttrList) -> ModifiedAttrList +%% if Func is omitted, F(L)->L end is used, e.g. list of attrs will be not modified at all. %% -%% @doc example: replace_node(<<"br">>, <<"p">>, HtmlTree) -> HtmlTreeWithBrReplacedToP -%% @doc example: replace_node(<<"br">>, <<"br">>, fun(L)->TransformedL end, HtmlTree) -> HtmlTreeWithBrReplacedToP +%% example: replace_node(<<"br">>, <<"p">>, HtmlTree) -> HtmlTreeWithBrReplacedToP +%% example: replace_node(<<"br">>, <<"br">>, fun(L)->TransformedL end, HtmlTree) -> HtmlTreeWithBrReplacedToP replace_node(Ks, NodeIn) -> replace_node(Ks, fun(L)->L end, NodeIn). % replace_node(_Ks, _Func, NodeIn) when is_binary(NodeIn) -> NodeIn; @@ -308,8 +308,8 @@ count_commas(Leaf) when is_binary(Leaf) -> lists:foldl(fun(E, S) -> if E == $, - %% @spec init_scores(html_node()) -> scored_html_node() %% @doc transforms html_node() to scored_html_node(). It now has 4 elements in tuple (not 3 as in mochiweb_html type), -%% @doc the second element in tuple is Score - record of #score, containing readability score, current element ref -%% @doc and ref to parent of current element (see -record(score, ...) below) +%% the second element in tuple is Score - record of #score, containing readability score, current element ref +%% and ref to parent of current element (see -record(score, ...) below) init_scores(Tree) -> init_scores(Tree, make_ref()). % adding reference for topmost element too % init_scores(R, _) when is_binary(R) -> R; @@ -330,7 +330,7 @@ clean_scores([H|T]) -> [clean_scores(H) | clean_scores(T)]. %% @spec modify_score(reference(), scored_html_node(), int()) -> scored_html_node() %% @doc modify readability score for specific element on scored tree. Score is added to current node score -%% @doc e.g. int() is ScoreDiff, to subtract score for element pass negative int() +%% e.g. int() is ScoreDiff, to subtract score for element pass negative int() modify_score(_, Leaf, _) when is_binary(Leaf) -> Leaf; modify_score(_, {comment, _}, _) -> []; modify_score(Ref, {Key, #score{ref=Ref, readability=Rdbl, parent=ParentRef}, A, R}, Score) -> {Key, #score{ref=Ref, readability=Rdbl+Score, parent=ParentRef}, A, R}; @@ -471,7 +471,7 @@ get_max_score_ref(Tree) -> %% @spec score_list(scored_html_node()) -> [{reference(), int()}] %% @doc builds list of pairs {Node_Ref, Node_Score} from html tree -%% @doc helper function for get_max_score_ref() +%% helper function for get_max_score_ref() score_list(HtmlNode) when is_binary(HtmlNode) -> []; % leaf score_list({comment, _}) -> []; score_list({_, #score{readability=Rdbl, ref=Ref}, _, R}) -> [{Ref, Rdbl} | score_list(R)]; @@ -481,7 +481,7 @@ score_list([H|T]) -> score_list(H) ++ score_list(T). %% @spec url_context(string()) -> {string(), string()} %% @doc returns the domain, and current context path. -%% @doc example: url_context("http://www.some.domain.com/content/index.html) -> {"http://www.some.domain.com", "/content"} +%% example: url_context("http://www.some.domain.com/content/index.html) -> {"http://www.some.domain.com", "/content"} url_context(URL) -> {Proto, _, Root, _Port, Path, _Query} = http_uri:parse(URL), Ctx = string:sub_string(Path, 1, string:rstr(Path,"/")), From e54c94b535749c98ecd637aec7d05c8ee48bc62a Mon Sep 17 00:00:00 2001 From: Ivan Koshkin Date: Fri, 18 Feb 2011 10:02:53 +0300 Subject: [PATCH 06/32] score tree rewritten (prepared to parallelization) --- src/rdbl.erl | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/src/rdbl.erl b/src/rdbl.erl index 3f0aa32..dabaf4b 100644 --- a/src/rdbl.erl +++ b/src/rdbl.erl @@ -20,7 +20,7 @@ -export([full_url/2, url_context/1]). -endif. -%% type score(): +%% @type score(): %% Keeps readability score and additional references for every HTML tree element -record(score, { ref, @@ -36,7 +36,7 @@ % or maybe commas are not counted??? %% -%% type scored_html_node() = {string(), score(), [html_attr()], [html_node() | string()]} +%% @type scored_html_node() = {string(), score(), [html_attr()], [html_node() | string()]} %% %% See definitions of html_node() and html_attr() in mochiweb_html.erl @@ -430,6 +430,25 @@ score_by_class_or_id(Attrs=[_|_]) -> %% @spec score_tree(scored_html_node()) -> scored_html_node() %% @doc score whole html tree depending on its contents score_tree(Tree) -> % TODO: do score_tree in parallel (multiplie processes, map+reduce) + Paragraphs = find_all_nodes(<<"p">>, Tree), + Map1 = [ {1, get_parent_ref(P)} || P <- Paragraphs ], % список вида {1, Parent} для каждого P (пары могут повторяться) + UniqParents = lists:foldl( % building list of unique parent refs for all

's + fun(P, ParentRefList) -> + ParentRef = get_parent_ref(P), + case lists:member(ParentRef, ParentRefList) of + true -> ParentRefList; + false -> [ParentRef | ParentRefList] + end + end, [], Paragraphs), + % replace with MAP phase in parallel !!! + Map2 = lists:map(fun(P_ref) -> Parent = find_node(P_ref, Tree), {score_by_class_or_id(Parent)+count_commas(Parent), P_ref} end, UniqParents), + lists:foldl( % TODO: REDUCE phaze here + fun({Score, P_ref}, TreeAcc) -> + modify_score(P_ref, TreeAcc, Score) + end, Tree, Map1++Map2). + + +score_tree1(Tree) -> % TODO: do score_tree in parallel (multiplie processes, map+reduce) Paragraphs = find_all_nodes(<<"p">>, Tree), Tree1 = lists:foldl( fun(P, TreeAcc) -> From bbadf6c9444214b766d29aa0c52a85da2296e0e0 Mon Sep 17 00:00:00 2001 From: Ivan Koshkin Date: Fri, 18 Feb 2011 10:45:00 +0300 Subject: [PATCH 07/32] score_tree parallelized (map is done in parallel). not such effective way - elements are gathered in the same order, as they were processed; so gather() function may wait too long. Should be rewritten for map/reduce without order (order is not important here). --- src/rdbl.erl | 44 ++++++++++++++++++++------------------------ 1 file changed, 20 insertions(+), 24 deletions(-) diff --git a/src/rdbl.erl b/src/rdbl.erl index dabaf4b..a00cc4a 100644 --- a/src/rdbl.erl +++ b/src/rdbl.erl @@ -441,36 +441,32 @@ score_tree(Tree) -> % TODO: do score_tree in parallel (multiplie processes, map+ end end, [], Paragraphs), % replace with MAP phase in parallel !!! - Map2 = lists:map(fun(P_ref) -> Parent = find_node(P_ref, Tree), {score_by_class_or_id(Parent)+count_commas(Parent), P_ref} end, UniqParents), + Map2 = score_parallel(Tree, UniqParents), lists:foldl( % TODO: REDUCE phaze here fun({Score, P_ref}, TreeAcc) -> modify_score(P_ref, TreeAcc, Score) end, Tree, Map1++Map2). -score_tree1(Tree) -> % TODO: do score_tree in parallel (multiplie processes, map+reduce) - Paragraphs = find_all_nodes(<<"p">>, Tree), - Tree1 = lists:foldl( - fun(P, TreeAcc) -> - modify_score(get_parent_ref(P), TreeAcc, 1) % +1 to parent for each inner

- % TODO: replace with ListAcc and [{1, ParentRef}|ListAcc] - map phaze!!! - end, Tree, Paragraphs), - UniqParents = lists:foldl( % building list of unique parent refs for all

's - fun(P, ParentRefList) -> - ParentRef = get_parent_ref(P), - case lists:member(ParentRef, ParentRefList) of - true -> ParentRefList; - false -> [ParentRef | ParentRefList] - end - end, [], Paragraphs), - lists:foldl( % replace with MAP phase in parallel !!! - fun(ParentRef, TreeAcc) -> - Parent = find_node(ParentRef, TreeAcc), - Score1 = score_by_class_or_id(Parent), - Commas = count_commas(Parent), - modify_score(ParentRef, TreeAcc, Commas+Score1) - end, Tree1, UniqParents). - % TODO: REDUCE phaze here +score_parallel(Tree, UniqParents) -> + S = self(), + Ref = make_ref(), + Pids = lists:map(fun(P_ref) -> + spawn(fun() -> do_score(S, Ref, Tree, P_ref) end) + end, UniqParents), + gather(Pids, Ref). + +do_score(ParentPid, Ref, Tree, P_ref) -> + ParentElem = find_node(P_ref, Tree), + Score1 = score_by_class_or_id(ParentElem), + Score2 = count_commas(ParentElem), + ParentPid ! {self(), Ref, {Score1+Score2, P_ref}}. + +gather([Pid|T], Ref) -> + receive + {Pid, Ref, Ret} -> [Ret|gather(T, Ref)] + end; +gather([], _) -> []. %% @spec get_max_score_ref(scored_html_node()) -> reference() %% @doc finds ref to node with maximum readability score From 7298d6d163fb366f6e92e67427bf5906872a0f78 Mon Sep 17 00:00:00 2001 From: Ivan Koshkin Date: Fri, 18 Feb 2011 11:12:02 +0300 Subject: [PATCH 08/32] gather now works without order --- src/rdbl.erl | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/src/rdbl.erl b/src/rdbl.erl index a00cc4a..cbe97e6 100644 --- a/src/rdbl.erl +++ b/src/rdbl.erl @@ -449,24 +449,27 @@ score_tree(Tree) -> % TODO: do score_tree in parallel (multiplie processes, map+ score_parallel(Tree, UniqParents) -> + process_flag(trap_exit, true), S = self(), - Ref = make_ref(), - Pids = lists:map(fun(P_ref) -> - spawn(fun() -> do_score(S, Ref, Tree, P_ref) end) + lists:foreach(fun(P_ref) -> + spawn(fun() -> do_score(S, Tree, P_ref) end) end, UniqParents), - gather(Pids, Ref). + gather(length(UniqParents), []). -do_score(ParentPid, Ref, Tree, P_ref) -> +do_score(ParentPid, Tree, P_ref) -> ParentElem = find_node(P_ref, Tree), Score1 = score_by_class_or_id(ParentElem), Score2 = count_commas(ParentElem), - ParentPid ! {self(), Ref, {Score1+Score2, P_ref}}. + ParentPid ! {Score1+Score2, P_ref}. -gather([Pid|T], Ref) -> +gather(0, L) -> L; +gather(N, L) -> receive - {Pid, Ref, Ret} -> [Ret|gather(T, Ref)] - end; -gather([], _) -> []. + {Score, P_ref} -> + gather(N-1, [{Score, P_ref} | L]); + {'EXIT', _, _Why} -> + gather(N-1, L) + end. %% @spec get_max_score_ref(scored_html_node()) -> reference() %% @doc finds ref to node with maximum readability score From f29f5bb579f38c15dd270a351381cc99cc1201f4 Mon Sep 17 00:00:00 2001 From: Ivan Koshkin Date: Fri, 18 Feb 2011 11:41:47 +0300 Subject: [PATCH 09/32] full mapreduce in score_tree/1 --- src/rdbl.erl | 47 +++++++++++++++++++++++++++++------------------ 1 file changed, 29 insertions(+), 18 deletions(-) diff --git a/src/rdbl.erl b/src/rdbl.erl index cbe97e6..9e5633a 100644 --- a/src/rdbl.erl +++ b/src/rdbl.erl @@ -431,7 +431,7 @@ score_by_class_or_id(Attrs=[_|_]) -> %% @doc score whole html tree depending on its contents score_tree(Tree) -> % TODO: do score_tree in parallel (multiplie processes, map+reduce) Paragraphs = find_all_nodes(<<"p">>, Tree), - Map1 = [ {1, get_parent_ref(P)} || P <- Paragraphs ], % список вида {1, Parent} для каждого P (пары могут повторяться) + %Map1 = [ {1, get_parent_ref(P)} || P <- Paragraphs ], % список вида {1, Parent} для каждого P (пары могут повторяться) UniqParents = lists:foldl( % building list of unique parent refs for all

's fun(P, ParentRefList) -> ParentRef = get_parent_ref(P), @@ -440,35 +440,46 @@ score_tree(Tree) -> % TODO: do score_tree in parallel (multiplie processes, map+ false -> [ParentRef | ParentRefList] end end, [], Paragraphs), - % replace with MAP phase in parallel !!! - Map2 = score_parallel(Tree, UniqParents), - lists:foldl( % TODO: REDUCE phaze here + ScoreList = score_parallel(Tree, UniqParents, Paragraphs), % вернет список вида { Score, Element }, каждый Element встречается 1 раз + lists:foldl( fun({Score, P_ref}, TreeAcc) -> - modify_score(P_ref, TreeAcc, Score) - end, Tree, Map1++Map2). + if + Score /= 0 -> + modify_score(P_ref, TreeAcc, Score); + true -> % если в процессе reduce score стало равно 0, то не вносим измненения в дерево для этого элемента + TreeAcc + end + end, Tree, ScoreList). -score_parallel(Tree, UniqParents) -> +score_parallel(Tree, UniqParents, Paragraphs) -> process_flag(trap_exit, true), S = self(), - lists:foreach(fun(P_ref) -> - spawn(fun() -> do_score(S, Tree, P_ref) end) - end, UniqParents), - gather(length(UniqParents), []). + lists:foreach(fun(P_ref) -> spawn(fun() -> do_score(S, Tree, P_ref) end) end, UniqParents), + lists:foreach(fun(P) -> S ! { 1, get_parent_ref(P) } end, Paragraphs), + Dict0 = dict:new(), + Dict1 = gather(length(Paragraphs) + 2*length(UniqParents), Dict0), % *2 тк do_score запускает 2 процесса + dict:fold(fun(K, ValList, L)-> [{lists:foldl(fun(E, Acc)-> E+Acc end, 0, ValList), K}| L] end, [], Dict1). do_score(ParentPid, Tree, P_ref) -> ParentElem = find_node(P_ref, Tree), - Score1 = score_by_class_or_id(ParentElem), - Score2 = count_commas(ParentElem), - ParentPid ! {Score1+Score2, P_ref}. + spawn(fun()-> ParentPid ! { score_by_class_or_id(ParentElem), P_ref} end), + spawn(fun()-> ParentPid ! { count_commas(ParentElem), P_ref} end). -gather(0, L) -> L; -gather(N, L) -> +gather(0, Dict) -> Dict; +gather(N, Dict) -> receive {Score, P_ref} -> - gather(N-1, [{Score, P_ref} | L]); + case dict:is_key(P_ref, Dict) of + true -> + Dict1 = dict:append(P_ref, Score, Dict), + gather(N-1, Dict1); + false -> + Dict1 = dict:store(P_ref, [Score], Dict), + gather(N-1, Dict1) + end; {'EXIT', _, _Why} -> - gather(N-1, L) + gather(N-1, Dict) end. %% @spec get_max_score_ref(scored_html_node()) -> reference() From 8f80d5c344d6c7c230f64333388ef6e56fe50fc4 Mon Sep 17 00:00:00 2001 From: Ivan Koshkin Date: Fri, 18 Feb 2011 11:48:05 +0300 Subject: [PATCH 10/32] minor chgs --- .gitignore | 2 ++ src/rdbl.app.src | 12 ------------ 2 files changed, 2 insertions(+), 12 deletions(-) delete mode 100644 src/rdbl.app.src diff --git a/.gitignore b/.gitignore index 3ef25db..62bb564 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,5 @@ ebin/* .settings/* .project* TODO +doc/* +.eunit/* diff --git a/src/rdbl.app.src b/src/rdbl.app.src deleted file mode 100644 index 72aa9ed..0000000 --- a/src/rdbl.app.src +++ /dev/null @@ -1,12 +0,0 @@ -{application, rdbl, - [ - {description, ""}, - {vsn, "0.3"}, - {registered, []}, - {applications, [ - kernel, - stdlib - ]}, - {mod, { rdbl_app, []}}, - {env, []} - ]}. From 73f694110659e8262f226945f056073a198996a2 Mon Sep 17 00:00:00 2001 From: Ivan Koshkin Date: Fri, 18 Feb 2011 12:11:35 +0300 Subject: [PATCH 11/32] support files added mochiweb is in rebar deps now --- .gitignore | 1 + README.md | 6 +- rebar.config | 7 + runme.sh | 1 + src/mochinum.erl | 354 ----------- src/mochiutf8.erl | 317 ---------- src/mochiweb_charref.erl | 308 ---------- src/mochiweb_html.erl | 1264 -------------------------------------- src/rdbl.app.src | 12 + 9 files changed, 26 insertions(+), 2244 deletions(-) create mode 100644 rebar.config create mode 100755 runme.sh delete mode 100644 src/mochinum.erl delete mode 100644 src/mochiutf8.erl delete mode 100644 src/mochiweb_charref.erl delete mode 100644 src/mochiweb_html.erl create mode 100644 src/rdbl.app.src diff --git a/.gitignore b/.gitignore index 62bb564..264dfd9 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,4 @@ ebin/* TODO doc/* .eunit/* +deps/* diff --git a/README.md b/README.md index 9510520..2c284ca 100644 --- a/README.md +++ b/README.md @@ -24,4 +24,8 @@ See other examples in rdbl.erl. Dependencies ------------ Library uses [mochiweb](https://github.com/mochi/mochiweb) html library to parse HTML-content (included). - +Only following files from mochiweb needed: +mochinum.erl +mochiutf8.erl +mochiweb_charref.erl +mochiweb_html.erl diff --git a/rebar.config b/rebar.config new file mode 100644 index 0000000..8d9d524 --- /dev/null +++ b/rebar.config @@ -0,0 +1,7 @@ +%% -*- erlang -*- +{erl_opts, [debug_info]}. +{deps, [ + {mochiweb, ".*", + {git, "git://github.com/mochi/mochiweb.git", "master"}}]}. +{cover_enabled, true}. +{eunit_opts, [verbose, {report,{eunit_surefire,[{dir,"."}]}}]}. diff --git a/runme.sh b/runme.sh new file mode 100755 index 0000000..d5f6208 --- /dev/null +++ b/runme.sh @@ -0,0 +1 @@ +exec erl -pa ebin deps/*/ebin diff --git a/src/mochinum.erl b/src/mochinum.erl deleted file mode 100644 index c52b15c..0000000 --- a/src/mochinum.erl +++ /dev/null @@ -1,354 +0,0 @@ -%% @copyright 2007 Mochi Media, Inc. -%% @author Bob Ippolito - -%% @doc Useful numeric algorithms for floats that cover some deficiencies -%% in the math module. More interesting is digits/1, which implements -%% the algorithm from: -%% http://www.cs.indiana.edu/~burger/fp/index.html -%% See also "Printing Floating-Point Numbers Quickly and Accurately" -%% in Proceedings of the SIGPLAN '96 Conference on Programming Language -%% Design and Implementation. - --module(mochinum). --author("Bob Ippolito "). --export([digits/1, frexp/1, int_pow/2, int_ceil/1]). - -%% IEEE 754 Float exponent bias --define(FLOAT_BIAS, 1022). --define(MIN_EXP, -1074). --define(BIG_POW, 4503599627370496). - -%% External API - -%% @spec digits(number()) -> string() -%% @doc Returns a string that accurately represents the given integer or float -%% using a conservative amount of digits. Great for generating -%% human-readable output, or compact ASCII serializations for floats. -digits(N) when is_integer(N) -> - integer_to_list(N); -digits(0.0) -> - "0.0"; -digits(Float) -> - {Frac1, Exp1} = frexp_int(Float), - [Place0 | Digits0] = digits1(Float, Exp1, Frac1), - {Place, Digits} = transform_digits(Place0, Digits0), - R = insert_decimal(Place, Digits), - case Float < 0 of - true -> - [$- | R]; - _ -> - R - end. - -%% @spec frexp(F::float()) -> {Frac::float(), Exp::float()} -%% @doc Return the fractional and exponent part of an IEEE 754 double, -%% equivalent to the libc function of the same name. -%% F = Frac * pow(2, Exp). -frexp(F) -> - frexp1(unpack(F)). - -%% @spec int_pow(X::integer(), N::integer()) -> Y::integer() -%% @doc Moderately efficient way to exponentiate integers. -%% int_pow(10, 2) = 100. -int_pow(_X, 0) -> - 1; -int_pow(X, N) when N > 0 -> - int_pow(X, N, 1). - -%% @spec int_ceil(F::float()) -> integer() -%% @doc Return the ceiling of F as an integer. The ceiling is defined as -%% F when F == trunc(F); -%% trunc(F) when F < 0; -%% trunc(F) + 1 when F > 0. -int_ceil(X) -> - T = trunc(X), - case (X - T) of - Pos when Pos > 0 -> T + 1; - _ -> T - end. - - -%% Internal API - -int_pow(X, N, R) when N < 2 -> - R * X; -int_pow(X, N, R) -> - int_pow(X * X, N bsr 1, case N band 1 of 1 -> R * X; 0 -> R end). - -insert_decimal(0, S) -> - "0." ++ S; -insert_decimal(Place, S) when Place > 0 -> - L = length(S), - case Place - L of - 0 -> - S ++ ".0"; - N when N < 0 -> - {S0, S1} = lists:split(L + N, S), - S0 ++ "." ++ S1; - N when N < 6 -> - %% More places than digits - S ++ lists:duplicate(N, $0) ++ ".0"; - _ -> - insert_decimal_exp(Place, S) - end; -insert_decimal(Place, S) when Place > -6 -> - "0." ++ lists:duplicate(abs(Place), $0) ++ S; -insert_decimal(Place, S) -> - insert_decimal_exp(Place, S). - -insert_decimal_exp(Place, S) -> - [C | S0] = S, - S1 = case S0 of - [] -> - "0"; - _ -> - S0 - end, - Exp = case Place < 0 of - true -> - "e-"; - false -> - "e+" - end, - [C] ++ "." ++ S1 ++ Exp ++ integer_to_list(abs(Place - 1)). - - -digits1(Float, Exp, Frac) -> - Round = ((Frac band 1) =:= 0), - case Exp >= 0 of - true -> - BExp = 1 bsl Exp, - case (Frac =/= ?BIG_POW) of - true -> - scale((Frac * BExp * 2), 2, BExp, BExp, - Round, Round, Float); - false -> - scale((Frac * BExp * 4), 4, (BExp * 2), BExp, - Round, Round, Float) - end; - false -> - case (Exp =:= ?MIN_EXP) orelse (Frac =/= ?BIG_POW) of - true -> - scale((Frac * 2), 1 bsl (1 - Exp), 1, 1, - Round, Round, Float); - false -> - scale((Frac * 4), 1 bsl (2 - Exp), 2, 1, - Round, Round, Float) - end - end. - -scale(R, S, MPlus, MMinus, LowOk, HighOk, Float) -> - Est = int_ceil(math:log10(abs(Float)) - 1.0e-10), - %% Note that the scheme implementation uses a 326 element look-up table - %% for int_pow(10, N) where we do not. - case Est >= 0 of - true -> - fixup(R, S * int_pow(10, Est), MPlus, MMinus, Est, - LowOk, HighOk); - false -> - Scale = int_pow(10, -Est), - fixup(R * Scale, S, MPlus * Scale, MMinus * Scale, Est, - LowOk, HighOk) - end. - -fixup(R, S, MPlus, MMinus, K, LowOk, HighOk) -> - TooLow = case HighOk of - true -> - (R + MPlus) >= S; - false -> - (R + MPlus) > S - end, - case TooLow of - true -> - [(K + 1) | generate(R, S, MPlus, MMinus, LowOk, HighOk)]; - false -> - [K | generate(R * 10, S, MPlus * 10, MMinus * 10, LowOk, HighOk)] - end. - -generate(R0, S, MPlus, MMinus, LowOk, HighOk) -> - D = R0 div S, - R = R0 rem S, - TC1 = case LowOk of - true -> - R =< MMinus; - false -> - R < MMinus - end, - TC2 = case HighOk of - true -> - (R + MPlus) >= S; - false -> - (R + MPlus) > S - end, - case TC1 of - false -> - case TC2 of - false -> - [D | generate(R * 10, S, MPlus * 10, MMinus * 10, - LowOk, HighOk)]; - true -> - [D + 1] - end; - true -> - case TC2 of - false -> - [D]; - true -> - case R * 2 < S of - true -> - [D]; - false -> - [D + 1] - end - end - end. - -unpack(Float) -> - <> = <>, - {Sign, Exp, Frac}. - -frexp1({_Sign, 0, 0}) -> - {0.0, 0}; -frexp1({Sign, 0, Frac}) -> - Exp = log2floor(Frac), - <> = <>, - {Frac1, -(?FLOAT_BIAS) - 52 + Exp}; -frexp1({Sign, Exp, Frac}) -> - <> = <>, - {Frac1, Exp - ?FLOAT_BIAS}. - -log2floor(Int) -> - log2floor(Int, 0). - -log2floor(0, N) -> - N; -log2floor(Int, N) -> - log2floor(Int bsr 1, 1 + N). - - -transform_digits(Place, [0 | Rest]) -> - transform_digits(Place, Rest); -transform_digits(Place, Digits) -> - {Place, [$0 + D || D <- Digits]}. - - -frexp_int(F) -> - case unpack(F) of - {_Sign, 0, Frac} -> - {Frac, ?MIN_EXP}; - {_Sign, Exp, Frac} -> - {Frac + (1 bsl 52), Exp - 53 - ?FLOAT_BIAS} - end. - -%% -%% Tests -%% --ifdef(TEST). --include_lib("eunit/include/eunit.hrl"). - -int_ceil_test() -> - ?assertEqual(1, int_ceil(0.0001)), - ?assertEqual(0, int_ceil(0.0)), - ?assertEqual(1, int_ceil(0.99)), - ?assertEqual(1, int_ceil(1.0)), - ?assertEqual(-1, int_ceil(-1.5)), - ?assertEqual(-2, int_ceil(-2.0)), - ok. - -int_pow_test() -> - ?assertEqual(1, int_pow(1, 1)), - ?assertEqual(1, int_pow(1, 0)), - ?assertEqual(1, int_pow(10, 0)), - ?assertEqual(10, int_pow(10, 1)), - ?assertEqual(100, int_pow(10, 2)), - ?assertEqual(1000, int_pow(10, 3)), - ok. - -digits_test() -> - ?assertEqual("0", - digits(0)), - ?assertEqual("0.0", - digits(0.0)), - ?assertEqual("1.0", - digits(1.0)), - ?assertEqual("-1.0", - digits(-1.0)), - ?assertEqual("0.1", - digits(0.1)), - ?assertEqual("0.01", - digits(0.01)), - ?assertEqual("0.001", - digits(0.001)), - ?assertEqual("1.0e+6", - digits(1000000.0)), - ?assertEqual("0.5", - digits(0.5)), - ?assertEqual("4503599627370496.0", - digits(4503599627370496.0)), - %% small denormalized number - %% 4.94065645841246544177e-324 =:= 5.0e-324 - <> = <<0,0,0,0,0,0,0,1>>, - ?assertEqual("5.0e-324", - digits(SmallDenorm)), - ?assertEqual(SmallDenorm, - list_to_float(digits(SmallDenorm))), - %% large denormalized number - %% 2.22507385850720088902e-308 - <> = <<0,15,255,255,255,255,255,255>>, - ?assertEqual("2.225073858507201e-308", - digits(BigDenorm)), - ?assertEqual(BigDenorm, - list_to_float(digits(BigDenorm))), - %% small normalized number - %% 2.22507385850720138309e-308 - <> = <<0,16,0,0,0,0,0,0>>, - ?assertEqual("2.2250738585072014e-308", - digits(SmallNorm)), - ?assertEqual(SmallNorm, - list_to_float(digits(SmallNorm))), - %% large normalized number - %% 1.79769313486231570815e+308 - <> = <<127,239,255,255,255,255,255,255>>, - ?assertEqual("1.7976931348623157e+308", - digits(LargeNorm)), - ?assertEqual(LargeNorm, - list_to_float(digits(LargeNorm))), - %% issue #10 - mochinum:frexp(math:pow(2, -1074)). - ?assertEqual("5.0e-324", - digits(math:pow(2, -1074))), - ok. - -frexp_test() -> - %% zero - ?assertEqual({0.0, 0}, frexp(0.0)), - %% one - ?assertEqual({0.5, 1}, frexp(1.0)), - %% negative one - ?assertEqual({-0.5, 1}, frexp(-1.0)), - %% small denormalized number - %% 4.94065645841246544177e-324 - <> = <<0,0,0,0,0,0,0,1>>, - ?assertEqual({0.5, -1073}, frexp(SmallDenorm)), - %% large denormalized number - %% 2.22507385850720088902e-308 - <> = <<0,15,255,255,255,255,255,255>>, - ?assertEqual( - {0.99999999999999978, -1022}, - frexp(BigDenorm)), - %% small normalized number - %% 2.22507385850720138309e-308 - <> = <<0,16,0,0,0,0,0,0>>, - ?assertEqual({0.5, -1021}, frexp(SmallNorm)), - %% large normalized number - %% 1.79769313486231570815e+308 - <> = <<127,239,255,255,255,255,255,255>>, - ?assertEqual( - {0.99999999999999989, 1024}, - frexp(LargeNorm)), - %% issue #10 - mochinum:frexp(math:pow(2, -1074)). - ?assertEqual( - {0.5, -1073}, - frexp(math:pow(2, -1074))), - ok. - --endif. diff --git a/src/mochiutf8.erl b/src/mochiutf8.erl deleted file mode 100644 index 28f28c1..0000000 --- a/src/mochiutf8.erl +++ /dev/null @@ -1,317 +0,0 @@ -%% @copyright 2010 Mochi Media, Inc. -%% @author Bob Ippolito - -%% @doc Algorithm to convert any binary to a valid UTF-8 sequence by ignoring -%% invalid bytes. - --module(mochiutf8). --export([valid_utf8_bytes/1, codepoint_to_bytes/1, codepoints_to_bytes/1]). --export([bytes_to_codepoints/1, bytes_foldl/3, codepoint_foldl/3]). --export([read_codepoint/1, len/1]). - -%% External API - --type unichar_low() :: 0..16#d7ff. --type unichar_high() :: 16#e000..16#10ffff. --type unichar() :: unichar_low() | unichar_high(). - --spec codepoint_to_bytes(unichar()) -> binary(). -%% @doc Convert a unicode codepoint to UTF-8 bytes. -codepoint_to_bytes(C) when (C >= 16#00 andalso C =< 16#7f) -> - %% U+0000 - U+007F - 7 bits - <>; -codepoint_to_bytes(C) when (C >= 16#080 andalso C =< 16#07FF) -> - %% U+0080 - U+07FF - 11 bits - <<0:5, B1:5, B0:6>> = <>, - <<2#110:3, B1:5, - 2#10:2, B0:6>>; -codepoint_to_bytes(C) when (C >= 16#0800 andalso C =< 16#FFFF) andalso - (C < 16#D800 orelse C > 16#DFFF) -> - %% U+0800 - U+FFFF - 16 bits (excluding UTC-16 surrogate code points) - <> = <>, - <<2#1110:4, B2:4, - 2#10:2, B1:6, - 2#10:2, B0:6>>; -codepoint_to_bytes(C) when (C >= 16#010000 andalso C =< 16#10FFFF) -> - %% U+10000 - U+10FFFF - 21 bits - <<0:3, B3:3, B2:6, B1:6, B0:6>> = <>, - <<2#11110:5, B3:3, - 2#10:2, B2:6, - 2#10:2, B1:6, - 2#10:2, B0:6>>. - --spec codepoints_to_bytes([unichar()]) -> binary(). -%% @doc Convert a list of codepoints to a UTF-8 binary. -codepoints_to_bytes(L) -> - <<<<(codepoint_to_bytes(C))/binary>> || C <- L>>. - --spec read_codepoint(binary()) -> {unichar(), binary(), binary()}. -read_codepoint(Bin = <<2#0:1, C:7, Rest/binary>>) -> - %% U+0000 - U+007F - 7 bits - <> = Bin, - {C, B, Rest}; -read_codepoint(Bin = <<2#110:3, B1:5, - 2#10:2, B0:6, - Rest/binary>>) -> - %% U+0080 - U+07FF - 11 bits - case <> of - <> when C >= 16#80 -> - <> = Bin, - {C, B, Rest} - end; -read_codepoint(Bin = <<2#1110:4, B2:4, - 2#10:2, B1:6, - 2#10:2, B0:6, - Rest/binary>>) -> - %% U+0800 - U+FFFF - 16 bits (excluding UTC-16 surrogate code points) - case <> of - <> when (C >= 16#0800 andalso C =< 16#FFFF) andalso - (C < 16#D800 orelse C > 16#DFFF) -> - <> = Bin, - {C, B, Rest} - end; -read_codepoint(Bin = <<2#11110:5, B3:3, - 2#10:2, B2:6, - 2#10:2, B1:6, - 2#10:2, B0:6, - Rest/binary>>) -> - %% U+10000 - U+10FFFF - 21 bits - case <> of - <> when (C >= 16#010000 andalso C =< 16#10FFFF) -> - <> = Bin, - {C, B, Rest} - end. - --spec codepoint_foldl(fun((unichar(), _) -> _), _, binary()) -> _. -codepoint_foldl(F, Acc, <<>>) when is_function(F, 2) -> - Acc; -codepoint_foldl(F, Acc, Bin) -> - {C, _, Rest} = read_codepoint(Bin), - codepoint_foldl(F, F(C, Acc), Rest). - --spec bytes_foldl(fun((binary(), _) -> _), _, binary()) -> _. -bytes_foldl(F, Acc, <<>>) when is_function(F, 2) -> - Acc; -bytes_foldl(F, Acc, Bin) -> - {_, B, Rest} = read_codepoint(Bin), - bytes_foldl(F, F(B, Acc), Rest). - --spec bytes_to_codepoints(binary()) -> [unichar()]. -bytes_to_codepoints(B) -> - lists:reverse(codepoint_foldl(fun (C, Acc) -> [C | Acc] end, [], B)). - --spec len(binary()) -> non_neg_integer(). -len(<<>>) -> - 0; -len(B) -> - {_, _, Rest} = read_codepoint(B), - 1 + len(Rest). - --spec valid_utf8_bytes(B::binary()) -> binary(). -%% @doc Return only the bytes in B that represent valid UTF-8. Uses -%% the following recursive algorithm: skip one byte if B does not -%% follow UTF-8 syntax (a 1-4 byte encoding of some number), -%% skip sequence of 2-4 bytes if it represents an overlong encoding -%% or bad code point (surrogate U+D800 - U+DFFF or > U+10FFFF). -valid_utf8_bytes(B) when is_binary(B) -> - binary_skip_bytes(B, invalid_utf8_indexes(B)). - -%% Internal API - --spec binary_skip_bytes(binary(), [non_neg_integer()]) -> binary(). -%% @doc Return B, but skipping the 0-based indexes in L. -binary_skip_bytes(B, []) -> - B; -binary_skip_bytes(B, L) -> - binary_skip_bytes(B, L, 0, []). - -%% @private --spec binary_skip_bytes(binary(), [non_neg_integer()], non_neg_integer(), iolist()) -> binary(). -binary_skip_bytes(B, [], _N, Acc) -> - iolist_to_binary(lists:reverse([B | Acc])); -binary_skip_bytes(<<_, RestB/binary>>, [N | RestL], N, Acc) -> - binary_skip_bytes(RestB, RestL, 1 + N, Acc); -binary_skip_bytes(<>, L, N, Acc) -> - binary_skip_bytes(RestB, L, 1 + N, [C | Acc]). - --spec invalid_utf8_indexes(binary()) -> [non_neg_integer()]. -%% @doc Return the 0-based indexes in B that are not valid UTF-8. -invalid_utf8_indexes(B) -> - invalid_utf8_indexes(B, 0, []). - -%% @private. --spec invalid_utf8_indexes(binary(), non_neg_integer(), [non_neg_integer()]) -> [non_neg_integer()]. -invalid_utf8_indexes(<>, N, Acc) when C < 16#80 -> - %% U+0000 - U+007F - 7 bits - invalid_utf8_indexes(Rest, 1 + N, Acc); -invalid_utf8_indexes(<>, N, Acc) - when C1 band 16#E0 =:= 16#C0, - C2 band 16#C0 =:= 16#80 -> - %% U+0080 - U+07FF - 11 bits - case ((C1 band 16#1F) bsl 6) bor (C2 band 16#3F) of - C when C < 16#80 -> - %% Overlong encoding. - invalid_utf8_indexes(Rest, 2 + N, [1 + N, N | Acc]); - _ -> - %% Upper bound U+07FF does not need to be checked - invalid_utf8_indexes(Rest, 2 + N, Acc) - end; -invalid_utf8_indexes(<>, N, Acc) - when C1 band 16#F0 =:= 16#E0, - C2 band 16#C0 =:= 16#80, - C3 band 16#C0 =:= 16#80 -> - %% U+0800 - U+FFFF - 16 bits - case ((((C1 band 16#0F) bsl 6) bor (C2 band 16#3F)) bsl 6) bor - (C3 band 16#3F) of - C when (C < 16#800) orelse (C >= 16#D800 andalso C =< 16#DFFF) -> - %% Overlong encoding or surrogate. - invalid_utf8_indexes(Rest, 3 + N, [2 + N, 1 + N, N | Acc]); - _ -> - %% Upper bound U+FFFF does not need to be checked - invalid_utf8_indexes(Rest, 3 + N, Acc) - end; -invalid_utf8_indexes(<>, N, Acc) - when C1 band 16#F8 =:= 16#F0, - C2 band 16#C0 =:= 16#80, - C3 band 16#C0 =:= 16#80, - C4 band 16#C0 =:= 16#80 -> - %% U+10000 - U+10FFFF - 21 bits - case ((((((C1 band 16#0F) bsl 6) bor (C2 band 16#3F)) bsl 6) bor - (C3 band 16#3F)) bsl 6) bor (C4 band 16#3F) of - C when (C < 16#10000) orelse (C > 16#10FFFF) -> - %% Overlong encoding or invalid code point. - invalid_utf8_indexes(Rest, 4 + N, [3 + N, 2 + N, 1 + N, N | Acc]); - _ -> - invalid_utf8_indexes(Rest, 4 + N, Acc) - end; -invalid_utf8_indexes(<<_, Rest/binary>>, N, Acc) -> - %% Invalid char - invalid_utf8_indexes(Rest, 1 + N, [N | Acc]); -invalid_utf8_indexes(<<>>, _N, Acc) -> - lists:reverse(Acc). - -%% -%% Tests -%% --ifdef(TEST). --include_lib("eunit/include/eunit.hrl"). - -binary_skip_bytes_test() -> - ?assertEqual(<<"foo">>, - binary_skip_bytes(<<"foo">>, [])), - ?assertEqual(<<"foobar">>, - binary_skip_bytes(<<"foo bar">>, [3])), - ?assertEqual(<<"foo">>, - binary_skip_bytes(<<"foo bar">>, [3, 4, 5, 6])), - ?assertEqual(<<"oo bar">>, - binary_skip_bytes(<<"foo bar">>, [0])), - ok. - -invalid_utf8_indexes_test() -> - ?assertEqual( - [], - invalid_utf8_indexes(<<"unicode snowman for you: ", 226, 152, 131>>)), - ?assertEqual( - [0], - invalid_utf8_indexes(<<128>>)), - ?assertEqual( - [57,59,60,64,66,67], - invalid_utf8_indexes(<<"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; (", - 167, 65, 170, 186, 73, 83, 80, 166, 87, 186, 217, 41, 41>>)), - ok. - -codepoint_to_bytes_test() -> - %% U+0000 - U+007F - 7 bits - %% U+0080 - U+07FF - 11 bits - %% U+0800 - U+FFFF - 16 bits (excluding UTC-16 surrogate code points) - %% U+10000 - U+10FFFF - 21 bits - ?assertEqual( - <<"a">>, - codepoint_to_bytes($a)), - ?assertEqual( - <<16#c2, 16#80>>, - codepoint_to_bytes(16#80)), - ?assertEqual( - <<16#df, 16#bf>>, - codepoint_to_bytes(16#07ff)), - ?assertEqual( - <<16#ef, 16#bf, 16#bf>>, - codepoint_to_bytes(16#ffff)), - ?assertEqual( - <<16#f4, 16#8f, 16#bf, 16#bf>>, - codepoint_to_bytes(16#10ffff)), - ok. - -bytes_foldl_test() -> - ?assertEqual( - <<"abc">>, - bytes_foldl(fun (B, Acc) -> <> end, <<>>, <<"abc">>)), - ?assertEqual( - <<"abc", 226, 152, 131, 228, 184, 173, 194, 133, 244,143,191,191>>, - bytes_foldl(fun (B, Acc) -> <> end, <<>>, - <<"abc", 226, 152, 131, 228, 184, 173, 194, 133, 244,143,191,191>>)), - ok. - -bytes_to_codepoints_test() -> - ?assertEqual( - "abc" ++ [16#2603, 16#4e2d, 16#85, 16#10ffff], - bytes_to_codepoints(<<"abc", 226, 152, 131, 228, 184, 173, 194, 133, 244,143,191,191>>)), - ok. - -codepoint_foldl_test() -> - ?assertEqual( - "cba", - codepoint_foldl(fun (C, Acc) -> [C | Acc] end, [], <<"abc">>)), - ?assertEqual( - [16#10ffff, 16#85, 16#4e2d, 16#2603 | "cba"], - codepoint_foldl(fun (C, Acc) -> [C | Acc] end, [], - <<"abc", 226, 152, 131, 228, 184, 173, 194, 133, 244,143,191,191>>)), - ok. - -len_test() -> - ?assertEqual( - 29, - len(<<"unicode snowman for you: ", 226, 152, 131, 228, 184, 173, 194, 133, 244, 143, 191, 191>>)), - ok. - -codepoints_to_bytes_test() -> - ?assertEqual( - iolist_to_binary(lists:map(fun codepoint_to_bytes/1, lists:seq(1, 1000))), - codepoints_to_bytes(lists:seq(1, 1000))), - ok. - -valid_utf8_bytes_test() -> - ?assertEqual( - <<"invalid U+11ffff: ">>, - valid_utf8_bytes(<<"invalid U+11ffff: ", 244, 159, 191, 191>>)), - ?assertEqual( - <<"U+10ffff: ", 244, 143, 191, 191>>, - valid_utf8_bytes(<<"U+10ffff: ", 244, 143, 191, 191>>)), - ?assertEqual( - <<"overlong 2-byte encoding (a): ">>, - valid_utf8_bytes(<<"overlong 2-byte encoding (a): ", 2#11000001, 2#10100001>>)), - ?assertEqual( - <<"overlong 2-byte encoding (!): ">>, - valid_utf8_bytes(<<"overlong 2-byte encoding (!): ", 2#11000000, 2#10100001>>)), - ?assertEqual( - <<"mu: ", 194, 181>>, - valid_utf8_bytes(<<"mu: ", 194, 181>>)), - ?assertEqual( - <<"bad coding bytes: ">>, - valid_utf8_bytes(<<"bad coding bytes: ", 2#10011111, 2#10111111, 2#11111111>>)), - ?assertEqual( - <<"low surrogate (unpaired): ">>, - valid_utf8_bytes(<<"low surrogate (unpaired): ", 237, 176, 128>>)), - ?assertEqual( - <<"high surrogate (unpaired): ">>, - valid_utf8_bytes(<<"high surrogate (unpaired): ", 237, 191, 191>>)), - ?assertEqual( - <<"unicode snowman for you: ", 226, 152, 131>>, - valid_utf8_bytes(<<"unicode snowman for you: ", 226, 152, 131>>)), - ?assertEqual( - <<"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; (AISPW))">>, - valid_utf8_bytes(<<"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; (", - 167, 65, 170, 186, 73, 83, 80, 166, 87, 186, 217, 41, 41>>)), - ok. - --endif. diff --git a/src/mochiweb_charref.erl b/src/mochiweb_charref.erl deleted file mode 100644 index d98e016..0000000 --- a/src/mochiweb_charref.erl +++ /dev/null @@ -1,308 +0,0 @@ -%% @author Bob Ippolito -%% @copyright 2007 Mochi Media, Inc. - -%% @doc Converts HTML 4 charrefs and entities to codepoints. --module(mochiweb_charref). --export([charref/1]). - -%% External API. - -%% @spec charref(S) -> integer() | undefined -%% @doc Convert a decimal charref, hex charref, or html entity to a unicode -%% codepoint, or return undefined on failure. -%% The input should not include an ampersand or semicolon. -%% charref("#38") = 38, charref("#x26") = 38, charref("amp") = 38. -charref(B) when is_binary(B) -> - charref(binary_to_list(B)); -charref([$#, C | L]) when C =:= $x orelse C =:= $X -> - try erlang:list_to_integer(L, 16) - catch - error:badarg -> undefined - end; -charref([$# | L]) -> - try list_to_integer(L) - catch - error:badarg -> undefined - end; -charref(L) -> - entity(L). - -%% Internal API. - -entity("nbsp") -> 160; -entity("iexcl") -> 161; -entity("cent") -> 162; -entity("pound") -> 163; -entity("curren") -> 164; -entity("yen") -> 165; -entity("brvbar") -> 166; -entity("sect") -> 167; -entity("uml") -> 168; -entity("copy") -> 169; -entity("ordf") -> 170; -entity("laquo") -> 171; -entity("not") -> 172; -entity("shy") -> 173; -entity("reg") -> 174; -entity("macr") -> 175; -entity("deg") -> 176; -entity("plusmn") -> 177; -entity("sup2") -> 178; -entity("sup3") -> 179; -entity("acute") -> 180; -entity("micro") -> 181; -entity("para") -> 182; -entity("middot") -> 183; -entity("cedil") -> 184; -entity("sup1") -> 185; -entity("ordm") -> 186; -entity("raquo") -> 187; -entity("frac14") -> 188; -entity("frac12") -> 189; -entity("frac34") -> 190; -entity("iquest") -> 191; -entity("Agrave") -> 192; -entity("Aacute") -> 193; -entity("Acirc") -> 194; -entity("Atilde") -> 195; -entity("Auml") -> 196; -entity("Aring") -> 197; -entity("AElig") -> 198; -entity("Ccedil") -> 199; -entity("Egrave") -> 200; -entity("Eacute") -> 201; -entity("Ecirc") -> 202; -entity("Euml") -> 203; -entity("Igrave") -> 204; -entity("Iacute") -> 205; -entity("Icirc") -> 206; -entity("Iuml") -> 207; -entity("ETH") -> 208; -entity("Ntilde") -> 209; -entity("Ograve") -> 210; -entity("Oacute") -> 211; -entity("Ocirc") -> 212; -entity("Otilde") -> 213; -entity("Ouml") -> 214; -entity("times") -> 215; -entity("Oslash") -> 216; -entity("Ugrave") -> 217; -entity("Uacute") -> 218; -entity("Ucirc") -> 219; -entity("Uuml") -> 220; -entity("Yacute") -> 221; -entity("THORN") -> 222; -entity("szlig") -> 223; -entity("agrave") -> 224; -entity("aacute") -> 225; -entity("acirc") -> 226; -entity("atilde") -> 227; -entity("auml") -> 228; -entity("aring") -> 229; -entity("aelig") -> 230; -entity("ccedil") -> 231; -entity("egrave") -> 232; -entity("eacute") -> 233; -entity("ecirc") -> 234; -entity("euml") -> 235; -entity("igrave") -> 236; -entity("iacute") -> 237; -entity("icirc") -> 238; -entity("iuml") -> 239; -entity("eth") -> 240; -entity("ntilde") -> 241; -entity("ograve") -> 242; -entity("oacute") -> 243; -entity("ocirc") -> 244; -entity("otilde") -> 245; -entity("ouml") -> 246; -entity("divide") -> 247; -entity("oslash") -> 248; -entity("ugrave") -> 249; -entity("uacute") -> 250; -entity("ucirc") -> 251; -entity("uuml") -> 252; -entity("yacute") -> 253; -entity("thorn") -> 254; -entity("yuml") -> 255; -entity("fnof") -> 402; -entity("Alpha") -> 913; -entity("Beta") -> 914; -entity("Gamma") -> 915; -entity("Delta") -> 916; -entity("Epsilon") -> 917; -entity("Zeta") -> 918; -entity("Eta") -> 919; -entity("Theta") -> 920; -entity("Iota") -> 921; -entity("Kappa") -> 922; -entity("Lambda") -> 923; -entity("Mu") -> 924; -entity("Nu") -> 925; -entity("Xi") -> 926; -entity("Omicron") -> 927; -entity("Pi") -> 928; -entity("Rho") -> 929; -entity("Sigma") -> 931; -entity("Tau") -> 932; -entity("Upsilon") -> 933; -entity("Phi") -> 934; -entity("Chi") -> 935; -entity("Psi") -> 936; -entity("Omega") -> 937; -entity("alpha") -> 945; -entity("beta") -> 946; -entity("gamma") -> 947; -entity("delta") -> 948; -entity("epsilon") -> 949; -entity("zeta") -> 950; -entity("eta") -> 951; -entity("theta") -> 952; -entity("iota") -> 953; -entity("kappa") -> 954; -entity("lambda") -> 955; -entity("mu") -> 956; -entity("nu") -> 957; -entity("xi") -> 958; -entity("omicron") -> 959; -entity("pi") -> 960; -entity("rho") -> 961; -entity("sigmaf") -> 962; -entity("sigma") -> 963; -entity("tau") -> 964; -entity("upsilon") -> 965; -entity("phi") -> 966; -entity("chi") -> 967; -entity("psi") -> 968; -entity("omega") -> 969; -entity("thetasym") -> 977; -entity("upsih") -> 978; -entity("piv") -> 982; -entity("bull") -> 8226; -entity("hellip") -> 8230; -entity("prime") -> 8242; -entity("Prime") -> 8243; -entity("oline") -> 8254; -entity("frasl") -> 8260; -entity("weierp") -> 8472; -entity("image") -> 8465; -entity("real") -> 8476; -entity("trade") -> 8482; -entity("alefsym") -> 8501; -entity("larr") -> 8592; -entity("uarr") -> 8593; -entity("rarr") -> 8594; -entity("darr") -> 8595; -entity("harr") -> 8596; -entity("crarr") -> 8629; -entity("lArr") -> 8656; -entity("uArr") -> 8657; -entity("rArr") -> 8658; -entity("dArr") -> 8659; -entity("hArr") -> 8660; -entity("forall") -> 8704; -entity("part") -> 8706; -entity("exist") -> 8707; -entity("empty") -> 8709; -entity("nabla") -> 8711; -entity("isin") -> 8712; -entity("notin") -> 8713; -entity("ni") -> 8715; -entity("prod") -> 8719; -entity("sum") -> 8721; -entity("minus") -> 8722; -entity("lowast") -> 8727; -entity("radic") -> 8730; -entity("prop") -> 8733; -entity("infin") -> 8734; -entity("ang") -> 8736; -entity("and") -> 8743; -entity("or") -> 8744; -entity("cap") -> 8745; -entity("cup") -> 8746; -entity("int") -> 8747; -entity("there4") -> 8756; -entity("sim") -> 8764; -entity("cong") -> 8773; -entity("asymp") -> 8776; -entity("ne") -> 8800; -entity("equiv") -> 8801; -entity("le") -> 8804; -entity("ge") -> 8805; -entity("sub") -> 8834; -entity("sup") -> 8835; -entity("nsub") -> 8836; -entity("sube") -> 8838; -entity("supe") -> 8839; -entity("oplus") -> 8853; -entity("otimes") -> 8855; -entity("perp") -> 8869; -entity("sdot") -> 8901; -entity("lceil") -> 8968; -entity("rceil") -> 8969; -entity("lfloor") -> 8970; -entity("rfloor") -> 8971; -entity("lang") -> 9001; -entity("rang") -> 9002; -entity("loz") -> 9674; -entity("spades") -> 9824; -entity("clubs") -> 9827; -entity("hearts") -> 9829; -entity("diams") -> 9830; -entity("quot") -> 34; -entity("amp") -> 38; -entity("lt") -> 60; -entity("gt") -> 62; -entity("OElig") -> 338; -entity("oelig") -> 339; -entity("Scaron") -> 352; -entity("scaron") -> 353; -entity("Yuml") -> 376; -entity("circ") -> 710; -entity("tilde") -> 732; -entity("ensp") -> 8194; -entity("emsp") -> 8195; -entity("thinsp") -> 8201; -entity("zwnj") -> 8204; -entity("zwj") -> 8205; -entity("lrm") -> 8206; -entity("rlm") -> 8207; -entity("ndash") -> 8211; -entity("mdash") -> 8212; -entity("lsquo") -> 8216; -entity("rsquo") -> 8217; -entity("sbquo") -> 8218; -entity("ldquo") -> 8220; -entity("rdquo") -> 8221; -entity("bdquo") -> 8222; -entity("dagger") -> 8224; -entity("Dagger") -> 8225; -entity("permil") -> 8240; -entity("lsaquo") -> 8249; -entity("rsaquo") -> 8250; -entity("euro") -> 8364; -entity(_) -> undefined. - - -%% -%% Tests -%% --ifdef(TEST). --include_lib("eunit/include/eunit.hrl"). - -exhaustive_entity_test() -> - T = mochiweb_cover:clause_lookup_table(?MODULE, entity), - [?assertEqual(V, entity(K)) || {K, V} <- T]. - -charref_test() -> - 1234 = charref("#1234"), - 255 = charref("#xfF"), - 255 = charref(<<"#XFf">>), - 38 = charref("amp"), - 38 = charref(<<"amp">>), - undefined = charref("not_an_entity"), - undefined = charref("#not_an_entity"), - undefined = charref("#xnot_an_entity"), - ok. - --endif. diff --git a/src/mochiweb_html.erl b/src/mochiweb_html.erl deleted file mode 100644 index d40a391..0000000 --- a/src/mochiweb_html.erl +++ /dev/null @@ -1,1264 +0,0 @@ -%% @author Bob Ippolito -%% @copyright 2007 Mochi Media, Inc. - -%% @doc Loosely tokenizes and generates parse trees for HTML 4. --module(mochiweb_html). --export([tokens/1, parse/1, parse_tokens/1, to_tokens/1, escape/1, - escape_attr/1, to_html/1]). - -%% This is a macro to placate syntax highlighters.. --define(QUOTE, $\"). --define(SQUOTE, $\'). --define(ADV_COL(S, N), - S#decoder{column=N+S#decoder.column, - offset=N+S#decoder.offset}). --define(INC_COL(S), - S#decoder{column=1+S#decoder.column, - offset=1+S#decoder.offset}). --define(INC_LINE(S), - S#decoder{column=1, - line=1+S#decoder.line, - offset=1+S#decoder.offset}). --define(INC_CHAR(S, C), - case C of - $\n -> - S#decoder{column=1, - line=1+S#decoder.line, - offset=1+S#decoder.offset}; - _ -> - S#decoder{column=1+S#decoder.column, - offset=1+S#decoder.offset} - end). - --define(IS_WHITESPACE(C), - (C =:= $\s orelse C =:= $\t orelse C =:= $\r orelse C =:= $\n)). --define(IS_LITERAL_SAFE(C), - ((C >= $A andalso C =< $Z) orelse (C >= $a andalso C =< $z) - orelse (C >= $0 andalso C =< $9))). --define(PROBABLE_CLOSE(C), - (C =:= $> orelse ?IS_WHITESPACE(C))). - --record(decoder, {line=1, - column=1, - offset=0}). - -%% @type html_node() = {string(), [html_attr()], [html_node() | string()]} -%% @type html_attr() = {string(), string()} -%% @type html_token() = html_data() | start_tag() | end_tag() | inline_html() | html_comment() | html_doctype() -%% @type html_data() = {data, string(), Whitespace::boolean()} -%% @type start_tag() = {start_tag, Name, [html_attr()], Singleton::boolean()} -%% @type end_tag() = {end_tag, Name} -%% @type html_comment() = {comment, Comment} -%% @type html_doctype() = {doctype, [Doctype]} -%% @type inline_html() = {'=', iolist()} - -%% External API. - -%% @spec parse(string() | binary()) -> html_node() -%% @doc tokenize and then transform the token stream into a HTML tree. -parse(Input) -> - parse_tokens(tokens(Input)). - -%% @spec parse_tokens([html_token()]) -> html_node() -%% @doc Transform the output of tokens(Doc) into a HTML tree. -parse_tokens(Tokens) when is_list(Tokens) -> - %% Skip over doctype, processing instructions - F = fun (X) -> - case X of - {start_tag, _, _, false} -> - false; - _ -> - true - end - end, - [{start_tag, Tag, Attrs, false} | Rest] = lists:dropwhile(F, Tokens), - {Tree, _} = tree(Rest, [norm({Tag, Attrs})]), - Tree. - -%% @spec tokens(StringOrBinary) -> [html_token()] -%% @doc Transform the input UTF-8 HTML into a token stream. -tokens(Input) -> - tokens(iolist_to_binary(Input), #decoder{}, []). - -%% @spec to_tokens(html_node()) -> [html_token()] -%% @doc Convert a html_node() tree to a list of tokens. -to_tokens({Tag0}) -> - to_tokens({Tag0, [], []}); -to_tokens(T={'=', _}) -> - [T]; -to_tokens(T={doctype, _}) -> - [T]; -to_tokens(T={comment, _}) -> - [T]; -to_tokens({Tag0, Acc}) -> - %% This is only allowed in sub-tags: {p, [{"class", "foo"}]} - to_tokens({Tag0, [], Acc}); -to_tokens({Tag0, Attrs, Acc}) -> - Tag = to_tag(Tag0), - to_tokens([{Tag, Acc}], [{start_tag, Tag, Attrs, is_singleton(Tag)}]). - -%% @spec to_html([html_token()] | html_node()) -> iolist() -%% @doc Convert a list of html_token() to a HTML document. -to_html(Node) when is_tuple(Node) -> - to_html(to_tokens(Node)); -to_html(Tokens) when is_list(Tokens) -> - to_html(Tokens, []). - -%% @spec escape(string() | atom() | binary()) -> binary() -%% @doc Escape a string such that it's safe for HTML (amp; lt; gt;). -escape(B) when is_binary(B) -> - escape(binary_to_list(B), []); -escape(A) when is_atom(A) -> - escape(atom_to_list(A), []); -escape(S) when is_list(S) -> - escape(S, []). - -%% @spec escape_attr(string() | binary() | atom() | integer() | float()) -> binary() -%% @doc Escape a string such that it's safe for HTML attrs -%% (amp; lt; gt; quot;). -escape_attr(B) when is_binary(B) -> - escape_attr(binary_to_list(B), []); -escape_attr(A) when is_atom(A) -> - escape_attr(atom_to_list(A), []); -escape_attr(S) when is_list(S) -> - escape_attr(S, []); -escape_attr(I) when is_integer(I) -> - escape_attr(integer_to_list(I), []); -escape_attr(F) when is_float(F) -> - escape_attr(mochinum:digits(F), []). - -to_html([], Acc) -> - lists:reverse(Acc); -to_html([{'=', Content} | Rest], Acc) -> - to_html(Rest, [Content | Acc]); -to_html([{pi, Bin} | Rest], Acc) -> - Open = [<<">, - Bin, - <<"?>">>], - to_html(Rest, [Open | Acc]); -to_html([{pi, Tag, Attrs} | Rest], Acc) -> - Open = [<<">, - Tag, - attrs_to_html(Attrs, []), - <<"?>">>], - to_html(Rest, [Open | Acc]); -to_html([{comment, Comment} | Rest], Acc) -> - to_html(Rest, [[<<"">>] | Acc]); -to_html([{doctype, Parts} | Rest], Acc) -> - Inside = doctype_to_html(Parts, Acc), - to_html(Rest, [[<<">, Inside, <<">">>] | Acc]); -to_html([{data, Data, _Whitespace} | Rest], Acc) -> - to_html(Rest, [escape(Data) | Acc]); -to_html([{start_tag, Tag, Attrs, Singleton} | Rest], Acc) -> - Open = [<<"<">>, - Tag, - attrs_to_html(Attrs, []), - case Singleton of - true -> <<" />">>; - false -> <<">">> - end], - to_html(Rest, [Open | Acc]); -to_html([{end_tag, Tag} | Rest], Acc) -> - to_html(Rest, [[<<">, Tag, <<">">>] | Acc]). - -doctype_to_html([], Acc) -> - lists:reverse(Acc); -doctype_to_html([Word | Rest], Acc) -> - case lists:all(fun (C) -> ?IS_LITERAL_SAFE(C) end, - binary_to_list(iolist_to_binary(Word))) of - true -> - doctype_to_html(Rest, [[<<" ">>, Word] | Acc]); - false -> - doctype_to_html(Rest, [[<<" \"">>, escape_attr(Word), ?QUOTE] | Acc]) - end. - -attrs_to_html([], Acc) -> - lists:reverse(Acc); -attrs_to_html([{K, V} | Rest], Acc) -> - attrs_to_html(Rest, - [[<<" ">>, escape(K), <<"=\"">>, - escape_attr(V), <<"\"">>] | Acc]). - -escape([], Acc) -> - list_to_binary(lists:reverse(Acc)); -escape("<" ++ Rest, Acc) -> - escape(Rest, lists:reverse("<", Acc)); -escape(">" ++ Rest, Acc) -> - escape(Rest, lists:reverse(">", Acc)); -escape("&" ++ Rest, Acc) -> - escape(Rest, lists:reverse("&", Acc)); -escape([C | Rest], Acc) -> - escape(Rest, [C | Acc]). - -escape_attr([], Acc) -> - list_to_binary(lists:reverse(Acc)); -escape_attr("<" ++ Rest, Acc) -> - escape_attr(Rest, lists:reverse("<", Acc)); -escape_attr(">" ++ Rest, Acc) -> - escape_attr(Rest, lists:reverse(">", Acc)); -escape_attr("&" ++ Rest, Acc) -> - escape_attr(Rest, lists:reverse("&", Acc)); -escape_attr([?QUOTE | Rest], Acc) -> - escape_attr(Rest, lists:reverse(""", Acc)); -escape_attr([C | Rest], Acc) -> - escape_attr(Rest, [C | Acc]). - -to_tag(A) when is_atom(A) -> - norm(atom_to_list(A)); -to_tag(L) -> - norm(L). - -to_tokens([], Acc) -> - lists:reverse(Acc); -to_tokens([{Tag, []} | Rest], Acc) -> - to_tokens(Rest, [{end_tag, to_tag(Tag)} | Acc]); -to_tokens([{Tag0, [{T0} | R1]} | Rest], Acc) -> - %% Allow {br} - to_tokens([{Tag0, [{T0, [], []} | R1]} | Rest], Acc); -to_tokens([{Tag0, [T0={'=', _C0} | R1]} | Rest], Acc) -> - %% Allow {'=', iolist()} - to_tokens([{Tag0, R1} | Rest], [T0 | Acc]); -to_tokens([{Tag0, [T0={comment, _C0} | R1]} | Rest], Acc) -> - %% Allow {comment, iolist()} - to_tokens([{Tag0, R1} | Rest], [T0 | Acc]); -to_tokens([{Tag0, [T0={pi, _S0} | R1]} | Rest], Acc) -> - %% Allow {pi, binary()} - to_tokens([{Tag0, R1} | Rest], [T0 | Acc]); -to_tokens([{Tag0, [T0={pi, _S0, _A0} | R1]} | Rest], Acc) -> - %% Allow {pi, binary(), list()} - to_tokens([{Tag0, R1} | Rest], [T0 | Acc]); -to_tokens([{Tag0, [{T0, A0=[{_, _} | _]} | R1]} | Rest], Acc) -> - %% Allow {p, [{"class", "foo"}]} - to_tokens([{Tag0, [{T0, A0, []} | R1]} | Rest], Acc); -to_tokens([{Tag0, [{T0, C0} | R1]} | Rest], Acc) -> - %% Allow {p, "content"} and {p, <<"content">>} - to_tokens([{Tag0, [{T0, [], C0} | R1]} | Rest], Acc); -to_tokens([{Tag0, [{T0, A1, C0} | R1]} | Rest], Acc) when is_binary(C0) -> - %% Allow {"p", [{"class", "foo"}], <<"content">>} - to_tokens([{Tag0, [{T0, A1, binary_to_list(C0)} | R1]} | Rest], Acc); -to_tokens([{Tag0, [{T0, A1, C0=[C | _]} | R1]} | Rest], Acc) - when is_integer(C) -> - %% Allow {"p", [{"class", "foo"}], "content"} - to_tokens([{Tag0, [{T0, A1, [C0]} | R1]} | Rest], Acc); -to_tokens([{Tag0, [{T0, A1, C1} | R1]} | Rest], Acc) -> - %% Native {"p", [{"class", "foo"}], ["content"]} - Tag = to_tag(Tag0), - T1 = to_tag(T0), - case is_singleton(norm(T1)) of - true -> - to_tokens([{Tag, R1} | Rest], [{start_tag, T1, A1, true} | Acc]); - false -> - to_tokens([{T1, C1}, {Tag, R1} | Rest], - [{start_tag, T1, A1, false} | Acc]) - end; -to_tokens([{Tag0, [L | R1]} | Rest], Acc) when is_list(L) -> - %% List text - Tag = to_tag(Tag0), - to_tokens([{Tag, R1} | Rest], [{data, iolist_to_binary(L), false} | Acc]); -to_tokens([{Tag0, [B | R1]} | Rest], Acc) when is_binary(B) -> - %% Binary text - Tag = to_tag(Tag0), - to_tokens([{Tag, R1} | Rest], [{data, B, false} | Acc]). - -tokens(B, S=#decoder{offset=O}, Acc) -> - case B of - <<_:O/binary>> -> - lists:reverse(Acc); - _ -> - {Tag, S1} = tokenize(B, S), - case parse_flag(Tag) of - script -> - {Tag2, S2} = tokenize_script(B, S1), - tokens(B, S2, [Tag2, Tag | Acc]); - textarea -> - {Tag2, S2} = tokenize_textarea(B, S1), - tokens(B, S2, [Tag2, Tag | Acc]); - none -> - tokens(B, S1, [Tag | Acc]) - end - end. - -parse_flag({start_tag, B, _, false}) -> - case string:to_lower(binary_to_list(B)) of - "script" -> - script; - "textarea" -> - textarea; - _ -> - none - end; -parse_flag(_) -> - none. - -tokenize(B, S=#decoder{offset=O}) -> - case B of - <<_:O/binary, "", _/binary>> -> - Len = O - Start, - <<_:Start/binary, Raw:Len/binary, _/binary>> = Bin, - {{comment, Raw}, ?ADV_COL(S, 3)}; - <<_:O/binary, C, _/binary>> -> - tokenize_comment(Bin, ?INC_CHAR(S, C), Start); - <<_:Start/binary, Raw/binary>> -> - {{comment, Raw}, S} - end. - -tokenize_script(Bin, S=#decoder{offset=O}) -> - tokenize_script(Bin, S, O). - -tokenize_script(Bin, S=#decoder{offset=O}, Start) -> - case Bin of - %% Just a look-ahead, we want the end_tag separately - <<_:O/binary, $<, $/, SS, CC, RR, II, PP, TT, ZZ, _/binary>> - when (SS =:= $s orelse SS =:= $S) andalso - (CC =:= $c orelse CC =:= $C) andalso - (RR =:= $r orelse RR =:= $R) andalso - (II =:= $i orelse II =:= $I) andalso - (PP =:= $p orelse PP =:= $P) andalso - (TT=:= $t orelse TT =:= $T) andalso - ?PROBABLE_CLOSE(ZZ) -> - Len = O - Start, - <<_:Start/binary, Raw:Len/binary, _/binary>> = Bin, - {{data, Raw, false}, S}; - <<_:O/binary, C, _/binary>> -> - tokenize_script(Bin, ?INC_CHAR(S, C), Start); - <<_:Start/binary, Raw/binary>> -> - {{data, Raw, false}, S} - end. - -tokenize_textarea(Bin, S=#decoder{offset=O}) -> - tokenize_textarea(Bin, S, O). - -tokenize_textarea(Bin, S=#decoder{offset=O}, Start) -> - case Bin of - %% Just a look-ahead, we want the end_tag separately - <<_:O/binary, $<, $/, TT, EE, XX, TT2, AA, RR, EE2, AA2, ZZ, _/binary>> - when (TT =:= $t orelse TT =:= $T) andalso - (EE =:= $e orelse EE =:= $E) andalso - (XX =:= $x orelse XX =:= $X) andalso - (TT2 =:= $t orelse TT2 =:= $T) andalso - (AA =:= $a orelse AA =:= $A) andalso - (RR =:= $r orelse RR =:= $R) andalso - (EE2 =:= $e orelse EE2 =:= $E) andalso - (AA2 =:= $a orelse AA2 =:= $A) andalso - ?PROBABLE_CLOSE(ZZ) -> - Len = O - Start, - <<_:Start/binary, Raw:Len/binary, _/binary>> = Bin, - {{data, Raw, false}, S}; - <<_:O/binary, C, _/binary>> -> - tokenize_textarea(Bin, ?INC_CHAR(S, C), Start); - <<_:Start/binary, Raw/binary>> -> - {{data, Raw, false}, S} - end. - - -%% -%% Tests -%% --ifdef(TEST). --include_lib("eunit/include/eunit.hrl"). - -to_html_test() -> - ?assertEqual( - <<"hey!

what's up

sucka
RAW!">>, - iolist_to_binary( - to_html({html, [], - [{<<"head">>, [], - [{title, <<"hey!">>}]}, - {body, [], - [{p, [{class, foo}], [<<"what's">>, <<" up">>, {br}]}, - {'div', <<"sucka">>}, - {'=', <<"RAW!">>}, - {comment, <<" comment! ">>}]}]}))), - ?assertEqual( - <<"">>, - iolist_to_binary( - to_html({doctype, - [<<"html">>, <<"PUBLIC">>, - <<"-//W3C//DTD XHTML 1.0 Transitional//EN">>, - <<"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">>]}))), - ?assertEqual( - <<"">>, - iolist_to_binary( - to_html({<<"html">>,[], - [{pi, <<"xml:namespace">>, - [{<<"prefix">>,<<"o">>}, - {<<"ns">>,<<"urn:schemas-microsoft-com:office:office">>}]}]}))), - ok. - -escape_test() -> - ?assertEqual( - <<"&quot;\"word ><<up!&quot;">>, - escape(<<""\"word ><>)), - ?assertEqual( - <<"&quot;\"word ><<up!&quot;">>, - escape(""\"word ><>, - escape('"\"word >< - ?assertEqual( - <<"&quot;"word ><<up!&quot;">>, - escape_attr(<<""\"word ><>)), - ?assertEqual( - <<"&quot;"word ><<up!&quot;">>, - escape_attr(""\"word ><>, - escape_attr('"\"word ><>, - escape_attr(12345)), - ?assertEqual( - <<"1.5">>, - escape_attr(1.5)), - ok. - -tokens_test() -> - ?assertEqual( - [{start_tag, <<"foo">>, [{<<"bar">>, <<"baz">>}, - {<<"wibble">>, <<"wibble">>}, - {<<"alice">>, <<"bob">>}], true}], - tokens(<<"">>)), - ?assertEqual( - [{start_tag, <<"foo">>, [{<<"bar">>, <<"baz">>}, - {<<"wibble">>, <<"wibble">>}, - {<<"alice">>, <<"bob">>}], true}], - tokens(<<"">>)), - ?assertEqual( - [{comment, <<"[if lt IE 7]>\n\n>}], - tokens(<<"">>)), - ?assertEqual( - [{start_tag, <<"script">>, [{<<"type">>, <<"text/javascript">>}], false}, - {data, <<" A= B <= C ">>, false}, - {end_tag, <<"script">>}], - tokens(<<"">>)), - ?assertEqual( - [{start_tag, <<"script">>, [{<<"type">>, <<"text/javascript">>}], false}, - {data, <<" A= B <= C ">>, false}, - {end_tag, <<"script">>}], - tokens(<<"">>)), - ?assertEqual( - [{start_tag, <<"script">>, [{<<"type">>, <<"text/javascript">>}], false}, - {data, <<" A= B <= C ">>, false}, - {end_tag, <<"script">>}], - tokens(<<"">>)), - ?assertEqual( - [{start_tag, <<"script">>, [{<<"type">>, <<"text/javascript">>}], false}, - {data, <<" A= B <= C ">>, false}, - {end_tag, <<"script">>}], - tokens(<<"">>)), - ?assertEqual( - [{start_tag, <<"textarea">>, [], false}, - {data, <<"">>, false}, - {end_tag, <<"textarea">>}], - tokens(<<"">>)), - ?assertEqual( - [{start_tag, <<"textarea">>, [], false}, - {data, <<"">>, false}], - tokens(<<"