diff --git a/src/httpu.erl b/src/httpu.erl index 2c3d67e..35e3518 100644 --- a/src/httpu.erl +++ b/src/httpu.erl @@ -22,12 +22,11 @@ get_http(Url) -> %% ----------------------------------------------------------------------------------------- -%% retrieves the content-type from http response headers - -get_content_type(Headers) -> - lists:foldl(fun({Type, Val}, Acc) -> - case Type of - "content-type" -> Val; - _ -> Acc - end, - end, [], Headers) +%% http post + +post_http(Url, Data) -> + JsonData = mochijson2:encode(Data), + http:request(post, {Url, [{"User-Agent", ?UA}], "text/json", JsonData}, [], []). + + + diff --git a/src/interface.erl b/src/interface.erl new file mode 100644 index 0000000..cd5d797 --- /dev/null +++ b/src/interface.erl @@ -0,0 +1,40 @@ +-module(interface). +-export([start/1, wait/0]). + +%% ----------------------------------------------------------------------------------------- + +%% starts the specified number of interfaces + +start(Number) -> + Pid = spawn(?MODULE, wait, []), + start(Number - 1, [Pid]). + +start(N, L) when N =/= 0 -> + Pid = spawn(?MODULE, wait, []), + start(N - 1, [Pid|L]); + +start(N, L) when N == 0 -> + L. + + +%% ----------------------------------------------------------------------------------------- + +%% waits to provide an interface to the Simple REST api + +wait() -> + receive + {From, large_batch_unvisited_urls} -> + {Headers, Body} = httpu:get_http("http://localhost:3000/detritus?size=large"), + BinaryBody = list_to_binary(Body), + Links = mochijson2:decode(BinaryBody), + From ! {self(), {large_batch_unvisited_urls, Links}}; + {From, std_batch_unvisited_urls} -> + {Headers, Body} = httpu:get_http("http://localhost:3000/detritus?size=standard"), + BinaryBody = list_to_binary(Body), + Links = mochijson2:decode(BinaryBody), + From ! {self(), {std_batch_unvisited_urls, Links}}; + {From, {store_feeds, Xml}} -> + {Response} = httpu:post_http("http://localhost:3000/arborage", Xml); + {From, {store_unvisited, Urls}} -> + {Response} = httpu:post_http("http://localhost:3000/rake", Urls) + end. diff --git a/src/master.erl b/src/master.erl index 8a60dca..5e0b7f8 100644 --- a/src/master.erl +++ b/src/master.erl @@ -1,4 +1,5 @@ -module(master). +-export([start_system/1, wait/0, interface_distribute/0, pid_distribute/0]). %% ----------------------------------------------------------------------------------------- @@ -16,7 +17,7 @@ start_system(NumCrawlers) -> %% spawns a process which distributes individual interface pids in a cycle -spawn_interface_distributer(Ints), +spawn_interface_distributer(Ints) -> Dist = spawn(?MODULE, interface_distribute, []), Dist ! {self(), Ints}, Dist. @@ -26,7 +27,7 @@ spawn_interface_distributer(Ints), %% spawns a process which distributes individual crawler pids in a cycle -spawn_pid_distributer(Pids), +spawn_pid_distributer(Pids) -> Dist = spawn(?MODULE, pid_distribute, []), Dist ! {self(), Pids}, Dist. @@ -54,7 +55,7 @@ interface_distribute(Ints, Index) -> _ -> interface_distribute(Ints, Index) end, - ListLen = length(Ints) + ListLen = length(Ints), case (ListLen - Index) of 0 -> interface_distribute(Ints, 1); _ -> interface_distribute(Ints, (Index + 1)) @@ -98,7 +99,7 @@ pid_distribute(Pids, Index) -> %% spawns the master spawn_listener(PidDist, IntDist) -> - Listener = spawn(?MODULE, listen, []), + Listener = spawn(?MODULE, wait, []), Listener ! {self(), {init, {PidDist, IntDist}}}. @@ -122,14 +123,14 @@ spawn_interfaces(Num) -> %% oversees the crawler system -listen() -> +wait() -> receive {From, {init, {PidDist, IntDist}}} -> self() ! init, - listen(PidDist, IntDist) + wait(PidDist, IntDist) end. -listen(PidDist, IntDist) -> +wait(PidDist, IntDist) -> receive %% initialization of all crawlers init -> @@ -149,7 +150,27 @@ listen(PidDist, IntDist) -> submit_to_crawler(self(), PidDist, Urls); %% store feeds - + {From, {crawled, {NewLinks, Xml}}} -> + Str = spawn(?MODULE, store_crawl_results, []), + Str ! {self, {IntDist, NewLinks, Xml}} + end, + wait(PidDist, IntDist). + + +%% ----------------------------------------------------------------------------------------- + +%% handle storing results of web crawler + +store_crawl_results() -> + receive + {From, {IntDist, NewLinks, Xml}} -> + IntDist ! {self(), int_request}, + receive + {From, {requested_int, Int}} -> + Int ! {self(), {store_feeds, Xml}}, + Int ! {self(), {store_unvisited, NewLinks}} + end + end. %% ----------------------------------------------------------------------------------------- @@ -163,7 +184,7 @@ entask_crawlers() -> Respondant = From end, receive - {From, {length_pids, L}} -> + {_, {length_pids, L}} -> LengthPids = L end, LengthUrls = length(Urls), @@ -177,8 +198,8 @@ entask_crawlers() -> submit_to_crawler(Respondant, PidDist, [X|List]), {1, []}; _ -> - {(Count + 1), [X|List]}; - end. + {(Count + 1), [X|List]} + end end, {1, []}, Urls). @@ -186,7 +207,7 @@ entask_crawlers() -> %% submits a list of urls to a crawler, given a crawler distributer -submit_to_crawler(Respondant, PidDist, List) +submit_to_crawler(Respondant, PidDist, List) -> PidDist ! {self(), pid_request}, receive {From, {requested_pid, Pid}} ->