From eb12b70afe1c95021fd44a14770e4131b378f70f Mon Sep 17 00:00:00 2001 From: Jacob Vorreuter Date: Fri, 1 May 2009 14:33:07 -0700 Subject: [PATCH] expanding instruction list --- Makefile | 1 + public/gracenote.html | 0 public/gracenote_album_2021ab38530960de.html | 405 ++++++++++++++++ public/gracenote_album_b917d0542e3ab9a7.html | 453 +++++++++++++++++ public/gracenote_albums.html | 480 +++++++++++++++++++ src/ex_consumer.erl | 87 +++- src/ex_engine.erl | 44 ++ src/ex_re.erl | 3 + src/ex_web.erl | 4 +- src/ex_xpath.erl | 10 +- t/Makefile | 9 + t/excavator_t_004.t | 51 ++ t/test_server.erl | 24 + 13 files changed, 1547 insertions(+), 24 deletions(-) delete mode 100644 public/gracenote.html create mode 100644 public/gracenote_album_2021ab38530960de.html create mode 100644 public/gracenote_album_b917d0542e3ab9a7.html create mode 100644 public/gracenote_albums.html create mode 100644 src/ex_engine.erl create mode 100644 t/Makefile create mode 100644 t/excavator_t_004.t create mode 100644 t/test_server.erl diff --git a/Makefile b/Makefile index c5493df..f50fe20 100644 --- a/Makefile +++ b/Makefile @@ -2,6 +2,7 @@ all: mkdir -p ebin/ #(cd templates;$(MAKE)) (cd src;$(MAKE)) + (cd t;$(MAKE)) test: all prove -v t/*.t diff --git a/public/gracenote.html b/public/gracenote.html deleted file mode 100644 index e69de29..0000000 diff --git a/public/gracenote_album_2021ab38530960de.html b/public/gracenote_album_2021ab38530960de.html new file mode 100644 index 0000000..cd87e2b --- /dev/null +++ b/public/gracenote_album_2021ab38530960de.html @@ -0,0 +1,405 @@ + + + + + +Gracenote: Album > The Beatles - The Beatles (White Album) [Disc 2] + + + + + + + + + + + + + + + + + + + +
+

The Beatles

+ + + + + +
+
+
+ +
Album > The Beatles (White Album) [Disc 2]
+ + + +
Artist > The Beatles
+ + +
Year of Release > 1968
+ + +
Label > Apple
+ +
+
+
+ + +
+ + + +
+ +
+ + +
+ + +
+ + + +

+Member of MOG Music Network + + +

+
+ + + +
+ +
+ + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/public/gracenote_album_b917d0542e3ab9a7.html b/public/gracenote_album_b917d0542e3ab9a7.html new file mode 100644 index 0000000..3608588 --- /dev/null +++ b/public/gracenote_album_b917d0542e3ab9a7.html @@ -0,0 +1,453 @@ + + + + + +Gracenote: Album > The Beatles - The Beatles (White Album) [Disc 1] + + + + + + + + + + + + + + + + + + + +
+

The Beatles

+ + + + + +
+
+
+ +
Album > The Beatles (White Album) [Disc 1]
+ + + +
Artist > The Beatles
+ + +
Year of Release > 1968
+ + +
Label > Apple
+ +
+
+
+ + +
+ + + +
+ +
+ + +
+ + +
+ + + +

+Member of MOG Music Network + + +

+
+ + + +
+ +
+ + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/public/gracenote_albums.html b/public/gracenote_albums.html new file mode 100644 index 0000000..45d07a5 --- /dev/null +++ b/public/gracenote_albums.html @@ -0,0 +1,480 @@ + + + + + + +Gracenote: Music Search > Results for beatles + + + + + + + + + + + + + + + + + + + +
+ +

Gracenote: Music Search > Results for beatles

+ + + +
+ To search the Gracenote Web site for music, enter the artist, album, or track, and click the "Search" button. +

+

+ + + +
+ +
+
+ Search results for beatles
+ Displaying results 1 - 20 of 23 results. +

+


+ +
1   |   2      »      
+
+
+
+
+ +
+ + + + + +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ + + +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ + + + +
+ +
+ +
Current Gracenote Media Database Stats
7,620,519 CDs
+ 97,206,484 Tracks
+ +
+ + + +

+Member of MOG Music Network + + +

+ +
+
+ +
+ + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/src/ex_consumer.erl b/src/ex_consumer.erl index 7f107ba..6da63e3 100644 --- a/src/ex_consumer.erl +++ b/src/ex_consumer.erl @@ -1,10 +1,11 @@ -module(ex_consumer). --export([execute/2, fetch/3, assign/3, assert/3, commit/3, each/4, configure/3]). +-export([execute/2, fetch/3, assign/3, assert/3, commit/3, commit/4, each/4, configure/3, function/2, print/2, onfail/3]). -include("excavator.hrl"). %% @spec execute(instr(), State) -> State1 execute({instr, Function, Args}, State) -> + ?INFO_MSG("~p(~p)~n", [Function, [state|Args]]), apply(?MODULE, Function, [State|Args]). %% template functions @@ -13,51 +14,97 @@ fetch(State, Key, {Method, Url}) when Method==options;Method==get;Method==head;M fetch(State, Key, {Method, Url, Headers}) when Method==options;Method==get;Method==head;Method==delete;Method==trace -> fetch(State, Key, {Method, Url, Headers, []}); fetch(State, Key, {Method, Url, Headers, Body}) -> - Response = ex_web:request(Method, Url, Headers, Body), + Url1 = lists:flatten([begin + case I of + String when is_list(String) -> String; + Atom when is_atom(Atom) -> to_string(?FETCH(State, Atom)); + Other -> Other + end + end || I <- Url]), + io:format("url: ~p~n", [Url1]), + Response = ex_web:request(Method, Url1, Headers, Body), ?STORE(State, Key, Response). assign(State, Key, Term) -> - ?STORE(State, Key, evaluate(State, Term)). + ?STORE(State, Key, compute(State, Term)). assert(State, Key, Assertion) -> assert_true(?FETCH(State, Key), Assertion), State. -commit(State, _Key, _Value) -> +commit(State, Key, Value) -> + %Key1 = evaluate(State, Key), + Value1 = evaluate(State, Value), + io:format("commit ~p:~p~n", [Key, Value1]), %% commit Key/Value to CouchDB or some disk-based key/value store State. + +commit(State, Key, Value, {CallbackModule, CallbackFunction}) -> + %Key1 = evaluate(State, Key), + Value1 = evaluate(State, Value), + io:format("commit ~p:~p~n", [Key, Value1]), + spawn(CallbackModule, CallbackFunction, [Key, Value1]), + State. -each(#state{stack=Stack, instructions=OldInstructions}=State, Key, Source, NewInstructions) -> - case ?FETCH(Source) of - Val when Val==undefined; Val==[] -> +each(#state{stack=Stack}=State, Key, Source, NewInstructions) -> + case ?FETCH(State, Source) of + {_Type, Val} when Val==undefined; Val==[] -> exit({?MODULE, ?LINE, fetch_failed, Source, Val}); - [Val] -> %% last item in each list - NewState = ?STORE(State, Key, Val), + {Type, [Val]} -> %% last item in list + NewState = ?STORE(State, Key, typify_value(Type, Val)), NewState#state{instructions=NewInstructions}; - [Val|Tail] -> - OldState0 = ?STORE(State, Source, Tail), %% insert list tail for source key - %% add this instruction to head of list so that when stack is popped - %% and this state begins processing again, the each list is the next instruction still - OldState1 = OldState0#state{instructions=[{instr, each, [Key, Source, NewInstructions]}|OldInstructions]}, - NewState = ?STORE(State, Key, Val), - NewState#state{instructions=NewInstructions, stack=[OldState1|Stack]} %% push old state on to stack of new state + {Type, [Val|Tail]} -> + OldState = ?STORE(State, Source, {Type, Tail}), %% insert list tail for source key + NewState = ?STORE(State, Key, typify_value(Type, Val)), %% insert item from source + NewState#state{instructions=NewInstructions, stack=[OldState|Stack]} %% push old state on to stack of new state %% next instruction processed will be first from Instructions end. configure(State, Key, Value) -> ?CONFIGURE(State, Key, Value). +function(State, Fun) when is_function(Fun) -> + Fun(State), + State. + +print(State, Key) -> + error_logger:info_report({print, ?FETCH(State, Key)}), + State. + +onfail(#state{stack=Stack}=State, AttemptInstrs, _FailInstr) when is_list(AttemptInstrs) -> + State#state{instructions=AttemptInstrs, stack=[State|Stack]}. + %% internal functions -evaluate(State, {xpath, Source, XPath}) -> +compute(State, {xpath, Source, XPath}) -> ex_xpath:run(XPath, ?FETCH(State, Source)); -evaluate(State, {regexp, Source, Regexp}) -> +compute(State, {regexp, Source, Regexp}) -> ex_re:run(Regexp, ?FETCH(State, Source)). +evaluate(State, Tuple) when is_tuple(Tuple) -> + list_to_tuple([evaluate(State, I) || I <- tuple_to_list(Tuple)]); +evaluate(State, Key) when is_atom(Key) -> + case ?FETCH(State, Key) of undefined -> Key; {_, Other} -> Other end; +evaluate(_State, Other) -> Other. + assert_true({nil, Key}, nil) when Key==[]; Key==undefined -> ok; -assert_true({string, Key}, string) when is_list(Key) -> ok; +assert_true({string, Key}, string) when is_list(Key), length(Key) > 0 -> ok; assert_true({node, Key}, node) when is_tuple(Key) -> ok; +assert_true(Key, node) when is_tuple(Key) -> ok; assert_true({list_of_strings, Key}, list_of_strings) when is_list(Key) -> [assert_true(Item, string) || Item <- Key], ok; assert_true({list_of_nodes, Key}, list_of_nodes) when is_list(Key) -> [assert_true(Item, node) || Item <- Key], ok; +assert_true({http_response, _S, _H, Body}, string) when is_list(Body), length(Body) > 0 -> ok; +assert_true({http_response, Status, _H, _B}, {status, Status}) -> ok; assert_true(Key, Assertion) -> exit({?MODULE, assertion_failed, {Key, Assertion}}). - +typify_value(list_of_strings, {string, String}) -> + {string, String}; +typify_value(list_of_strings, String) when is_list(String) -> + {string, String}; +typify_value(list_of_nodes, {node, Node}) -> + {node, Node}; +typify_value(list_of_nodes, Node) -> + {node, Node}. + +to_string(List) when is_list(List) -> List; +to_string({string, String}) -> String; +to_string({node, Node}) -> to_string(ex_xpath:reassemble({node, Node})). \ No newline at end of file diff --git a/src/ex_engine.erl b/src/ex_engine.erl new file mode 100644 index 0000000..88b4b8d --- /dev/null +++ b/src/ex_engine.erl @@ -0,0 +1,44 @@ +-module(ex_engine). +-export([run/1]). + +-include("excavator.hrl"). + +%% when a list of instructions is passed in, create a +%% new state and begin processing +run(Instructions) when is_list(Instructions) -> + run(#state{instructions=Instructions}); + +%% finished running +run(#state{instructions=[], stack=[]}) -> + ok; + +%% finished local instruction set, pop stack +run(#state{instructions=[], stack=[#state{instructions=[{instr, onfail, _}|Instrs]}=S|_]}) -> + run(S#state{instructions=Instrs}); + +run(#state{instructions=[], stack=[OldState|_]}) -> + run(OldState); + +run(#state{instructions=[Instr|Instrs], stack=Stack}=State) -> + NewState0 = + case keep_instruction(Instr) of + true -> State#state{instructions=[Instr|Instrs]}; + false -> State#state{instructions=Instrs} + end, + case catch ex_consumer:execute(Instr, NewState0) of + {'EXIT', Error} -> + case Stack of + [#state{instructions=[{instr, onfail, [_, FailInstrs]}|InstrsTail]}=OldState|StackTail] -> + NewStack = [OldState#state{instructions=InstrsTail}|StackTail], + NewState1 = #state{instructions=FailInstrs, stack=NewStack}, + run(NewState1); + _ -> + exit(Error) %% perhaps someone else will enjoy this + end; + NewState1 -> + run(NewState1) + end. + +%% internal functions +keep_instruction({instr, Instr, _}) when Instr==each; Instr==onfail -> true; +keep_instruction(_) -> false. \ No newline at end of file diff --git a/src/ex_re.erl b/src/ex_re.erl index 9ff1075..e21ca16 100644 --- a/src/ex_re.erl +++ b/src/ex_re.erl @@ -5,6 +5,9 @@ %% Regexp = {re_pattern, _, _, _} %% Subject = {Type, Value} %% Result = {nil, _} | {string, _} | {list_of_strings, _} +run(Regexp, {node, Subject}) when is_tuple(Regexp), is_tuple(Subject) -> + run(Regexp, ex_xpath:reassemble({node, Subject})); + run(Regexp, {string, Subject}) when is_tuple(Regexp), is_list(Subject) -> case re:run(Subject, Regexp, [global]) of nomatch -> diff --git a/src/ex_web.erl b/src/ex_web.erl index 7d5e9c5..33b5f42 100644 --- a/src/ex_web.erl +++ b/src/ex_web.erl @@ -17,13 +17,13 @@ request(Method, Url, [], Body) -> request(Method, Url, ?HEADERS, Body); request(Method, Url, Headers, []) when Method==options;Method==get;Method==head;Method==delete;Method==trace -> case http:request(Method, {Url, Headers}, [], []) of - {ok, Response} -> {string, Response}; + {ok, {{_,RspStatus,_}, RspHeaders, RspBody}} -> {http_response, RspStatus, RspHeaders, RspBody}; {error, Reason} -> exit({?MODULE, ?LINE, Reason}) end; request(Method, Url, Headers, Body) when Method == post; Method == put -> case http:request(Method, {Url, Headers, "text/html", Body}, [], []) of - {ok, Response} -> {string, Response}; + {ok, {{_,RspStatus,_}, RspHeaders, RspBody}} -> {http_response, RspStatus, RspHeaders, RspBody}; {error, Reason} -> exit({?MODULE, ?LINE, Reason}) end. \ No newline at end of file diff --git a/src/ex_xpath.erl b/src/ex_xpath.erl index 8171d32..f908022 100644 --- a/src/ex_xpath.erl +++ b/src/ex_xpath.erl @@ -1,11 +1,14 @@ -module(ex_xpath). --export([run/2]). +-export([run/2, reassemble/1]). %% @spec run(XPath, Subject) -> Result %% XPath = string() %% Subject = {Type, Value} %% Result = {nil, []} | {node, _} | {list_of_nodes, _} | {string, _} | {list_of_strings, _} +run(XPath, {http_response, _, _, Body}) -> + run(XPath, {string, Body}); + run(XPath, {string, Subject0}) when is_list(XPath), is_list(Subject0) -> case mochiweb_html:parse(Subject0) of Subject when is_tuple(Subject) -> @@ -32,4 +35,7 @@ run(XPath, {node, Subject}) when is_list(XPath), is_tuple(Subject) -> {list_of_strings, [binary_to_list(Bin) || Bin <- List]}; _ -> exit({?MODULE, ?LINE, XPath, Subject}) - end. \ No newline at end of file + end. + +reassemble({node, Node}) -> + {string, binary_to_list(iolist_to_binary(mochiweb_html:to_html(Node)))}. \ No newline at end of file diff --git a/t/Makefile b/t/Makefile new file mode 100644 index 0000000..c56f980 --- /dev/null +++ b/t/Makefile @@ -0,0 +1,9 @@ +include ../support/include.mk + +all: $(EBIN_FILES) + +debug: + $(MAKE) DEBUG=-DDEBUG + +clean: + rm -rf $(EBIN_FILES) erl_crash.dump diff --git a/t/excavator_t_004.t b/t/excavator_t_004.t new file mode 100644 index 0000000..6bb511c --- /dev/null +++ b/t/excavator_t_004.t @@ -0,0 +1,51 @@ +#!/usr/local/bin/escript +%% -*- erlang -*- +%%! -pa ebin -sasl errlog_type error -boot start_sasl -noshell + +main(_) -> + etap:plan(unknown), + case (catch start()) of + {'EXIT', Err} -> + io:format("# ~p~n", [Err]), + etap:bail(); + _ -> + etap:end_tests() + end, + ok. + +start() -> + application:start(inets), + test_server:start_link(), + Instrs = + [ {instr, configure, [qps, 10]}, + {instr, fetch, [artist_page, {get, "http://127.0.0.1:8888/gracenote_albums.html"}]}, + {instr, assert, [artist_page, {status, 200}]}, + {instr, assert, [artist_page, string]}, + {instr, assign, [albums, {xpath, artist_page, "//div[@class='album-meta-data-wrapper']"}]}, + {instr, assert, [albums, list_of_nodes]}, + {instr, each, [album, albums, [ + {instr, assign, [album_href, {xpath, album, "//a[1]/@href"}]}, + {instr, assign, [album_id, {regexp, album_href, compile_re("tui_id=(.*)tui")}]}, + {instr, assert, [album_id, string]}, + {instr, fetch, [album_page, {get, ["http://127.0.0.1:8888/gracenote_album_", album_id, ".html"]}]}, + {instr, onfail, [ + [ {instr, assert, [album_page, {status, 200}]}, + {instr, assert, [album_page, string]}, + {instr, assign, [album_name_node, {xpath, album_page, "//div[@class='album-name']"}]}, + {instr, assert, [album_name_node, node]}, + {instr, assign, [album_name, {regexp, album_name_node, compile_re(" > (.*)")}]}, + {instr, assert, [album_name, string]}, + {instr, commit, [{album, beatles}, {album_id, album_name}]}, + {instr, print, [album_name]} + ], + [{instr, function, [fun(S) -> io:format("This shit fAiLeD~n") end]}] + ]} + ]]} + ], + + ex_engine:run(Instrs), + + ok. + +compile_re(Regexp) -> + {ok, RE} = re:compile(Regexp), RE. \ No newline at end of file diff --git a/t/test_server.erl b/t/test_server.erl new file mode 100644 index 0000000..0e2ccb4 --- /dev/null +++ b/t/test_server.erl @@ -0,0 +1,24 @@ +-module(test_server). +-behaviour(web_server). + +-export([start_link/0]). + +%% web_server callbacks +-export([dispatch/2]). + +start_link() -> + web_server:start(?MODULE, [{address, "127.0.0.1"}, {port, 8888}]). + +%%==================================================================== +%% web_server callbacks +%%==================================================================== + +%%-------------------------------------------------------------------- +%% Function: dispatch(Req, PathTokens) -> {reply, Status, Headers, Body} | +%% {reply, Module, Function, Args} | +%% undefined +%% Req = mochiweb_request() +%% PathTokens = list() +%%-------------------------------------------------------------------- +dispatch(_, _) -> + undefined. \ No newline at end of file