Skip to content
Browse files

0.4.4

  • Loading branch information...
1 parent 2de5c90 commit 6c24a9667ffbbf368907d5ce85c7ed7ca03ee195 @johnnagro committed May 21, 2009
View
6 CHANGES
@@ -1,3 +1,9 @@
+2009-05-21
+* fixed an issue with robots.txt on ssl hosts
+* fixed an issue with pulling robots.txt from disallowed hosts
+* fixed a documentation error with ExpiredLinks
+* Many thanks to Brian Campbell
+
2008-10-09
* fixed a situation with nested slashes in urls, thanks to Sander van der Vliet and John Buckley
View
6 README
@@ -62,13 +62,12 @@ scraping, collecting, and looping so that you can just handle the data.
=== Track cycles with a custom object
require 'spider'
-
class ExpireLinks < Hash
def <<(v)
- [v] = Time.now
+ self[v] = Time.now
end
def include?(v)
- [v] && (Time.now + 86400) <= [v]
+ self[v].kind_of?(Time) && (self[v] + 86400) >= Time.now
end
end
@@ -141,6 +140,7 @@ Matt Horan
Henri Cook
Sander van der Vliet
John Buckley
+Brian Campbell
With `robot_rules' from James Edward Gray II via
http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/177589
View
63 doc/classes/IncludedInMemcached.html
@@ -81,8 +81,8 @@
<div id="description">
<p>
A specialized class using memcached to track items stored. It supports
-three operations: <a href="IncludedInMemcached.html#M000001">new</a>,
-&lt;&lt;, and <a href="IncludedInMemcached.html#M000003">include?</a> .
+three operations: <a href="IncludedInMemcached.html#M000015">new</a>,
+&lt;&lt;, and <a href="IncludedInMemcached.html#M000017">include?</a> .
Together these can be used to add items to the memcache, then determine
whether the item has been added.
</p>
@@ -105,9 +105,9 @@
<h3 class="section-bar">Methods</h3>
<div class="name-list">
- <a href="#M000002">&lt;&lt;</a>&nbsp;&nbsp;
- <a href="#M000003">include?</a>&nbsp;&nbsp;
- <a href="#M000001">new</a>&nbsp;&nbsp;
+ <a href="#M000016">&lt;&lt;</a>&nbsp;&nbsp;
+ <a href="#M000017">include?</a>&nbsp;&nbsp;
+ <a href="#M000015">new</a>&nbsp;&nbsp;
</div>
</div>
@@ -129,41 +129,33 @@ <h3 class="section-bar">Methods</h3>
<div id="methods">
<h3 class="section-bar">Public Class methods</h3>
- <div id="method-M000001" class="method-detail">
- <a name="M000001"></a>
+ <div id="method-M000015" class="method-detail">
+ <a name="M000015"></a>
<div class="method-heading">
- <a href="#M000001" class="method-signature">
+ <a href="IncludedInMemcached.src/M000015.html" target="Code" class="method-signature"
+ onclick="popupCode('IncludedInMemcached.src/M000015.html');return false;">
<span class="method-name">new</span><span class="method-args">(*a)</span>
</a>
</div>
<div class="method-description">
<p>
-Construct a <a href="IncludedInMemcached.html#M000001">new</a> <a
+Construct a <a href="IncludedInMemcached.html#M000015">new</a> <a
href="IncludedInMemcached.html">IncludedInMemcached</a> instance. All
arguments here are passed to MemCache (part of the memcache-client gem).
</p>
- <p><a class="source-toggle" href="#"
- onclick="toggleCode('M000001-source');return false;">[Source]</a></p>
- <div class="method-source-code" id="M000001-source">
-<pre>
-<span class="ruby-comment cmt"># File lib/spider/included_in_memcached.rb, line 39</span>
- <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">initialize</span>(<span class="ruby-operator">*</span><span class="ruby-identifier">a</span>)
- <span class="ruby-ivar">@c</span> = <span class="ruby-constant">MemCache</span>.<span class="ruby-identifier">new</span>(<span class="ruby-operator">*</span><span class="ruby-identifier">a</span>)
- <span class="ruby-keyword kw">end</span>
-</pre>
- </div>
</div>
</div>
<h3 class="section-bar">Public Instance methods</h3>
- <div id="method-M000002" class="method-detail">
- <a name="M000002"></a>
+ <div id="method-M000016" class="method-detail">
+ <a name="M000016"></a>
<div class="method-heading">
- <a href="#M000002" class="method-signature">
+ <a href="IncludedInMemcached.src/M000016.html" target="Code" class="method-signature"
+ onclick="popupCode('IncludedInMemcached.src/M000016.html');return false;">
<span class="method-name">&lt;&lt;</span><span class="method-args">(v)</span>
</a>
</div>
@@ -172,24 +164,15 @@ <h3 class="section-bar">Public Instance methods</h3>
<p>
Add an item to the memcache.
</p>
- <p><a class="source-toggle" href="#"
- onclick="toggleCode('M000002-source');return false;">[Source]</a></p>
- <div class="method-source-code" id="M000002-source">
-<pre>
-<span class="ruby-comment cmt"># File lib/spider/included_in_memcached.rb, line 44</span>
- <span class="ruby-keyword kw">def</span> <span class="ruby-operator">&lt;&lt;</span>(<span class="ruby-identifier">v</span>)
- <span class="ruby-ivar">@c</span>.<span class="ruby-identifier">add</span>(<span class="ruby-identifier">v</span>.<span class="ruby-identifier">to_s</span>, <span class="ruby-identifier">v</span>)
- <span class="ruby-keyword kw">end</span>
-</pre>
- </div>
</div>
</div>
- <div id="method-M000003" class="method-detail">
- <a name="M000003"></a>
+ <div id="method-M000017" class="method-detail">
+ <a name="M000017"></a>
<div class="method-heading">
- <a href="#M000003" class="method-signature">
+ <a href="IncludedInMemcached.src/M000017.html" target="Code" class="method-signature"
+ onclick="popupCode('IncludedInMemcached.src/M000017.html');return false;">
<span class="method-name">include?</span><span class="method-args">(v)</span>
</a>
</div>
@@ -198,16 +181,6 @@ <h3 class="section-bar">Public Instance methods</h3>
<p>
True if the item is in the memcache.
</p>
- <p><a class="source-toggle" href="#"
- onclick="toggleCode('M000003-source');return false;">[Source]</a></p>
- <div class="method-source-code" id="M000003-source">
-<pre>
-<span class="ruby-comment cmt"># File lib/spider/included_in_memcached.rb, line 49</span>
- <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">include?</span>(<span class="ruby-identifier">v</span>)
- <span class="ruby-ivar">@c</span>.<span class="ruby-identifier">get</span>(<span class="ruby-identifier">v</span>.<span class="ruby-identifier">to_s</span>) <span class="ruby-operator">==</span> <span class="ruby-identifier">v</span>
- <span class="ruby-keyword kw">end</span>
-</pre>
- </div>
</div>
</div>
View
22 doc/classes/Spider.html
@@ -93,7 +93,7 @@
<h3 class="section-bar">Methods</h3>
<div class="name-list">
- <a href="#M000011">start_at</a>&nbsp;&nbsp;
+ <a href="#M000029">start_at</a>&nbsp;&nbsp;
</div>
</div>
@@ -115,11 +115,12 @@ <h3 class="section-bar">Methods</h3>
<div id="methods">
<h3 class="section-bar">Public Class methods</h3>
- <div id="method-M000011" class="method-detail">
- <a name="M000011"></a>
+ <div id="method-M000029" class="method-detail">
+ <a name="M000029"></a>
<div class="method-heading">
- <a href="#M000011" class="method-signature">
+ <a href="Spider.src/M000029.html" target="Code" class="method-signature"
+ onclick="popupCode('Spider.src/M000029.html');return false;">
<span class="method-name">start_at</span><span class="method-args">(a_url, &amp;block)</span>
</a>
</div>
@@ -151,19 +152,6 @@ <h3 class="section-bar">Public Class methods</h3>
end
end
</pre>
- <p><a class="source-toggle" href="#"
- onclick="toggleCode('M000011-source');return false;">[Source]</a></p>
- <div class="method-source-code" id="M000011-source">
-<pre>
-<span class="ruby-comment cmt"># File lib/spider.rb, line 54</span>
- <span class="ruby-keyword kw">def</span> <span class="ruby-keyword kw">self</span>.<span class="ruby-identifier">start_at</span>(<span class="ruby-identifier">a_url</span>, <span class="ruby-operator">&amp;</span><span class="ruby-identifier">block</span>)
- <span class="ruby-identifier">rules</span> = <span class="ruby-constant">RobotRules</span>.<span class="ruby-identifier">new</span>(<span class="ruby-value str">'Ruby Spider 1.0'</span>)
- <span class="ruby-identifier">a_spider</span> = <span class="ruby-constant">SpiderInstance</span>.<span class="ruby-identifier">new</span>({<span class="ruby-keyword kw">nil</span> =<span class="ruby-operator">&gt;</span> <span class="ruby-identifier">a_url</span>}, [], <span class="ruby-identifier">rules</span>, [])
- <span class="ruby-identifier">block</span>.<span class="ruby-identifier">call</span>(<span class="ruby-identifier">a_spider</span>)
- <span class="ruby-identifier">a_spider</span>.<span class="ruby-identifier">start!</span>
- <span class="ruby-keyword kw">end</span>
-</pre>
- </div>
</div>
</div>
View
180 doc/classes/SpiderInstance.html
@@ -86,13 +86,14 @@
<h3 class="section-bar">Methods</h3>
<div class="name-list">
- <a href="#M000004">add_url_check</a>&nbsp;&nbsp;
- <a href="#M000005">check_already_seen_with</a>&nbsp;&nbsp;
- <a href="#M000010">clear_headers</a>&nbsp;&nbsp;
- <a href="#M000009">headers</a>&nbsp;&nbsp;
- <a href="#M000006">on</a>&nbsp;&nbsp;
- <a href="#M000007">setup</a>&nbsp;&nbsp;
- <a href="#M000008">teardown</a>&nbsp;&nbsp;
+ <a href="#M000021">add_url_check</a>&nbsp;&nbsp;
+ <a href="#M000022">check_already_seen_with</a>&nbsp;&nbsp;
+ <a href="#M000028">clear_headers</a>&nbsp;&nbsp;
+ <a href="#M000027">headers</a>&nbsp;&nbsp;
+ <a href="#M000024">on</a>&nbsp;&nbsp;
+ <a href="#M000025">setup</a>&nbsp;&nbsp;
+ <a href="#M000023">store_next_urls_with</a>&nbsp;&nbsp;
+ <a href="#M000026">teardown</a>&nbsp;&nbsp;
</div>
</div>
@@ -114,11 +115,12 @@ <h3 class="section-bar">Methods</h3>
<div id="methods">
<h3 class="section-bar">Public Instance methods</h3>
- <div id="method-M000004" class="method-detail">
- <a name="M000004"></a>
+ <div id="method-M000021" class="method-detail">
+ <a name="M000021"></a>
<div class="method-heading">
- <a href="#M000004" class="method-signature">
+ <a href="SpiderInstance.src/M000021.html" target="Code" class="method-signature"
+ onclick="popupCode('SpiderInstance.src/M000021.html');return false;">
<span class="method-name">add_url_check</span><span class="method-args">(&amp;block)</span>
</a>
</div>
@@ -136,24 +138,15 @@ <h3 class="section-bar">Public Instance methods</h3>
<pre>
add_url_check { |a_url| a_url =~ %r{^http://mike-burns.com.*}
</pre>
- <p><a class="source-toggle" href="#"
- onclick="toggleCode('M000004-source');return false;">[Source]</a></p>
- <div class="method-source-code" id="M000004-source">
-<pre>
-<span class="ruby-comment cmt"># File lib/spider/spider_instance.rb, line 70</span>
- <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">add_url_check</span>(<span class="ruby-operator">&amp;</span><span class="ruby-identifier">block</span>)
- <span class="ruby-ivar">@url_checks</span> <span class="ruby-operator">&lt;&lt;</span> <span class="ruby-identifier">block</span>
- <span class="ruby-keyword kw">end</span>
-</pre>
- </div>
</div>
</div>
- <div id="method-M000005" class="method-detail">
- <a name="M000005"></a>
+ <div id="method-M000022" class="method-detail">
+ <a name="M000022"></a>
<div class="method-heading">
- <a href="#M000005" class="method-signature">
+ <a href="SpiderInstance.src/M000022.html" target="Code" class="method-signature"
+ onclick="popupCode('SpiderInstance.src/M000022.html');return false;">
<span class="method-name">check_already_seen_with</span><span class="method-args">(cacher)</span>
</a>
</div>
@@ -171,7 +164,7 @@ <h3 class="section-bar">Public Instance methods</h3>
</p>
<p>
You can implement a custom class for this; any object passed to <a
-href="SpiderInstance.html#M000005">check_already_seen_with</a> must
+href="SpiderInstance.html#M000022">check_already_seen_with</a> must
understand just &lt;&lt; and included? .
</p>
<pre>
@@ -182,54 +175,32 @@ <h3 class="section-bar">Public Instance methods</h3>
require 'spider/included_in_memcached'
check_already_seen_with IncludedInMemcached.new('localhost:11211')
</pre>
- <p><a class="source-toggle" href="#"
- onclick="toggleCode('M000005-source');return false;">[Source]</a></p>
- <div class="method-source-code" id="M000005-source">
-<pre>
-<span class="ruby-comment cmt"># File lib/spider/spider_instance.rb, line 91</span>
- <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">check_already_seen_with</span>(<span class="ruby-identifier">cacher</span>)
- <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">cacher</span>.<span class="ruby-identifier">respond_to?</span>(<span class="ruby-identifier">:&lt;&lt;</span>) <span class="ruby-operator">&amp;&amp;</span> <span class="ruby-identifier">cacher</span>.<span class="ruby-identifier">respond_to?</span>(<span class="ruby-identifier">:include?</span>)
- <span class="ruby-ivar">@seen</span> = <span class="ruby-identifier">cacher</span>
- <span class="ruby-keyword kw">else</span>
- <span class="ruby-identifier">raise</span> <span class="ruby-constant">ArgumentError</span>, <span class="ruby-value str">'expected something that responds to &lt;&lt; and included?'</span>
- <span class="ruby-keyword kw">end</span>
- <span class="ruby-keyword kw">end</span>
-</pre>
- </div>
</div>
</div>
- <div id="method-M000010" class="method-detail">
- <a name="M000010"></a>
+ <div id="method-M000028" class="method-detail">
+ <a name="M000028"></a>
<div class="method-heading">
- <a href="#M000010" class="method-signature">
+ <a href="SpiderInstance.src/M000028.html" target="Code" class="method-signature"
+ onclick="popupCode('SpiderInstance.src/M000028.html');return false;">
<span class="method-name">clear_headers</span><span class="method-args">()</span>
</a>
</div>
<div class="method-description">
<p>
-Reset the <a href="SpiderInstance.html#M000009">headers</a> hash.
+Reset the <a href="SpiderInstance.html#M000027">headers</a> hash.
</p>
- <p><a class="source-toggle" href="#"
- onclick="toggleCode('M000010-source');return false;">[Source]</a></p>
- <div class="method-source-code" id="M000010-source">
-<pre>
-<span class="ruby-comment cmt"># File lib/spider/spider_instance.rb, line 158</span>
- <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">clear_headers</span>
- <span class="ruby-ivar">@headers</span> = {}
- <span class="ruby-keyword kw">end</span>
-</pre>
- </div>
</div>
</div>
- <div id="method-M000009" class="method-detail">
- <a name="M000009"></a>
+ <div id="method-M000027" class="method-detail">
+ <a name="M000027"></a>
<div class="method-heading">
- <a href="#M000009" class="method-signature">
+ <a href="SpiderInstance.src/M000027.html" target="Code" class="method-signature"
+ onclick="popupCode('SpiderInstance.src/M000027.html');return false;">
<span class="method-name">headers</span><span class="method-args">()</span>
</a>
</div>
@@ -241,24 +212,15 @@ <h3 class="section-bar">Public Instance methods</h3>
<pre>
headers['Cookies'] = 'user_id=1;password=btrross3'
</pre>
- <p><a class="source-toggle" href="#"
- onclick="toggleCode('M000009-source');return false;">[Source]</a></p>
- <div class="method-source-code" id="M000009-source">
-<pre>
-<span class="ruby-comment cmt"># File lib/spider/spider_instance.rb, line 146</span>
- <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">headers</span>
- <span class="ruby-constant">HeaderSetter</span>.<span class="ruby-identifier">new</span>(<span class="ruby-keyword kw">self</span>)
- <span class="ruby-keyword kw">end</span>
-</pre>
- </div>
</div>
</div>
- <div id="method-M000006" class="method-detail">
- <a name="M000006"></a>
+ <div id="method-M000024" class="method-detail">
+ <a name="M000024"></a>
<div class="method-heading">
- <a href="#M000006" class="method-signature">
+ <a href="SpiderInstance.src/M000024.html" target="Code" class="method-signature"
+ onclick="popupCode('SpiderInstance.src/M000024.html');return false;">
<span class="method-name">on</span><span class="method-args">(code, p = nil, &amp;block)</span>
</a>
</div>
@@ -290,30 +252,15 @@ <h3 class="section-bar">Public Instance methods</h3>
puts &quot;Given this code: #{resp.code}&quot;
end
</pre>
- <p><a class="source-toggle" href="#"
- onclick="toggleCode('M000006-source');return false;">[Source]</a></p>
- <div class="method-source-code" id="M000006-source">
-<pre>
-<span class="ruby-comment cmt"># File lib/spider/spider_instance.rb, line 121</span>
- <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">on</span>(<span class="ruby-identifier">code</span>, <span class="ruby-identifier">p</span> = <span class="ruby-keyword kw">nil</span>, <span class="ruby-operator">&amp;</span><span class="ruby-identifier">block</span>)
- <span class="ruby-identifier">f</span> = <span class="ruby-identifier">p</span> <span class="ruby-value">? </span><span class="ruby-identifier">p</span> <span class="ruby-operator">:</span> <span class="ruby-identifier">block</span>
- <span class="ruby-keyword kw">case</span> <span class="ruby-identifier">code</span>
- <span class="ruby-keyword kw">when</span> <span class="ruby-constant">Fixnum</span>
- <span class="ruby-ivar">@callbacks</span>[<span class="ruby-identifier">code</span>] = <span class="ruby-identifier">f</span>
- <span class="ruby-keyword kw">else</span>
- <span class="ruby-ivar">@callbacks</span>[<span class="ruby-identifier">code</span>.<span class="ruby-identifier">to_sym</span>] = <span class="ruby-identifier">f</span>
- <span class="ruby-keyword kw">end</span>
- <span class="ruby-keyword kw">end</span>
-</pre>
- </div>
</div>
</div>
- <div id="method-M000007" class="method-detail">
- <a name="M000007"></a>
+ <div id="method-M000025" class="method-detail">
+ <a name="M000025"></a>
<div class="method-heading">
- <a href="#M000007" class="method-signature">
+ <a href="SpiderInstance.src/M000025.html" target="Code" class="method-signature"
+ onclick="popupCode('SpiderInstance.src/M000025.html');return false;">
<span class="method-name">setup</span><span class="method-args">(p = nil, &amp;block)</span>
</a>
</div>
@@ -327,24 +274,51 @@ <h3 class="section-bar">Public Instance methods</h3>
headers['Cookies'] = 'user_id=1;admin=true'
end
</pre>
- <p><a class="source-toggle" href="#"
- onclick="toggleCode('M000007-source');return false;">[Source]</a></p>
- <div class="method-source-code" id="M000007-source">
+ </div>
+ </div>
+
+ <div id="method-M000023" class="method-detail">
+ <a name="M000023"></a>
+
+ <div class="method-heading">
+ <a href="SpiderInstance.src/M000023.html" target="Code" class="method-signature"
+ onclick="popupCode('SpiderInstance.src/M000023.html');return false;">
+ <span class="method-name">store_next_urls_with</span><span class="method-args">(a_store)</span>
+ </a>
+ </div>
+
+ <div class="method-description">
+ <p>
+The Web is a really, really, really big graph; as such, this list of nodes
+to visit grows really, really, really big.
+</p>
+<p>
+Change the object used to store nodes we have yet to walk. The default
+object is an instance of Array. Available with <a
+href="Spider.html">Spider</a> is a wrapper of AmazonSQS.
+</p>
+<p>
+You can implement a custom class for this; any object passed to <a
+href="SpiderInstance.html#M000022">check_already_seen_with</a> must
+understand just push and pop .
+</p>
<pre>
-<span class="ruby-comment cmt"># File lib/spider/spider_instance.rb, line 135</span>
- <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">setup</span>(<span class="ruby-identifier">p</span> = <span class="ruby-keyword kw">nil</span>, <span class="ruby-operator">&amp;</span><span class="ruby-identifier">block</span>)
- <span class="ruby-ivar">@setup</span> = <span class="ruby-identifier">p</span> <span class="ruby-value">? </span><span class="ruby-identifier">p</span> <span class="ruby-operator">:</span> <span class="ruby-identifier">block</span>
- <span class="ruby-keyword kw">end</span>
+ # default
+ store_next_urls_with Array.new
+
+ # AmazonSQS
+ require 'spider/next_urls_in_sqs'
+ store_next_urls_with NextUrlsInSQS.new(AWS_ACCESS_KEY, AWS_SECRET_ACCESS_KEY, queue_name)
</pre>
- </div>
</div>
</div>
- <div id="method-M000008" class="method-detail">
- <a name="M000008"></a>
+ <div id="method-M000026" class="method-detail">
+ <a name="M000026"></a>
<div class="method-heading">
- <a href="#M000008" class="method-signature">
+ <a href="SpiderInstance.src/M000026.html" target="Code" class="method-signature"
+ onclick="popupCode('SpiderInstance.src/M000026.html');return false;">
<span class="method-name">teardown</span><span class="method-args">(p = nil, &amp;block)</span>
</a>
</div>
@@ -353,16 +327,6 @@ <h3 class="section-bar">Public Instance methods</h3>
<p>
Run last, once for each page. Given the URL as a string.
</p>
- <p><a class="source-toggle" href="#"
- onclick="toggleCode('M000008-source');return false;">[Source]</a></p>
- <div class="method-source-code" id="M000008-source">
-<pre>
-<span class="ruby-comment cmt"># File lib/spider/spider_instance.rb, line 140</span>
- <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">teardown</span>(<span class="ruby-identifier">p</span> = <span class="ruby-keyword kw">nil</span>, <span class="ruby-operator">&amp;</span><span class="ruby-identifier">block</span>)
- <span class="ruby-ivar">@teardown</span> = <span class="ruby-identifier">p</span> <span class="ruby-value">? </span><span class="ruby-identifier">p</span> <span class="ruby-operator">:</span> <span class="ruby-identifier">block</span>
- <span class="ruby-keyword kw">end</span>
-</pre>
- </div>
</div>
</div>
View
2 doc/created.rid
@@ -1 +1 @@
-Sat, 10 Nov 2007 00:25:19 -0500
+Thu, 21 May 2009 15:42:01 +0000
View
30 doc/files/lib/spider/included_in_memcached_rb.html
@@ -56,7 +56,7 @@
</tr>
<tr class="top-aligned-row">
<td><strong>Last Update:</strong></td>
- <td>Sat Nov 10 00:24:11 -0500 2007</td>
+ <td>Thu May 21 13:19:06 +0000 2009</td>
</tr>
</table>
</div>
@@ -72,6 +72,34 @@
<p>
Use memcached to track cycles.
</p>
+<p>
+Copyright 2007 Mike Burns Redistribution and use in source and binary
+forms, with or without modification, are permitted provided that the
+following conditions are met:
+</p>
+<pre>
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name Mike Burns nor the
+ names of his contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+</pre>
+<p>
+THIS SOFTWARE IS PROVIDED BY Mike Burns ``AS IS&#8217;&#8217; AND ANY
+EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL Mike Burns BE LIABLE FOR ANY DIRECT,
+INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+DAMAGE.
+</p>
</div>
View
3 doc/files/lib/spider/spider_instance_rb.html
@@ -56,7 +56,7 @@
</tr>
<tr class="top-aligned-row">
<td><strong>Last Update:</strong></td>
- <td>Sat Nov 10 00:25:04 -0500 2007</td>
+ <td>Thu May 21 15:38:44 +0000 2009</td>
</tr>
</table>
</div>
@@ -79,7 +79,6 @@
<h3 class="section-bar">Required files</h3>
<div class="name-list">
- robot_rules&nbsp;&nbsp;
open-uri&nbsp;&nbsp;
uri&nbsp;&nbsp;
net/http&nbsp;&nbsp;
View
49 doc/files/lib/spider_rb.html
@@ -56,7 +56,7 @@
</tr>
<tr class="top-aligned-row">
<td><strong>Last Update:</strong></td>
- <td>Thu Nov 08 17:29:01 -0500 2007</td>
+ <td>Thu May 21 13:19:06 +0000 2009</td>
</tr>
</table>
</div>
@@ -70,9 +70,12 @@
<div id="description">
<p>
-Copyright 2007 Mike Burns <a href="../../classes/Spider.html">Spider</a>, a
-Web spidering library for Ruby. It handles the robots.txt, scraping,
-collecting, and looping so that you can just handle the data.
+Copyright 2007-2008 Mike Burns &amp; John Nagro
+</p>
+<p>
+<a href="../../classes/Spider.html">Spider</a>, a Web spidering library for
+Ruby. It handles the robots.txt, scraping, collecting, and looping so that
+you can just handle the data.
</p>
<h2>Examples</h2>
<h3>Crawl the Web, loading each page in turn, until you run out of memory</h3>
@@ -133,20 +136,44 @@
<h3>Track cycles with a custom object</h3>
<pre>
require 'spider'
-
class ExpireLinks &lt; Hash
def &lt;&lt;(v)
- [v] = Time.now
+ self[v] = Time.now
end
def include?(v)
- [v] &amp;&amp; (Time.now + 86400) &lt;= [v]
+ self[v].kind_of?(Time) &amp;&amp; (self[v] + 86400) &gt;= Time.now
end
end
Spider.start_at('http://mike-burns.com/') do |s|
s.check_already_seen_with ExpireLinks.new
end
</pre>
+<h3>Store nodes to visit with Amazon SQS</h3>
+<pre>
+ require 'spider'
+ require 'spider/next_urls_in_sqs'
+ Spider.start_at('http://mike-burns.com') do |s|
+ s.store_next_urls_with NextUrlsInSQS.new(AWS_ACCESS_KEY, AWS_SECRET_ACCESS_KEY)
+ end
+</pre>
+<h4>Store nodes to visit with a custom object</h4>
+<pre>
+ require 'spider'
+ class MyArray &lt; Array
+ def pop
+ super
+ end
+
+ def push(a_msg)
+ super(a_msg)
+ end
+ end
+
+ Spider.start_at('http://mike-burns.com') do |s|
+ s.store_next_urls_with MyArray.new
+ end
+</pre>
<h3>Create a URL graph</h3>
<pre>
require 'spider'
@@ -178,11 +205,15 @@
</pre>
<h2>Author</h2>
<p>
+John Nagro john.nagro@gmail.com
+</p>
+<p>
Mike Burns <a href="http://mike-burns.com">mike-burns.com</a>
-mike@mike-burns.com
+mike@mike-burns.com (original author)
</p>
<p>
-Help from Matt Horan, John Nagro, and Henri Cook.
+Many thanks to: Matt Horan Henri Cook Sander van der Vliet John Buckley
+Brian Campbell
</p>
<p>
With `robot_rules&#8217; from James Edward Gray II via <a
View
5 doc/fr_class_index.html
@@ -20,7 +20,12 @@
<div id="index">
<h1 class="section-bar">Classes</h1>
<div id="index-entries">
+ <a href="classes/BeStaticServerPages.html">BeStaticServerPages</a><br />
<a href="classes/IncludedInMemcached.html">IncludedInMemcached</a><br />
+ <a href="classes/LoopingServlet.html">LoopingServlet</a><br />
+ <a href="classes/NextUrlsInSQS.html">NextUrlsInSQS</a><br />
+ <a href="classes/QueryServlet.html">QueryServlet</a><br />
+ <a href="classes/RobotRules.html">RobotRules</a><br />
<a href="classes/Spider.html">Spider</a><br />
<a href="classes/SpiderInstance.html">SpiderInstance</a><br />
</div>
View
7 doc/fr_file_index.html
@@ -20,10 +20,15 @@
<div id="index">
<h1 class="section-bar">Files</h1>
<div id="index-entries">
- <a href="files/README.html">README</a><br />
<a href="files/lib/spider_rb.html">lib/spider.rb</a><br />
<a href="files/lib/spider/included_in_memcached_rb.html">lib/spider/included_in_memcached.rb</a><br />
+ <a href="files/lib/spider/next_urls_in_sqs_rb.html">lib/spider/next_urls_in_sqs.rb</a><br />
+ <a href="files/lib/spider/robot_rules_rb.html">lib/spider/robot_rules.rb</a><br />
<a href="files/lib/spider/spider_instance_rb.html">lib/spider/spider_instance.rb</a><br />
+ <a href="files/spec/spec_helper_rb.html">spec/spec_helper.rb</a><br />
+ <a href="files/spec/spider/included_in_memcached_spec_rb.html">spec/spider/included_in_memcached_spec.rb</a><br />
+ <a href="files/spec/spider/spider_instance_spec_rb.html">spec/spider/spider_instance_spec.rb</a><br />
+ <a href="files/spec/spider_spec_rb.html">spec/spider_spec.rb</a><br />
</div>
</div>
</body>
View
49 doc/fr_method_index.html
@@ -20,17 +20,44 @@
<div id="index">
<h1 class="section-bar">Methods</h1>
<div id="index-entries">
- <a href="classes/IncludedInMemcached.html#M000002"><< (IncludedInMemcached)</a><br />
- <a href="classes/SpiderInstance.html#M000004">add_url_check (SpiderInstance)</a><br />
- <a href="classes/SpiderInstance.html#M000005">check_already_seen_with (SpiderInstance)</a><br />
- <a href="classes/SpiderInstance.html#M000010">clear_headers (SpiderInstance)</a><br />
- <a href="classes/SpiderInstance.html#M000009">headers (SpiderInstance)</a><br />
- <a href="classes/IncludedInMemcached.html#M000003">include? (IncludedInMemcached)</a><br />
- <a href="classes/IncludedInMemcached.html#M000001">new (IncludedInMemcached)</a><br />
- <a href="classes/SpiderInstance.html#M000006">on (SpiderInstance)</a><br />
- <a href="classes/SpiderInstance.html#M000007">setup (SpiderInstance)</a><br />
- <a href="classes/Spider.html#M000011">start_at (Spider)</a><br />
- <a href="classes/SpiderInstance.html#M000008">teardown (SpiderInstance)</a><br />
+ <a href="classes/IncludedInMemcached.html#M000016"><< (IncludedInMemcached)</a><br />
+ <a href="classes/SpiderInstance.html#M000021">add_url_check (SpiderInstance)</a><br />
+ <a href="files/spec/spider/included_in_memcached_spec_rb.html#M000007">after_specing_memcached (spec/spider/included_in_memcached_spec.rb)</a><br />
+ <a href="classes/RobotRules.html#M000036">allowed? (RobotRules)</a><br />
+ <a href="files/spec/spec_helper_rb.html#M000004">be_static_server_pages (spec/spec_helper.rb)</a><br />
+ <a href="files/spec/spider/included_in_memcached_spec_rb.html#M000006">before_specing_memcached (spec/spider/included_in_memcached_spec.rb)</a><br />
+ <a href="files/spec/spider/spider_instance_spec_rb.html#M000012">callback_arguments_on (spec/spider/spider_instance_spec.rb)</a><br />
+ <a href="classes/SpiderInstance.html#M000022">check_already_seen_with (SpiderInstance)</a><br />
+ <a href="classes/SpiderInstance.html#M000028">clear_headers (SpiderInstance)</a><br />
+ <a href="classes/BeStaticServerPages.html#M000033">description (BeStaticServerPages)</a><br />
+ <a href="classes/QueryServlet.html#M000038">do_GET (QueryServlet)</a><br />
+ <a href="classes/LoopingServlet.html#M000037">do_GET (LoopingServlet)</a><br />
+ <a href="classes/BeStaticServerPages.html#M000032">failure_message (BeStaticServerPages)</a><br />
+ <a href="files/spec/spider_spec_rb.html#M000014">find_pages_with_static_server (spec/spider_spec.rb)</a><br />
+ <a href="classes/SpiderInstance.html#M000027">headers (SpiderInstance)</a><br />
+ <a href="classes/IncludedInMemcached.html#M000017">include? (IncludedInMemcached)</a><br />
+ <a href="files/spec/spider/spider_instance_spec_rb.html#M000013">it_should_prevent_cycles_with (spec/spider/spider_instance_spec.rb)</a><br />
+ <a href="files/spec/spec_helper_rb.html#M000001">local_require (spec/spec_helper.rb)</a><br />
+ <a href="classes/BeStaticServerPages.html#M000031">matches? (BeStaticServerPages)</a><br />
+ <a href="files/spec/spider/spider_instance_spec_rb.html#M000010">mock_failed_http (spec/spider/spider_instance_spec.rb)</a><br />
+ <a href="files/spec/spider/spider_instance_spec_rb.html#M000008">mock_http (spec/spider/spider_instance_spec.rb)</a><br />
+ <a href="files/spec/spider/spider_instance_spec_rb.html#M000011">mock_redirect_http (spec/spider/spider_instance_spec.rb)</a><br />
+ <a href="files/spec/spider/spider_instance_spec_rb.html#M000009">mock_successful_http (spec/spider/spider_instance_spec.rb)</a><br />
+ <a href="classes/IncludedInMemcached.html#M000015">new (IncludedInMemcached)</a><br />
+ <a href="classes/NextUrlsInSQS.html#M000018">new (NextUrlsInSQS)</a><br />
+ <a href="classes/BeStaticServerPages.html#M000030">new (BeStaticServerPages)</a><br />
+ <a href="classes/RobotRules.html#M000034">new (RobotRules)</a><br />
+ <a href="files/spec/spec_helper_rb.html#M000005">null_logger (spec/spec_helper.rb)</a><br />
+ <a href="classes/SpiderInstance.html#M000024">on (SpiderInstance)</a><br />
+ <a href="classes/RobotRules.html#M000035">parse (RobotRules)</a><br />
+ <a href="classes/NextUrlsInSQS.html#M000019">pop (NextUrlsInSQS)</a><br />
+ <a href="classes/NextUrlsInSQS.html#M000020">push (NextUrlsInSQS)</a><br />
+ <a href="classes/SpiderInstance.html#M000025">setup (SpiderInstance)</a><br />
+ <a href="classes/Spider.html#M000029">start_at (Spider)</a><br />
+ <a href="classes/SpiderInstance.html#M000023">store_next_urls_with (SpiderInstance)</a><br />
+ <a href="classes/SpiderInstance.html#M000026">teardown (SpiderInstance)</a><br />
+ <a href="files/spec/spec_helper_rb.html#M000003">with_memcached (spec/spec_helper.rb)</a><br />
+ <a href="files/spec/spec_helper_rb.html#M000002">with_web_server (spec/spec_helper.rb)</a><br />
</div>
</div>
</body>
View
2 doc/index.html
@@ -19,6 +19,6 @@
<frame src="fr_class_index.html" name="Classes" />
<frame src="fr_method_index.html" name="Methods" />
</frameset>
- <frame src="files/lib/spider_rb.html" name="docwin" />
+ <frame src="files/spec/spec_helper_rb.html" name="docwin" />
</frameset>
</html>
View
22 lib/spider/spider_instance.rb
@@ -53,7 +53,7 @@ def initialize(next_urls, seen = [], rules = nil, robots_seen = []) #:nodoc:
@callbacks = {}
@next_urls = [next_urls]
@seen = seen
- @rules = rules || RobotRules.new('Ruby Spider 1.0')
+ @rules = rules || RobotRules.new('Ruby Spider 0.4.4')
@robots_seen = robots_seen
@headers = {}
@setup = nil
@@ -227,12 +227,18 @@ def allowable_url?(a_url, parsed_url) #:nodoc:
# True if the robots.txt for that URL allows access to it.
def allowed?(a_url, parsed_url) # :nodoc:
+ return false unless ['http','https'].include?(parsed_url.scheme)
u = "#{parsed_url.scheme}://#{parsed_url.host}:#{parsed_url.port}/robots.txt"
+ parsed_u = URI.parse(u)
+ return false unless @url_checks.map{|url_check|url_check.call(a_url)}.all?
begin
unless @robots_seen.include?(u)
- open(u, 'User-Agent' => 'Ruby Spider',
- 'Accept' => 'text/html,text/xml,application/xml,text/plain') do |url|
- @rules.parse(u, url.read)
+ #open(u, 'User-Agent' => 'Ruby Spider',
+ # 'Accept' => 'text/html,text/xml,application/xml,text/plain', :ssl_verify => false) do |url|
+ # @rules.parse(u, url.read)
+ #end
+ get_page(parsed_u) do |r|
+ @rules.parse(u, r.body)
end
@robots_seen << u
end
@@ -248,10 +254,12 @@ def get_page(parsed_url, &block) #:nodoc:
@seen << parsed_url
begin
http = Net::HTTP.new(parsed_url.host, parsed_url.port)
- http.use_ssl = parsed_url.scheme == 'https'
+ if parsed_url.scheme == 'https'
+ http.use_ssl = true
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
+ end
# Uses start because http.finish cannot be called.
- r = http.start {|h| h.request(Net::HTTP::Get.new(parsed_url.request_uri,
- @headers))}
+ r = http.start {|h| h.request(Net::HTTP::Get.new(parsed_url.request_uri, @headers))}
if r.redirect?
get_page(URI.parse(construct_complete_url(parsed_url,r['Location'])), &block)
else
View
2 spider.gemspec
@@ -14,5 +14,5 @@ spec = Gem::Specification.new do |s|
A Web spidering library: handles robots.txt, scraping, finding more
links, and doing it all over again.
EOF
- s.version = '0.4.3'
+ s.version = '0.4.4'
end

0 comments on commit 6c24a96

Please sign in to comment.
Something went wrong with that request. Please try again.