Skip to content

Commit

Permalink
Added ISpiderManager interface and a test to verify the default Spide…
Browse files Browse the repository at this point in the history
…rManager comforms to it
  • Loading branch information
pablohoffman committed Sep 3, 2010
1 parent 7cfc379 commit 1b76687
Show file tree
Hide file tree
Showing 9 changed files with 63 additions and 37 deletions.
4 changes: 2 additions & 2 deletions scrapy/commands/parse.py
Expand Up @@ -2,7 +2,7 @@
from scrapy.http import Request
from scrapy.item import BaseItem
from scrapy.utils import display
from scrapy.utils.spider import iterate_spider_output
from scrapy.utils.spider import iterate_spider_output, create_spider_for_request
from scrapy.utils.url import is_url
from scrapy.exceptions import UsageError
from scrapy import log
Expand Down Expand Up @@ -73,7 +73,7 @@ def get_spider(self, request, opts):
except KeyError:
log.msg('Unable to find spider: %s' % opts.spider, log.ERROR)
else:
spider = self.crawler.spiders.create_for_request(request)
spider = create_spider_for_request(self.crawler.spiders, request)
if spider:
return spider
log.msg('Unable to find spider for: %s' % request, log.ERROR)
Expand Down
17 changes: 17 additions & 0 deletions scrapy/interfaces.py
Expand Up @@ -33,3 +33,20 @@ def clear():
"""Clear the queue.
This method can return a deferred. """

class ISpiderManager(Interface):

def from_settings(settings):
"""Class method to instantiate from settings"""

def create(spider_name, **spider_args):
"""Returns a new Spider instance for the given spider name, and using
the given spider arguments. If the spider name is not found, it must
raise a KeyError."""

def list():
"""Return a list with the names of all spiders available in the
project"""

def find_by_request(request):
"""Returns the list of spiders names that can handle the given request"""
6 changes: 4 additions & 2 deletions scrapy/queue.py
Expand Up @@ -2,6 +2,7 @@

from scrapy.http import Request
from scrapy.utils.misc import arg_to_iter
from scrapy.utils.spider import create_spider_for_request
from scrapy import log


Expand Down Expand Up @@ -58,7 +59,7 @@ def append_spider(self, spider):

def append_request(self, request, spider=None, **kwargs):
if spider is None:
spider = self._spiders.create_for_request(request, **kwargs)
spider = create_spider_for_request(self._spiders, request, **kwargs)
if spider:
self.spider_requests.append((spider, [request]))

Expand All @@ -69,7 +70,8 @@ def append_url(self, url=None, spider=None, **kwargs):
if url is None:
raise ValueError("A url is required")
if spider is None:
spider = self._spiders.create_for_request(Request(url), **kwargs)
spider = create_spider_for_request(self._spiders, Request(url), \
**kwargs)
if spider:
requests = arg_to_iter(spider.make_requests_from_url(url))
self.spider_requests.append((spider, requests))
Expand Down
3 changes: 2 additions & 1 deletion scrapy/shell.py
Expand Up @@ -13,6 +13,7 @@
from scrapy.item import BaseItem
from scrapy.spider import BaseSpider
from scrapy.selector import XPathSelector, XmlXPathSelector, HtmlXPathSelector
from scrapy.utils.spider import create_spider_for_request
from scrapy.utils.misc import load_object
from scrapy.utils.response import open_in_browser
from scrapy.utils.url import any_to_uri
Expand Down Expand Up @@ -58,7 +59,7 @@ def fetch(self, request_or_url, spider=None):
url = any_to_uri(request_or_url)
request = Request(url, dont_filter=True)
if spider is None:
spider = self.crawler.spiders.create_for_request(request, \
spider = create_spider_for_request(self.crawler.spiders, request, \
BaseSpider('default'), log_multiple=True)
self.crawler.engine.open_spider(spider)
response = None
Expand Down
35 changes: 6 additions & 29 deletions scrapy/spidermanager.py
Expand Up @@ -3,14 +3,19 @@
spiders
"""

from zope.interface import implements

from scrapy import log, signals
from scrapy.interfaces import ISpiderManager
from scrapy.utils.misc import walk_modules
from scrapy.utils.spider import iter_spider_classes
from scrapy.xlib.pydispatch import dispatcher


class SpiderManager(object):

implements(ISpiderManager)

def __init__(self, spider_modules):
self.spider_modules = spider_modules
self._spiders = {}
Expand All @@ -28,41 +33,13 @@ def from_settings(cls, settings):
return cls(settings.getlist('SPIDER_MODULES'))

def create(self, spider_name, **spider_kwargs):
"""Returns a Spider instance for the given spider name, using the given
spider arguments. If the spider name is not found, it raises a
KeyError.
"""
return self._spiders[spider_name](**spider_kwargs)

def find_by_request(self, request):
"""Returns list of spiders names that match the given Request"""
return [name for name, cls in self._spiders.iteritems()
if cls.handles_request(request)]

def create_for_request(self, request, default_spider=None, \
log_none=False, log_multiple=False, **spider_kwargs):
"""Create a spider to handle the given Request.
This will look for the spiders that can handle the given request (using
find_by_request) and return a (new) Spider if (and only if) there is
only one Spider able to handle the Request.
If multiple spiders (or no spider) are found, it will return the
default_spider passed. It can optionally log if multiple or no spiders
are found.
"""
snames = self.find_by_request(request)
if len(snames) == 1:
return self.create(snames[0], **spider_kwargs)
if len(snames) > 1 and log_multiple:
log.msg('More than one spider can handle: %s - %s' % \
(request, ", ".join(snames)), log.ERROR)
if len(snames) == 0 and log_none:
log.msg('Unable to find spider that handles: %s' % request, log.ERROR)
return default_spider
if cls.handles_request(request)]

def list(self):
"""Returns list of spiders available."""
return self._spiders.keys()

def close_spider(self, spider, reason):
Expand Down
1 change: 0 additions & 1 deletion scrapy/spiderqueue.py
@@ -1,5 +1,4 @@
from zope.interface import implements
from zope.interface.verify import verifyObject

from scrapy.interfaces import ISpiderQueue
from scrapy.utils.sqlite import JsonSqlitePriorityQueue
Expand Down
4 changes: 2 additions & 2 deletions scrapy/tests/test_queue.py
Expand Up @@ -18,8 +18,8 @@ def make_requests_from_url(self, url):

class TestSpiderManager(object):

def create_for_request(self, request, **kwargs):
return TestSpider('create_for_request', **kwargs)
def find_by_request(self, request):
return ['create_for_request']

def create(self, spider_name, **spider_kwargs):
return TestSpider(spider_name, **spider_kwargs)
Expand Down
6 changes: 6 additions & 0 deletions scrapy/tests/test_spidermanager/__init__.py
Expand Up @@ -3,11 +3,14 @@
import weakref
import shutil

from zope.interface.verify import verifyObject
from twisted.trial import unittest


# ugly hack to avoid cyclic imports of scrapy.spider when running this test
# alone
import scrapy.spider
from scrapy.interfaces import ISpiderManager
from scrapy.spidermanager import SpiderManager
from scrapy.http import Request

Expand All @@ -28,6 +31,9 @@ def tearDown(self):
del self.spiderman
sys.path.remove(self.tmpdir)

def test_interface(self):
verifyObject(ISpiderManager, self.spiderman)

def test_list(self):
self.assertEqual(set(self.spiderman.list()),
set(['spider1', 'spider2', 'spider3']))
Expand Down
24 changes: 24 additions & 0 deletions scrapy/utils/spider.py
@@ -1,5 +1,6 @@
import inspect

from scrapy import log
from scrapy.item import BaseItem
from scrapy.utils.misc import arg_to_iter

Expand All @@ -21,3 +22,26 @@ def iter_spider_classes(module):
obj.__module__ == module.__name__ and \
getattr(obj, 'name', None):
yield obj

def create_spider_for_request(spidermanager, request, default_spider=None, \
log_none=False, log_multiple=False, **spider_kwargs):
"""Create a spider to handle the given Request.
This will look for the spiders that can handle the given request (using
the spider manager) and return a (new) Spider if (and only if) there is
only one Spider able to handle the Request.
If multiple spiders (or no spider) are found, it will return the
default_spider passed. It can optionally log if multiple or no spiders
are found.
"""
snames = spidermanager.find_by_request(request)
if len(snames) == 1:
return spidermanager.create(snames[0], **spider_kwargs)
if len(snames) > 1 and log_multiple:
log.msg('More than one spider can handle: %s - %s' % \
(request, ", ".join(snames)), log.ERROR)
if len(snames) == 0 and log_none:
log.msg('Unable to find spider that handles: %s' % request, log.ERROR)
return default_spider

0 comments on commit 1b76687

Please sign in to comment.