Skip to content

Commit

Permalink
fixed bug in url_is_from_spider() when no allowed_domains class attri…
Browse files Browse the repository at this point in the history
…bute is present
  • Loading branch information
pablohoffman committed Aug 16, 2010
1 parent a28cf29 commit b563e56
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 2 deletions.
28 changes: 27 additions & 1 deletion scrapy/tests/test_utils_url.py
Expand Up @@ -19,7 +19,22 @@ def test_url_is_from_any_domain(self):
self.assertFalse(url_is_from_any_domain(url, ['testdomain.com']))
self.assertFalse(url_is_from_any_domain(url+'.testdomain.com', ['testdomain.com']))

def test_url_is_from_any_domain(self):
def test_url_is_from_spider(self):
spider = BaseSpider(name='example.com')
self.assertTrue(url_is_from_spider('http://www.example.com/some/page.html', spider))
self.assertTrue(url_is_from_spider('http://sub.example.com/some/page.html', spider))
self.assertFalse(url_is_from_spider('http://www.example.org/some/page.html', spider))
self.assertFalse(url_is_from_spider('http://www.example.net/some/page.html', spider))

def test_url_is_from_spider_class_attributes(self):
class MySpider(BaseSpider):
name = 'example.com'
self.assertTrue(url_is_from_spider('http://www.example.com/some/page.html', MySpider))
self.assertTrue(url_is_from_spider('http://sub.example.com/some/page.html', MySpider))
self.assertFalse(url_is_from_spider('http://www.example.org/some/page.html', MySpider))
self.assertFalse(url_is_from_spider('http://www.example.net/some/page.html', MySpider))

def test_url_is_from_spider_with_allowed_domains(self):
spider = BaseSpider(name='example.com', allowed_domains=['example.org', 'example.net'])
self.assertTrue(url_is_from_spider('http://www.example.com/some/page.html', spider))
self.assertTrue(url_is_from_spider('http://sub.example.com/some/page.html', spider))
Expand All @@ -28,6 +43,17 @@ def test_url_is_from_any_domain(self):
self.assertTrue(url_is_from_spider('http://www.example.net/some/page.html', spider))
self.assertFalse(url_is_from_spider('http://www.example.us/some/page.html', spider))

def test_url_is_from_spider_with_allowed_domains_class_attributes(self):
class MySpider(BaseSpider):
name = 'example.com'
allowed_domains = ['example.org', 'example.net']
self.assertTrue(url_is_from_spider('http://www.example.com/some/page.html', MySpider))
self.assertTrue(url_is_from_spider('http://sub.example.com/some/page.html', MySpider))
self.assertTrue(url_is_from_spider('http://example.com/some/page.html', MySpider))
self.assertTrue(url_is_from_spider('http://www.example.org/some/page.html', MySpider))
self.assertTrue(url_is_from_spider('http://www.example.net/some/page.html', MySpider))
self.assertFalse(url_is_from_spider('http://www.example.us/some/page.html', MySpider))

def test_urljoin_rfc(self):
self.assertEqual(urljoin_rfc('http://example.com/some/path', 'newpath/test'),
'http://example.com/some/newpath/test')
Expand Down
3 changes: 2 additions & 1 deletion scrapy/utils/url.py
Expand Up @@ -22,7 +22,8 @@ def url_is_from_any_domain(url, domains):

def url_is_from_spider(url, spider):
"""Return True if the url belongs to the given spider"""
return url_is_from_any_domain(url, [spider.name] + spider.allowed_domains)
return url_is_from_any_domain(url, [spider.name] + \
getattr(spider, 'allowed_domains', []))

def urljoin_rfc(base, ref, encoding='utf-8'):
"""Same as urlparse.urljoin but supports unicode values in base and ref
Expand Down

0 comments on commit b563e56

Please sign in to comment.