Skip to content

Commit

Permalink
Merge branch 'dev' into add-redis-socket-timeout
Browse files Browse the repository at this point in the history
  • Loading branch information
Madison Bahmer authored Nov 21, 2018
2 parents 075edc4 + 55b784d commit 5db88cb
Show file tree
Hide file tree
Showing 3 changed files with 55 additions and 5 deletions.
8 changes: 7 additions & 1 deletion crawler/crawling/distributed_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
standard_library.install_aliases()
from builtins import str
from past.builtins import basestring
from six import string_types
from builtins import object
from scrapy.http import Request
from scrapy.conf import settings
Expand Down Expand Up @@ -543,6 +544,11 @@ def next_request(self):
# extra check to add items to request
if 'useragent' in req.meta and req.meta['useragent'] is not None:
req.headers['User-Agent'] = req.meta['useragent']
if 'cookie' in req.meta and req.meta['cookie'] is not None:
if isinstance(req.meta['cookie'], dict):
req.cookies = req.meta['cookie']
elif isinstance(req.meta['cookie'], string_types):
req.cookies = self.parse_cookie(req.meta['cookie'])

return req

Expand All @@ -569,7 +575,7 @@ def request_from_feed(self, item):
if 'cookie' in item and item['cookie'] is not None:
if isinstance(item['cookie'], dict):
req.cookies = item['cookie']
elif isinstance(item['cookie'], basestring):
elif isinstance(item['cookie'], string_types):
req.cookies = self.parse_cookie(item['cookie'])
return req

Expand Down
50 changes: 46 additions & 4 deletions crawler/tests/test_distributed_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,17 +210,59 @@ def test_next_request(self, t):
for key in out.meta:
self.assertEqual(out.meta[key], self.req.meta[key])

# test request from feed with cookies
feed = {
"url": "http://ex.com",
"crawlid": "abc123",
"appid": "myapp",
"spiderid": "link",
"cookie": "authenticated=true;privacy=10"
}
self.req.meta['cookie'] = "authenticated=true;privacy=10" # add cookie to req since we are not testing this
self.scheduler.find_item = MagicMock(return_value=feed)
out = self.scheduler.next_request()
self.assertEqual(out.url, 'http://ex.com')
for key in out.meta:
self.assertEqual(out.meta[key], self.req.meta[key])
self.assertEqual(out.cookies, self.scheduler.parse_cookie(feed["cookie"]))
self.req.meta['cookie'] = None # reset

# test request from serialized request
exist_req = Request('http://ex.com')
exist_req.meta["crawlid"] = "abc123"
exist_req.meta["appid"] = "myapp"
exist_req.meta["spiderid"] = "link"
exist_item = request_to_dict(exist_req)
exist_item["meta"]["crawlid"] = "abc123"
exist_item["meta"]["appid"] = "myapp"
exist_item["meta"]["spiderid"] = "link"
self.scheduler.find_item = MagicMock(return_value=exist_item)
out = self.scheduler.next_request()
self.assertEqual(out.url, 'http://ex.com')
for key in out.meta:
self.assertEqual(out.meta[key], self.req.meta[key])
self.assertEqual(out.meta[key], exist_req.meta[key])

# test request from serialized request with supplied cookie
exist_req = Request('http://ex.com', cookies={'auth':'101'})
exist_item = request_to_dict(exist_req)
self.scheduler.find_item = MagicMock(return_value=exist_item)
out = self.scheduler.next_request()
self.assertEqual(out.url, 'http://ex.com')
for key in out.meta:
self.assertEqual(out.meta[key], exist_req.meta[key])
self.assertEqual(out.cookies, exist_req.cookies)
self.req.meta['cookie'] = None # reset

# test request from serialized request with meta cookie
exist_req = Request('http://ex.com')
exist_req.meta["crawlid"] = "abc123"
exist_req.meta["appid"] = "myapp"
exist_req.meta["spiderid"] = "link"
exist_req.meta["cookie"] = {'authenticated': False, 'privacy':9}
exist_item = request_to_dict(exist_req)
self.scheduler.find_item = MagicMock(return_value=exist_item)
out = self.scheduler.next_request()
self.assertEqual(out.url, 'http://ex.com')
for key in out.meta:
self.assertEqual(out.meta[key], exist_req.meta[key])
self.assertEqual(out.cookies, exist_req.meta['cookie'])

# test didn't get item
self.scheduler.find_item = MagicMock(return_value=None)
Expand Down
2 changes: 2 additions & 0 deletions docs/topics/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ Date: ??/??/????

- Corrected Ansible host list for zookeeper

- Improve cookie handling

- Minor documentation changes/updates

- Added REDIS_SOCKET_TIMEOUT setting to control socket_timeout and socket_connect_timeout
Expand Down

0 comments on commit 5db88cb

Please sign in to comment.