Skip to content

Commit

Permalink
Warns users if they put a URL into alowed_domains and corrects the UR…
Browse files Browse the repository at this point in the history
…L to the domain as per urlparse netloc (scrapy#2250)
  • Loading branch information
jlong49 committed Jan 18, 2017
1 parent 5586fc7 commit f400a3e
Showing 1 changed file with 8 additions and 0 deletions.
8 changes: 8 additions & 0 deletions scrapy/spidermiddlewares/offsite.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from scrapy import signals
from scrapy.http import Request
from scrapy.utils.httpobj import urlparse_cached
from urlparse import urlparse

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -52,6 +53,13 @@ def get_host_regex(self, spider):
allowed_domains = getattr(spider, 'allowed_domains', None)
if not allowed_domains:
return re.compile('') # allow all by default
else:
for i, user_domain in enumerate(allowed_domains):
parsed_uri = urlparse(user_domain)
if user_domain != parsed_uri.netloc and parsed_uri.netloc != '':
allowed_domains[i] = parsed_uri.netloc
logging.warn("%s is an invalid domain and has automatically been converted to %s" % (
user_domain, parsed_uri.netloc))
regex = r'^(.*\.)?(%s)$' % '|'.join(re.escape(d) for d in allowed_domains if d is not None)
return re.compile(regex)

Expand Down

0 comments on commit f400a3e

Please sign in to comment.