Skip to content

Commit

Permalink
ignore meta-refresh redirects embedded in <script> tags. related to i…
Browse files Browse the repository at this point in the history
…ssue 18
  • Loading branch information
pablohoffman committed Nov 14, 2011
1 parent ec1ef02 commit 36df87b
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 0 deletions.
9 changes: 9 additions & 0 deletions scrapy/tests/test_utils_response.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,17 @@ def test_get_meta_refresh(self):
</noSCRIPT>
<body>blahablsdfsal&amp;</body>
</html>""")
r3 = HtmlResponse("http://www.example.com", body="""
<noscript><meta http-equiv="REFRESH" content="0;url=http://www.example.com/newpage</noscript>
<script type="text/javascript">
if(!checkCookies()){
document.write('<meta http-equiv="REFRESH" content="0;url=http://www.example.com/newpage">');
}
</script>
""")
self.assertEqual(get_meta_refresh(r1), (5.0, 'http://example.org/newpage'))
self.assertEqual(get_meta_refresh(r2), (None, None))
self.assertEqual(get_meta_refresh(r3), (None, None))

if __name__ == "__main__":
unittest.main()
2 changes: 2 additions & 0 deletions scrapy/utils/response.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,14 @@ def get_base_url(response):
return _baseurl_cache[response]

_noscript_re = re.compile(u'<noscript>.*?</noscript>', re.IGNORECASE | re.DOTALL)
_script_re = re.compile(u'<script.*?>.*?</script>', re.IGNORECASE | re.DOTALL)
_metaref_cache = weakref.WeakKeyDictionary()
def get_meta_refresh(response):
"""Parse the http-equiv refrsh parameter from the given response"""
if response not in _metaref_cache:
text = response.body_as_unicode()[0:4096]
text = _noscript_re.sub(u'', text)
text = _script_re.sub(u'', text)
_metaref_cache[response] = html.get_meta_refresh(text, response.url, \
response.encoding)
return _metaref_cache[response]
Expand Down

0 comments on commit 36df87b

Please sign in to comment.