Skip to content

Commit

Permalink
Fix redirect URL handling; add comments
Browse files Browse the repository at this point in the history
The "allow 301 only if redirect to self" bit doesn't work with cache, so now we allow any kind of 301
  • Loading branch information
jace committed Jun 3, 2021
1 parent c63adc2 commit 62d3f2e
Showing 1 changed file with 37 additions and 30 deletions.
67 changes: 37 additions & 30 deletions baseframe/forms/validators.py
Expand Up @@ -503,20 +503,29 @@ def check_url(
cache_check = asset_cache.get(cache_key)
except ValueError: # Possible error from a broken pickle
cache_check = None
# Read from cache, but assume cache may be broken
# since Flask-Cache stores data as a pickle,
# which is version-specific
# Read from cache, but assume cache may be broken since Flask-Cache stores data
# as a pickle, which is version-specific
if cache_check and isinstance(cache_check, dict):
rurl = cache_check.get('url')
code = cache_check.get('code')
else:
rurl = None # rurl is the response URL after following redirects
code = None

# This validator is meant to catch typos, but bot protection tools make that
# fairly hard to do, so we only aim to be helpful, not thorough. Cloudflare's
# implementation issues a 301 redirect to self. The `allow_redirects` option
# in `requests` does not recognise this, and will go into a loop fetching the
# same URL. We therefore have our own implementation of `allow_redirects` that
# stops as soon as it encounters a 3xx redirect to self.

# TODO: Also honour the robots.txt protocol and stay off URLs that aren't meant
# to be checked. https://docs.python.org/3/library/urllib.robotparser.html
if not rurl or not code:
try:
rurl = url
for limit in range(30):
# 30 is the default redirect limit in `requests`
for _count in range(30):
r = requests.get(
rurl,
timeout=30,
Expand All @@ -525,16 +534,18 @@ def check_url(
headers={'User-Agent': self.user_agent},
)
code = r.status_code
rurl = r.url
if code >= 300 and code < 400:
# Redirect?
if rurl == url:
# Loop, break immediately
# Redirect. What kind?
if rurl == r.url:
# Redirect to self in a loop, break immediately
rurl = r.url
break
# Not a loop, continue following redirects
rurl = r.url
continue
else:
# Not a redirect, break iterations and check the response
rurl = r.url
break
except (
# Still a relative URL? Must be broken
Expand All @@ -552,31 +563,27 @@ def check_url(
if (
rurl is not None
and code is not None
and (
code
in (
200,
201,
202,
203,
204,
205,
206,
207,
208,
226,
403, # Previously for Cloudflare
999, # For LinkedIn
)
or (code == 301 and rurl == url) # For Cloudflare
and code
in (
200,
201,
202,
203,
204,
205,
206,
207,
208,
226,
301, # For Cloudflare
403, # Previously for Cloudflare
999, # For LinkedIn
)
):
# Cloudflare now returns HTTP 301 (previously 403) for urls behind its bot
# protection. Hence 301 and 403 are both considered acceptable codes, but
# 301 only when the URL does not change
#
# 999 is a non-standard too-many-requests error. We can't look past it to
# check a URL, so we let it pass
# protection. 301 and 403 are both considered acceptable codes. 999 is a
# non-standard too-many-requests error used by LinkedIn. We can't look past
# it to check a URL, so we let it pass.

# The URL works, so now we check if it's in a reject list. This part
# runs _after_ attempting to load the URL as we want to catch redirects.
Expand Down

0 comments on commit 62d3f2e

Please sign in to comment.