Fix redirect URL handling; add comments

The "allow 301 only if redirect to self" bit doesn't work with cache, so now we allow any kind of 301
hasgeek · Jun 3, 2021 · 62d3f2e · 62d3f2e
1 parent c63adc2
commit 62d3f2e
Showing 1 changed file with 37 additions and 30 deletions.
diff --git a/baseframe/forms/validators.py b/baseframe/forms/validators.py
@@ -503,20 +503,29 @@ def check_url(
             cache_check = asset_cache.get(cache_key)
         except ValueError:  # Possible error from a broken pickle
             cache_check = None
-        # Read from cache, but assume cache may be broken
-        # since Flask-Cache stores data as a pickle,
-        # which is version-specific
+        # Read from cache, but assume cache may be broken since Flask-Cache stores data
+        # as a pickle, which is version-specific
         if cache_check and isinstance(cache_check, dict):
             rurl = cache_check.get('url')
             code = cache_check.get('code')
         else:
             rurl = None  # rurl is the response URL after following redirects
             code = None
 
+        # This validator is meant to catch typos, but bot protection tools make that
+        # fairly hard to do, so we only aim to be helpful, not thorough. Cloudflare's
+        # implementation issues a 301 redirect to self. The `allow_redirects` option
+        # in `requests` does not recognise this, and will go into a loop fetching the
+        # same URL. We therefore have our own implementation of `allow_redirects` that
+        # stops as soon as it encounters a 3xx redirect to self.
+
+        # TODO: Also honour the robots.txt protocol and stay off URLs that aren't meant
+        # to be checked. https://docs.python.org/3/library/urllib.robotparser.html
         if not rurl or not code:
             try:
                 rurl = url
-                for limit in range(30):
+                # 30 is the default redirect limit in `requests`
+                for _count in range(30):
                     r = requests.get(
                         rurl,
                         timeout=30,
@@ -525,16 +534,18 @@ def check_url(
                         headers={'User-Agent': self.user_agent},
                     )
                     code = r.status_code
-                    rurl = r.url
                     if code >= 300 and code < 400:
-                        # Redirect?
-                        if rurl == url:
-                            # Loop, break immediately
+                        # Redirect. What kind?
+                        if rurl == r.url:
+                            # Redirect to self in a loop, break immediately
+                            rurl = r.url
                             break
                         # Not a loop, continue following redirects
+                        rurl = r.url
                         continue
                     else:
                         # Not a redirect, break iterations and check the response
+                        rurl = r.url
                         break
             except (
                 # Still a relative URL? Must be broken
@@ -552,31 +563,27 @@ def check_url(
         if (
             rurl is not None
             and code is not None
-            and (
-                code
-                in (
-                    200,
-                    201,
-                    202,
-                    203,
-                    204,
-                    205,
-                    206,
-                    207,
-                    208,
-                    226,
-                    403,  # Previously for Cloudflare
-                    999,  # For LinkedIn
-                )
-                or (code == 301 and rurl == url)  # For Cloudflare
+            and code
+            in (
+                200,
+                201,
+                202,
+                203,
+                204,
+                205,
+                206,
+                207,
+                208,
+                226,
+                301,  # For Cloudflare
+                403,  # Previously for Cloudflare
+                999,  # For LinkedIn
             )
         ):
             # Cloudflare now returns HTTP 301 (previously 403) for urls behind its bot
-            # protection. Hence 301 and 403 are both considered acceptable codes, but
-            # 301 only when the URL does not change
-            #
-            # 999 is a non-standard too-many-requests error. We can't look past it to
-            # check a URL, so we let it pass
+            # protection. 301 and 403 are both considered acceptable codes. 999 is a
+            # non-standard too-many-requests error used by LinkedIn. We can't look past
+            # it to check a URL, so we let it pass.
 
             # The URL works, so now we check if it's in a reject list. This part
             # runs _after_ attempting to load the URL as we want to catch redirects.