From 242b8f1a346e985617dad03230869f7bcc8908cf Mon Sep 17 00:00:00 2001 From: Jeremy Singer-Vine Date: Wed, 17 Jan 2024 16:28:55 -0500 Subject: [PATCH] Add --delay-retry for pausing between retries ... and add Session.try_get to refactor retries handling. --- waybackpack/cdx.py | 6 ++++++ waybackpack/cli.py | 8 ++++++++ waybackpack/session.py | 38 +++++++++++++++++++++++++------------- 3 files changed, 39 insertions(+), 13 deletions(-) diff --git a/waybackpack/cdx.py b/waybackpack/cdx.py index a82b454..0ccc31e 100644 --- a/waybackpack/cdx.py +++ b/waybackpack/cdx.py @@ -7,6 +7,10 @@ SEARCH_URL = "https://web.archive.org/cdx/search/cdx" +class WaybackpackException(Exception): + pass + + def search( url, from_date=None, to_date=None, uniques_only=False, collapse=None, session=None ): @@ -23,6 +27,8 @@ def search( "collapse": collapse, }, ) + if res is None: + raise WaybackpackException("Difficulty connecting to Wayback Machine CDX API") if res.status_code == 200: cdx = res.json() diff --git a/waybackpack/cli.py b/waybackpack/cli.py index 24aa4ae..5e7f202 100644 --- a/waybackpack/cli.py +++ b/waybackpack/cli.py @@ -113,6 +113,13 @@ def parse_args(): "--delay", type=int, default=0, help="Sleep X seconds between each fetch." ) + parser.add_argument( + "--delay-retry", + type=int, + default=5, + help="Sleep X seconds between each post-error retry.", + ) + args = parser.parse_args() return args @@ -129,6 +136,7 @@ def main(): user_agent=args.user_agent, follow_redirects=args.follow_redirects, max_retries=args.max_retries, + delay_retry=args.delay_retry, ) snapshots = search( diff --git a/waybackpack/session.py b/waybackpack/session.py index 8b0499c..ca65368 100644 --- a/waybackpack/session.py +++ b/waybackpack/session.py @@ -14,18 +14,18 @@ def __init__( follow_redirects=False, user_agent=DEFAULT_USER_AGENT, max_retries=3, + delay_retry=5, ): self.follow_redirects = follow_redirects self.user_agent = user_agent self.max_retries = max_retries + self.delay_retry = delay_retry - def get(self, url, **kwargs): + def try_get(self, url, **kwargs): headers = { "User-Agent": self.user_agent, } - response_is_final = False - retries = 0 - while response_is_final is False: + try: res = requests.get( url, allow_redirects=self.follow_redirects, @@ -34,19 +34,31 @@ def get(self, url, **kwargs): **kwargs ) - if res.status_code != 200: - logger.info("HTTP status code: {0}".format(res.status_code)) - if int(res.status_code / 100) in [4, 5]: # 4XX and 5XX codes - logger.info("Waiting 1 second before retrying.") + return False, res + else: + return True, res + + except requests.exceptions.ConnectionError: + logger.info("Connection error") + return False, None + + def get(self, url, **kwargs): + retries = 0 + while True: + success, res = self.try_get(url, **kwargs) + if success: + if res.status_code != 200: + logger.info("HTTP status code: {0}".format(res.status_code)) + return res + else: + logger.info( + "Waiting {0} second(s) before retrying.".format(self.delay_retry) + ) + time.sleep(self.delay_retry) retries += 1 if retries <= self.max_retries: - logger.info("Waiting 1 second before retrying.") - time.sleep(1) continue else: logger.info("Maximum retries reached, skipping.") return None - else: - response_is_final = True - return res