Skip to content

Commit

Permalink
Add --delay-retry for pausing between retries
Browse files Browse the repository at this point in the history
... and add Session.try_get to refactor retries handling.
  • Loading branch information
jsvine committed Jan 17, 2024
1 parent 2dc24e9 commit 242b8f1
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 13 deletions.
6 changes: 6 additions & 0 deletions waybackpack/cdx.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@
SEARCH_URL = "https://web.archive.org/cdx/search/cdx"


class WaybackpackException(Exception):
pass


def search(
url, from_date=None, to_date=None, uniques_only=False, collapse=None, session=None
):
Expand All @@ -23,6 +27,8 @@ def search(
"collapse": collapse,
},
)
if res is None:
raise WaybackpackException("Difficulty connecting to Wayback Machine CDX API")

if res.status_code == 200:
cdx = res.json()
Expand Down
8 changes: 8 additions & 0 deletions waybackpack/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,13 @@ def parse_args():
"--delay", type=int, default=0, help="Sleep X seconds between each fetch."
)

parser.add_argument(
"--delay-retry",
type=int,
default=5,
help="Sleep X seconds between each post-error retry.",
)

args = parser.parse_args()
return args

Expand All @@ -129,6 +136,7 @@ def main():
user_agent=args.user_agent,
follow_redirects=args.follow_redirects,
max_retries=args.max_retries,
delay_retry=args.delay_retry,
)

snapshots = search(
Expand Down
38 changes: 25 additions & 13 deletions waybackpack/session.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,18 +14,18 @@ def __init__(
follow_redirects=False,
user_agent=DEFAULT_USER_AGENT,
max_retries=3,
delay_retry=5,
):
self.follow_redirects = follow_redirects
self.user_agent = user_agent
self.max_retries = max_retries
self.delay_retry = delay_retry

def get(self, url, **kwargs):
def try_get(self, url, **kwargs):
headers = {
"User-Agent": self.user_agent,
}
response_is_final = False
retries = 0
while response_is_final is False:
try:
res = requests.get(
url,
allow_redirects=self.follow_redirects,
Expand All @@ -34,19 +34,31 @@ def get(self, url, **kwargs):
**kwargs
)

if res.status_code != 200:
logger.info("HTTP status code: {0}".format(res.status_code))

if int(res.status_code / 100) in [4, 5]: # 4XX and 5XX codes
logger.info("Waiting 1 second before retrying.")
return False, res
else:
return True, res

except requests.exceptions.ConnectionError:
logger.info("Connection error")
return False, None

def get(self, url, **kwargs):
retries = 0
while True:
success, res = self.try_get(url, **kwargs)
if success:
if res.status_code != 200:
logger.info("HTTP status code: {0}".format(res.status_code))
return res
else:
logger.info(
"Waiting {0} second(s) before retrying.".format(self.delay_retry)
)
time.sleep(self.delay_retry)
retries += 1
if retries <= self.max_retries:
logger.info("Waiting 1 second before retrying.")
time.sleep(1)
continue
else:
logger.info("Maximum retries reached, skipping.")
return None
else:
response_is_final = True
return res

0 comments on commit 242b8f1

Please sign in to comment.