Skip to content

Commit

Permalink
Merge pull request #277 from galgeek/rotary_skip_ytdlp
Browse files Browse the repository at this point in the history
skip ytdlp for selected seeds
  • Loading branch information
galgeek committed Jun 3, 2024
2 parents d51d09c + 25f50c5 commit 42c5e6f
Show file tree
Hide file tree
Showing 4 changed files with 43 additions and 4 deletions.
17 changes: 16 additions & 1 deletion brozzler/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"""
brozzler/cli.py - brozzler command line executables
Copyright (C) 2014-2023 Internet Archive
Copyright (C) 2014-2024 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -544,12 +544,27 @@ def dump_state(signum, frame):
finally:
signal.signal(signal.SIGQUIT, dump_state)

def get_skip_av_seeds():
# TODO: develop UI and refactor
SKIP_AV_SEEDS_FILE = "/opt/local/brozzler/skip_av_seeds.txt"
try:
# make set from seed IDs in SKIP_AV_SEEDS_FILE
with open(SKIP_AV_SEEDS_FILE) as skips:
skip_av_seeds = {int(l) for l in skips.readlines()}
logging.info("running with skip_av_seeds file %s" % SKIP_AV_SEEDS_FILE)
except Exception as e:
skip_av_seeds = set()
logging.info("running with empty skip_av_seeds")
return skip_av_seeds

rr = rethinker(args)
frontier = brozzler.RethinkDbFrontier(rr)
service_registry = doublethink.ServiceRegistry(rr)
skip_av_seeds_from_file = get_skip_av_seeds()
worker = brozzler.worker.BrozzlerWorker(
frontier,
service_registry,
skip_av_seeds=skip_av_seeds_from_file,
max_browsers=int(args.max_browsers),
chrome_exe=args.chrome_exe,
proxy=args.proxy,
Expand Down
4 changes: 3 additions & 1 deletion brozzler/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
brozzler/models.py - model classes representing jobs, sites, and pages, with
related logic
Copyright (C) 2014-2022 Internet Archive
Copyright (C) 2014-2024 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -235,6 +235,8 @@ def populate_defaults(self):
self.last_claimed = brozzler.EPOCH_UTC
if not "scope" in self:
self.scope = {}
if not "skip_ytdlp" in self:
self.skip_ytdlp = None

# backward compatibility
if "surt" in self.scope:
Expand Down
4 changes: 3 additions & 1 deletion brozzler/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ def __init__(
self,
frontier,
service_registry=None,
skip_av_seeds=None,
max_browsers=1,
chrome_exe="chromium-browser",
warcprox_auto=False,
Expand All @@ -73,6 +74,7 @@ def __init__(
):
self._frontier = frontier
self._service_registry = service_registry
self._skip_av_seeds = skip_av_seeds
self._max_browsers = max_browsers

self._warcprox_auto = warcprox_auto
Expand Down Expand Up @@ -261,7 +263,7 @@ def brozzle_page(
except brozzler.PageInterstitialShown:
self.logger.info("page interstitial shown (http auth): %s", page)

if enable_youtube_dl and ydl.should_ytdlp(page, site):
if enable_youtube_dl and ydl.should_ytdlp(site, page, self._skip_av_seeds):
try:
ydl_outlinks = ydl.do_youtube_dl(self, site, page)
outlinks.update(ydl_outlinks)
Expand Down
22 changes: 21 additions & 1 deletion brozzler/ydl.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,16 +32,36 @@
thread_local = threading.local()


def should_ytdlp(page, site):
def should_ytdlp(site, page, skip_av_seeds):
# called only after we've passed needs_browsing() check

if page.status_code != 200:
logging.info("skipping ytdlp: non-200 page status")
return False
if site.skip_ytdlp:
logging.info("skipping ytdlp: site marked skip_ytdlp")
return False

ytdlp_url = page.redirect_url if page.redirect_url else page.url

if "chrome-error:" in ytdlp_url:
return False

ytdlp_seed = (
site["metadata"]["ait_seed_id"]
if "metadata" in site and "ait_seed_id" in site["metadata"]
else None
)

# TODO: develop UI and refactor
if ytdlp_seed:
if site.skip_ytdlp is None and ytdlp_seed in skip_av_seeds:
logging.info("skipping ytdlp: site in skip_av_seeds")
site.skip_ytdlp = True
return False
else:
site.skip_ytdlp = False

return True


Expand Down

0 comments on commit 42c5e6f

Please sign in to comment.