From 45f72ea718413c1bcf50dce46fc614c0fc187793 Mon Sep 17 00:00:00 2001 From: Kenneth Belitzky Date: Sun, 10 Aug 2025 22:37:56 -0300 Subject: [PATCH 1/2] feat(content_fetcher): use raw.githubusercontent.com for single-file GitHub fetch with retries/timeouts; fallback to git clone/pull; support STRUCT_DENY_NETWORK, STRUCT_HTTP_TIMEOUT, STRUCT_HTTP_RETRIES (closes #92) --- struct_module/content_fetcher.py | 47 ++++++++++++++++++++++++++++++-- 1 file changed, 44 insertions(+), 3 deletions(-) diff --git a/struct_module/content_fetcher.py b/struct_module/content_fetcher.py index ef948a4..7aec59a 100644 --- a/struct_module/content_fetcher.py +++ b/struct_module/content_fetcher.py @@ -98,7 +98,7 @@ def _fetch_github_file(self, github_path): raise ValueError("Invalid GitHub path. Expected owner/repo/branch/file_path") owner, repo, branch, file_path = match.groups() - return self._clone_or_fetch_github(owner, repo, branch, file_path, https=True) + return self._github_fetch_with_raw_then_git(owner, repo, branch, file_path, use_https=True) def _fetch_github_https_file(self, github_path): """ @@ -111,7 +111,7 @@ def _fetch_github_https_file(self, github_path): raise ValueError("Invalid GitHub path. Expected owner/repo/branch/file_path") owner, repo, branch, file_path = match.groups() - return self._clone_or_fetch_github(owner, repo, branch, file_path, https=True) + return self._github_fetch_with_raw_then_git(owner, repo, branch, file_path, use_https=True) def _fetch_github_ssh_file(self, github_path): """ @@ -124,7 +124,7 @@ def _fetch_github_ssh_file(self, github_path): raise ValueError("Invalid GitHub path. Expected owner/repo/branch/file_path") owner, repo, branch, file_path = match.groups() - return self._clone_or_fetch_github(owner, repo, branch, file_path, https=False) + return self._github_fetch_with_raw_then_git(owner, repo, branch, file_path, use_https=False) def _clone_or_fetch_github(self, owner, repo, branch, file_path, https=True): repo_cache_path = self.cache_dir / f"{owner}_{repo}_{branch}" @@ -146,6 +146,47 @@ def _clone_or_fetch_github(self, owner, repo, branch, file_path, https=True): with file_full_path.open('r') as file: return file.read() + def _github_fetch_with_raw_then_git(self, owner, repo, branch, file_path, use_https=True): + """ + Try lightweight fetch via raw.githubusercontent.com first. If it fails + (network disabled, HTTP error, etc.), fall back to git clone/pull. + If a local cache repo exists already, prefer using git path directly + to avoid surprise network requests. + """ + # Deny network option + if os.getenv("STRUCT_DENY_NETWORK") == "1": + self.logger.debug("Network denied by STRUCT_DENY_NETWORK=1; using git fallback if available") + return self._clone_or_fetch_github(owner, repo, branch, file_path, https=use_https) + + repo_cache_path = self.cache_dir / f"{owner}_{repo}_{branch}" + if repo_cache_path.exists(): + # Keep existing behavior: use git path if cache exists + return self._clone_or_fetch_github(owner, repo, branch, file_path, https=use_https) + + # Attempt raw fetch + raw_url = f"https://raw.githubusercontent.com/{owner}/{repo}/{branch}/{file_path}" + timeout = float(os.getenv("STRUCT_HTTP_TIMEOUT", "10")) + retries = int(os.getenv("STRUCT_HTTP_RETRIES", "2")) + + last_err = None + for attempt in range(retries + 1): + try: + self.logger.debug(f"Attempting raw fetch: {raw_url} (attempt {attempt+1}/{retries+1})") + resp = requests.get(raw_url, timeout=timeout) + resp.raise_for_status() + return resp.text + except Exception as e: + last_err = e + # simple backoff + try: + import time + time.sleep(min(2 ** attempt, 5)) + except Exception: + pass + + self.logger.warning(f"Raw GitHub fetch failed, falling back to git. Last error: {last_err}") + return self._clone_or_fetch_github(owner, repo, branch, file_path, https=use_https) + def _fetch_s3_file(self, s3_path): """ Fetch a file from an S3 bucket. From a65dbb4d1313cde81d1b513e26aa595a4bd6ab59 Mon Sep 17 00:00:00 2001 From: Kenneth Belitzky Date: Sun, 10 Aug 2025 22:42:31 -0300 Subject: [PATCH 2/2] test(content_fetcher): add coverage for raw GitHub fetch, fallback, deny-network, and existing cache behavior (refs #92) --- tests/test_content_fetcher_more.py | 107 +++++++++++++++++++++++++++++ 1 file changed, 107 insertions(+) diff --git a/tests/test_content_fetcher_more.py b/tests/test_content_fetcher_more.py index 88c9c0f..585df2f 100644 --- a/tests/test_content_fetcher_more.py +++ b/tests/test_content_fetcher_more.py @@ -219,3 +219,110 @@ def fake_run(args, check): with pytest.raises(subprocess.CalledProcessError): cf.fetch_content("githubhttps://owner/repo/main/file.txt") + + +def test_github_raw_fetch_success_no_git_calls(monkeypatch, tmp_path): + cf = ContentFetcher(cache_dir=tmp_path / "cache") + # Ensure no existing cache repo + # Prepare requests.get to return content + class Resp: + def __init__(self, text): + self.text = text + def raise_for_status(self): + return None + called = {"http": 0, "git": 0} + def fake_get(url, timeout=None): + called["http"] += 1 + assert url == "https://raw.githubusercontent.com/owner/repo/main/path/to/file.txt" + return Resp("RAW_DATA") + def fake_run(args, check): + called["git"] += 1 + raise AssertionError("git should not be called on raw success") + monkeypatch.setattr("struct_module.content_fetcher.requests.get", fake_get) + monkeypatch.setattr(subprocess, "run", fake_run) + + out = cf.fetch_content("githubhttps://owner/repo/main/path/to/file.txt") + assert out == "RAW_DATA" + assert called["http"] == 1 + assert called["git"] == 0 + + +def test_github_raw_fetch_retries_then_fallback_to_git(monkeypatch, tmp_path): + cf = ContentFetcher(cache_dir=tmp_path / "cache") + repo_dir = tmp_path / "cache" / "owner_repo_main" + file_rel = "path/to/file.txt" + file_full = repo_dir / file_rel + + # Fail HTTP calls + def bad_get(url, timeout=None): + raise Exception("network down") + monkeypatch.setattr("struct_module.content_fetcher.requests.get", bad_get) + + calls = {"clone": 0} + def fake_run(args, check): + if args[:2] == ["git", "clone"]: + calls["clone"] += 1 + repo_dir.mkdir(parents=True, exist_ok=True) + file_full.parent.mkdir(parents=True, exist_ok=True) + file_full.write_text("GIT_DATA") + elif args[:3] == ["git", "-C", str(repo_dir)]: + # no-op for pull after clone path (not expected here) + return None + else: + raise AssertionError(f"Unexpected git call: {args}") + monkeypatch.setattr(subprocess, "run", fake_run) + + # Speed up retries/backoff by setting retries=0 via env + monkeypatch.setenv("STRUCT_HTTP_RETRIES", "0") + + out = cf.fetch_content("githubhttps://owner/repo/main/path/to/file.txt") + assert out == "GIT_DATA" + assert calls["clone"] == 1 + + +def test_github_deny_network_uses_git(monkeypatch, tmp_path): + cf = ContentFetcher(cache_dir=tmp_path / "cache") + repo_dir = tmp_path / "cache" / "owner_repo_main" + file_full = repo_dir / "path.txt" + + # Deny network + monkeypatch.setenv("STRUCT_DENY_NETWORK", "1") + + # HTTP must not be called + def bad_get(url, timeout=None): + raise AssertionError("HTTP should not be invoked when STRUCT_DENY_NETWORK=1") + monkeypatch.setattr("struct_module.content_fetcher.requests.get", bad_get) + + def fake_run(args, check): + if args[:2] == ["git", "clone"]: + repo_dir.mkdir(parents=True, exist_ok=True) + file_full.write_text("OK") + elif args[:3] == ["git", "-C", str(repo_dir)]: + return None + monkeypatch.setattr(subprocess, "run", fake_run) + + out = cf.fetch_content("githubhttps://owner/repo/main/path.txt") + assert out == "OK" + + +def test_github_existing_cache_prefers_git(monkeypatch, tmp_path): + cf = ContentFetcher(cache_dir=tmp_path / "cache") + repo_dir = tmp_path / "cache" / "owner_repo_main" + file_full = repo_dir / "path.txt" + repo_dir.mkdir(parents=True, exist_ok=True) + file_full.write_text("CACHE_DATA") + + # HTTP must not be needed; if called, fail + def bad_get(url, timeout=None): + raise AssertionError("HTTP should not be called when cache exists") + monkeypatch.setattr("struct_module.content_fetcher.requests.get", bad_get) + + pulls = {"count": 0} + def fake_run(args, check): + if args[:3] == ["git", "-C", str(repo_dir)]: + pulls["count"] += 1 + monkeypatch.setattr(subprocess, "run", fake_run) + + out = cf.fetch_content("githubhttps://owner/repo/main/path.txt") + assert out == "CACHE_DATA" + assert pulls["count"] == 1