Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 44 additions & 3 deletions struct_module/content_fetcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def _fetch_github_file(self, github_path):
raise ValueError("Invalid GitHub path. Expected owner/repo/branch/file_path")

owner, repo, branch, file_path = match.groups()
return self._clone_or_fetch_github(owner, repo, branch, file_path, https=True)
return self._github_fetch_with_raw_then_git(owner, repo, branch, file_path, use_https=True)

def _fetch_github_https_file(self, github_path):
"""
Expand All @@ -111,7 +111,7 @@ def _fetch_github_https_file(self, github_path):
raise ValueError("Invalid GitHub path. Expected owner/repo/branch/file_path")

owner, repo, branch, file_path = match.groups()
return self._clone_or_fetch_github(owner, repo, branch, file_path, https=True)
return self._github_fetch_with_raw_then_git(owner, repo, branch, file_path, use_https=True)

def _fetch_github_ssh_file(self, github_path):
"""
Expand All @@ -124,7 +124,7 @@ def _fetch_github_ssh_file(self, github_path):
raise ValueError("Invalid GitHub path. Expected owner/repo/branch/file_path")

owner, repo, branch, file_path = match.groups()
return self._clone_or_fetch_github(owner, repo, branch, file_path, https=False)
return self._github_fetch_with_raw_then_git(owner, repo, branch, file_path, use_https=False)

def _clone_or_fetch_github(self, owner, repo, branch, file_path, https=True):
repo_cache_path = self.cache_dir / f"{owner}_{repo}_{branch}"
Expand All @@ -146,6 +146,47 @@ def _clone_or_fetch_github(self, owner, repo, branch, file_path, https=True):
with file_full_path.open('r') as file:
return file.read()

def _github_fetch_with_raw_then_git(self, owner, repo, branch, file_path, use_https=True):
"""
Try lightweight fetch via raw.githubusercontent.com first. If it fails
(network disabled, HTTP error, etc.), fall back to git clone/pull.
If a local cache repo exists already, prefer using git path directly
to avoid surprise network requests.
"""
# Deny network option
if os.getenv("STRUCT_DENY_NETWORK") == "1":
self.logger.debug("Network denied by STRUCT_DENY_NETWORK=1; using git fallback if available")
return self._clone_or_fetch_github(owner, repo, branch, file_path, https=use_https)

repo_cache_path = self.cache_dir / f"{owner}_{repo}_{branch}"
if repo_cache_path.exists():
# Keep existing behavior: use git path if cache exists
return self._clone_or_fetch_github(owner, repo, branch, file_path, https=use_https)

# Attempt raw fetch
raw_url = f"https://raw.githubusercontent.com/{owner}/{repo}/{branch}/{file_path}"
timeout = float(os.getenv("STRUCT_HTTP_TIMEOUT", "10"))
retries = int(os.getenv("STRUCT_HTTP_RETRIES", "2"))

last_err = None
for attempt in range(retries + 1):
try:
self.logger.debug(f"Attempting raw fetch: {raw_url} (attempt {attempt+1}/{retries+1})")
resp = requests.get(raw_url, timeout=timeout)
resp.raise_for_status()
return resp.text
except Exception as e:
last_err = e
# simple backoff
try:
import time
time.sleep(min(2 ** attempt, 5))
except Exception:
pass

self.logger.warning(f"Raw GitHub fetch failed, falling back to git. Last error: {last_err}")
return self._clone_or_fetch_github(owner, repo, branch, file_path, https=use_https)

def _fetch_s3_file(self, s3_path):
"""
Fetch a file from an S3 bucket.
Expand Down
107 changes: 107 additions & 0 deletions tests/test_content_fetcher_more.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,3 +219,110 @@ def fake_run(args, check):

with pytest.raises(subprocess.CalledProcessError):
cf.fetch_content("githubhttps://owner/repo/main/file.txt")


def test_github_raw_fetch_success_no_git_calls(monkeypatch, tmp_path):
cf = ContentFetcher(cache_dir=tmp_path / "cache")
# Ensure no existing cache repo
# Prepare requests.get to return content
class Resp:
def __init__(self, text):
self.text = text
def raise_for_status(self):
return None
called = {"http": 0, "git": 0}
def fake_get(url, timeout=None):
called["http"] += 1
assert url == "https://raw.githubusercontent.com/owner/repo/main/path/to/file.txt"
return Resp("RAW_DATA")
def fake_run(args, check):
called["git"] += 1
raise AssertionError("git should not be called on raw success")
monkeypatch.setattr("struct_module.content_fetcher.requests.get", fake_get)
monkeypatch.setattr(subprocess, "run", fake_run)

out = cf.fetch_content("githubhttps://owner/repo/main/path/to/file.txt")
assert out == "RAW_DATA"
assert called["http"] == 1
assert called["git"] == 0


def test_github_raw_fetch_retries_then_fallback_to_git(monkeypatch, tmp_path):
cf = ContentFetcher(cache_dir=tmp_path / "cache")
repo_dir = tmp_path / "cache" / "owner_repo_main"
file_rel = "path/to/file.txt"
file_full = repo_dir / file_rel

# Fail HTTP calls
def bad_get(url, timeout=None):
raise Exception("network down")
monkeypatch.setattr("struct_module.content_fetcher.requests.get", bad_get)

calls = {"clone": 0}
def fake_run(args, check):
if args[:2] == ["git", "clone"]:
calls["clone"] += 1
repo_dir.mkdir(parents=True, exist_ok=True)
file_full.parent.mkdir(parents=True, exist_ok=True)
file_full.write_text("GIT_DATA")
elif args[:3] == ["git", "-C", str(repo_dir)]:
# no-op for pull after clone path (not expected here)
return None
else:
raise AssertionError(f"Unexpected git call: {args}")
monkeypatch.setattr(subprocess, "run", fake_run)

# Speed up retries/backoff by setting retries=0 via env
monkeypatch.setenv("STRUCT_HTTP_RETRIES", "0")

out = cf.fetch_content("githubhttps://owner/repo/main/path/to/file.txt")
assert out == "GIT_DATA"
assert calls["clone"] == 1


def test_github_deny_network_uses_git(monkeypatch, tmp_path):
cf = ContentFetcher(cache_dir=tmp_path / "cache")
repo_dir = tmp_path / "cache" / "owner_repo_main"
file_full = repo_dir / "path.txt"

# Deny network
monkeypatch.setenv("STRUCT_DENY_NETWORK", "1")

# HTTP must not be called
def bad_get(url, timeout=None):
raise AssertionError("HTTP should not be invoked when STRUCT_DENY_NETWORK=1")
monkeypatch.setattr("struct_module.content_fetcher.requests.get", bad_get)

def fake_run(args, check):
if args[:2] == ["git", "clone"]:
repo_dir.mkdir(parents=True, exist_ok=True)
file_full.write_text("OK")
elif args[:3] == ["git", "-C", str(repo_dir)]:
return None
monkeypatch.setattr(subprocess, "run", fake_run)

out = cf.fetch_content("githubhttps://owner/repo/main/path.txt")
assert out == "OK"


def test_github_existing_cache_prefers_git(monkeypatch, tmp_path):
cf = ContentFetcher(cache_dir=tmp_path / "cache")
repo_dir = tmp_path / "cache" / "owner_repo_main"
file_full = repo_dir / "path.txt"
repo_dir.mkdir(parents=True, exist_ok=True)
file_full.write_text("CACHE_DATA")

# HTTP must not be needed; if called, fail
def bad_get(url, timeout=None):
raise AssertionError("HTTP should not be called when cache exists")
monkeypatch.setattr("struct_module.content_fetcher.requests.get", bad_get)

pulls = {"count": 0}
def fake_run(args, check):
if args[:3] == ["git", "-C", str(repo_dir)]:
pulls["count"] += 1
monkeypatch.setattr(subprocess, "run", fake_run)

out = cf.fetch_content("githubhttps://owner/repo/main/path.txt")
assert out == "CACHE_DATA"
assert pulls["count"] == 1
Loading