httpdss · httpdss · Aug 11, 2025 · Aug 11, 2025 · Aug 11, 2025
@@ -98,7 +98,7 @@ def _fetch_github_file(self, github_path):
       raise ValueError("Invalid GitHub path. Expected owner/repo/branch/file_path")
 
     owner, repo, branch, file_path = match.groups()
-    return self._clone_or_fetch_github(owner, repo, branch, file_path, https=True)
+    return self._github_fetch_with_raw_then_git(owner, repo, branch, file_path, use_https=True)
 
   def _fetch_github_https_file(self, github_path):
     """
@@ -111,7 +111,7 @@ def _fetch_github_https_file(self, github_path):
       raise ValueError("Invalid GitHub path. Expected owner/repo/branch/file_path")
 
     owner, repo, branch, file_path = match.groups()
-    return self._clone_or_fetch_github(owner, repo, branch, file_path, https=True)
+    return self._github_fetch_with_raw_then_git(owner, repo, branch, file_path, use_https=True)
 
   def _fetch_github_ssh_file(self, github_path):
     """
@@ -124,7 +124,7 @@ def _fetch_github_ssh_file(self, github_path):
       raise ValueError("Invalid GitHub path. Expected owner/repo/branch/file_path")
 
     owner, repo, branch, file_path = match.groups()
-    return self._clone_or_fetch_github(owner, repo, branch, file_path, https=False)
+    return self._github_fetch_with_raw_then_git(owner, repo, branch, file_path, use_https=False)
 
   def _clone_or_fetch_github(self, owner, repo, branch, file_path, https=True):
     repo_cache_path = self.cache_dir / f"{owner}_{repo}_{branch}"
@@ -146,6 +146,47 @@ def _clone_or_fetch_github(self, owner, repo, branch, file_path, https=True):
     with file_full_path.open('r') as file:
       return file.read()
 
+  def _github_fetch_with_raw_then_git(self, owner, repo, branch, file_path, use_https=True):
+    """
+    Try lightweight fetch via raw.githubusercontent.com first. If it fails
+    (network disabled, HTTP error, etc.), fall back to git clone/pull.
+    If a local cache repo exists already, prefer using git path directly
+    to avoid surprise network requests.
+    """
+    # Deny network option
+    if os.getenv("STRUCT_DENY_NETWORK") == "1":
+      self.logger.debug("Network denied by STRUCT_DENY_NETWORK=1; using git fallback if available")
+      return self._clone_or_fetch_github(owner, repo, branch, file_path, https=use_https)
+
+    repo_cache_path = self.cache_dir / f"{owner}_{repo}_{branch}"
+    if repo_cache_path.exists():
+      # Keep existing behavior: use git path if cache exists
+      return self._clone_or_fetch_github(owner, repo, branch, file_path, https=use_https)
+
+    # Attempt raw fetch
+    raw_url = f"https://raw.githubusercontent.com/{owner}/{repo}/{branch}/{file_path}"
+    timeout = float(os.getenv("STRUCT_HTTP_TIMEOUT", "10"))
+    retries = int(os.getenv("STRUCT_HTTP_RETRIES", "2"))
+
+    last_err = None
+    for attempt in range(retries + 1):
+      try:
+        self.logger.debug(f"Attempting raw fetch: {raw_url} (attempt {attempt+1}/{retries+1})")
+        resp = requests.get(raw_url, timeout=timeout)
+        resp.raise_for_status()
+        return resp.text
+      except Exception as e:
+        last_err = e
+        # simple backoff
+        try:
+          import time
+          time.sleep(min(2 ** attempt, 5))
+        except Exception:
+          pass
+
+    self.logger.warning(f"Raw GitHub fetch failed, falling back to git. Last error: {last_err}")
+    return self._clone_or_fetch_github(owner, repo, branch, file_path, https=use_https)
+
   def _fetch_s3_file(self, s3_path):
     """
     Fetch a file from an S3 bucket.

@@ -219,3 +219,110 @@ def fake_run(args, check):
 
     with pytest.raises(subprocess.CalledProcessError):
         cf.fetch_content("githubhttps://owner/repo/main/file.txt")
+
+
+def test_github_raw_fetch_success_no_git_calls(monkeypatch, tmp_path):
+    cf = ContentFetcher(cache_dir=tmp_path / "cache")
+    # Ensure no existing cache repo
+    # Prepare requests.get to return content
+    class Resp:
+        def __init__(self, text):
+            self.text = text
+        def raise_for_status(self):
+            return None
+    called = {"http": 0, "git": 0}
+    def fake_get(url, timeout=None):
+        called["http"] += 1
+        assert url == "https://raw.githubusercontent.com/owner/repo/main/path/to/file.txt"
+        return Resp("RAW_DATA")
+    def fake_run(args, check):
+        called["git"] += 1
+        raise AssertionError("git should not be called on raw success")
+    monkeypatch.setattr("struct_module.content_fetcher.requests.get", fake_get)
+    monkeypatch.setattr(subprocess, "run", fake_run)
+
+    out = cf.fetch_content("githubhttps://owner/repo/main/path/to/file.txt")
+    assert out == "RAW_DATA"
+    assert called["http"] == 1
+    assert called["git"] == 0
+
+
+def test_github_raw_fetch_retries_then_fallback_to_git(monkeypatch, tmp_path):
+    cf = ContentFetcher(cache_dir=tmp_path / "cache")
+    repo_dir = tmp_path / "cache" / "owner_repo_main"
+    file_rel = "path/to/file.txt"
+    file_full = repo_dir / file_rel
+
+    # Fail HTTP calls
+    def bad_get(url, timeout=None):
+        raise Exception("network down")
+    monkeypatch.setattr("struct_module.content_fetcher.requests.get", bad_get)
+
+    calls = {"clone": 0}
+    def fake_run(args, check):
+        if args[:2] == ["git", "clone"]:
+            calls["clone"] += 1
+            repo_dir.mkdir(parents=True, exist_ok=True)
+            file_full.parent.mkdir(parents=True, exist_ok=True)
+            file_full.write_text("GIT_DATA")
+        elif args[:3] == ["git", "-C", str(repo_dir)]:
+            # no-op for pull after clone path (not expected here)
+            return None
+        else:
+            raise AssertionError(f"Unexpected git call: {args}")
+    monkeypatch.setattr(subprocess, "run", fake_run)
+
+    # Speed up retries/backoff by setting retries=0 via env
+    monkeypatch.setenv("STRUCT_HTTP_RETRIES", "0")
+
+    out = cf.fetch_content("githubhttps://owner/repo/main/path/to/file.txt")
+    assert out == "GIT_DATA"
+    assert calls["clone"] == 1
+
+
+def test_github_deny_network_uses_git(monkeypatch, tmp_path):
+    cf = ContentFetcher(cache_dir=tmp_path / "cache")
+    repo_dir = tmp_path / "cache" / "owner_repo_main"
+    file_full = repo_dir / "path.txt"
+
+    # Deny network
+    monkeypatch.setenv("STRUCT_DENY_NETWORK", "1")
+
+    # HTTP must not be called
+    def bad_get(url, timeout=None):
+        raise AssertionError("HTTP should not be invoked when STRUCT_DENY_NETWORK=1")
+    monkeypatch.setattr("struct_module.content_fetcher.requests.get", bad_get)
+
+    def fake_run(args, check):
+        if args[:2] == ["git", "clone"]:
+            repo_dir.mkdir(parents=True, exist_ok=True)
+            file_full.write_text("OK")
+        elif args[:3] == ["git", "-C", str(repo_dir)]:
+            return None
+    monkeypatch.setattr(subprocess, "run", fake_run)
+
+    out = cf.fetch_content("githubhttps://owner/repo/main/path.txt")
+    assert out == "OK"
+
+
+def test_github_existing_cache_prefers_git(monkeypatch, tmp_path):
+    cf = ContentFetcher(cache_dir=tmp_path / "cache")
+    repo_dir = tmp_path / "cache" / "owner_repo_main"
+    file_full = repo_dir / "path.txt"
+    repo_dir.mkdir(parents=True, exist_ok=True)
+    file_full.write_text("CACHE_DATA")
+
+    # HTTP must not be needed; if called, fail
+    def bad_get(url, timeout=None):
+        raise AssertionError("HTTP should not be called when cache exists")
+    monkeypatch.setattr("struct_module.content_fetcher.requests.get", bad_get)
+
+    pulls = {"count": 0}
+    def fake_run(args, check):
+        if args[:3] == ["git", "-C", str(repo_dir)]:
+            pulls["count"] += 1
+    monkeypatch.setattr(subprocess, "run", fake_run)
+
+    out = cf.fetch_content("githubhttps://owner/repo/main/path.txt")
+    assert out == "CACHE_DATA"
+    assert pulls["count"] == 1