From 7a9455db88884571faef1f17044003c4e6460836 Mon Sep 17 00:00:00 2001 From: Rodos Date: Fri, 14 Nov 2025 10:17:08 +1100 Subject: [PATCH 1/2] fix: Prevent duplicate attachment downloads Fixes bug where attachments were downloaded multiple times with incremented filenames (file.mov, file_1.mov, file_2.mov) when running backups without --skip-existing flag. I should not have used the --skip-existing flag for attachments, it did not do what I thought it did. The correct approach is to always use the manifest to guide what has already been downloaded and what now needs to be done. --- github_backup/github_backup.py | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/github_backup/github_backup.py b/github_backup/github_backup.py index b0c2aef..d1828d0 100644 --- a/github_backup/github_backup.py +++ b/github_backup/github_backup.py @@ -919,12 +919,6 @@ def download_attachment_file(url, path, auth, as_app=False, fine=False): "error": None, } - if os.path.exists(path): - metadata["success"] = True - metadata["http_status"] = 200 # Assume success if already exists - metadata["size_bytes"] = os.path.getsize(path) - return metadata - # Create simple request (no API query params) request = Request(url) request.add_header("Accept", "application/octet-stream") @@ -1337,10 +1331,10 @@ def download_attachments( attachments_dir = os.path.join(item_cwd, "attachments", str(number)) manifest_path = os.path.join(attachments_dir, "manifest.json") - # Load existing manifest if skip_existing is enabled + # Load existing manifest to prevent duplicate downloads existing_urls = set() existing_metadata = [] - if args.skip_existing and os.path.exists(manifest_path): + if os.path.exists(manifest_path): try: with open(manifest_path, "r") as f: existing_manifest = json.load(f) @@ -1395,9 +1389,6 @@ def download_attachments( filename = get_attachment_filename(url) filepath = os.path.join(attachments_dir, filename) - # Check for collision BEFORE downloading - filepath = resolve_filename_collision(filepath) - # Download and get metadata metadata = download_attachment_file( url, From e4d1c789937fe1ccf7934613ccfbc63fd8b8ab9b Mon Sep 17 00:00:00 2001 From: Rodos Date: Fri, 14 Nov 2025 10:23:29 +1100 Subject: [PATCH 2/2] test: Add pytest infrastructure and attachment tests In making my last fix to attachments, I found it challenging not having tests to ensure there was no regression. Added pytest with minimal setup and isolated configuration. Created a separate test workflow to keep tests isolated from linting. Tests cover the key elements of the attachment logic: - URL extraction from issue bodies - Filename extraction from different URL types - Filename collision resolution - Manifest duplicate prevention --- .github/workflows/test.yml | 33 ++++ pytest.ini | 6 + release-requirements.txt | 1 + tests/__init__.py | 1 + tests/test_attachments.py | 353 +++++++++++++++++++++++++++++++++++++ 5 files changed, 394 insertions(+) create mode 100644 .github/workflows/test.yml create mode 100644 pytest.ini create mode 100644 tests/__init__.py create mode 100644 tests/test_attachments.py diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..fb43350 --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,33 @@ +--- +name: "test" + +# yamllint disable-line rule:truthy +on: + pull_request: + branches: + - "*" + push: + branches: + - "main" + - "master" + +jobs: + test: + name: test + runs-on: ubuntu-24.04 + strategy: + matrix: + python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"] + + steps: + - name: Checkout repository + uses: actions/checkout@v5 + with: + fetch-depth: 0 + - name: Setup Python + uses: actions/setup-python@v6 + with: + python-version: ${{ matrix.python-version }} + cache: "pip" + - run: pip install -r release-requirements.txt + - run: pytest tests/ -v diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..a1edb37 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,6 @@ +[pytest] +testpaths = tests +python_files = test_*.py +python_classes = Test* +python_functions = test_* +addopts = -v diff --git a/release-requirements.txt b/release-requirements.txt index b3e9f19..2a9b2ba 100644 --- a/release-requirements.txt +++ b/release-requirements.txt @@ -8,6 +8,7 @@ colorama==0.4.6 docutils==0.22.3 flake8==7.3.0 gitchangelog==3.0.4 +pytest==8.3.3 idna==3.11 importlib-metadata==8.7.0 jaraco.classes==3.4.0 diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..5675dbd --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1 @@ +"""Tests for python-github-backup.""" diff --git a/tests/test_attachments.py b/tests/test_attachments.py new file mode 100644 index 0000000..07c1b33 --- /dev/null +++ b/tests/test_attachments.py @@ -0,0 +1,353 @@ +"""Behavioral tests for attachment functionality.""" + +import json +import os +import tempfile +from pathlib import Path +from unittest.mock import Mock + +import pytest + +from github_backup import github_backup + + +@pytest.fixture +def attachment_test_setup(tmp_path): + """Fixture providing setup and helper for attachment download tests.""" + from unittest.mock import patch + + issue_cwd = tmp_path / "issues" + issue_cwd.mkdir() + + # Mock args + args = Mock() + args.as_app = False + args.token_fine = None + args.token_classic = None + args.username = None + args.password = None + args.osx_keychain_item_name = None + args.osx_keychain_item_account = None + args.user = "testuser" + args.repository = "testrepo" + + repository = {"full_name": "testuser/testrepo"} + + def call_download(issue_data, issue_number=123): + """Call download_attachments with mocked HTTP downloads. + + Returns list of URLs that were actually downloaded. + """ + downloaded_urls = [] + + def mock_download(url, path, auth, as_app, fine): + downloaded_urls.append(url) + return { + "success": True, + "saved_as": os.path.basename(path), + "url": url, + } + + with patch( + "github_backup.github_backup.download_attachment_file", + side_effect=mock_download, + ): + github_backup.download_attachments( + args, str(issue_cwd), issue_data, issue_number, repository + ) + + return downloaded_urls + + return { + "issue_cwd": str(issue_cwd), + "args": args, + "repository": repository, + "call_download": call_download, + } + + +class TestURLExtraction: + """Test URL extraction with realistic issue content.""" + + def test_mixed_urls(self): + issue_data = { + "body": """ + ## Bug Report + + When uploading files, I see this error. Here's a screenshot: + https://github.com/user-attachments/assets/abc123def456 + + The logs show: https://github.com/user-attachments/files/789/error-log.txt + + This is similar to https://github.com/someorg/somerepo/issues/42 but different. + + You can also see the video at https://user-images.githubusercontent.com/12345/video-demo.mov + + Here's how to reproduce: + ```bash + # Don't extract this example URL: + curl https://github.com/user-attachments/assets/example999 + ``` + + More info at https://docs.example.com/guide + + Also see this inline code `https://github.com/user-attachments/files/111/inline.pdf` should not extract. + + Final attachment: https://github.com/user-attachments/files/222/report.pdf. + """, + "comment_data": [ + { + "body": "Here's another attachment: https://private-user-images.githubusercontent.com/98765/secret.png?jwt=token123" + }, + { + "body": """ + Example code: + ```python + url = "https://github.com/user-attachments/assets/code-example" + ``` + But this is real: https://github.com/user-attachments/files/333/actual.zip + """ + }, + ], + } + + # Extract URLs + urls = github_backup.extract_attachment_urls(issue_data) + + expected_urls = [ + "https://github.com/user-attachments/assets/abc123def456", + "https://github.com/user-attachments/files/789/error-log.txt", + "https://user-images.githubusercontent.com/12345/video-demo.mov", + "https://github.com/user-attachments/files/222/report.pdf", + "https://private-user-images.githubusercontent.com/98765/secret.png?jwt=token123", + "https://github.com/user-attachments/files/333/actual.zip", + ] + + assert set(urls) == set(expected_urls) + + def test_trailing_punctuation_stripped(self): + """URLs with trailing punctuation should have punctuation stripped.""" + issue_data = { + "body": """ + See this file: https://github.com/user-attachments/files/1/doc.pdf. + And this one (https://github.com/user-attachments/files/2/image.png). + Check it out! https://github.com/user-attachments/files/3/data.csv! + """ + } + + urls = github_backup.extract_attachment_urls(issue_data) + + expected = [ + "https://github.com/user-attachments/files/1/doc.pdf", + "https://github.com/user-attachments/files/2/image.png", + "https://github.com/user-attachments/files/3/data.csv", + ] + assert set(urls) == set(expected) + + def test_deduplication_across_body_and_comments(self): + """Same URL in body and comments should only appear once.""" + duplicate_url = "https://github.com/user-attachments/assets/abc123" + + issue_data = { + "body": f"First mention: {duplicate_url}", + "comment_data": [ + {"body": f"Second mention: {duplicate_url}"}, + {"body": f"Third mention: {duplicate_url}"}, + ], + } + + urls = github_backup.extract_attachment_urls(issue_data) + + assert set(urls) == {duplicate_url} + + +class TestFilenameExtraction: + """Test filename extraction from different URL types.""" + + def test_modern_assets_url(self): + """Modern assets URL returns UUID.""" + url = "https://github.com/user-attachments/assets/abc123def456" + filename = github_backup.get_attachment_filename(url) + assert filename == "abc123def456" + + def test_modern_files_url(self): + """Modern files URL returns filename.""" + url = "https://github.com/user-attachments/files/12345/report.pdf" + filename = github_backup.get_attachment_filename(url) + assert filename == "report.pdf" + + def test_legacy_cdn_url(self): + """Legacy CDN URL returns filename with extension.""" + url = "https://user-images.githubusercontent.com/123456/abc-def.png" + filename = github_backup.get_attachment_filename(url) + assert filename == "abc-def.png" + + def test_private_cdn_url(self): + """Private CDN URL returns filename.""" + url = "https://private-user-images.githubusercontent.com/98765/secret.png?jwt=token123" + filename = github_backup.get_attachment_filename(url) + assert filename == "secret.png" + + def test_repo_files_url(self): + """Repo-scoped files URL returns filename.""" + url = "https://github.com/owner/repo/files/789/document.txt" + filename = github_backup.get_attachment_filename(url) + assert filename == "document.txt" + + +class TestFilenameCollision: + """Test filename collision resolution.""" + + def test_collision_behavior(self): + """Test filename collision resolution with real files.""" + with tempfile.TemporaryDirectory() as tmpdir: + # No collision - file doesn't exist + result = github_backup.resolve_filename_collision( + os.path.join(tmpdir, "report.pdf") + ) + assert result == os.path.join(tmpdir, "report.pdf") + + # Create the file, now collision exists + Path(os.path.join(tmpdir, "report.pdf")).touch() + result = github_backup.resolve_filename_collision( + os.path.join(tmpdir, "report.pdf") + ) + assert result == os.path.join(tmpdir, "report_1.pdf") + + # Create report_1.pdf too + Path(os.path.join(tmpdir, "report_1.pdf")).touch() + result = github_backup.resolve_filename_collision( + os.path.join(tmpdir, "report.pdf") + ) + assert result == os.path.join(tmpdir, "report_2.pdf") + + def test_manifest_reserved(self): + """manifest.json is always treated as reserved.""" + with tempfile.TemporaryDirectory() as tmpdir: + # Even if manifest.json doesn't exist, should get manifest_1.json + result = github_backup.resolve_filename_collision( + os.path.join(tmpdir, "manifest.json") + ) + assert result == os.path.join(tmpdir, "manifest_1.json") + + +class TestManifestDuplicatePrevention: + """Test that manifest prevents duplicate downloads (the bug fix).""" + + def test_manifest_filters_existing_urls(self, attachment_test_setup): + """URLs in manifest are not re-downloaded.""" + setup = attachment_test_setup + + # Create manifest with existing URLs + attachments_dir = os.path.join(setup["issue_cwd"], "attachments", "123") + os.makedirs(attachments_dir) + manifest_path = os.path.join(attachments_dir, "manifest.json") + + manifest = { + "attachments": [ + { + "url": "https://github.com/user-attachments/assets/old1", + "success": True, + "saved_as": "old1.pdf", + }, + { + "url": "https://github.com/user-attachments/assets/old2", + "success": True, + "saved_as": "old2.pdf", + }, + ] + } + with open(manifest_path, "w") as f: + json.dump(manifest, f) + + # Issue data with 2 old URLs and 1 new URL + issue_data = { + "body": """ + Old: https://github.com/user-attachments/assets/old1 + Old: https://github.com/user-attachments/assets/old2 + New: https://github.com/user-attachments/assets/new1 + """ + } + + downloaded_urls = setup["call_download"](issue_data) + + # Should only download the NEW URL (old ones filtered by manifest) + assert len(downloaded_urls) == 1 + assert downloaded_urls[0] == "https://github.com/user-attachments/assets/new1" + + def test_no_manifest_downloads_all(self, attachment_test_setup): + """Without manifest, all URLs should be downloaded.""" + setup = attachment_test_setup + + # Issue data with 2 URLs + issue_data = { + "body": """ + https://github.com/user-attachments/assets/url1 + https://github.com/user-attachments/assets/url2 + """ + } + + downloaded_urls = setup["call_download"](issue_data) + + # Should download ALL URLs (no manifest to filter) + assert len(downloaded_urls) == 2 + assert set(downloaded_urls) == { + "https://github.com/user-attachments/assets/url1", + "https://github.com/user-attachments/assets/url2", + } + + def test_manifest_skips_permanent_failures(self, attachment_test_setup): + """Manifest skips permanent failures (404, 410) but retries transient (503).""" + setup = attachment_test_setup + + # Create manifest with different failure types + attachments_dir = os.path.join(setup["issue_cwd"], "attachments", "123") + os.makedirs(attachments_dir) + manifest_path = os.path.join(attachments_dir, "manifest.json") + + manifest = { + "attachments": [ + { + "url": "https://github.com/user-attachments/assets/success", + "success": True, + "saved_as": "success.pdf", + }, + { + "url": "https://github.com/user-attachments/assets/notfound", + "success": False, + "http_status": 404, + }, + { + "url": "https://github.com/user-attachments/assets/gone", + "success": False, + "http_status": 410, + }, + { + "url": "https://github.com/user-attachments/assets/unavailable", + "success": False, + "http_status": 503, + }, + ] + } + with open(manifest_path, "w") as f: + json.dump(manifest, f) + + # Issue data has all 4 URLs + issue_data = { + "body": """ + https://github.com/user-attachments/assets/success + https://github.com/user-attachments/assets/notfound + https://github.com/user-attachments/assets/gone + https://github.com/user-attachments/assets/unavailable + """ + } + + downloaded_urls = setup["call_download"](issue_data) + + # Should only retry 503 (transient failure) + # Success, 404, and 410 should be skipped + assert len(downloaded_urls) == 1 + assert ( + downloaded_urls[0] + == "https://github.com/user-attachments/assets/unavailable" + )