From 7840528fe25f95b7ed4f0aacab602288f1f73c74 Mon Sep 17 00:00:00 2001 From: Rodos Date: Sat, 29 Nov 2025 09:19:23 +1100 Subject: [PATCH] Skip DMCA'd repos which return a 451 response Log a warning and the link to the DMCA notice. Continue backing up other repositories instead of crashing. Closes #163 --- github_backup/github_backup.py | 87 +++++++++++++------- tests/test_http_451.py | 143 +++++++++++++++++++++++++++++++++ 2 files changed, 201 insertions(+), 29 deletions(-) create mode 100644 tests/test_http_451.py diff --git a/github_backup/github_backup.py b/github_backup/github_backup.py index 14f0ed8..dcf79e8 100644 --- a/github_backup/github_backup.py +++ b/github_backup/github_backup.py @@ -37,6 +37,15 @@ FILE_URI_PREFIX = "file://" logger = logging.getLogger(__name__) + +class RepositoryUnavailableError(Exception): + """Raised when a repository is unavailable due to legal reasons (e.g., DMCA takedown).""" + + def __init__(self, message, dmca_url=None): + super().__init__(message) + self.dmca_url = dmca_url + + # Setup SSL context with fallback chain https_ctx = ssl.create_default_context() if https_ctx.get_ca_certs(): @@ -612,6 +621,19 @@ def retrieve_data_gen(args, template, query_args=None, single_request=False): status_code = int(r.getcode()) + # Handle DMCA takedown (HTTP 451) - raise exception to skip entire repository + if status_code == 451: + dmca_url = None + try: + response_data = json.loads(r.read().decode("utf-8")) + dmca_url = response_data.get("block", {}).get("html_url") + except Exception: + pass + raise RepositoryUnavailableError( + "Repository unavailable due to legal reasons (HTTP 451)", + dmca_url=dmca_url + ) + # Check if we got correct data try: response = json.loads(r.read().decode("utf-8")) @@ -1668,40 +1690,47 @@ def backup_repositories(args, output_directory, repositories): continue # don't try to back anything else for a gist; it doesn't exist - download_wiki = args.include_wiki or args.include_everything - if repository["has_wiki"] and download_wiki: - fetch_repository( - repository["name"], - repo_url.replace(".git", ".wiki.git"), - os.path.join(repo_cwd, "wiki"), - skip_existing=args.skip_existing, - bare_clone=args.bare_clone, - lfs_clone=args.lfs_clone, - no_prune=args.no_prune, - ) - if args.include_issues or args.include_everything: - backup_issues(args, repo_cwd, repository, repos_template) + try: + download_wiki = args.include_wiki or args.include_everything + if repository["has_wiki"] and download_wiki: + fetch_repository( + repository["name"], + repo_url.replace(".git", ".wiki.git"), + os.path.join(repo_cwd, "wiki"), + skip_existing=args.skip_existing, + bare_clone=args.bare_clone, + lfs_clone=args.lfs_clone, + no_prune=args.no_prune, + ) + if args.include_issues or args.include_everything: + backup_issues(args, repo_cwd, repository, repos_template) - if args.include_pulls or args.include_everything: - backup_pulls(args, repo_cwd, repository, repos_template) + if args.include_pulls or args.include_everything: + backup_pulls(args, repo_cwd, repository, repos_template) - if args.include_milestones or args.include_everything: - backup_milestones(args, repo_cwd, repository, repos_template) + if args.include_milestones or args.include_everything: + backup_milestones(args, repo_cwd, repository, repos_template) - if args.include_labels or args.include_everything: - backup_labels(args, repo_cwd, repository, repos_template) + if args.include_labels or args.include_everything: + backup_labels(args, repo_cwd, repository, repos_template) - if args.include_hooks or args.include_everything: - backup_hooks(args, repo_cwd, repository, repos_template) + if args.include_hooks or args.include_everything: + backup_hooks(args, repo_cwd, repository, repos_template) - if args.include_releases or args.include_everything: - backup_releases( - args, - repo_cwd, - repository, - repos_template, - include_assets=args.include_assets or args.include_everything, - ) + if args.include_releases or args.include_everything: + backup_releases( + args, + repo_cwd, + repository, + repos_template, + include_assets=args.include_assets or args.include_everything, + ) + except RepositoryUnavailableError as e: + logger.warning(f"Repository {repository['full_name']} is unavailable (HTTP 451)") + if e.dmca_url: + logger.warning(f"DMCA notice: {e.dmca_url}") + logger.info(f"Skipping remaining resources for {repository['full_name']}") + continue if args.incremental: if last_update == "0000-00-00T00:00:00Z": diff --git a/tests/test_http_451.py b/tests/test_http_451.py new file mode 100644 index 0000000..7feca1d --- /dev/null +++ b/tests/test_http_451.py @@ -0,0 +1,143 @@ +"""Tests for HTTP 451 (DMCA takedown) handling.""" + +import json +from unittest.mock import Mock, patch + +import pytest + +from github_backup import github_backup + + +class TestHTTP451Exception: + """Test suite for HTTP 451 DMCA takedown exception handling.""" + + def test_repository_unavailable_error_raised(self): + """HTTP 451 should raise RepositoryUnavailableError with DMCA URL.""" + # Create mock args + args = Mock() + args.as_app = False + args.token_fine = None + args.token_classic = None + args.username = None + args.password = None + args.osx_keychain_item_name = None + args.osx_keychain_item_account = None + args.throttle_limit = None + args.throttle_pause = 0 + + # Mock HTTPError 451 response + mock_response = Mock() + mock_response.getcode.return_value = 451 + + dmca_data = { + "message": "Repository access blocked", + "block": { + "reason": "dmca", + "created_at": "2024-11-12T14:38:04Z", + "html_url": "https://github.com/github/dmca/blob/master/2024/11/2024-11-04-source-code.md" + } + } + mock_response.read.return_value = json.dumps(dmca_data).encode("utf-8") + mock_response.headers = {"x-ratelimit-remaining": "5000"} + mock_response.reason = "Unavailable For Legal Reasons" + + def mock_get_response(request, auth, template): + return mock_response, [] + + with patch("github_backup.github_backup._get_response", side_effect=mock_get_response): + with pytest.raises(github_backup.RepositoryUnavailableError) as exc_info: + list(github_backup.retrieve_data_gen(args, "https://api.github.com/repos/test/dmca/issues")) + + # Check exception has DMCA URL + assert exc_info.value.dmca_url == "https://github.com/github/dmca/blob/master/2024/11/2024-11-04-source-code.md" + assert "451" in str(exc_info.value) + + def test_repository_unavailable_error_without_dmca_url(self): + """HTTP 451 without DMCA details should still raise exception.""" + args = Mock() + args.as_app = False + args.token_fine = None + args.token_classic = None + args.username = None + args.password = None + args.osx_keychain_item_name = None + args.osx_keychain_item_account = None + args.throttle_limit = None + args.throttle_pause = 0 + + mock_response = Mock() + mock_response.getcode.return_value = 451 + mock_response.read.return_value = b'{"message": "Blocked"}' + mock_response.headers = {"x-ratelimit-remaining": "5000"} + mock_response.reason = "Unavailable For Legal Reasons" + + def mock_get_response(request, auth, template): + return mock_response, [] + + with patch("github_backup.github_backup._get_response", side_effect=mock_get_response): + with pytest.raises(github_backup.RepositoryUnavailableError) as exc_info: + list(github_backup.retrieve_data_gen(args, "https://api.github.com/repos/test/dmca/issues")) + + # Exception raised even without DMCA URL + assert exc_info.value.dmca_url is None + assert "451" in str(exc_info.value) + + def test_repository_unavailable_error_with_malformed_json(self): + """HTTP 451 with malformed JSON should still raise exception.""" + args = Mock() + args.as_app = False + args.token_fine = None + args.token_classic = None + args.username = None + args.password = None + args.osx_keychain_item_name = None + args.osx_keychain_item_account = None + args.throttle_limit = None + args.throttle_pause = 0 + + mock_response = Mock() + mock_response.getcode.return_value = 451 + mock_response.read.return_value = b"invalid json {" + mock_response.headers = {"x-ratelimit-remaining": "5000"} + mock_response.reason = "Unavailable For Legal Reasons" + + def mock_get_response(request, auth, template): + return mock_response, [] + + with patch("github_backup.github_backup._get_response", side_effect=mock_get_response): + with pytest.raises(github_backup.RepositoryUnavailableError): + list(github_backup.retrieve_data_gen(args, "https://api.github.com/repos/test/dmca/issues")) + + def test_other_http_errors_unchanged(self): + """Other HTTP errors should still raise generic Exception.""" + args = Mock() + args.as_app = False + args.token_fine = None + args.token_classic = None + args.username = None + args.password = None + args.osx_keychain_item_name = None + args.osx_keychain_item_account = None + args.throttle_limit = None + args.throttle_pause = 0 + + mock_response = Mock() + mock_response.getcode.return_value = 404 + mock_response.read.return_value = b'{"message": "Not Found"}' + mock_response.headers = {"x-ratelimit-remaining": "5000"} + mock_response.reason = "Not Found" + + def mock_get_response(request, auth, template): + return mock_response, [] + + with patch("github_backup.github_backup._get_response", side_effect=mock_get_response): + # Should raise generic Exception, not RepositoryUnavailableError + with pytest.raises(Exception) as exc_info: + list(github_backup.retrieve_data_gen(args, "https://api.github.com/repos/test/notfound/issues")) + + assert not isinstance(exc_info.value, github_backup.RepositoryUnavailableError) + assert "404" in str(exc_info.value) + + +if __name__ == "__main__": + pytest.main([__file__, "-v"])