diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 1c30115..f51382d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -7,7 +7,7 @@ repos: - id: bandit args: [-lll, --quiet, --exclude=tests/**] - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v3.4.0 + rev: v4.0.1 hooks: - id: check-case-conflict - id: check-docstring-first @@ -21,16 +21,16 @@ repos: - id: trailing-whitespace exclude: ^tests/sample-filings/ - repo: https://github.com/timothycrosley/isort - rev: 5.8.0 + rev: 5.9.1 hooks: - id: isort - repo: https://github.com/asottile/pyupgrade - rev: v2.15.0 + rev: v2.19.4 hooks: - id: pyupgrade args: [--py36-plus] - repo: https://github.com/pre-commit/pygrep-hooks - rev: v1.8.0 + rev: v1.9.0 hooks: - id: python-no-eval - id: python-check-blanket-noqa @@ -39,7 +39,7 @@ repos: - id: rst-directive-colons - id: rst-inline-touching-normal - repo: https://github.com/psf/black - rev: 21.5b0 + rev: 21.6b0 hooks: - id: black args: [--quiet, --target-version=py36] diff --git a/CHANGELOG.md b/CHANGELOG.md index f991233..12a11d1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,12 @@ # Changelog +## 4.2.1 - 6/22/2021 + +### Fixed + +- Anchor links inside of filings are now resolved correctly. Fragments and external links should now function as intended. +- Renamed `requirements.txt` to `requirements-dev.txt` in order to prevent confusion with the dependencies listed in `setup.py`. + ## 4.2.0 - 5/19/2021 ### New diff --git a/MANIFEST.in b/MANIFEST.in index 17cac88..f8e2efb 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,3 +1,3 @@ -include LICENSE CHANGELOG.md README.rst Makefile requirements.txt tox.ini +include LICENSE CHANGELOG.md README.rst Makefile requirements-dev.txt tox.ini recursive-include tests *.py recursive-include docs *.py *.rst *.bat Makefile diff --git a/Makefile b/Makefile index 80eaa43..8f4a6ef 100644 --- a/Makefile +++ b/Makefile @@ -11,7 +11,7 @@ build36 build37 build38 build39: clean $(PYTHON_VER) -m venv venv . venv/bin/activate; \ pip install -U pip setuptools wheel; \ - pip install -r requirements.txt; \ + pip install -r requirements-dev.txt; \ pre-commit install test: diff --git a/requirements.txt b/requirements-dev.txt similarity index 100% rename from requirements.txt rename to requirements-dev.txt diff --git a/sec_edgar_downloader/_utils.py b/sec_edgar_downloader/_utils.py index f083e34..a3bb0a5 100644 --- a/sec_edgar_downloader/_utils.py +++ b/sec_edgar_downloader/_utils.py @@ -215,10 +215,14 @@ def get_filing_urls_to_download( return filings_to_fetch -def resolve_relative_urls_in_filing(filing_text: str, base_url: str) -> str: +def resolve_relative_urls_in_filing(filing_text: str, download_url: str) -> str: soup = BeautifulSoup(filing_text, "lxml") + base_url = f"{download_url.rsplit('/', 1)[0]}/" for url in soup.find_all("a", href=True): + # Do not resolve a URL if it is a fragment or it already contains a full URL + if url["href"].startswith("#") or url["href"].startswith("http"): + continue url["href"] = urljoin(base_url, url["href"]) for image in soup.find_all("img", src=True): @@ -247,8 +251,7 @@ def download_and_save_filing( # Only resolve URLs in HTML files if resolve_urls and Path(save_filename).suffix == ".html": - base_url = f"{download_url.rsplit('/', 1)[0]}/" - filing_text = resolve_relative_urls_in_filing(filing_text, base_url) + filing_text = resolve_relative_urls_in_filing(filing_text, download_url) # Create all parent directories as needed and write content to file save_path = ( diff --git a/sec_edgar_downloader/_version.py b/sec_edgar_downloader/_version.py index 0fd7811..aef46ac 100644 --- a/sec_edgar_downloader/_version.py +++ b/sec_edgar_downloader/_version.py @@ -1 +1 @@ -__version__ = "4.2.0" +__version__ = "4.2.1" diff --git a/tests/test_utils.py b/tests/test_utils.py index 38f9abb..d8aa27d 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -7,6 +7,7 @@ def test_resolve_relative_urls(): sample_img = "foobar.jpg" sample_anchor_fragment = "#anchor_link" sample_anchor_html = "external.html" + sample_anchor_full_url = "https://www.sec.gov/" sample_filing_html = f""" @@ -18,6 +19,7 @@ def test_resolve_relative_urls(): another random anchor link yet another random anchor link + yet another random full anchor link """ @@ -25,16 +27,19 @@ def test_resolve_relative_urls(): assert sample_filing_html.count(f'"{sample_img}"') == 2 assert sample_filing_html.count(f'"{sample_anchor_fragment}"') == 2 assert sample_filing_html.count(f'"{sample_anchor_html}"') == 1 + assert sample_filing_html.count(f'"{sample_anchor_full_url}"') == 1 # Example base URL for an actual Apple 10-K filing base_url = "https://www.sec.gov/Archives/edgar/data/0000320193/000032019320000096/" + download_url = f"{base_url}aapl-20200926.htm" # Must cast to a string since we are not writing bytes to a file resolved_filing_html = str( - resolve_relative_urls_in_filing(sample_filing_html, base_url) + resolve_relative_urls_in_filing(sample_filing_html, download_url) ) assert sample_filing_html != resolved_filing_html assert resolved_filing_html.count(f'"{base_url}{sample_img}"') == 2 - assert resolved_filing_html.count(f'"{base_url}{sample_anchor_fragment}"') == 2 + assert resolved_filing_html.count(f'"{sample_anchor_fragment}"') == 2 assert resolved_filing_html.count(f'"{base_url}{sample_anchor_html}"') == 1 + assert sample_filing_html.count(f'"{sample_anchor_full_url}"') == 1 diff --git a/tox.ini b/tox.ini index 3c6385d..29df685 100644 --- a/tox.ini +++ b/tox.ini @@ -11,7 +11,7 @@ python = 3.9: py39 [testenv] -deps = -rrequirements.txt +deps = -rrequirements-dev.txt allowlist_externals = pytest commands = pytest