diff --git a/.github/workflows/continuous_integration.yml b/.github/workflows/continuous_integration.yml index 3b9d301..c3066fa 100644 --- a/.github/workflows/continuous_integration.yml +++ b/.github/workflows/continuous_integration.yml @@ -14,7 +14,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: - # Prevent SEC rate-limiting by limiting parallel runners + # Prevent SEC rate-limiting by limiting number of parallel runners max-parallel: 6 fail-fast: false matrix: diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 6291cfb..f1244c7 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -25,15 +25,19 @@ repos: hooks: - id: isort - repo: https://github.com/asottile/pyupgrade - rev: v2.7.4 + rev: v2.10.0 hooks: - id: pyupgrade args: [--py36-plus] - repo: https://github.com/pre-commit/pygrep-hooks - rev: v1.7.0 + rev: v1.7.1 hooks: - id: python-no-eval + - id: python-check-blanket-noqa + - id: python-use-type-annotations - id: rst-backticks + - id: rst-directive-colons + - id: rst-inline-touching-normal - repo: https://github.com/psf/black rev: 20.8b1 hooks: diff --git a/CHANGELOG.md b/CHANGELOG.md index 0803a25..c30e50a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ # Changelog +## 4.0.2 - 2/18/2021 + +### Fixed + +- Fixed a `RecursionError` that could occur when downloading older filings with the `download_details` flag set to true. Thanks to @neilbartlett for reporting and fixing this bug! + ## 4.0.1 - 1/23/2021 ### Fixed diff --git a/requirements.txt b/requirements.txt index b915a0e..6367e5e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ bs4 +lxml pre-commit pytest pytest-cov diff --git a/sec_edgar_downloader/_utils.py b/sec_edgar_downloader/_utils.py index 1e9ed9a..2bae903 100644 --- a/sec_edgar_downloader/_utils.py +++ b/sec_edgar_downloader/_utils.py @@ -186,7 +186,7 @@ def get_filing_urls_to_download( def resolve_relative_urls_in_filing(filing_text: str, base_url: str) -> str: - soup = BeautifulSoup(filing_text, "html.parser") + soup = BeautifulSoup(filing_text, "lxml") for url in soup.find_all("a", href=True): url["href"] = urljoin(base_url, url["href"]) @@ -194,7 +194,7 @@ def resolve_relative_urls_in_filing(filing_text: str, base_url: str) -> str: for image in soup.find_all("img", src=True): image["src"] = urljoin(base_url, image["src"]) - if soup.original_encoding is None: + if soup.original_encoding is None: # pragma: no cover return soup return soup.encode(soup.original_encoding) diff --git a/sec_edgar_downloader/_version.py b/sec_edgar_downloader/_version.py index 76ad18b..064c0b3 100644 --- a/sec_edgar_downloader/_version.py +++ b/sec_edgar_downloader/_version.py @@ -1 +1 @@ -__version__ = "4.0.1" +__version__ = "4.0.2" diff --git a/setup.py b/setup.py index 44e086e..4a72313 100644 --- a/setup.py +++ b/setup.py @@ -19,7 +19,7 @@ url="https://github.com/jadchaar/sec-edgar-downloader", packages=["sec_edgar_downloader"], zip_safe=False, - install_requires=["requests", "bs4"], + install_requires=["requests", "bs4", "lxml"], python_requires=">=3.6", classifiers=[ "Development Status :: 5 - Production/Stable", diff --git a/tests/test_detail_downloads.py b/tests/test_detail_downloads.py new file mode 100644 index 0000000..e55dbbf --- /dev/null +++ b/tests/test_detail_downloads.py @@ -0,0 +1,17 @@ +# Regression test for issue 60 +def test_recursion_error_older_filings(downloader): + dl, _ = downloader + + filing_type = "10-K" + ticker = "AIZ" + # 10-K filing details before 2005 for AIZ cause a RecursionError + # when resolving relative URLs. This issue can be resolved by + # using lxml rather than html.parser as the parser for bs4. + num_downloaded = dl.get( + filing_type, + ticker, + download_details=True, + include_amends=False, + before="2005-03-31", + ) + assert num_downloaded == 2 diff --git a/tox.ini b/tox.ini index 66807d8..f8c61fc 100644 --- a/tox.ini +++ b/tox.ini @@ -30,8 +30,8 @@ changedir = docs deps = doc8 sphinx - bs4 sphinx_autodoc_typehints + bs4 allowlist_externals = make commands = doc8 index.rst ../README.rst --extension .rst --ignore D001