Fix RecursionError when downloading details (#62)

* Fix RecursionError when downloading details. * Fixed function name and revise changelog
jadchaar · Feb 18, 2021 · 960be96 · 960be96
1 parent a095892
commit 960be96
Show file tree

Hide file tree

Showing 9 changed files with 36 additions and 8 deletions.
diff --git a/.github/workflows/continuous_integration.yml b/.github/workflows/continuous_integration.yml
@@ -14,7 +14,7 @@ jobs:
     runs-on: ${{ matrix.os }}
 
     strategy:
-      # Prevent SEC rate-limiting by limiting parallel runners
+      # Prevent SEC rate-limiting by limiting number of parallel runners
       max-parallel: 6
       fail-fast: false
       matrix:

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -25,15 +25,19 @@ repos:
     hooks:
       - id: isort
   - repo: https://github.com/asottile/pyupgrade
-    rev: v2.7.4
+    rev: v2.10.0
     hooks:
       - id: pyupgrade
         args: [--py36-plus]
   - repo: https://github.com/pre-commit/pygrep-hooks
-    rev: v1.7.0
+    rev: v1.7.1
     hooks:
       - id: python-no-eval
+      - id: python-check-blanket-noqa
+      - id: python-use-type-annotations
       - id: rst-backticks
+      - id: rst-directive-colons
+      - id: rst-inline-touching-normal
   - repo: https://github.com/psf/black
     rev: 20.8b1
     hooks:

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,11 @@
 # Changelog
 
+## 4.0.2 - 2/18/2021
+
+### Fixed
+
+- Fixed a `RecursionError` that could occur when downloading older filings with the `download_details` flag set to true. Thanks to @neilbartlett for reporting and fixing this bug!
+
 ## 4.0.1 - 1/23/2021
 
 ### Fixed

diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,5 @@
 bs4
+lxml
 pre-commit
 pytest
 pytest-cov

diff --git a/sec_edgar_downloader/_utils.py b/sec_edgar_downloader/_utils.py
@@ -186,15 +186,15 @@ def get_filing_urls_to_download(
 
 
 def resolve_relative_urls_in_filing(filing_text: str, base_url: str) -> str:
-    soup = BeautifulSoup(filing_text, "html.parser")
+    soup = BeautifulSoup(filing_text, "lxml")
 
     for url in soup.find_all("a", href=True):
         url["href"] = urljoin(base_url, url["href"])
 
     for image in soup.find_all("img", src=True):
         image["src"] = urljoin(base_url, image["src"])
 
-    if soup.original_encoding is None:
+    if soup.original_encoding is None:  # pragma: no cover
         return soup
 
     return soup.encode(soup.original_encoding)

diff --git a/sec_edgar_downloader/_version.py b/sec_edgar_downloader/_version.py
@@ -1 +1 @@
-__version__ = "4.0.1"
+__version__ = "4.0.2"
diff --git a/setup.py b/setup.py
@@ -19,7 +19,7 @@
     url="https://github.com/jadchaar/sec-edgar-downloader",
     packages=["sec_edgar_downloader"],
     zip_safe=False,
-    install_requires=["requests", "bs4"],
+    install_requires=["requests", "bs4", "lxml"],
     python_requires=">=3.6",
     classifiers=[
         "Development Status :: 5 - Production/Stable",

diff --git a/tests/test_detail_downloads.py b/tests/test_detail_downloads.py
@@ -0,0 +1,17 @@
+# Regression test for issue 60
+def test_recursion_error_older_filings(downloader):
+    dl, _ = downloader
+
+    filing_type = "10-K"
+    ticker = "AIZ"
+    # 10-K filing details before 2005 for AIZ cause a RecursionError
+    # when resolving relative URLs. This issue can be resolved by
+    # using lxml rather than html.parser as the parser for bs4.
+    num_downloaded = dl.get(
+        filing_type,
+        ticker,
+        download_details=True,
+        include_amends=False,
+        before="2005-03-31",
+    )
+    assert num_downloaded == 2
diff --git a/tox.ini b/tox.ini
@@ -30,8 +30,8 @@ changedir = docs
 deps =
     doc8
     sphinx
-    bs4
     sphinx_autodoc_typehints
+    bs4
 allowlist_externals = make
 commands =
     doc8 index.rst ../README.rst --extension .rst --ignore D001