Skip to content

Commit

Permalink
Fix URL resolution and rename requirements.txt (#80)
Browse files Browse the repository at this point in the history
* Fix URL resolution and rename requirements.txt to requirements-dev.txt to prevent confusion

* Bump version and updated CHANGELOG
  • Loading branch information
jadchaar committed Jun 22, 2021
1 parent 1659719 commit 24d78f8
Show file tree
Hide file tree
Showing 9 changed files with 29 additions and 14 deletions.
10 changes: 5 additions & 5 deletions .pre-commit-config.yaml
Expand Up @@ -7,7 +7,7 @@ repos:
- id: bandit
args: [-lll, --quiet, --exclude=tests/**]
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v3.4.0
rev: v4.0.1
hooks:
- id: check-case-conflict
- id: check-docstring-first
Expand All @@ -21,16 +21,16 @@ repos:
- id: trailing-whitespace
exclude: ^tests/sample-filings/
- repo: https://github.com/timothycrosley/isort
rev: 5.8.0
rev: 5.9.1
hooks:
- id: isort
- repo: https://github.com/asottile/pyupgrade
rev: v2.15.0
rev: v2.19.4
hooks:
- id: pyupgrade
args: [--py36-plus]
- repo: https://github.com/pre-commit/pygrep-hooks
rev: v1.8.0
rev: v1.9.0
hooks:
- id: python-no-eval
- id: python-check-blanket-noqa
Expand All @@ -39,7 +39,7 @@ repos:
- id: rst-directive-colons
- id: rst-inline-touching-normal
- repo: https://github.com/psf/black
rev: 21.5b0
rev: 21.6b0
hooks:
- id: black
args: [--quiet, --target-version=py36]
Expand Down
7 changes: 7 additions & 0 deletions CHANGELOG.md
@@ -1,5 +1,12 @@
# Changelog

## 4.2.1 - 6/22/2021

### Fixed

- Anchor links inside of filings are now resolved correctly. Fragments and external links should now function as intended.
- Renamed `requirements.txt` to `requirements-dev.txt` in order to prevent confusion with the dependencies listed in `setup.py`.

## 4.2.0 - 5/19/2021

### New
Expand Down
2 changes: 1 addition & 1 deletion MANIFEST.in
@@ -1,3 +1,3 @@
include LICENSE CHANGELOG.md README.rst Makefile requirements.txt tox.ini
include LICENSE CHANGELOG.md README.rst Makefile requirements-dev.txt tox.ini
recursive-include tests *.py
recursive-include docs *.py *.rst *.bat Makefile
2 changes: 1 addition & 1 deletion Makefile
Expand Up @@ -11,7 +11,7 @@ build36 build37 build38 build39: clean
$(PYTHON_VER) -m venv venv
. venv/bin/activate; \
pip install -U pip setuptools wheel; \
pip install -r requirements.txt; \
pip install -r requirements-dev.txt; \
pre-commit install

test:
Expand Down
File renamed without changes.
9 changes: 6 additions & 3 deletions sec_edgar_downloader/_utils.py
Expand Up @@ -215,10 +215,14 @@ def get_filing_urls_to_download(
return filings_to_fetch


def resolve_relative_urls_in_filing(filing_text: str, base_url: str) -> str:
def resolve_relative_urls_in_filing(filing_text: str, download_url: str) -> str:
soup = BeautifulSoup(filing_text, "lxml")
base_url = f"{download_url.rsplit('/', 1)[0]}/"

for url in soup.find_all("a", href=True):
# Do not resolve a URL if it is a fragment or it already contains a full URL
if url["href"].startswith("#") or url["href"].startswith("http"):
continue
url["href"] = urljoin(base_url, url["href"])

for image in soup.find_all("img", src=True):
Expand Down Expand Up @@ -247,8 +251,7 @@ def download_and_save_filing(

# Only resolve URLs in HTML files
if resolve_urls and Path(save_filename).suffix == ".html":
base_url = f"{download_url.rsplit('/', 1)[0]}/"
filing_text = resolve_relative_urls_in_filing(filing_text, base_url)
filing_text = resolve_relative_urls_in_filing(filing_text, download_url)

# Create all parent directories as needed and write content to file
save_path = (
Expand Down
2 changes: 1 addition & 1 deletion sec_edgar_downloader/_version.py
@@ -1 +1 @@
__version__ = "4.2.0"
__version__ = "4.2.1"
9 changes: 7 additions & 2 deletions tests/test_utils.py
Expand Up @@ -7,6 +7,7 @@ def test_resolve_relative_urls():
sample_img = "foobar.jpg"
sample_anchor_fragment = "#anchor_link"
sample_anchor_html = "external.html"
sample_anchor_full_url = "https://www.sec.gov/"

sample_filing_html = f"""<html>
<head>
Expand All @@ -18,23 +19,27 @@ def test_resolve_relative_urls():
<img src="{sample_img}" />
<a href="{sample_anchor_fragment}">another random anchor link</a>
<a href="{sample_anchor_html}">yet another random anchor link</a>
<a href="{sample_anchor_full_url}">yet another random full anchor link</a>
</body>
</html>
"""

assert sample_filing_html.count(f'"{sample_img}"') == 2
assert sample_filing_html.count(f'"{sample_anchor_fragment}"') == 2
assert sample_filing_html.count(f'"{sample_anchor_html}"') == 1
assert sample_filing_html.count(f'"{sample_anchor_full_url}"') == 1

# Example base URL for an actual Apple 10-K filing
base_url = "https://www.sec.gov/Archives/edgar/data/0000320193/000032019320000096/"
download_url = f"{base_url}aapl-20200926.htm"

# Must cast to a string since we are not writing bytes to a file
resolved_filing_html = str(
resolve_relative_urls_in_filing(sample_filing_html, base_url)
resolve_relative_urls_in_filing(sample_filing_html, download_url)
)

assert sample_filing_html != resolved_filing_html
assert resolved_filing_html.count(f'"{base_url}{sample_img}"') == 2
assert resolved_filing_html.count(f'"{base_url}{sample_anchor_fragment}"') == 2
assert resolved_filing_html.count(f'"{sample_anchor_fragment}"') == 2
assert resolved_filing_html.count(f'"{base_url}{sample_anchor_html}"') == 1
assert sample_filing_html.count(f'"{sample_anchor_full_url}"') == 1
2 changes: 1 addition & 1 deletion tox.ini
Expand Up @@ -11,7 +11,7 @@ python =
3.9: py39

[testenv]
deps = -rrequirements.txt
deps = -rrequirements-dev.txt
allowlist_externals = pytest
commands = pytest

Expand Down

0 comments on commit 24d78f8

Please sign in to comment.