From edea39776b6bf0dd71eda633ca3d7866089aa640 Mon Sep 17 00:00:00 2001 From: Jad Chaar Date: Wed, 14 Jul 2021 21:08:35 -0400 Subject: [PATCH] Prep v4.2.2 release (#86) * Prep v4.2.2 * Prep v4.2.2 --- .pre-commit-config.yaml | 4 +- CHANGELOG.md | 20 +++++++++ Makefile | 2 +- sec_edgar_downloader/Downloader.py | 14 ++++++ sec_edgar_downloader/_utils.py | 19 +++++--- sec_edgar_downloader/_version.py | 2 +- tests/test_detail_downloads.py | 2 +- tests/test_user_input.py | 72 ++++++++++++++++++++++++------ tests/test_utils.py | 13 +++++- 9 files changed, 123 insertions(+), 25 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f51382d..851880c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -21,11 +21,11 @@ repos: - id: trailing-whitespace exclude: ^tests/sample-filings/ - repo: https://github.com/timothycrosley/isort - rev: 5.9.1 + rev: 5.9.2 hooks: - id: isort - repo: https://github.com/asottile/pyupgrade - rev: v2.19.4 + rev: v2.21.0 hooks: - id: pyupgrade args: [--py36-plus] diff --git a/CHANGELOG.md b/CHANGELOG.md index 12a11d1..e1be2be 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,25 @@ # Changelog +## 4.2.2 - 7/14/2021 + +### New + +- CIKs are now automatically zero-padded to 10 digits to ensure that filings are accurately retrieved by the SEC Edgar system. For example, passing either `"0000789019"` or `"789019"` (the CIK for MSFT) to `get()` will yield equivalent results: + +```python +>>> dl.get("10-K", "0000789019", amount=1) +1 +>>> dl.get("10-K", "789019", amount=1) +1 +``` + +### Fixed + +- Updated the `User-Agent` header to comply with new [SEC Edgar Fair Access requirements](https://www.sec.gov/os/accessing-edgar-data). This should resolve the 403 network errors some users are encountering when downloading a significant number of filings. + +### Changed +- A `ValueError` is now raised when a CIK of length >10 or a blank ticker/CIK is passed to `get()`. + ## 4.2.1 - 6/22/2021 ### Fixed diff --git a/Makefile b/Makefile index 8f4a6ef..72cd726 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ .PHONY: auto test docs clean -auto: build38 +auto: build39 build36: PYTHON_VER = python3.6 build37: PYTHON_VER = python3.7 diff --git a/sec_edgar_downloader/Downloader.py b/sec_edgar_downloader/Downloader.py index 2b2e871..8e0bc4c 100644 --- a/sec_edgar_downloader/Downloader.py +++ b/sec_edgar_downloader/Downloader.py @@ -10,6 +10,7 @@ download_filings, get_filing_urls_to_download, get_number_of_unique_filings, + is_cik, validate_date_format, ) @@ -111,6 +112,19 @@ def get( """ ticker_or_cik = str(ticker_or_cik).strip().upper() + # Check for blank tickers or CIKs + if not ticker_or_cik: + raise ValueError("Invalid ticker or CIK. Please enter a non-blank value.") + + # Detect CIKs and ensure that they are properly zero-padded + if is_cik(ticker_or_cik): + if len(ticker_or_cik) > 10: + raise ValueError("Invalid CIK. CIKs must be at most 10 digits long.") + # Pad CIK with 0s to ensure that it is exactly 10 digits long + # The SEC Edgar Search API requires zero-padded CIKs to ensure + # that search results are accurate. Relates to issue #84. + ticker_or_cik = ticker_or_cik.zfill(10) + if amount is None: # If amount is not specified, obtain all available filings. # We simply need a large number to denote this and the loop diff --git a/sec_edgar_downloader/_utils.py b/sec_edgar_downloader/_utils.py index 0e4a1d7..2186402 100644 --- a/sec_edgar_downloader/_utils.py +++ b/sec_edgar_downloader/_utils.py @@ -1,5 +1,4 @@ """Utility functions for the downloader class.""" - import time from collections import namedtuple from datetime import datetime @@ -156,7 +155,7 @@ def get_filing_urls_to_download( resp = client.post( SEC_EDGAR_SEARCH_API_ENDPOINT, json=payload, - headers=generate_random_user_agent(), + headers={"User-Agent": generate_random_user_agent()}, ) resp.raise_for_status() search_query_results = resp.json() @@ -245,7 +244,9 @@ def download_and_save_filing( *, resolve_urls: bool = False, ) -> None: - resp = client.get(download_url, headers=generate_random_user_agent()) + resp = client.get( + download_url, headers={"User-Agent": generate_random_user_agent()} + ) resp.raise_for_status() filing_text = resp.content @@ -322,5 +323,13 @@ def get_number_of_unique_filings(filings: List[FilingMetadata]) -> int: return len({metadata.accession_number for metadata in filings}) -def generate_random_user_agent() -> dict: - return {"User-Agent": f"{fake.first_name()} {fake.last_name()} {fake.email()}"} +def generate_random_user_agent() -> str: + return f"{fake.first_name()} {fake.last_name()} {fake.email()}" + + +def is_cik(ticker_or_cik: str) -> bool: + try: + int(ticker_or_cik) + return True + except ValueError: + return False diff --git a/sec_edgar_downloader/_version.py b/sec_edgar_downloader/_version.py index aef46ac..2e905e4 100644 --- a/sec_edgar_downloader/_version.py +++ b/sec_edgar_downloader/_version.py @@ -1 +1 @@ -__version__ = "4.2.1" +__version__ = "4.2.2" diff --git a/tests/test_detail_downloads.py b/tests/test_detail_downloads.py index e55dbbf..8224b84 100644 --- a/tests/test_detail_downloads.py +++ b/tests/test_detail_downloads.py @@ -1,4 +1,4 @@ -# Regression test for issue 60 +# Regression test for issue #60 def test_recursion_error_older_filings(downloader): dl, _ = downloader diff --git a/tests/test_user_input.py b/tests/test_user_input.py index 8ebfdbf..652ad38 100644 --- a/tests/test_user_input.py +++ b/tests/test_user_input.py @@ -20,9 +20,9 @@ def test_invalid_filing_type(downloader): ticker = "AAPL" expected_msg = f"'{fake_filing_type}' filings are not supported." - with pytest.raises(ValueError) as excinfo: + with pytest.raises(ValueError) as exc_info: dl.get(fake_filing_type, ticker) - assert expected_msg in str(excinfo.value) + assert expected_msg in str(exc_info.value) def test_invalid_ticker(downloader): @@ -47,6 +47,50 @@ def test_invalid_ticker(downloader): assert not filing_type_save_path.exists() +def test_invalid_cik(downloader): + dl, _ = downloader + expected_msg = "Invalid CIK. CIKs must be at most 10 digits long." + + filing_type = "10-K" + cik = "12345678910" + + with pytest.raises(ValueError) as exc_info: + dl.get(filing_type, cik, amount=1) + assert expected_msg in str(exc_info.value) + + +def test_blank_ticker(downloader): + dl, _ = downloader + expected_msg = "Invalid ticker or CIK. Please enter a non-blank value." + + filing_type = "10-K" + ticker = "" + + with pytest.raises(ValueError) as exc_info: + dl.get(filing_type, ticker, amount=1) + assert expected_msg in str(exc_info.value) + + +def test_cik_zero_padding(downloader): + # Regression test for issue #84 + dl, dl_path = downloader + + filing_type = "10-K" + cik = "0000789019" + num_filings_downloaded_full_cik = dl.get(filing_type, cik, amount=1) + + trimmed_cik = "789019" + num_filings_downloaded_trimmed_cik = dl.get(filing_type, trimmed_cik, amount=1) + + assert num_filings_downloaded_full_cik == 1 + assert num_filings_downloaded_trimmed_cik == 1 + + # Both filings should be saved under the directory 0000789019 + # if the CIK is properly padded. If zero-padding was not being done + # properly, we would have two parent directories of 789019 and 0000789019. + assert len(list(dl_path.glob("*"))) == 1 + + def test_invalid_num_filings_to_download(downloader): dl, _ = downloader expected_msg = "Invalid amount. Please enter a number greater than 1." @@ -54,13 +98,13 @@ def test_invalid_num_filings_to_download(downloader): filing_type = "10-K" ticker = "AAPL" - with pytest.raises(ValueError) as excinfo: + with pytest.raises(ValueError) as exc_info: dl.get(filing_type, ticker, amount=-1) - assert expected_msg in str(excinfo.value) + assert expected_msg in str(exc_info.value) - with pytest.raises(ValueError) as excinfo: + with pytest.raises(ValueError) as exc_info: dl.get(filing_type, ticker, amount=0) - assert expected_msg in str(excinfo.value) + assert expected_msg in str(exc_info.value) def test_invalid_before_and_after_dates(downloader): @@ -77,13 +121,13 @@ def test_invalid_before_and_after_dates(downloader): before_date = datetime(2019, 11, 15) incorrect_date_format = "%Y%m%d" - with pytest.raises(ValueError) as excinfo: + with pytest.raises(ValueError) as exc_info: dl.get(filing_type, ticker, after=after_date.strftime(incorrect_date_format)) - assert expected_msg in str(excinfo.value) + assert expected_msg in str(exc_info.value) - with pytest.raises(ValueError) as excinfo: + with pytest.raises(ValueError) as exc_info: dl.get(filing_type, ticker, before=before_date.strftime(incorrect_date_format)) - assert expected_msg in str(excinfo.value) + assert expected_msg in str(exc_info.value) def test_valid_before_and_after_date_combos(downloader): @@ -130,14 +174,14 @@ def test_invalid_before_and_after_date_combos(downloader): # after_date > before_date after_date += timedelta(days=3) expected_msg = "Invalid after and before date combination." - with pytest.raises(ValueError) as excinfo: + with pytest.raises(ValueError) as exc_info: dl.get( filing_type, ticker, after=after_date.strftime(DATE_FORMAT_TOKENS), before=before_date.strftime(DATE_FORMAT_TOKENS), ) - assert expected_msg in str(excinfo.value) + assert expected_msg in str(exc_info.value) def test_pre_default_after_date(downloader): @@ -148,9 +192,9 @@ def test_pre_default_after_date(downloader): invalid_date = DEFAULT_AFTER_DATE - timedelta(days=1) expected_msg = f"Filings cannot be downloaded prior to {DEFAULT_AFTER_DATE.year}." - with pytest.raises(ValueError) as excinfo: + with pytest.raises(ValueError) as exc_info: dl.get(filing_type, ticker, after=invalid_date.strftime(DATE_FORMAT_TOKENS)) - assert expected_msg in str(excinfo.value) + assert expected_msg in str(exc_info.value) def test_non_string_date(downloader): diff --git a/tests/test_utils.py b/tests/test_utils.py index d8aa27d..e1f5ce7 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,6 +1,6 @@ """Test miscellaneous utility functions.""" -from sec_edgar_downloader._utils import resolve_relative_urls_in_filing +from sec_edgar_downloader._utils import is_cik, resolve_relative_urls_in_filing def test_resolve_relative_urls(): @@ -43,3 +43,14 @@ def test_resolve_relative_urls(): assert resolved_filing_html.count(f'"{sample_anchor_fragment}"') == 2 assert resolved_filing_html.count(f'"{base_url}{sample_anchor_html}"') == 1 assert sample_filing_html.count(f'"{sample_anchor_full_url}"') == 1 + + +def test_is_cik(): + # CIKs are 10 digit identifiers that are zero-padded + # if they are shorter than 10 digits long + assert is_cik("0000789019") + assert is_cik("789019") + assert is_cik(789019) + + assert not is_cik("AAPL") + assert not is_cik("")