In [7]:
import requests, datetime, feedparser, time
from urllib.parse import urlencode

UA = "Your Name (your.email@example.com)"
CUTOFF = datetime.date(2023, 1, 1)
FORM_TYPES = {"10-K", "10-Q", "8-K", "DEF 14A", "4"}  # add others as needed

def fetch_filings_for_cik(cik: str, form_types=FORM_TYPES, count=100):
    cik = str(cik).zfill(10)
    all_rows = []
    for form in form_types:
        start = 0
        while True:
            params = {
                "action": "getcompany",
                "CIK": cik,
                "type": form,
                "owner": "include",   # or "exclude"
                "count": count,
                "start": start,
                "output": "atom"
            }
            url = "https://www.sec.gov/cgi-bin/browse-edgar?" + urlencode(params)
            r = requests.get(url, headers={"User-Agent": UA}, timeout=30)
            r.raise_for_status()
            feed = feedparser.parse(r.text)

            # No more entries?
            if not feed.entries:
                break

            stop_here = False
            for e in feed.entries:
                # 'updated' or 'filing-date' both appear; keep both fallbacks
                filed = None
                if "filing-date" in e:
                    filed = datetime.date.fromisoformat(e["filing-date"])
                else:
                    # e.updated is RFC 822; feedparser gives a struct_time in e.updated_parsed
                    filed = datetime.date(*e.updated_parsed[:3])

                if filed < CUTOFF:
                    stop_here = True
                    continue

                all_rows.append({
                    "cik": cik,
                    "form": form,
                    "filed": filed.isoformat(),
                    "title": e.title,
                    "filing_href": e.link,              # 'Filing' page
                    "primary_doc_href": next((l.href for l in e.links if l.get("rel") == "alternate"), e.link)
                })

            if stop_here:
                break
            start += count
            time.sleep(0.2)  # be nice to EDGAR
    # Sort newest→oldest
    all_rows.sort(key=lambda x: x["filed"], reverse=True)
    return all_rows

In [1]:
import pandas as pd

sp100_tickers = pd.read_csv("data/sp100_tickers.csv")
sp100_tickers = sp100_tickers['Symbol'].tolist()
print(sp100_tickers)

['AAPL', 'ABBV', 'ABT', 'ACN', 'ADBE', 'AIG', 'AMD', 'AMGN', 'AMT', 'AMZN', 'AVGO', 'AXP', 'BA', 'BAC', 'BK', 'BKNG', 'BLK', 'BMY', 'BRK-B', 'C', 'CAT', 'CHTR', 'CL', 'CMCSA', 'COF', 'COP', 'COST', 'CRM', 'CSCO', 'CVS', 'CVX', 'DE', 'DHR', 'DIS', 'DUK', 'EMR', 'FDX', 'GD', 'GE', 'GILD', 'GM', 'GOOGL', 'GS', 'HD', 'HON', 'IBM', 'INTC', 'INTU', 'ISRG', 'JNJ', 'JPM', 'KO', 'LIN', 'LLY', 'LMT', 'LOW', 'MA', 'MCD', 'MDLZ', 'MDT', 'MET', 'META', 'MMM', 'MO', 'MRK', 'MS', 'MSFT', 'NEE', 'NFLX', 'NKE', 'NOW', 'NVDA', 'ORCL', 'PEP', 'PFE', 'PG', 'PLTR', 'PM', 'PYPL', 'QCOM', 'RTX', 'SBUX', 'SCHW', 'SO', 'SPG', 'T', 'TGT', 'TMO', 'TMUS', 'TSLA', 'TXN', 'UNH', 'UNP', 'UPS', 'USB', 'V', 'VZ', 'WFC', 'WMT', 'XOM']


In [20]:
import json

ticker_to_cik = {}

data_path = "data/company_tickers.json"
with open(data_path, 'r') as f:
    data = json.load(f)

for c_data in data.values():
    ticker_to_cik[c_data['ticker']] = c_data['cik_str']

In [27]:
import json
from pathlib import Path

output_dir = Path("data/edgar_documents")

for ticker in sp100_tickers:
    outpath = output_dir / f"{ticker}_filings.json"

    # Skip if already downloaded
    if outpath.exists():
        print(f"Skipping {ticker}: already exists → {outpath}")
        continue

    try:
        cik = ticker_to_cik[ticker]
        print(f"Fetching filings for {ticker} (CIK: {cik})...")
        rows = fetch_filings_for_cik(cik)

        with open(outpath, "w", encoding="utf-8") as f:
            json.dump(rows, f, indent=2, ensure_ascii=False)

        print(f"✅ Saved {len(rows)} filings for {ticker} → {outpath}")

        time.sleep(0.3)

    except Exception as e:
        print(f"Error processing {ticker}: {e}")
        continue

Skipping AAPL: already exists → data\edgar_documents\AAPL_filings.json
Skipping ABBV: already exists → data\edgar_documents\ABBV_filings.json
Skipping ABT: already exists → data\edgar_documents\ABT_filings.json
Skipping ACN: already exists → data\edgar_documents\ACN_filings.json
Skipping ADBE: already exists → data\edgar_documents\ADBE_filings.json
Skipping AIG: already exists → data\edgar_documents\AIG_filings.json
Skipping AMD: already exists → data\edgar_documents\AMD_filings.json
Skipping AMGN: already exists → data\edgar_documents\AMGN_filings.json
Skipping AMT: already exists → data\edgar_documents\AMT_filings.json
Skipping AMZN: already exists → data\edgar_documents\AMZN_filings.json
Skipping AVGO: already exists → data\edgar_documents\AVGO_filings.json
Skipping AXP: already exists → data\edgar_documents\AXP_filings.json
Skipping BA: already exists → data\edgar_documents\BA_filings.json
Fetching filings for BAC (CIK: 70858)...
Error processing BAC: 503 Server Error: Service Unav

In [55]:
import os
from pathlib import Path
import json
import requests
from bs4 import BeautifulSoup 

# output dir
ROOT_DIR = "data/edgar_documents"
FILINGS_NAME = "_filings.json"

UA = "Your Name (your.email@example.com)"
session = requests.Session()
session.headers.update({"User-Agent": UA})

def extract_file_link(url):
    resp = session.get(url, timeout=30)
    soup = BeautifulSoup(resp.text, "html.parser")
    tables = soup.select("table.tableFile")
    doc_table = tables[0]
    first_table_row = doc_table.select("tr")[1]
    raw_link = first_table_row.find('a', href=True)['href']
    formatted_raw_link = raw_link.split("doc=")[1] if "doc=" in raw_link else raw_link
    doc_url = "https://www.sec.gov" + formatted_raw_link
    return doc_url

def download_document(url: str, dest: Path) -> bool:
    if dest.exists() and dest.stat().st_size > 0:
        return True
    doc_link = extract_file_link(url)
    resp = session.get(doc_link, timeout=30)
    if resp.status_code == 200:
        dest.parent.mkdir(exist_ok=True)
        dest.write_bytes(resp.content)
        return True
    return False

for ticker in sp100_tickers:
    filing_path = Path(ROOT_DIR + '/' + ticker + FILINGS_NAME)
    if filing_path.exists():
        with open(filing_path, 'r') as f:
            filings = json.load(f)
        
        SAVE_PATH = ROOT_DIR + '/' + ticker
        os.makedirs(SAVE_PATH, exist_ok=True)
        
        total_downloaded = 0
        for filing in filings:
            file_name = filing['form'].replace(" ", '-') + '_' + filing['filed']
            file_path = Path(ROOT_DIR).joinpath(ticker, file_name)
            url = filing['filing_href']
            total_downloaded  += download_document(url, dest=file_path)
        print(f"Completed {ticker}, downloaded: {total_downloaded}/{len(filings)}")

Completed AAPL, downloaded: 167/167
Completed ABBV, downloaded: 232/232
Completed ABT, downloaded: 232/232
Completed ACN, downloaded: 734/734
Completed ADBE, downloaded: 321/321
Completed AIG, downloaded: 438/438
Completed AMD, downloaded: 245/245
Completed AMGN, downloaded: 294/294
Completed AMT, downloaded: 228/228
Completed AMZN, downloaded: 276/276
Completed AVGO, downloaded: 224/224
Completed AXP, downloaded: 362/362
Completed BA, downloaded: 310/310
Completed BK, downloaded: 316/316
Completed BKNG, downloaded: 204/204
Completed BLK, downloaded: 153/153
Completed BMY, downloaded: 258/258
Completed BRK-B, downloaded: 180/180
Completed CAT, downloaded: 301/301
Completed CHTR, downloaded: 285/285
Completed CL, downloaded: 243/243
Completed CMCSA, downloaded: 263/263
Completed COF, downloaded: 395/395
Completed COP, downloaded: 278/278
Completed COST, downloaded: 211/211
Completed CRM, downloaded: 936/936
Completed CSCO, downloaded: 299/299
Completed CVS, downloaded: 223/223
Completed