In [53]:
import os
import re
import httpx

import edgar as edgar
from edgar.core import set_identity
from edgar._companies import Company
from edgar._filings import get_by_accession_number
import pandas as pd
from bs4 import BeautifulSoup
from rich import print
from dotenv import load_dotenv, find_dotenv

%load_ext rich

load_dotenv(find_dotenv())

set_identity("lakshya@insy695.com")

The rich extension is already loaded. To reload it, use:
  %reload_ext rich


In [54]:
BASE_URL = "https://api.sec-api.io/"
client = httpx.Client(
    base_url=BASE_URL,
    timeout=60.0
)


In [55]:
def parse_html_filing(filing):
    html_ = filing.html()
    regex = re.compile(r"(>Item(\s|&#160;|&nbsp;)(7A|7)\.{0,1})|(ITEM\s(7A|7)\.{0,1})")
    matches = regex.finditer(html_)

    match_df = pd.DataFrame([(x.group(), x.start(), x.end()) for x in matches])

    match_df.columns = ["item", "start", "end"]
    match_df["item"] = match_df.item.str.lower()

    match_df.replace("&#160;", " ", regex=True, inplace=True)
    match_df.replace("&nbsp;", " ", regex=True, inplace=True)
    match_df.replace(" ", " ", regex=True, inplace=True)
    match_df.replace("\.", "", regex=True, inplace=True)
    match_df.replace(">", "", regex=True, inplace=True)

    match_df = match_df.sort_values("start", ascending=True).drop_duplicates(
        subset=["item"], keep="last"
    )

    if len(match_df["start"].values) > 1:
        item_7_raw = html_[match_df["start"].values[0] : match_df["start"].values[1]]
    else:
        item_7_raw = html_[match_df["start"].values[0] :]

    item_7_content = BeautifulSoup(item_7_raw, "lxml").get_text("\n\n")

    return item_7_content

In [58]:
def extract_section(ticker, year):
    company = Company(ticker)
    filings_df = company.get_filings(form="10-K").to_pandas()
    filings_df["year"] = pd.to_datetime(filings_df["reportDate"]).dt.year
    accession_number = filings_df.query(f"year == {year}")["accession_number"].values[0]

    filing = get_by_accession_number(accession_number)

    item_7_content_html = parse_html_filing(filing)

    extractor_params = {
        "url": filing.url,
        "item": "7",
        "type": "text",
        "token": os.getenv("SEC_API_KEY"),
    }

    response = client.get("extractor", params=extractor_params)

    item_7_extracted = response.text

    # Save to files
    with open(f"extracted/{ticker}_{year}_item_7_extracted.txt", "w") as f:
        f.write(item_7_extracted)

    with open(f"html/{ticker}_{year}_item_7.txt", "w") as f:
        f.write(item_7_content_html)

    print(f"{ticker} {year} ITEM 7 saved")

    return filing


In [61]:
companies_code = pd.read_csv("./data/constituents.csv")

for company_code in companies_code['Symbol']:
    try:
        extract_section(company_code, 2020)
    except Exception as e:
        print(f"Extraction failed for company: {e}")
        continue

In [59]:
extract_section("NFLX", 2020)


╭──────────────────────────────────────── 10-K 📊 filing for NETFLIX INC ─────────────────────────────────────────╮
│                                                                                                                 │
│  [1m [0m[1mAccession Number    [0m[1m [0m [1m [0m[1mFiling Date[0m[1m [0m [1m [0m[1mCompany    [0m[1m [0m [1m [0m[1mCIK    [0m[1m [0m                                                   │
│  ────────────────────────────────────────────────────────────                                                   │
│  [1m [0m[1m0001065280-21-000040[0m[1m [0m  2021-01-28    NETFLIX INC   1065280                                                    │
│                                                                                                                 │
│                                                                                                                 │
│  [1m [0m[1mLinks[0m[1m: 🏠 Homepage 📄 Primary Document 📜 Full 