In [5]:
# 1. Imports
from lxml import html
import requests, time
from IPython.display import display, Markdown


# 2. Load Raw HTML
url = "https://www.sec.gov/Archives/edgar/data/1835632/000183563225000051/q425_8kx212025ex-991.htm"
headers = {
    "User-Agent": "Kris at Safe Harbor Stocks (kris@safeharborstocks.com)"
}

response = requests.get(url, headers=headers)
time.sleep(1.0)  # ✅ throttle to avoid hammering SEC servers
raw_html = response.text

with open("output/raw_exhibit.html", "w", encoding="utf-8") as f:
    f.write(raw_html)

# 4. Parse HTML Tree
tree = html.fromstring(raw_html)

# 5. Confirm Tree Type
print(f"Parsed tree type: {type(tree)}")  # should be <class 'lxml.html.HtmlElement'>

# 6. Extract <body> Element
body_elements = tree.xpath('//body')
if not body_elements:
    raise ValueError("No <body> tag found in HTML.")
body = body_elements[0]

# 7. Inspect First-Level Children Tags
print("First-level children in <body>:")
for child in body.iterchildren():
    print(child.tag)

# 8. Extract and Remove <table> Elements
tables = body.xpath('.//table')
print(f"Found {len(tables)} tables to remove")

for table in tables:
    parent = table.getparent()
    if parent is not None:
        parent.remove(table)

# 9. Checkpoint After Table Removal
print("First-level children in <body> after table removal:")
for child in body.iterchildren():
    print(child.tag)

# 10. Save Cleaned Body HTML
cleaned_html = html.tostring(body, pretty_print=True, encoding='unicode')

with open("output/cleaned_exhibit_body.html", "w", encoding="utf-8") as f:
    f.write(cleaned_html)

print("✅ Saved cleaned body HTML to 'output/cleaned_exhibit_body.html'")


Parsed tree type: <class 'lxml.html.HtmlElement'>
First-level children in <body>:
document
Found 10 tables to remove
First-level children in <body> after table removal:
document
✅ Saved cleaned body HTML to 'output/cleaned_exhibit_body.html'
