In [4]:
import re
import time
import requests
from bs4 import BeautifulSoup
import urllib3

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

BASE = "https://encyclopedia.chicagohistory.org"
INDEX = BASE + "/pages/700005.html?entryA"

HEADERS = {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36"
}

def fetch(url: str) -> str:
    r = requests.get(url, headers=HEADERS, timeout=30, verify=False, allow_redirects=True)
    # Helpful debug
    print(f"GET {r.status_code} {r.url} (len={len(r.text)})")
    r.raise_for_status()
    return r.text

def collect_entry_links(index_html: str) -> list[str]:
    soup = BeautifulSoup(index_html, "html.parser")
    links = set()

    for a in soup.find_all("a", href=True):
        href = a["href"].strip()

        # Typical entry links on that site look like: /pages/123.html
        if re.match(r"^/pages/\d+\.html$", href):
            links.add(BASE + href)

    return sorted(links)

def extract_entry_text(entry_html: str, url: str) -> str | None:
    soup = BeautifulSoup(entry_html, "html.parser")

    # Title: try a few common patterns
    title = None
    if soup.find("h1"):
        title = soup.find("h1").get_text(" ", strip=True)
    if not title and soup.title:
        title = soup.title.get_text(" ", strip=True)

    # Body: try common containers, then fall back to <article> or largest <div>
    candidates = []
    for selector in [
        ("div", {"id": "content"}),
        ("div", {"id": "main"}),
        ("div", {"class": "content"}),
        ("div", {"class": "entry"}),
        ("article", None),
    ]:
        tag, attrs = selector
        node = soup.find(tag, attrs) if attrs else soup.find(tag)
        if node:
            text = node.get_text("\n", strip=True)
            if text and len(text) > 500:  # ignore tiny nav chunks
                candidates.append(text)

    if not candidates:
        # Fallback: pick the largest div by text length
        divs = soup.find_all("div")
        best = ""
        for d in divs:
            t = d.get_text("\n", strip=True)
            if len(t) > len(best):
                best = t
        if len(best) > 800:
            candidates.append(best)

    if not candidates:
        print(f"  !! Could not extract body from {url}")
        return None

    body = max(candidates, key=len)

    # Light cleanup: remove repeated whitespace
    body = re.sub(r"\n{3,}", "\n\n", body).strip()

    if not title:
        title = url

    return f"{title}\n{url}\n\n{body}\n"

def main():
    index_html = fetch(INDEX)
    entry_urls = collect_entry_links(index_html)

    print(f"\nFound {len(entry_urls)} entry links on the index page.\n")
    if len(entry_urls) == 0:
        print("No entry links found. The index page HTML structure may differ.")
        return

    out_path = "encyclopedia_chicago_entries_A.txt"
    written = 0
    skipped = 0

    with open(out_path, "w", encoding="utf-8") as f:
        for i, url in enumerate(entry_urls, start=1):
            try:
                html = fetch(url)
                entry = extract_entry_text(html, url)
                if entry:
                    f.write("=" * 80 + "\n")
                    f.write(entry + "\n\n")
                    written += 1
                else:
                    skipped += 1
            except Exception as e:
                print(f"  !! Error on {url}: {e}")
                skipped += 1

            if i % 10 == 0:
                print(f"Progress: {i}/{len(entry_urls)} (written={written}, skipped={skipped})")

            time.sleep(0.5)  # be polite

    print(f"\nDone. Wrote {written} entries. Skipped {skipped}. Output: {out_path}")

if __name__ == "__main__":
    main()


GET 200 https://encyclopedia.chicagohistory.org/pages/700005.html?entryA (len=6705)

Found 3 entry links on the index page.

GET 200 https://encyclopedia.chicagohistory.org/pages/700011.html (len=5868)
  !! Could not extract body from https://encyclopedia.chicagohistory.org/pages/700011.html
GET 200 https://encyclopedia.chicagohistory.org/pages/700013.html (len=3749)
  !! Could not extract body from https://encyclopedia.chicagohistory.org/pages/700013.html
GET 200 https://encyclopedia.chicagohistory.org/pages/700031.html (len=7661)
  !! Could not extract body from https://encyclopedia.chicagohistory.org/pages/700031.html

Done. Wrote 0 entries. Skipped 3. Output: encyclopedia_chicago_entries_A.txt
