In [21]:
import requests
from bs4 import BeautifulSoup

WIKIPEDIA_PAGE_URL = 'https://en.wikipedia.org/wiki/Data_science'

# For authorization
REQUEST_HEADERS = {
    "User-Agent": "UCalgary-Student-Assignment/1.0 (educational-use)"
}

EXCLUDED_HEADINGS = ['references', 'external links', 'see also', 'notes']

wikipedia_page_html = requests.get(WIKIPEDIA_PAGE_URL, headers = REQUEST_HEADERS)
wikipedia_page_html.raise_for_status() # Check the status of the request

parsed_html_document = BeautifulSoup(wikipedia_page_html.text, 'html.parser')

div_content = parsed_html_document.find("div", id = "mw-content-text")

headings = []

for h2_content in div_content.find_all("h2"):
    # Prefer the actual headline text
    headline = h2_content.find("span", class_="mw-headline")
    text = headline.get_text(" ", strip=True) if headline else h2_content.get_text(" ", strip=True)

    # Remove any [edit] text if present
    text = text.replace("[edit]", "").strip()

    # Skip headings containing any excluded headings (case-insensitive)
    text_lower = text.lower()
    if any(word in text_lower for word in EXCLUDED_HEADINGS):
        continue

    headings.append(text)

with open("headings.txt", "w", encoding="utf-8") as file:
    for heading in headings:
        file.write(heading + "\n")

print(f"Saved {len(headings)} headings to headings.txt")

Saved 5 headings to headings.txt
