In [4]:
import re
from urllib.parse import urljoin, urlparse

import requests
from bs4 import BeautifulSoup


def get_soup(url: str, timeout: int = 20) -> BeautifulSoup:
    """
    1) Get and parse HTML content from a Wikipedia page.
    """
    headers = {
        "User-Agent": "Mozilla/5.0 (compatible; WikiScraper/1.0)"
    }
    resp = requests.get(url, headers=headers, timeout=timeout)
    resp.raise_for_status()
    return BeautifulSoup(resp.text, "html.parser")


def extract_title(soup: BeautifulSoup) -> str:
    """
    2) Extract article title.
    Wikipedia typically uses <h1 id="firstHeading">.
    """
    h1 = soup.find("h1", id="firstHeading")
    if h1 and h1.get_text(strip=True):
        return h1.get_text(strip=True)

    # Fallback
    if soup.title and soup.title.get_text(strip=True):
        return soup.title.get_text(strip=True).replace(" - Wikipedia", "").strip()

    return ""


def extract_text_by_heading(soup: BeautifulSoup) -> dict:
    """
    3) Extract article text for each paragraph with their respective headings.
    Map headings to their paragraphs in a dictionary.

    Strategy:
    - Focus on the main content container: <div id="mw-content-text">.
    - Walk through the content in order and group <p> paragraphs under the
      current heading.
    - Headings considered: h2, h3, h4 (you can extend this).
    """
    content = soup.find("div", id="mw-content-text")
    if not content:
        return {}

  
    parser_output = content.find("div", class_="mw-parser-output") or content

    sections: dict[str, list[str]] = {}
    current_heading = "Introduction"
    sections[current_heading] = []

   
    for elem in parser_output.find_all(["h2", "h3", "h4", "p"], recursive=False):
        if elem.name in ("h2", "h3", "h4"):
        
            headline = elem.find(class_="mw-headline")
            heading_text = (headline.get_text(" ", strip=True) if headline else elem.get_text(" ", strip=True))

            if heading_text.lower() in {"contents", "see also", "references", "external links", "notes", "further reading"}:
                current_heading = heading_text
                sections.setdefault(current_heading, [])
                
                continue

            current_heading = heading_text if heading_text else current_heading
            sections.setdefault(current_heading, [])

        elif elem.name == "p":
            text = elem.get_text(" ", strip=True)

           
            if text:
                sections.setdefault(current_heading, []).append(text)
    return sections


def collect_internal_wikipedia_links(soup: BeautifulSoup, base_url: str) -> set:
    
    internal_links = set()

    for a in soup.find_all("a", href=True):
        href = a["href"].strip()

      
        if not href or href.startswith("#"):
            continue

        if href.startswith("/wiki/"):
         
            if ":" in href:
                continue

            full = urljoin(base_url, href)
            internal_links.add(full)

    return internal_links


def scrape_wikipedia_page(url: str) -> dict:
    """
    5) Wrap all the previous functions into a single function
       that takes as parameters a Wikipedia link.

    Returns a dictionary containing:
    - url
    - title
    - sections (heading -> list of paragraphs)
    - internal_links (set of absolute URLs)
    """
    soup = get_soup(url)


    parsed = urlparse(url)
    base_url = f"{parsed.scheme}://{parsed.netloc}"

    title = extract_title(soup)
    sections = extract_text_by_heading(soup)
    links = collect_internal_wikipedia_links(soup, base_url)

    return {
        "url": url,
        "title": title,
        "sections": sections,
        "internal_links": links,
    }


if __name__ == "__main__":
  
    test_url = "https://en.wikipedia.org/wiki/NASA"
    result = scrape_wikipedia_page(test_url)

    print("URL:", result["url"])
    print("Title:", result["title"])
    print("\nHeadings found:", len(result["sections"]))
    print("Internal links found:", len(result["internal_links"]))

    
    print("\n--- Section Preview ---")
    shown = 0
    for heading, paragraphs in result["sections"].items():
        if not paragraphs:
            continue
        print(f"\n## {heading}")
        print(paragraphs[0][:300] + ("..." if len(paragraphs[0]) > 300 else ""))
        shown += 1
        if shown >= 3:
            break

 
    print("\n--- Links Preview ---")
    for i, link in enumerate(list(result["internal_links"])[:10], start=1):
        print(f"{i}. {link}")


URL: https://en.wikipedia.org/wiki/NASA
Title: NASA

Headings found: 1
Internal links found: 1640

--- Section Preview ---

--- Links Preview ---
1. https://en.wikipedia.org/wiki/Combustion
2. https://en.wikipedia.org/wiki/Atoms_for_Peace_Award
3. https://en.wikipedia.org/wiki/List_of_uncrewed_NASA_missions
4. https://en.wikipedia.org/wiki/Brazilian_Space_Agency
5. https://en.wikipedia.org/wiki/Hubble_Legacy_Field
6. https://en.wikipedia.org/wiki/Mir
7. https://en.wikipedia.org/wiki/Voyager_Golden_Record
8. https://en.wikipedia.org/wiki/Proton_(rocket)
9. https://en.wikipedia.org/wiki/Carina_Nebula
10. https://en.wikipedia.org/wiki/Lunar_Prospector
