In [17]:
import os
import re
import json
import requests
from typing import Any

from bs4 import BeautifulSoup, Tag
from urllib.parse import urljoin, unquote

BASE_URL = 'https://pt.wikipedia.org'
MAIN_URL = BASE_URL + '/wiki/Wikip%C3%A9dia:Sabia_que'

OUTPUT_DIR = 'data/pt'

In [10]:
def get_year_links_from_archive(main_page_url):
    """Get the year links from the main archive page."""
    resp = requests.get(main_page_url)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, 'html.parser')

    table = soup.find('table', class_='tmbox tmbox-notice }}')
    if not table:
        raise ValueError("Could not find the table in the container div.")

    archive_links = {}
    for b_tag in table.find_all('b'):
        a_tag = b_tag.find('a', href=True)
        if a_tag:
            year = a_tag.text.strip()
            if re.match(r'^\d{4}$', year):
                href = a_tag['href']
                full_url = urljoin(BASE_URL, href)
                exists = 'new' not in a_tag.get('class', [])
                archive_links[year] = {
                    'url': full_url,
                    'exists': exists
                }

    return archive_links

archive = get_year_links_from_archive(MAIN_URL)
archive

{'2006': {'url': 'https://pt.wikipedia.org/wiki/Wikip%C3%A9dia:Sabia_que/Arquivo/2006',
  'exists': True},
 '2007': {'url': 'https://pt.wikipedia.org/wiki/Wikip%C3%A9dia:Sabia_que/Arquivo/2007',
  'exists': True},
 '2008': {'url': 'https://pt.wikipedia.org/wiki/Wikip%C3%A9dia:Sabia_que/Arquivo/2008',
  'exists': True},
 '2009': {'url': 'https://pt.wikipedia.org/wiki/Wikip%C3%A9dia:Sabia_que/Arquivo/2009',
  'exists': True},
 '2010': {'url': 'https://pt.wikipedia.org/wiki/Wikip%C3%A9dia:Sabia_que/Arquivo/2010',
  'exists': True},
 '2011': {'url': 'https://pt.wikipedia.org/wiki/Wikip%C3%A9dia:Sabia_que/Arquivo/2011',
  'exists': True},
 '2012': {'url': 'https://pt.wikipedia.org/wiki/Wikip%C3%A9dia:Sabia_que/Arquivo/2012',
  'exists': True},
 '2013': {'url': 'https://pt.wikipedia.org/wiki/Wikip%C3%A9dia:Sabia_que/Arquivo/2013',
  'exists': True},
 '2014': {'url': 'https://pt.wikipedia.org/wiki/Wikip%C3%A9dia:Sabia_que/Arquivo/2014',
  'exists': True},
 '2015': {'url': 'https://pt.wikipedi

In [21]:
year = '2010'

resp = requests.get(archive[year]['url'])
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "html.parser")

results: list[dict] = []

table = soup.find("table", class_="wikitable")

table

<table class="wikitable">
<tbody><tr>
<th>Texto
</th>
<th>Imagem
</th></tr>
<tr>
<td>... <b><a href="/wiki/Andrew_Wiles" title="Andrew Wiles">Andrew Wiles</a></b> <i>(imagem)</i> foi o matemático a demonstrar o <a href="/wiki/%C3%9Altimo_teorema_de_Fermat" title="Último teorema de Fermat">Último teorema de Fermat</a>?</td>
<td><span typeof="mw:File"><a class="mw-file-description" href="/wiki/Ficheiro:Andrew_wiles1-3.jpg"><img class="mw-file-element" data-file-height="2502" data-file-width="1986" decoding="async" height="63" src="//upload.wikimedia.org/wikipedia/commons/thumb/4/4c/Andrew_wiles1-3.jpg/60px-Andrew_wiles1-3.jpg" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/4/4c/Andrew_wiles1-3.jpg/120px-Andrew_wiles1-3.jpg 1.5x" width="50"/></a></span>
</td></tr>
<tr>
<td>... em termos de <a href="/wiki/Biomassa" title="Biomassa">biomassa</a>, o <b><a href="/wiki/Krill_ant%C3%A1rtico" title="Krill antártico">Krill antártico</a></b> é a melhor sucedida das espécies animais do plan

In [30]:
def _extract_fact_data(element: Tag, base_url: str) -> dict[str, Any]:
    """Extracts text, links, and relevant links from a BeautifulSoup Tag."""
    fact_text = element.get_text(" ", strip=True)

    links = []
    relevant_links = []
    for a in element.find_all("a", href=True):
        href = a["href"]
        if not href.startswith("/wiki/"):
            continue

        full_url = unquote(urljoin(base_url, href))

        if a.find_parent('b'):
            relevant_links.append(full_url)

        links.append(full_url)

    return {
        "text": fact_text,
        "links": links,
        "relevant_links": relevant_links,
    }


def _post_process_section_title(title: str, year: str) -> str:
    """Post-processes the section title to include the year."""
    if title == str(None):
        return year
    
    if len(title) < 4 or not title[-4:].isdigit():
        return f"{title} de {year}"
    
    return title


def parse_year_facts(year_url: str, year: str) -> list[dict]:
    """Parse the facts from the year page."""
    resp = requests.get(year_url)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")

    results: list[dict] = []

    # If a wikitable exists, it's the only element with facts.
    wikitable = soup.find("table", class_="wikitable")
    if wikitable:
        section_title = _post_process_section_title(str(None), year)
        for tr in wikitable.find_all("tr")[1:]:  # Ignore header row
            td = tr.find("td")
            if td:
                fact_data = _extract_fact_data(td, year_url)
                if fact_data['text']:
                    results.append({
                        "section": section_title,
                        **fact_data
                    })
        return results

    # Otherwise, parse facts from sections.
    section_divs = soup.select("div.mw-heading.mw-heading2, div.mw-heading.mw-heading3, div.mw-heading.mw-heading4")
    
    for section_div in section_divs:
        header = section_div.find(["h2", "h3", "h4"])
        raw_section_title = header.get_text(strip=True) if header else str(None)
        section_title = _post_process_section_title(raw_section_title, year)

        for sib in section_div.find_next_siblings():
            sib_classes = sib.get("class", []) if isinstance(sib, Tag) else []
            is_heading = "mw-heading" in sib_classes and any(f"mw-heading{i}" in sib_classes for i in [2, 3, 4])
            if is_heading:
                break

            # Some facts can be written as paragraphs
            if isinstance(sib, Tag) and sib.name == "p":
                fact_data = _extract_fact_data(sib, year_url)
                if fact_data['text']:
                    results.append({
                        "section": section_title,
                        **fact_data
                    })
                continue

            if not (isinstance(sib, Tag) and sib.name == "ul"):
                continue

            # Most facts are written as items in unordered lists
            for li in sib.find_all("li"):
                fact_data = _extract_fact_data(li, year_url)
                if fact_data['text']:
                    results.append({
                        "section": section_title,
                        **fact_data
                    })

    return results


current_year = '2025'
parse_year_facts(archive[current_year]['url'], current_year)

[{'section': 'Fevereiro de 2025',
  'text': '… a Lâmpada Centenária , criada por Adolphe Alexandre Chaillet e mantida pelo Corpo de Bombeiros de Livermore - Pleasanton , na Califórnia , é a lâmpada incandescente mais antiga do mundo ainda em funcionamento, acesa desde 1901 ?',
  'links': ['https://pt.wikipedia.org/wiki/Lâmpada_Centenária',
   'https://pt.wikipedia.org/wiki/Adolphe_Alexandre_Chaillet',
   'https://pt.wikipedia.org/wiki/Livermore_(Califórnia)',
   'https://pt.wikipedia.org/wiki/Pleasanton_(Califórnia)',
   'https://pt.wikipedia.org/wiki/Califórnia',
   'https://pt.wikipedia.org/wiki/Lâmpada_incandescente',
   'https://pt.wikipedia.org/wiki/1901'],
  'relevant_links': ['https://pt.wikipedia.org/wiki/Lâmpada_Centenária']},
 {'section': 'Fevereiro de 2025',
  'text': '… o porquinho-da-índia não é originário da Índia e sim dos Andes , na América do Sul ?',
  'links': ['https://pt.wikipedia.org/wiki/Porquinho-da-índia',
   'https://pt.wikipedia.org/wiki/Índia',
   'https://pt