#utliser les robots pour scrapper

In [None]:
import json
import requests
from bs4 import BeautifulSoup

In [None]:
def extract(url):
    try:
        r = requests.get(url, timeout=15)
    except requests.RequestException:
        return None

    ctype = (r.headers.get("Content-Type") or "").lower()
    if "text/html" not in ctype:
        return None  # on ignore tout ce qui n'est pas du HTML

    # r.text peut bug selon encodage; on force un décodage safe
    try:
        html = r.content.decode(r.encoding or "utf-8", errors="ignore")
    except Exception:
        return None

    try:
        soup = BeautifulSoup(html, "html.parser")
    except Exception:
        return None

    for tag in soup(["header", "footer", "nav", "script", "style"]):
        tag.decompose()

    title = (soup.title.get_text(strip=True) if soup.title else "").strip()
    text = soup.get_text("\n", strip=True)

    if len(text) < 200:
        return None

    return {"url": url, "title": title, "text": text}

In [None]:
def get_all_urls(main_sitemap):
    try:
        r = requests.get(main_sitemap, timeout=15)
    except requests.RequestException:
        return []

    # sitemap = XML (text)
    try:
        xml = r.content.decode(r.encoding or "utf-8", errors="ignore")
    except Exception:
        return []

    soup = BeautifulSoup(xml, "xml")
    sitemap_links = [loc.get_text(strip=True) for loc in soup.find_all("loc")]

    all_urls = []
    for sitemap in sitemap_links:
        try:
            r2 = requests.get(sitemap, timeout=15)
        except requests.RequestException:
            continue

        try:
            xml2 = r2.content.decode(r2.encoding or "utf-8", errors="ignore")
        except Exception:
            continue

        sub = BeautifulSoup(xml2, "xml")
        all_urls.extend([loc.get_text(strip=True) for loc in sub.find_all("loc")])

    return all_urls

In [None]:
SITEMAPS = {
    "esilv": "https://www.esilv.fr/sitemap_index.xml",
    #"emlv": "https://www.emlv.fr/sitemap_index.xml",
    #"iim": "https://www.iim.fr/sitemap_index.xml",
}

VF_PATH = "all_sites_VF.jsonl"

# reset du VF à chaque run
open(VF_PATH, "w", encoding="utf-8").close()

for name, sitemap in SITEMAPS.items():
    urls = get_all_urls(sitemap)

    with open(f"{name}_all_pages.jsonl", "w", encoding="utf-8") as f_site, \
         open(VF_PATH, "a", encoding="utf-8") as f_vf:

        for url in urls:
            doc = extract(url)
            if doc:
                line = json.dumps(doc, ensure_ascii=False) + "\n"
                f_site.write(line)
                f_vf.write(line)

    print(f"{name}: {len(urls)} URLs -> fichier: {name}_all_pages.jsonl")

print(f"VF global: {VF_PATH}")


esilv: 7664 URLs -> fichier: esilv_all_pages.jsonl
emlv: 0 URLs -> fichier: emlv_all_pages.jsonl
iim: 0 URLs -> fichier: iim_all_pages.jsonl
VF global: all_sites_VF.jsonl
