In [1]:
# Installation de la bibliothèque si nécessaire (à commenter si déjà installé)
# !pip install beautifulsoup4 requests

import requests
from bs4 import BeautifulSoup
import json
import time

# 1. Liste des URLs pour l'Ukraine (35 URLs fournies)
urls_ukraine = [
    "https://www.bbc.com/news/articles/cwypp1jwjz1o",
    "https://www.bbc.com/news/live/ce8np3pnljdt",
    "https://www.bbc.com/news/articles/c30jre652gvo",
    "https://www.bbc.com/news/articles/c1dz6wgn2w9o",
    "https://www.bbc.com/news/articles/c0l0k4389g2o",
    "https://www.bbc.com/news/videos/cx2e1kr2drdo",
    "https://www.bbc.com/news/articles/c9vjlj1ezpgo",
    "https://www.bbc.com/news/articles/crmd1xgy38go",
    "https://www.bbc.com/news/articles/c80xjkm0283o",
    "https://www.bbc.com/news/articles/clyzzxggl7go",
    "https://www.bbc.com/news/articles/c0l9954yr9ko",
    "https://www.bbc.com/news/articles/cqlkk9qk7vyo",
    "https://www.bbc.com/news/articles/c5yjj2epvlro",
    "https://www.bbc.com/news/articles/c98nnd01g91o",
    "https://www.bbc.com/news/articles/cgjnd9y52pno",
    "https://www.bbc.com/news/articles/clydz0j1yego",
    "https://www.bbc.com/news/articles/c5y20p7xweko",
    "https://www.bbc.com/news/articles/cn5l1474yv7o",
    "https://www.bbc.com/news/articles/c3e07kxey74o",
    "https://www.bbc.com/news/articles/cg7vdd115vjo",
    "https://www.bbc.com/news/articles/c2k4z9nwz4yo",
    "https://www.bbc.com/news/articles/crew8y7pwd5o",
    "https://www.bbc.com/news/articles/cz680jx511no",
    "https://www.bbc.com/news/articles/clydxj7my84o",
    "https://www.bbc.com/news/articles/cpq4e21nr5vo",
    "https://www.bbc.com/news/articles/cz7n95wzl9lo",
    "https://www.bbc.com/news/articles/cn41jzw9eyko",
    "https://www.bbc.com/news/articles/ce8ng3yjenpo",
    "https://www.bbc.com/news/articles/cp9ky5z2gx0o",
    "https://www.bbc.com/news/articles/cp84nl57v0lo",
    "https://www.bbc.com/news/articles/cly0jjv2lg1o",
    "https://www.bbc.com/news/articles/c17x0x2qrejo",
    "https://www.bbc.com/news/articles/cn4d27dx80no",
    "https://www.bbc.com/news/articles/cde6yld78d6o",
    "https://www.bbc.com/news/articles/cwy170jkekdo"
]

corpus_ukraine = []

print("--- Début de la collecte du corpus Ukraine ---")

for url in urls_ukraine:
    try:
        # Requête pour obtenir la page avec un timeout
        response = requests.get(url, timeout=10)
        
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Extraction du titre (Balise h1)
            title_tag = soup.find('h1')
            title = title_tag.get_text(strip=True) if title_tag else "Sans titre"
            
            # Extraction du texte (Toutes les balises <p>)
            # On ignore les textes trop courts pour éviter les menus/pubs
            paragraphs = [p.get_text(strip=True) for p in soup.find_all('p') if len(p.get_text()) > 20]
            full_text = " ".join(paragraphs)
            
            # On n'ajoute l'article que s'il y a du contenu
            if len(full_text) > 100:
                corpus_ukraine.append({
                    "source": "BBC",
                    "url": url,
                    "title": title,
                    "content": full_text
                })
                print(f" Succès : {title[:50]}...")
            else:
                print(f"Contenu trop court ou vide pour : {url}")
        else:
            print(f" Erreur {response.status_code} pour l'URL : {url}")
            
        # Pause éthique pour respecter le serveur
        time.sleep(1)
        
    except Exception as e:
        print(f" Erreur critique sur {url} : {e}")

# 2. Sauvegarde dans un fichier JSON
file_name = 'corpus_ukraine.json'
with open(file_name, 'w', encoding='utf-8') as f:
    json.dump(corpus_ukraine, f, ensure_ascii=False, indent=4)

print(f"\n--- TERMINÉ ---")
print(f"Résultat : {len(corpus_ukraine)} articles sauvegardés dans {file_name}")

# Téléchargement automatique (Si vous utilisez Google Colab)
try:
    from google.colab import files
    files.download(file_name)
except ImportError:
    print(f"Note : Le fichier est disponible localement sous le nom {file_name}")

--- Début de la collecte du corpus Ukraine ---
 Succès : Stigma of Ukraine's forgotten soldiers who 'died t...
 Succès : EU agrees â¬90bn loan for Ukraine as Putin tells ...
 Succès : Briton who fought in Ukraine jailed for 13 years b...
 Succès : In this secret missile factory, Ukraine is ramping...
 Succès : Ukraine in maps: Tracking the war with Russia...
 Succès : Could the EU release frozen assets to fund Ukraine...
 Succès : British soldier killed in Ukraine told family to '...
 Succès : Ukraine struggling to keep lights on under Russian...
 Succès : What it would take to stop Putin fighting in Ukrai...
 Succès : One million households without power in Ukraine af...
 Succès : Ukraine's health supplies hit in series of Russian...
 Succès : Turkish car ferry damaged in strike at Ukrainian p...
 Succès : EU backs indefinite freeze on Russia's frozen cash...
 Succès : US wants 'special economic zone' in Ukraine's fron...
 Succès : Poland arrests Russian archaeologist wanted in Ukr..