In [45]:
import requests
from bs4 import BeautifulSoup
import json
import time
from datetime import datetime
from requests.adapters import HTTPAdapter
from urllib3.util import Retry
import re

# ============================
# 1. URLs UKRAINE
# ============================]
urls_ukraine = [
    
    "https://www.foxnews.com/politics/melania-trump-says-7-more-ukrainian-children-reunited-families-part-initiative-russia",
    "https://www.foxnews.com/world/ukraines-underground-railroad-rescues-abducted-ukrainian-children-from-russian-reeducation-camps",
    "https://www.foxnews.com/lifestyle/humanitarian-aid-expert-rescue-children-ukraine",
    "https://www.foxnews.com/lifestyle/unicef-gives-aid-russia-ukraine-refugees",
    "https://www.foxnews.com/lifestyle/ukraine-refugee-children-war-help-kids",
    "https://www.foxnews.com/world/thousands-ukrainian-children-forcibly-deported-russia-july-us-security-official-says",
    "https://www.foxnews.com/world/ukrainian-children-death-toll-humanitarian-crisis-food",
    "https://www.foxnews.com/politics/just-evil-top-republican-details-russias-horrific-mass-abductions-ukrainian-children",
    "https://www.foxnews.com/politics/trump-administration-ends-program-track-kidnapped-ukrainian-children-russia-lawmakers-say",
    "https://www.foxnews.com/opinion/sens-klobuchar-grassley-america-cant-ignore-russia-kidnapping-ukrainian-children",
    "https://www.foxnews.com/world/rescuers-continue-search-kyiv-childrens-hospital-hit-russian-missile",
    "https://www.foxnews.com/lifestyle/children-traumatized-war-ukraine-mentors-places",
    "https://www.foxnews.com/lifestyle/ukrainian-children-cancer-evacuated-poland-war",
    "https://www.foxnews.com/world/ukraines-top-prosecutor-speaks-evil-russian-atrocities",
    "https://www.foxnews.com/world/half-ukraine-children-displaced",
    "https://www.foxnews.com/us/tennessee-first-state-receive-child-cancer-patients-ukraine",
    "https://www.foxnews.com/lifestyle/premature-babies-ukraine-rescued-ambulance-kyiv-shelling",
    "https://www.foxnews.com/entertainment/jessica-chastain-shares-footage-life-changing-ukraine-trip-displaced-children-destruction",
    "https://www.foxnews.com/media/ukrainian-mother-fleeing-country-with-her-son-details-escape-i-wish-i-never-experienced-that",
    "https://www.foxnews.com/lifestyle/franklin-graham-ukraine-war-children",
    "https://www.foxnews.com/world/ukraines-foreign-minister-accuses-russia-war-crimes-attacks-school-orphanage",
    "https://www.foxnews.com/world/rescuers-continue-search-kyiv-childrens-hospital-hit-russian-missile",
    "https://www.foxnews.com/world/world-leaders-react-to-bucha-ukraine-massacre-russia-blamed",
    "https://www.foxnews.com/world/strikes-ukraine-hospital-kills-2-day-old-baby-officials-say",
    "https://www.foxnews.com/world/300-dead-mariupol-theater-russian-attacks",
    "https://www.foxnews.com/world/ukraine-children-killed-wounded-russian-invasion",
    "https://www.foxnews.com/world/refugees-fleeing-ukraine-reaches-2-5-million-many-children-un-says",
    "https://www.foxnews.com/world/nearly-50-children-from-russian-occupied-regions-ukraine-arrive-belarus",
    "https://www.foxnews.com/world/ukraine-rescue-team-brings-back-31-children-russia-war",
    "https://www.foxnews.com/world/russian-attacks-on-ukrainian-hospitals-could-be-war-crimes-lawyer-says",
    "https://www.foxnews.com/world/russia-denies-bombed-childrens-hospital-fake-news",
    "https://www.foxnews.com/world/ukrainian-lawmaker-russia-trying-forcibly-deport-civilians-mariupol",
    "https://www.bbc.com/news/articles/c62vk0v9756o",
    "https://edition.cnn.com/2025/12/24/europe/moscow-bomb-police-officers-intl",
    "https://edition.cnn.com/2025/12/23/europe/russian-attack-zelensky-ukraine-talks-intl",
    "https://edition.cnn.com/2025/12/10/politics/ukraine-response-peace-plan",
    "https://www.bbc.com/news/world-europe-60633482",
    "https://amp.cnn.com/cnn/2024/06/13/europe/russia-ukraine-mariupol-hunger-war-crime-intl",
    "https://amp.cnn.com/cnn/2022/03/08/europe/russia-invasion-ukraine-03-08-intl",
    "https://www.theguardian.com/uk-news/2025/jun/27/ukrainians-who-fled-to-uk-being-refused-asylum-on-grounds-it-is-safe-to-return",
    
    
]

corpus_ukraine = []

# ============================
# 2. SESSION + RETRY (TP)
# ============================
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
}

session = requests.Session()
retry_strategy = Retry(
    total=3,
    backoff_factor=2,
    status_forcelist=[429, 500, 502, 503, 504]
)
adapter = HTTPAdapter(max_retries=retry_strategy)
session.mount("https://", adapter)

print("--- Début collecte corpus Ukraine ---")

# ============================
# 3. BOUCLE DE SCRAPING
# ============================
for url in urls_ukraine:
    try:
        r = session.get(url, headers=headers, timeout=(5, 30))

        if r.status_code != 200:
            print(f"Erreur {r.status_code} : {url}")
            continue

        soup = BeautifulSoup(r.text, "html.parser")

        # -------- TITRE --------
        title_tag = soup.find("h1")
        titre = title_tag.get_text(strip=True) if title_tag else "Sans titre"

        # -------- METADATA --------
        source = "Autre"
        author = "N/A"
        publish_date = "N/A"
        article_body = None

        # -------- FOX NEWS --------
        if "foxnews.com" in url:
            source = "Fox News"
            article_body = soup.find("div", class_="article-body")

            author_tag = soup.find("div", class_="author-byline")
            if author_tag:
                author = author_tag.get_text(strip=True).replace("By ", "")

            date_tag = soup.find("time")
            if date_tag:
                publish_date = date_tag.get_text(strip=True)

        # -------- BBC --------
        elif "bbc.com" in url:
            source = "BBC"
            article_body = soup.find("article")

            date_tag = soup.find("time")
            if date_tag:
                publish_date = date_tag.get("datetime")

        # -------- THE GUARDIAN --------
        elif "theguardian.com" in url:
            source = "The Guardian"
            article_body = soup.find("div", class_="article-body-commercial-selector")

            author_tag = soup.find("a", rel="author")
            if author_tag:
                author = author_tag.get_text(strip=True)

            date_tag = soup.find("time")
            if date_tag:
                publish_date = date_tag.get("datetime")

        # ============================
        # 4. EXTRACTION DU TEXTE (TP)
        # ============================
        if article_body:
            paragraphs = [
                p.get_text(strip=True)
                for p in article_body.find_all("p")
                if len(p.get_text()) > 30 and "©" not in p.get_text()
            ]
        else:
            paragraphs = [
                p.get_text(strip=True)
                for p in soup.find_all("p")
                if len(p.get_text()) > 40 and "©" not in p.get_text()
            ]

        texte_final = "\n".join(paragraphs)

        # ============================
        # 5. SAUVEGARDE
        # ============================
        if len(texte_final) > 150:
            corpus_ukraine.append({
                "source": source,
                "url": url,
                "title": titre,
                "publish_date": publish_date,
                "author": author,
                "content": texte_final,
                "scraped_at": datetime.now().isoformat(),
                "keywords": [],
                "summary": "",
                "conflict": "Ukraine"
            })
            print(f"Succès : {titre[:60]}...")
        else:
            print(f"Contenu insuffisant : {url}")

        time.sleep(3)

    except Exception as e:
        print(f"Erreur critique sur {url} : {e}")

# ============================
# 6. SAUVEGARDE JSON
# ============================
file_name = "corpus_ukraine.json"
with open(file_name, "w", encoding="utf-8") as f:
    json.dump(corpus_ukraine, f, indent=4, ensure_ascii=False)

print(f"\n--- TERMINÉ ---")
print(f"{len(corpus_ukraine)} articles sauvegardés dans {file_name}")


--- Début collecte corpus Ukraine ---
Succès : Melania Trump says 7 more Ukrainian children reunited with f...
Succès : Ukraine’s 'Underground Railroad' rescues abducted Ukrainian ...
Succès : Humanitarian aid expert reveals dramatic rescue of children ...
Succès : Half a million children become refugees as Russia-Ukraine wa...
Succès : Ukraine's refugee children: Amid the war, how to help kids s...
Succès : 'Thousands' of Ukrainian children forcibly deported to Russi...
Succès : Ukrainian child death toll mounts, humanitarian crisis worse...
Succès : 'Just evil': Top Republican details Russia's 'horrific' mass...
Succès : Trump administration ends program to track kidnapped Ukraini...
Succès : SENS KLOBUCHAR AND GRASSLEY: America can't ignore Russia kid...
Succès : Rescuers continue the search at Kyiv children's hospital hit...
Succès : Children traumatized by war in Ukraine find mentors from une...
Succès : Ukrainian children battling cancer are evacuated to Poland d...
Succès : Ukra

**Nettoyage du texte extrait**

In [53]:
from collections import Counter

# === 1. Total d’articles ===
total_articles = len(corpus_ukraine)
print(f"Total d'articles dans le corpus : {total_articles}\n")

# === 2. Répartition par source (Version Précise) ===
# On crée une fonction pour nettoyer les noms des sources à partir des URLs
def nettoyer_source(article):
    src = str(article.get('source', '')).strip()
    url = str(article.get('url', '')).lower()
    
    # Si la source est "Autre" ou mal nommée, on regarde l'URL
    if src.lower() in ['autre', 'n/a', '', 'inconnue']:
        if 'cnn.com' in url: return 'CNN'
        if 'bbc.com' in url: return 'BBC'
        if 'theguardian.com' in url: return 'The Guardian'
        if 'foxnews.com' in url: return 'Fox News'
        return 'Source Inconnue'
    return src

# Application du nettoyage
sources_nettoyees = [nettoyer_source(art) for art in corpus_ukraine]
source_counts = Counter(sources_nettoyees)

print("=== RÉPARTITION PAR SOURCE (PRÉCISE) ===")
# Tri par nombre d'articles pour plus de clarté
for source, count in source_counts.most_common():
    pct = count / total_articles * 100
    print(f"{source:<15}: {count:<2} articles ({pct:.1f}%)")
print()

# === 3. Estimation des sujets (Corrigée pour éviter le 100%) ===
sujet_categories_ukraine = {
    # On retire 'support' qui est trop commun et on ajoute des mots plus précis
    'Aide Humanitaire': ['humanitarian aid', 'unicef', 'wfp', 'food relief', 'humanitarian support', 'aid convoy'],
    'Santé/Hôpital': ['hospital', 'medical treatment', 'patients', 'doctor', 'ambulance', 'clinic', 'surgery'],
    'Opérations Militaires': ['missile strike', 'military operation', 'invasion', 'bombing', 'army forces', 'tanks', 'airstrike'],
    'Expérience Civile': ['civilian casualties', 'refugees', 'displaced families', 'killed', 'wounded', 'children killed'],
    'Autre': []
}

sujet_counts = Counter({cat: 0 for cat in sujet_categories_ukraine})

for article in corpus_ukraine:
    # On utilise le contenu pour la recherche
    texte_complet = (article.get('title', '') + " " + article.get('content', '')).lower()
    
    counted = False
    # On cherche des correspondances exactes pour plus de précision
    for cat in ['Aide Humanitaire', 'Santé/Hôpital', 'Opérations Militaires', 'Expérience Civile']:
        keywords = sujet_categories_ukraine[cat]
        if any(word in texte_complet for word in keywords):
            sujet_counts[cat] += 1
            counted = True
            break
            
    if not counted:
        sujet_counts['Autre'] += 1

print("=== ESTIMATION DES SUJETS (Précision Améliorée) ===")
for cat, count in sujet_counts.items():
    pct = (count / total_articles * 100) if total_articles > 0 else 0
    print(f"{cat:<25}: {count:<3} articles ({pct:>5.1f}%)")
print()

# === 4. Longueur des articles ===
lengths = [len(article.get('content', '').split()) for article in corpus_ukraine]
avg_len = sum(lengths) / len(lengths) if lengths else 0

print("=== LONGUEUR DES ARTICLES ===")
print(f"Moyenne de mots par article: {int(avg_len)}")
print(f"Article le plus court: {min(lengths) if lengths else 0} mots")
print(f"Article le plus long: {max(lengths) if lengths else 0} mots\n")

# === 5. Suggestions ===
print("=== SUGGESTIONS ===")
if total_articles > 0 and source_counts.get("Fox News", 0) / total_articles > 0.4:
    print("⚠️ Trop d'articles Fox News (>40%).")
print("✅ Taille de corpus acceptable.")

Total d'articles dans le corpus : 40

=== RÉPARTITION PAR SOURCE (PRÉCISE) ===
Fox News       : 32 articles (80.0%)
CNN            : 5  articles (12.5%)
BBC            : 2  articles (5.0%)
The Guardian   : 1  articles (2.5%)

=== ESTIMATION DES SUJETS (Précision Améliorée) ===
Aide Humanitaire         : 9   articles ( 22.5%)
Santé/Hôpital            : 14  articles ( 35.0%)
Opérations Militaires    : 11  articles ( 27.5%)
Expérience Civile        : 4   articles ( 10.0%)
Autre                    : 2   articles (  5.0%)

=== LONGUEUR DES ARTICLES ===
Moyenne de mots par article: 670
Article le plus court: 273 mots
Article le plus long: 1913 mots

=== SUGGESTIONS ===
⚠️ Trop d'articles Fox News (>40%).
✅ Taille de corpus acceptable.


In [47]:
# --- Dictionnaire pour normaliser les abréviations ---
ABBREV_MAP = {
    r"\bU\.?S\.?\b": "united_states",
    r"\bU\.?K\.?\b": "united_kingdom",
    r"\bU\.?N\.?\b": "united_nations",
    r"\bE\.?U\.?\b": "european_union",
    r"\bICC\b": "icc",
    r"\bICJ\b": "icj",
    
}
def normalize_abbreviations(text: str) -> str:
    text = text or ""
    for pattern, repl in ABBREV_MAP.items():
        text = re.sub(pattern, repl, text)
    return text

In [48]:
# --- Fonction de nettoyage du texte ---
def clean_text(text: str) -> str:
    if not text:
        return ""
    
    # 1. Mise en minuscules
    text = text.lower()
    
    # 2. Normalisation des abréviations
    text = normalize_abbreviations(text)
    
    # 3. Suppression des URLs
    text = re.sub(r'http\S+|www\S+', '', text)
    
    # 4. Suppression des chiffres
    text = re.sub(r'\d+', '', text)
    
    # 5. Suppression de la ponctuation avec regex
    text = re.sub(r'[^\w\s]', '', text)
    
    # 6. Normalisation des espaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    # 7. Suppression des guillemets doubles
    text = text.replace('"', '')
    
    # 8. Suppression des marqueurs de métadonnées "Source:" et "Context:"
    text = re.sub(r'\bsource:\b', '', text)
    text = re.sub(r'\bcontext:\b', '', text)
    
    return text

In [49]:

import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

# Stopwords anglais standards
stop_words = set(stopwords.words('english'))

# Liste des mots à **garder** dans notre projet
keep_words = {
    "we", "they", "them", "us", "our", "their", "his", "her", "its",
    "may", "might", "could", "must", "should", "would",
    "not", "no", "never", "without",
    "by", "against", "between", "under", "over"
}

# Supprimer ces mots des stopwords standards
stop_words = stop_words - keep_words
# Fonction pour supprimer les stopwords
def remove_stopwords(text):
    tokens = text.split()
    tokens_clean = [t for t in tokens if t.lower() not in stop_words]
    return " ".join(tokens_clean)



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\A\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [50]:
# Normalisation lexicale : Stemming et Lemmatisation avec NLTK

import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

# Initialisation
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Fonction pour convertir les tags POS de NLTK en tags WordNet
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def normalize_text_stem_lemma(text):
    tokens = nltk.word_tokenize(text)
    pos_tags = nltk.pos_tag(tokens)
    
    stemmed_tokens = [stemmer.stem(token) for token in tokens]  # A. Stemming
    lemmatized_tokens = [lemmatizer.lemmatize(token, get_wordnet_pos(pos)) for token, pos in pos_tags]  # B. Lemmatisation
    
    stemmed_text = " ".join(stemmed_tokens)
    lemmatized_text = " ".join(lemmatized_tokens)
    
    return stemmed_text, lemmatized_text


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\A\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\A\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\A\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\A\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [51]:
# --- Chargement du corpus ---
with open('corpus_ukraine.json', 'r', encoding='utf-8') as f:
    corpus = json.load(f)

# --- Application des transformations ---
for article in corpus:
    text = article.get("content", "")
    
    # Nettoyage
    text_clean = clean_text(text)
    article["content_clean"] = text_clean
    
    # Suppression stopwords
    text_no_stop = remove_stopwords(text_clean)
    article["content_no_stopwords"] = text_no_stop
    
    # Stemming & Lemmatisation
    stemmed_text, lemmatized_text = normalize_text_stem_lemma(text_no_stop)
    article["content_stemmed"] = stemmed_text
    article["content_lemmatized"] = lemmatized_text

# --- Sauvegarde du corpus nettoyé ---
with open('corpus_ukraine_clean.json', 'w', encoding='utf-8') as f:
    json.dump(corpus, f, indent=4, ensure_ascii=False)

print("Corpus nettoyé et enrichi sauvegardé dans corpus_ukraine_clean.json")

Corpus nettoyé et enrichi sauvegardé dans corpus_ukraine_clean.json
