# pretraining 


In [1]:
# =================================================================
# SCRIPT DE PR√âTRAITEMENT MULTI-VUES - VERSION MASTER EXPERT
# =================================================================
# Auteur : Expert NLP
# Am√©liorations :
# 1. LISTE NOIRE EXPERTE (Supprime le bruit : said, 2024, october...)
# 2. R√âPARATION AVANC√âE (D√©colle TrumpAnnounced, LadyMelania...)
# 3. TROIS VUES DISTINCTES (Lexicale propre, Structurelle compl√®te, Sentiment nuanc√©)
# =================================================================

import json
import os
import re 
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

# --- T√âL√âCHARGEMENT DES RESSOURCES ---
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

# --- 1. CONFIGURATION LINGUISTIQUE ---

def get_wordnet_pos(treebank_tag):
    """ Convertit les tags POS pour la lemmatisation """
    if treebank_tag.startswith('J'): return wordnet.ADJ
    elif treebank_tag.startswith('V'): return wordnet.VERB
    elif treebank_tag.startswith('N'): return wordnet.NOUN
    elif treebank_tag.startswith('R'): return wordnet.ADV
    else: return wordnet.NOUN

# --- 2. LISTES DE FILTRAGE ---

def get_expert_stopwords():
    """ 
    LISTE NOIRE EXPERTE : Indispensable pour Word2Vec et Nuages de Mots.
    Retire le bruit journalistique et temporel.
    """
    base_stop = set(stopwords.words('english'))
    
    # Bruit journalistique & Temporel √† supprimer ABSOLUMENT
    expert_noise = {
        # Verbes de parole
        'said', 'told', 'reported', 'stated', 'asked', 'added', 'says', 'according', 
        'report', 'confirm', 'announced', 'claim', 'claimed',
        # Temps
        'year', 'month', 'day', 'today', 'yesterday', 'tuesday', 'monday', 'friday', 'sunday',
        'october', 'november', 'december', 'january', 'february', '2023', '2024', '2025', 
        'time', 'week', 'daily', 'late', 'early', 'ago', 'since',
        # Quantit√©s floues
        'one', 'two', 'three', 'many', 'much', 'least', 'first', 'last', 'number', 
        'several', 'including', 'around', 'part', 'even', 'also', 'would', 'could'
    }
    return base_stop.union(expert_noise)

def get_sentiment_stopwords():
    """ 
    Pour le Sentiment, on GARDE les n√©gations.
    """
    base_stop = set(stopwords.words('english'))
    preserve = {'not', 'no', 'never', 'by', 'was', 'were', 'been', 'is', 'are', 'against'}
    return base_stop - preserve

# Initialisation des outils
lemmatizer = WordNetLemmatizer()
STOPWORDS_LEXICAL = get_expert_stopwords()      # Pour la vue Lexicale (Stricte)
STOPWORDS_SENTIMENT = get_sentiment_stopwords() # Pour la vue Sentiment (Nuanc√©e)
STOPWORDS_STRUCTURAL = get_sentiment_stopwords() # Pour la vue Structurelle

# --- 3. FONCTION DE R√âPARATION (CRITIQUE) ---
def reparer_mots_colles(text):
    """
    S√©pare les mots coll√©s par erreur de scraping.
    """
    if not text: return ""
    
    # A. Patchs sp√©cifiques (Erreurs connues)
    patchs = {
        "Trumpannounced": "Trump announced",
        "trumpannounced": "trump announced",
        "LadyMelania": "Lady Melania",
        "Ladymelania": "Lady Melania",
        "whitehouse": "white house",
        "WhiteHouse": "White House",
        "Fox News": "", # Nettoyage source
        "FoxNews": ""
    }
    for erreur, correction in patchs.items():
        text = text.replace(erreur, correction)
    
    # B. Regex CamelCase (minusculeMajuscule -> minuscule Majuscule)
    text = re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', text)
    
    # C. Ponctuation coll√©e (mot.Majuscule -> mot. Majuscule)
    text = re.sub(r'(?<=[a-z])\.(?=[A-Z])', '. ', text)
    
    return text

# --- 4. LES TROIS PIPELINES ---

def pipeline_lexical(text):
    """ 
    VUE 1 : LEXICALE (Pour Word2Vec, Nuages de Mots)
    -> Nettoyage AGRESSIF (On vire 'said', '2024', etc.)
    """
    text = reparer_mots_colles(text)
    tokens = word_tokenize(text.lower())
    
    # On ne garde que les mots alphanum√©riques, >2 lettres, et pas dans la liste experte
    clean_tokens = [
        t for t in tokens 
        if t.isalnum() 
        and t not in STOPWORDS_LEXICAL 
        and len(t) > 2 
        and not t.isdigit() # On vire les nombres isol√©s ("160")
    ]
    
    lemmas = [lemmatizer.lemmatize(t) for t in clean_tokens]
    return lemmas

def pipeline_structural_semantique(text):
    """ 
    VUE 2 : STRUCTURELLE (Pour S√©mantique, Syntaxe, Agence)
    -> Garde la structure des phrases et les verbes auxiliaires
    """
    text = reparer_mots_colles(text)
    sentences = sent_tokenize(text)
    processed_sentences = []
    
    for sent in sentences:
        tokens = word_tokenize(sent)
        tagged = pos_tag(tokens)
        
        sentence_data = []
        for word, tag in tagged:
            low_word = word.lower()
            # Ici on filtre moins pour garder la coh√©rence grammaticale
            if low_word not in STOPWORDS_STRUCTURAL: 
                lemma = lemmatizer.lemmatize(low_word, get_wordnet_pos(tag))
                sentence_data.append({"w": word, "t": tag, "l": lemma})
        
        if sentence_data:
            processed_sentences.append(sentence_data)
            
    return processed_sentences

def pipeline_sentiment(text):
    """ 
    VUE 3 : SENTIMENT (Pour Lab 4/9)
    -> Garde les n√©gations ('not good')
    """
    text = reparer_mots_colles(text)
    tokens = word_tokenize(text.lower())
    
    # On garde les mots alphanum√©riques OU les n√©gations importantes
    clean_tokens = [
        t for t in tokens 
        if t.isalnum() and (t not in stopwords.words('english') or t in {'not', 'no', 'never', 'against'})
    ]
    return clean_tokens

# --- 5. EX√âCUTION ---

def traiter_corpus(filename_in, filename_out, conflict_label):
    print(f"\nüöÄ Pr√©traitement Expert en cours : {conflict_label}...")
    
    if not os.path.exists(filename_in):
        print(f"‚ùå Erreur : {filename_in} introuvable.")
        return

    with open(filename_in, 'r', encoding='utf-8') as f:
        articles = json.load(f)

    corpus_pretraite = []

    for art in articles:
        content = art['content']
        
        processed_data = {
            "title": art.get('title', 'N/A'),
            "conflict": conflict_label,
            "scraped_at": art.get('scraped_at', ''),
            
            # VUE 1 : PROPRE (Sans bruit journalistique)
            "lexical_view": pipeline_lexical(content),
            
            # VUE 2 : GRAMMATICALE (Avec POS tags)
            "structural_view": pipeline_structural_semantique(content),
            
            # VUE 3 : √âMOTIONNELLE (Avec n√©gations)
            "sentiment_view": pipeline_sentiment(content)
        }
        corpus_pretraite.append(processed_data)

    with open(filename_out, 'w', encoding='utf-8') as f:
        json.dump(corpus_pretraite, f, indent=4, ensure_ascii=False)
    
    print(f"‚úÖ Termin√© ! {conflict_label} sauvegard√© dans {filename_out}")
    print(f"   (Nettoyage Expert appliqu√©)")

# --- LANCEMENT ---
if __name__ == "__main__":
    # Assurez-vous d'avoir vos fichiers source (v1 ou nettoye_v1)
    traiter_corpus('corpus/corpus_palestine_nettoye_v1.json', 'corpus/corpus_gaza_pretraiter.json', 'GAZA')
    traiter_corpus('corpus/corpus_ukraine_nettoye_v1.json', 'corpus/corpus_ukraine_pretraiter.json', 'UKRAINE')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!



üöÄ Pr√©traitement Expert en cours : GAZA...
‚úÖ Termin√© ! GAZA sauvegard√© dans corpus/corpus_gaza_pretraiter.json
   (Nettoyage Expert appliqu√©)

üöÄ Pr√©traitement Expert en cours : UKRAINE...
‚úÖ Termin√© ! UKRAINE sauvegard√© dans corpus/corpus_ukraine_pretraiter.json
   (Nettoyage Expert appliqu√©)
