# pretraining 


In [4]:
# =================================================================
# SCRIPT DE PR√âTRAITEMENT MULTI-VUES - VERSION EXPERT CORRIG√âE
# =================================================================
# Auteur : Expert NLP
# But : Segmentation par phrases, conservation des dates/chiffres 
# et cr√©ation des vues : Lexicale, Structurelle (S√©mantique) et Sentiment.
# =================================================================

import json
import os
import re 
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

# --- T√©l√©chargement des ressources NLTK n√©cessaires ---
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

# --- CONFIGURATION ---

def get_wordnet_pos(treebank_tag):
    """ Convertit les tags POS de Treebank pour WordNet (Crucial pour Lab 6) """
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def setup_selective_stopwords():
    """ 
    Garde les mots cl√©s de la voix passive (by, was, were) 
    et les n√©gations (not, no) pour l'analyse s√©mantique et de sentiment.
    """
    base_stop = set(stopwords.words('english'))
    preserve = {'not', 'no', 'never', 'by', 'was', 'were', 'been', 'is', 'are', 'against'}
    return base_stop - preserve

# Initialisation des outils
lemmatizer = WordNetLemmatizer()
custom_stopwords = setup_selective_stopwords()
standard_stopwords = set(stopwords.words('english'))



# --- FONCTION UTILITAIRE DE R√âPARATION ---
def reparer_mots_colles(text):
    """
    S√©pare les mots coll√©s par erreur lors du scraping (ex: 'LadyMelania' -> 'Lady Melania').
    Doit √™tre appliqu√© AVANT la mise en minuscule.
    """
    if not text: return ""
    
    # Cas 1 : CamelCase (LadyMelania -> Lady Melania)
    # On cherche une minuscule suivie imm√©diatement d'une majuscule
    text = re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', text)
    
    # Cas 2 : Ponctuation coll√©e (End.Start -> End. Start)
    # On cherche un point suivi d'une majuscule sans espace
    text = re.sub(r'(?<=\.)(?=[A-Z])', ' ', text)
    
    return text

# --- LES TROIS PIPELINES DE PR√âTRAITEMENT ---
def pipeline_lexical(text):
    """ 
    VERSION 1 : Pour TF-IDF et Nuages de mots (Lab 3, 7).
    Filtre les stopwords et la ponctuation, mais garde les chiffres significatifs.
    """
    # 1. D'abord on r√©pare les collages (tant qu'il y a des majuscules)
    text = reparer_mots_colles(text)
    
    # Tokenization standard
    tokens = word_tokenize(text.lower())
    # Nettoyage : on garde les mots alphanum√©riques (mots + nombres)
    clean_tokens = [t for t in tokens if t.isalnum() and t not in standard_stopwords and len(t) > 1]
    # Lemmatisation
    lemmas = [lemmatizer.lemmatize(t) for t in clean_tokens]
    return lemmas

def pipeline_structural_semantique(text):
    """ 
    VERSION 2 : Analyse S√©mantique / N-Grams .
    Structure : Liste de phrases -> Chaque phrase contient des dictionnaires (Mot, POS, Lemme).
    Indispensable pour l'analyse de la responsabilit√© (Voix passive/active).
    """
# 1. R√©paration initiale
    text = reparer_mots_colles(text)

    # 1. Segmentation en phrases (sur texte propre avec espaces corrig√©s)
    sentences = sent_tokenize(text)
    processed_sentences = []
    
    for sent in sentences:
        # 2. Tokenization par phrase
        tokens = word_tokenize(sent)
        # 3. √âtiquetage grammatical (POS Tagging)
        tagged = pos_tag(tokens)
        
        sentence_data = []
        for word, tag in tagged:
            low_word = word.lower()
            # On ne filtre QUE les vrais stopwords (pas ceux de structure/voix passive)
            if low_word not in custom_stopwords:
                # Lemmatisation intelligente bas√©e sur le POS
                lemma = lemmatizer.lemmatize(low_word, get_wordnet_pos(tag))
                sentence_data.append({"w": word, "t": tag, "l": lemma})
        
        if sentence_data: # On n'ajoute pas les phrases vides
            processed_sentences.append(sentence_data)
            
    return processed_sentences

def pipeline_sentiment(text):
    """ 
    VERSION 3 : Pour Analyse de Polarit√© (Lab 9, 10).
    Conserve les n√©gations pour √©viter d'inverser le sens √©motionnel.

    """
    # 1. R√©paration
    text = reparer_mots_colles(text)
    # 2. Tokenization
    tokens = word_tokenize(text.lower())
    # On filtre les stopwords mais on pr√©serve 'not', 'no', 'never'
    clean_tokens = [t for t in tokens if t.isalnum() and (t not in standard_stopwords or t in {'not', 'no', 'never'})]
    return clean_tokens

# --- FONCTION PRINCIPALE D'EX√âCUTION ---

def traiter_corpus(filename_in, filename_out, conflict_label):
    print(f"\nüöÄ Pr√©traitement Expert en cours : {conflict_label}...")
    
    if not os.path.exists(filename_in):
        print(f"‚ùå Erreur : {filename_in} introuvable.")
        return

    with open(filename_in, 'r', encoding='utf-8') as f:
        articles = json.load(f)

    corpus_pretraite = []

    for art in articles:
        content = art['content']
        
        # Cr√©ation des 3 vues pour l'article
        processed_data = {
            "title": art.get('title', 'N/A'),
            "conflict": conflict_label,
            "scraped_at": art.get('scraped_at', ''),
            # Vue 1 : Mots-cl√©s (Lab 7)
            "lexical_view": pipeline_lexical(content),
            # Vue 2 : Syntaxe et S√©mantique par phrase (Lab 6, 8, 10)
            "structural_view": pipeline_structural_semantique(content),
            # Vue 3 : Sentiments avec n√©gations (Lab 9)
            "sentiment_view": pipeline_sentiment(content)
        }
        corpus_pretraite.append(processed_data)

    # Sauvegarde
    with open(filename_out, 'w', encoding='utf-8') as f:
        json.dump(corpus_pretraite, f, indent=4, ensure_ascii=False)
    
    print(f"‚úÖ Termin√© ! {conflict_label} sauvegard√© dans {filename_out}")

# --- LANCEMENT ---
traiter_corpus('corpus/corpus_palestine_nettoye_v1.json', 'corpus/corpus_gaza_pretraiter.json', 'GAZA')
traiter_corpus('corpus/corpus_ukraine_nettoye_v1.json', 'corpus/corpus_ukraine_pretraiter.json', 'UKRAINE')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!



üöÄ Pr√©traitement Expert en cours : GAZA...
‚úÖ Termin√© ! GAZA sauvegard√© dans corpus/corpus_gaza_pretraiter.json

üöÄ Pr√©traitement Expert en cours : UKRAINE...
‚úÖ Termin√© ! UKRAINE sauvegard√© dans corpus/corpus_ukraine_pretraiter.json


In [None]:
# =================================================================
# SCRIPT DE PR√âTRAITEMENT MULTI-VUES - VERSION EXPERT (CORRIG√âE)
# =================================================================
# Auteur : Expert NLP
# Am√©lioration : D√©collage automatique des mots (TrumpAnnounced -> Trump Announced)
# =================================================================

import json
import os
import re 
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

# --- T√©l√©chargement des ressources ---
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

# --- CONFIGURATION LINGUISTIQUE ---

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'): return wordnet.ADJ
    elif treebank_tag.startswith('V'): return wordnet.VERB
    elif treebank_tag.startswith('N'): return wordnet.NOUN
    elif treebank_tag.startswith('R'): return wordnet.ADV
    else: return wordnet.NOUN

def setup_selective_stopwords():
    base_stop = set(stopwords.words('english'))
    preserve = {'not', 'no', 'never', 'by', 'was', 'were', 'been', 'is', 'are', 'against'}
    return base_stop - preserve

# Outils
lemmatizer = WordNetLemmatizer()
custom_stopwords = setup_selective_stopwords()
standard_stopwords = set(stopwords.words('english'))

# --- FONCTION DE R√âPARATION ---
def reparer_mots_colles(text):
    """
    S√©pare les mots coll√©s par erreur de scraping AVANT la mise en minuscule.
    Int√®gre une logique Regex + un Dictionnaire de Patchs pour les cas r√©sistants.
    """
    if not text: return ""
    
    # 1. LISTE DE PATCHS SP√âCIFIQUES (Correction manuelle des erreurs r√©currentes)
    # C'est ici qu'on force la correction de "Trumpannounced" et autres erreurs identifi√©es
    patchs = {
        "Trumpannounced": "Trump announced",
        "trumpannounced": "trump announced",
        "LadyMelania": "Lady Melania",
        "Ladymelania": "Lady Melania",
        "whitehouse": "white house",
        "WhiteHouse": "White House"
    }
    
    # On applique les patchs d'abord
    for erreur, correction in patchs.items():
        text = text.replace(erreur, correction)
    
    # 2. D√©coller le CamelCase g√©n√©rique (minuscule suivie d'une Majuscule)
    # Ex: TrumpSaid -> Trump Said
    text = re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', text)
    
    # 3. D√©coller la ponctuation manquante
    # Ex: word.Next -> word. Next
    text = re.sub(r'(?<=[a-z])\.(?=[A-Z])', '. ', text)
    
    return text

# --- LES 3 PIPELINES (AVEC R√âPARATION INT√âGR√âE) ---

def pipeline_lexical(text):
    """ VERSION 1 : Lexicale (TF-IDF) """
    # √âTAPE CRUCIALE : On r√©pare D'ABORD, quand les majuscules existent encore
    text = reparer_mots_colles(text)
    
    # Ensuite on met en minuscule
    tokens = word_tokenize(text.lower())
    
    # Filtrage
    clean_tokens = [t for t in tokens if t.isalnum() and t not in standard_stopwords and len(t) > 1]
    lemmas = [lemmatizer.lemmatize(t) for t in clean_tokens]
    return lemmas

def pipeline_structural_semantique(text):
    """ VERSION 2 : S√©mantique (Phrases & POS) """
    # √âTAPE CRUCIALE : R√©paration initiale
    text = reparer_mots_colles(text)
    
    sentences = sent_tokenize(text)
    processed_sentences = []
    
    for sent in sentences:
        tokens = word_tokenize(sent)
        tagged = pos_tag(tokens)
        
        sentence_data = []
        for word, tag in tagged:
            low_word = word.lower()
            if low_word not in custom_stopwords:
                lemma = lemmatizer.lemmatize(low_word, get_wordnet_pos(tag))
                sentence_data.append({"w": word, "t": tag, "l": lemma})
        
        if sentence_data:
            processed_sentences.append(sentence_data)
            
    return processed_sentences

def pipeline_sentiment(text):
    """ VERSION 3 : Sentiment (Avec N√©gations) """
    # √âTAPE CRUCIALE : R√©paration initiale
    text = reparer_mots_colles(text)
    
    tokens = word_tokenize(text.lower())
    clean_tokens = [t for t in tokens if t.isalnum() and (t not in standard_stopwords or t in {'not', 'no', 'never'})]
    return clean_tokens

# --- EX√âCUTION ---

def traiter_corpus(filename_in, filename_out, conflict_label):
    print(f"\nüöÄ Pr√©traitement Expert en cours : {conflict_label}...")
    
    if not os.path.exists(filename_in):
        print(f"‚ùå Erreur : {filename_in} introuvable.")
        return

    with open(filename_in, 'r', encoding='utf-8') as f:
        articles = json.load(f)

    corpus_pretraite = []

    for art in articles:
        content = art['content']
        
        processed_data = {
            "title": art.get('title', 'N/A'),
            "conflict": conflict_label,
            "scraped_at": art.get('scraped_at', ''),
            "lexical_view": pipeline_lexical(content),
            "structural_view": pipeline_structural_semantique(content),
            "sentiment_view": pipeline_sentiment(content)
        }
        corpus_pretraite.append(processed_data)

    with open(filename_out, 'w', encoding='utf-8') as f:
        json.dump(corpus_pretraite, f, indent=4, ensure_ascii=False)
    
    print(f"‚úÖ Termin√© ! {conflict_label} sauvegard√© dans {filename_out}")

# --- LANCEMENT ---
# Assurez-vous d'utiliser les fichiers NETTOY√âS (v1) comme source, car ils contiennent encore les majuscules !
traiter_corpus('corpus/corpus_palestine_nettoye_v1.json', 'corpus/corpus_gaza_pretraiter.json', 'GAZA')
traiter_corpus('corpus/corpus_ukraine_nettoye_v1.json', 'corpus /corpus_ukraine_pretraiter.json', 'UKRAINE')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!



üöÄ Pr√©traitement Expert en cours : GAZA...
‚úÖ Termin√© ! GAZA sauvegard√© dans corpus/corpus_gaza_pretraiter.json

üöÄ Pr√©traitement Expert en cours : UKRAINE...
‚úÖ Termin√© ! UKRAINE sauvegard√© dans corpus/corpus_ukraine_pretraiter.json
