In [None]:
# =================================================================
# PROJET NLP HPC : ANALYSE S√âMANTIQUE (CONCORDANCE & EMBEDDINGS)
# =================================================================
# Objectif : Comparer le sens des mots (Word2Vec) et leur contexte (Concordance)
# =================================================================

import json
import nltk
from nltk.text import Text
from gensim.models import Word2Vec
import pandas as pd
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# --- 1. CHARGEMENT DES DONN√âES ---
def charger_corpus_brut(chemin):
    with open(chemin, 'r', encoding='utf-8') as f:
        data = json.load(f)
    # Pour Word2Vec, on a besoin d'une liste de listes de mots (sentences)
    sentences = [art['lexical_view'] for art in data]
    return sentences

# --- 2. ANALYSE DE CONCORDANCE (Le Microscope) ---
def afficher_concordance(tokens, mot_cible, label, lignes=5):
    print(f"\nüîç CONCORDANCE pour '{mot_cible.upper()}' dans {label} :")
    text_obj = Text(tokens)
    # Affiche le mot dans son contexte (fen√™tre de mots autour)
    text_obj.concordance(mot_cible, width=80, lines=lignes)

# --- 3. ENTRA√éNEMENT WORD2VEC (L'Intelligence S√©mantique) ---
def entrainer_modele(sentences, label):
    print(f"\nüß† Entra√Ænement du mod√®le Word2Vec pour {label}...")
    # Vector_size=100 : chaque mot devient un vecteur de 100 dimensions
    # Window=5 : on regarde 5 mots avant et apr√®s pour comprendre le sens
    # Min_count=5 : on ignore les mots trop rares
    model = Word2Vec(sentences, vector_size=100, window=5, min_count=5, workers=4)
    return model

# --- 4. COMPARAISON DES CHAMPS S√âMANTIQUES ---
def comparer_voisins(model_gaza, model_ukraine, mot_pivot):
    print(f"\nüìä QUEL EST LE SENS DE '{mot_pivot.upper()}' ? (Top 5 Synonymes/Associations)")
    
    # V√©rification si le mot existe dans le vocabulaire
    try:
        voisins_g = model_gaza.wv.most_similar(mot_pivot, topn=5)
        print(f"üëâ GAZA associe '{mot_pivot}' √† :")
        for mot, score in voisins_g:
            print(f"   - {mot} (sim: {score:.2f})")
    except KeyError:
        print(f"üëâ GAZA : Le mot '{mot_pivot}' n'est pas assez fr√©quent.")

    try:
        voisins_u = model_ukraine.wv.most_similar(mot_pivot, topn=5)
        print(f"üëâ UKRAINE associe '{mot_pivot}' √† :")
        for mot, score in voisins_u:
            print(f"   - {mot} (sim: {score:.2f})")
    except KeyError:
        print(f"üëâ UKRAINE : Le mot '{mot_pivot}' n'est pas assez fr√©quent.")

# --- EX√âCUTION ---
if __name__ == "__main__":
    # 1. Chargement
    sentences_gaza = charger_corpus_brut('corpus/corpus_gaza_pretraiter.json')
    sentences_ukraine = charger_corpus_brut('corpus/corpus_ukraine_pretraiter.json')
    
    # Aplatir pour NLTK (Concordance a besoin d'une seule longue liste)
    all_tokens_gaza = [w for s in sentences_gaza for w in s]
    all_tokens_ukraine = [w for s in sentences_ukraine for w in s]
    
    # 2. CONCORDANCE (√âtudier les contextes d'utilisation)
    # On regarde comment sont utilis√©s les mots "hospital" et "soldier/military"
    print("="*60)
    print("PARTIE 1 : √âTUDE DES CONTEXTES (CONCORDANCE)")
    print("="*60)
    afficher_concordance(all_tokens_gaza, 'hospital', 'GAZA')
    afficher_concordance(all_tokens_ukraine, 'hospital', 'UKRAINE')
    
    afficher_concordance(all_tokens_gaza, 'terrorist', 'GAZA')
    # Note: Terrorist est rare en Ukraine, on essaie 'soldier'
    afficher_concordance(all_tokens_ukraine, 'soldier', 'UKRAINE')

    # 3. WORD2VEC (Comparer les champs s√©mantiques)
    print("\n" + "="*60)
    print("PARTIE 2 : CHAMPS S√âMANTIQUES (WORD2VEC)")
    print("="*60)
    
    # Entra√Ænement des deux cerveaux
    w2v_gaza = entrainer_modele(sentences_gaza, 'GAZA')
    w2v_ukraine = entrainer_modele(sentences_ukraine, 'UKRAINE')
    
    # Comparaison des concepts cl√©s
    mots_a_tester = ['war', 'civilian', 'child', 'military', 'leader']
    
    for mot in mots_a_tester:
        comparer_voisins(w2v_gaza, w2v_ukraine, mot)
        print("-" * 40)

2026-01-02 20:09:30,153 : INFO : collecting all words and their counts
2026-01-02 20:09:30,156 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types


PARTIE 1 : √âTUDE DES CONTEXTES (CONCORDANCE)

üîç CONCORDANCE pour 'HOSPITAL' dans GAZA :
Displaying 5 of 213 matches:
3 401st brigade combat team rantisi hospital video footage released israel defen
tem used hamas terrorist connecting hospital militant based oations video idf sp
ed raid israel next school 200 yard hospital hagari show oational tunnel electri
r tunnel lead bulletproof door gaza hospital coverage prof medium need skeptical
und tunnel leading basement rantisi hospital gaza idf idf spokesman intended sho

üîç CONCORDANCE pour 'HOSPITAL' dans UKRAINE :
Displaying 5 of 72 matches:
 10 district strike okhmatdyt child hospital interrupted surgery forced young ca
al outcry russian missile hit child hospital kyiv ukrainian president zelenskyy 
president zelenskyy vow retaliation hospital ukraine largest medical facility ch
ovnir said tuesday missile hit wing hospital building conducted dialysis child k
 reporter estimating overall damage hospital million danielle bell head t

2026-01-02 20:09:30,187 : INFO : collected 5547 word types from a corpus of 29953 raw words and 56 sentences
2026-01-02 20:09:30,194 : INFO : Creating a fresh vocabulary
2026-01-02 20:09:30,213 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 retains 1285 unique words (23.17% of original 5547, drops 4262)', 'datetime': '2026-01-02T20:09:30.213310', 'gensim': '4.4.0', 'python': '3.13.1 (tags/v3.13.1:0671451, Dec  3 2024, 19:06:28) [MSC v.1942 64 bit (AMD64)]', 'platform': 'Windows-11-10.0.26200-SP0', 'event': 'prepare_vocab'}
2026-01-02 20:09:30,215 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 leaves 23180 word corpus (77.39% of original 29953, drops 6773)', 'datetime': '2026-01-02T20:09:30.215697', 'gensim': '4.4.0', 'python': '3.13.1 (tags/v3.13.1:0671451, Dec  3 2024, 19:06:28) [MSC v.1942 64 bit (AMD64)]', 'platform': 'Windows-11-10.0.26200-SP0', 'event': 'prepare_vocab'}
2026-01-02 20:09:30,284 : INFO : deleting the raw counts dictionary of 5547 


üß† Entra√Ænement du mod√®le Word2Vec pour UKRAINE...


2026-01-02 20:09:31,344 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 retains 828 unique words (19.27% of original 4296, drops 3468)', 'datetime': '2026-01-02T20:09:31.344610', 'gensim': '4.4.0', 'python': '3.13.1 (tags/v3.13.1:0671451, Dec  3 2024, 19:06:28) [MSC v.1942 64 bit (AMD64)]', 'platform': 'Windows-11-10.0.26200-SP0', 'event': 'prepare_vocab'}
2026-01-02 20:09:31,347 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 leaves 12355 word corpus (68.88% of original 17937, drops 5582)', 'datetime': '2026-01-02T20:09:31.347591', 'gensim': '4.4.0', 'python': '3.13.1 (tags/v3.13.1:0671451, Dec  3 2024, 19:06:28) [MSC v.1942 64 bit (AMD64)]', 'platform': 'Windows-11-10.0.26200-SP0', 'event': 'prepare_vocab'}
2026-01-02 20:09:31,384 : INFO : deleting the raw counts dictionary of 4296 items
2026-01-02 20:09:31,395 : INFO : sample=0.001 downsamples 62 most-common words
2026-01-02 20:09:31,401 : INFO : Word2Vec lifecycle event {'msg': 'downsampling leaves


üìä QUEL EST LE SENS DE 'WAR' ? (Top 5 Synonymes/Associations)
üëâ GAZA associe 'war' √† :
   - israel (sim: 1.00)
   - said (sim: 1.00)
   - hamas (sim: 1.00)
   - gaza (sim: 1.00)
   - one (sim: 1.00)
üëâ UKRAINE associe 'war' √† :
   - said (sim: 1.00)
   - ukraine (sim: 1.00)
   - russian (sim: 1.00)
   - russia (sim: 1.00)
   - ukrainian (sim: 1.00)
----------------------------------------

üìä QUEL EST LE SENS DE 'CIVILIAN' ? (Top 5 Synonymes/Associations)
üëâ GAZA associe 'civilian' √† :
   - said (sim: 1.00)
   - hamas (sim: 1.00)
   - say (sim: 1.00)
   - hospital (sim: 1.00)
   - gaza (sim: 1.00)
üëâ UKRAINE associe 'civilian' √† :
   - ukraine (sim: 1.00)
   - said (sim: 1.00)
   - child (sim: 1.00)
   - russia (sim: 1.00)
   - ukrainian (sim: 1.00)
----------------------------------------

üìä QUEL EST LE SENS DE 'CHILD' ? (Top 5 Synonymes/Associations)
üëâ GAZA associe 'child' √† :
   - hamas (sim: 1.00)
   - said (sim: 1.00)
   - gaza (sim: 1.00)
   - civilian (s