In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# √âtape 1.1 : Importer les biblioth√®ques n√©cessaires
# Explication : Pandas pour data, Matplotlib/Seaborn pour viz, NLTK/spaCy pour NLP.
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
import spacy

# T√©l√©charger ressources NLTK et charger spaCy (anglais, adapt√© √† tweets)
nltk.download('stopwords')
nltk.download('punkt')
nlp = spacy.load('en_core_web_sm')  # Mod√®le anglais
stop_words = set(stopwords.words('english'))
print("Setup des libs termin√© !")

# √âtape 1.2 : Charger les deux datasets depuis Kaggle
# Explication : Sentiment140 et Generative AI Tweets, avec v√©rification colonnes.
sentiment140_path = '/kaggle/input/sentiment140/training.1600000.processed.noemoticon.csv'
genai_path = '/kaggle/input/generative-ai-tweets/GenerativeAI tweets.csv'  # Ajustez chemin exact

# Charger Sentiment140
column_names = ['target', 'id', 'date', 'flag', 'user', 'text']
df_sent140 = pd.read_csv(sentiment140_path, encoding='ISO-8859-1', names=column_names)
print("Colonnes Sentiment140 :", df_sent140.columns.tolist())

# Charger Generative AI Tweets
df_genai = pd.read_csv(genai_path)
print("Colonnes Generative AI Tweets :", df_genai.columns.tolist())

# Harmoniser colonnes : Renommer si n√©cessaire
if 'Text' in df_genai.columns:  # Corrig√© : 'Text' au lieu de 'text'
    df_genai = df_genai.rename(columns={'Text': 'text'})  # Unifier nom
df_genai['target'] = None  # Ajouter colonne target vide (√† labelliser plus tard si besoin)

# S√©lectionner colonnes communes pour concat√©nation
df_sent140_subset = df_sent140[['text', 'target']].copy()  # Copie pour √©viter warnings
df_genai_subset = df_genai[['text', 'target']].copy()  # Copie pour Generative AI
df = pd.concat([df_sent140_subset, df_genai_subset], ignore_index=True)
print("Datasets combin√©s ! Taille totale :", len(df), "tweets.")

# √âtape 1.3 : Gestion √âthique
# Explication : Anonymiser, noter biais.
df = df.drop(columns=['user'], errors='ignore')  # Supprimer usernames si pr√©sents
print("√âthique : Usernames supprim√©s pour privacy.")
print("Biais potentiels : Sentiment140 (2009, biais anglais/USA), Generative AI (hype IA, petit volume, pas de target).")
print("V√©rifiez licences Kaggle (CC-BY) dans README.")
    

In [None]:
# √âtape 2.1 : Analyse Statistique Basique
# Explication : Calculer longueur tweets, distribution sentiments, outliers.
df['length'] = df['text'].apply(len)
print("Stats longueur tweets :", df['length'].describe())
outliers = df[(df['length'] < 10) | (df['length'] > 280)]  # Limite Twitter 280
print(f"Outliers (trop courts/longs) : {len(outliers)}")

# Distribution sentiments (uniquement Sentiment140 a target)
print("Distribution sentiments (Sentiment140 uniquement) :", 
      df.dropna(subset=['target'])['target'].value_counts(normalize=True))

# √âtape 2.2 : Visualisations
# Explication : Histogramme longueurs, barplot sentiments, word cloud pour th√®mes.
plt.figure(figsize=(10, 5))
sns.histplot(df['length'], bins=50)
plt.title('Distribution Longueur Tweets')
plt.show()

plt.figure(figsize=(6, 4))
sns.countplot(x='target', data=df.dropna(subset=['target']))  # Seulement lignes avec target
plt.title('Distribution Sentiments (0=N√©gatif, 4=Positif)')
plt.show()

# Ajout de l'import manquant pour WordCloud
from wordcloud import WordCloud

# Word Cloud pour mots fr√©quents (√©chantillon pour √©viter surcharge m√©moire)
sample_size = 10000  # Limiter √† 10 000 tweets pour performance
sample_text = ' '.join(df['text'].dropna().sample(n=sample_size, random_state=42))
wordcloud = WordCloud(stopwords=stop_words, background_color='white', max_words=200).generate(sample_text)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud (√âchantillon 10 000 Tweets)')
plt.show()

# √âtape 2.3 : D√©tection Biais et Qualit√©
# Explication : Duplicates, NaN, biais (e.g., hype IA dans Generative AI).
duplicates = df.duplicated(subset=['text']).sum()
print(f"Duplicates : {duplicates} (supprimez si besoin avec df.drop_duplicates()).")
nan_count = df.isnull().sum()
print("NaN par colonne :", nan_count)

# Biais : V√©rifier si Generative AI a plus positifs (manuellement si target absent)
genai_subset = df[df['text'].isin(df_genai['text'].dropna())]  # Tweets de Generative AI
print("Biais Generative AI : Th√®mes IA souvent positifs (hype). V√©rifiez manuellement si possible.")

In [None]:
# √âtape 3.1 : Nettoyage du Texte et Suppression Doublons (Optimis√©)
# Explication : Uniformiser, enlever bruit avec regex l√©ger, supprimer doublons, √©chantillonner.
import re
import random
import pandas as pd

# Supprimer doublons pour am√©liorer qualit√©
df = df.drop_duplicates(subset=['text'], inplace=False)  # Nouvelle copie sans doublons
print(f"Doublons supprim√©s ! Nouvelle taille : {len(df)} tweets.")

# √âchantillonner pour tester (100 000 tweets)
sample_size = 100000
df_sample = df.sample(n=sample_size, random_state=42)
print(f"Travail sur un √©chantillon de {sample_size} tweets.")

# Nettoyage l√©ger sans spaCy (plus rapide)
def clean_text_light(text):
    text = text.lower()  # Minuscules
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)  # URLs
    text = re.sub(r'@\w+', '', text)  # Mentions
    text = re.sub(r'[^\w\s]', '', text)  # Ponctuation/emojis
    return text  # Pas de lemmatization

df_sample['clean_text'] = df_sample['text'].apply(clean_text_light)
print("Nettoyage l√©ger fait ! Aper√ßu :", df_sample[['text', 'clean_text']].head())

# √âtape 3.2 : Augmentation des Donn√©es (Optimis√©e et Corrig√©e)
# Explication : Ajouter donn√©es sur un sous-ensemble de n√©gatifs, g√©rer cha√Ænes vides.
from nltk.corpus import wordnet
nltk.download('wordnet')
nltk.download('omw-1.4')

def synonym_replacement(text, n=2):
    if not text or not text.strip():  # Si texte vide ou seulement espaces
        return text
    words = text.split()
    for _ in range(n):
        idx = random.randint(0, len(words)-1)
        synonyms = wordnet.synsets(words[idx])
        if synonyms:
            words[idx] = synonyms[0].lemmas()[0].name()
    return ' '.join(words)

def random_swap(text):
    if not text or not text.strip():  # Si texte vide ou seulement espaces
        return text
    words = text.split()
    if len(words) > 1:
        idx1, idx2 = random.sample(range(len(words)), 2)
        words[idx1], words[idx2] = words[idx2], words[idx1]
    return ' '.join(words)

# Appliquer sur 10 000 n√©gatifs (au lieu de tous), filtrer les vides
aug_df = df_sample[df_sample['target'] == 0].sample(n=10000, random_state=42).copy()
aug_df = aug_df[aug_df['clean_text'].str.strip().astype(bool)]  # Filtrer cha√Ænes vides
aug_df['clean_text'] = aug_df['clean_text'].apply(synonym_replacement).apply(random_swap)

# Combiner
augmented_df = pd.concat([df_sample, aug_df])
print("Avant augmentation :", df_sample['target'].value_counts(normalize=True))
print("Apr√®s augmentation :", augmented_df['target'].value_counts(normalize=True))

# √âtape 3.3 : Validation et Sauvegarde (Corrig√©)
# Explication : Split sur √©chantillon, filtrer None, sauvegarder.
from sklearn.model_selection import train_test_split

# Filtrer les lignes o√π target n'est pas None
augmented_df = augmented_df[augmented_df['target'].notna()]
X = augmented_df['clean_text']
y = augmented_df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

print("Distribution train :", y_train.value_counts(normalize=True))
print("Distribution test :", y_test.value_counts(normalize=True))

augmented_df.to_csv('augmented_tweets_sample.csv', index=False)
print("Dataset √©chantillon pr√™t et sauvegard√© ! √âthique : Biais r√©duit, anonymis√©.")

In [None]:
# √âtape 3.1 : Importation des biblioth√®ques et configuration initiale
# ---------------------------------------------------------------

import re
import string
import nltk
import emoji
from collections import Counter

# Tentative silencieuse sans bloquer ni afficher d‚Äôerreur
try:
    # Test rapide : v√©rifier si WordNet est d√©j√† install√© localement
    nltk.data.find('corpora/wordnet')
    nltk.data.find('corpora/omw-1.4')
    from nltk.stem import WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    use_lemmatization = True
    print(" WordNet disponible (lemmatisation activ√©e).")

except LookupError:
    # Si les fichiers ne sont pas trouv√©s, on passe au stemming
    from nltk.stem import PorterStemmer
    lemmatizer = PorterStemmer()
    use_lemmatization = False
    print(" WordNet non disponible (utilisation du stemming Porter).")

# Initialisation du tokenizer sp√©cialis√© pour les tweets
try:
    from nltk.tokenize import TweetTokenizer
    tweet_tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    print(" TweetTokenizer initialis√©.")
except:
    print(" TweetTokenizer non disponible, utilisation de tokenisation basique.")
    tweet_tokenizer = lambda text: text.split()

# Stopwords manuels complets
stop_words = {
    'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're",
    "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves',
    'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself',
    'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs',
    'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll",
    'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
    'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an',
    'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of',
    'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through',
    'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down',
    'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once'
}


In [None]:
# √âtape 3.2 : Dictionnaires pour la normalisation du langage informel
# ---------------------------------------------------------------

# Dictionnaire des contractions anglaises
contractions_dict = {
    "ain't": "am not", "aren't": "are not", "can't": "cannot", "can't've": "cannot have",
    "'cause": "because", "could've": "could have", "couldn't": "could not",
    "didn't": "did not", "doesn't": "does not", "don't": "do not", "hadn't": "had not",
    "hasn't": "has not", "haven't": "have not", "he'd": "he would", "he'd've": "he would have",
    "he'll": "he will", "he's": "he is", "how'd": "how did", "how'll": "how will",
    "how's": "how is", "i'd": "i would", "i'll": "i will", "i'm": "i am", "i've": "i have",
    "isn't": "is not", "it'd": "it would", "it'll": "it will", "it's": "it is",
    "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have",
    "must've": "must have", "mustn't": "must not", "needn't": "need not",
    "oughtn't": "ought not", "shan't": "shall not", "she'd": "she would",
    "she'll": "she will", "she's": "she is", "should've": "should have",
    "shouldn't": "should not", "that's": "that is", "there's": "there is",
    "they'd": "they would", "they'll": "they will", "they're": "they are",
    "they've": "they have", "wasn't": "was not", "we'd": "we would",
    "we'll": "we will", "we're": "we are", "we've": "we have", "weren't": "were not",
    "what're": "what are", "what's": "what is", "what've": "what have",
    "where's": "where is", "who'll": "who will", "who's": "who is",
    "won't": "will not", "would've": "would have", "wouldn't": "would not",
    "you'd": "you would", "you'll": "you will", "you're": "you are", "you've": "you have"
}

# Dictionnaire des abr√©viations Twitter courantes
twitter_abbreviations = {
    "rt": "retweet", "dm": "direct message", "tbt": "throwback thursday",
    "ama": "ask me anything", "tldr": "too long didn't read", "imo": "in my opinion",
    "imho": "in my humble opinion", "nsfw": "not safe for work", "ftw": "for the win",
    "smh": "shaking my head", "idk": "i don't know", "brb": "be right back",
    "afaik": "as far as i know", "irl": "in real life", "fyi": "for your information",
    "yolo": "you only live once", "omg": "oh my god", "lol": "laugh out loud",
    "wtf": "what the fuck", "tbh": "to be honest", "bfn": "bye for now"
}

# Dictionnaire pour la normalisation des √©motions
emotion_normalization = {
    "‚ù§Ô∏è": " love ", "üòç": " love ", "üòä": " happy ", "üòÇ": " laugh ", "üò≠": " cry ",
    "üò¢": " sad ", "üò°": " angry ", "ü§î": " thinking ", "üòé": " cool ", "üî•": " fire ",
    "üëç": " good ", "üëé": " bad ", "üéâ": " celebrate ", "ü§Ø": " mind blown ",
    "üíØ": " hundred percent ", "‚ú®": " sparkle ", "üíï": " love ", "üíî": " broken heart "
}

# Dictionnaire pour les √©motic√¥nes textuelles
text_emoticons = {
    ":)": " smile ", ":-)": " smile ", ": )": " smile ", ":D": " big smile ",
    ":-D": " big smile ", ":(": " sad ", ":-(": " sad ", ": (": " sad ",
    ":'(": " cry ", ";-)": " wink ", "; )": " wink ", ";)": " wink ",
    ":P": " tongue ", ":-P": " tongue ", ": P": " tongue ", "XP": " tongue ",
    "xD": " laughing ", "XD": " laughing ", "<3": " love ", "</3": " broken heart "
}

In [None]:
# √âtape 3.1 : Fonctions de nettoyage de base
# ---------------------------------------------------------------

def remove_urls(text):
    """Supprime toutes les URLs du texte"""
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub('', text)

def remove_mentions_and_hashtags(text):
    """Supprime les mentions @utilisateur et les hashtags #motcl√©"""
    text = re.sub(r'@\w+', '', text)  # Supprime @utilisateur
    text = re.sub(r'#(\w+)', r'\1', text)  # Garde seulement le texte du hashtag
    return text

def handle_punctuation(text):
    """Gestion de la ponctuation et caract√®res sp√©ciaux"""
    # Garde la ponctuation sentimentale
    text = re.sub(r'!+', ' ! ', text)
    text = re.sub(r'\?+', ' ? ', text)
    
    # Supprime les autres caract√®res sp√©ciaux
    text = re.sub(r'[^\w\s!?]', '', text)
    
    return text

def to_lowercase(text):
    """Conversion en minuscules"""
    return text.lower()

In [None]:
# √âtape 3.2 : Fonctions de normalisation avanc√©e
# ---------------------------------------------------------------

def reduce_character_repetition(text):
    """Correction des r√©p√©titions de caract√®res (ex: 'soooo' -> 'so')"""
    # Pattern pour d√©tecter les r√©p√©titions de 3 caract√®res ou plus
    pattern = re.compile(r"(.)\1{2,}")
    return pattern.sub(r"\1\1", text)  # R√©duit √† 2 r√©p√©titions maximum

def normalize_emojis_and_emoticons(text):
    """Gestion des √©motic√¥nes et emojis"""
    # Convertit les emojis en texte
    text = emoji.demojize(text, delimiters=(" ", " "))
    
    # Remplace les emojis par leur signification
    for emoji_char, meaning in emotion_normalization.items():
        text = text.replace(emoji_char, meaning)
    
    # Remplace les √©motic√¥nes textuelles
    for emoticon, meaning in text_emoticons.items():
        text = text.replace(emoticon, meaning)
    
    return text

def expand_contractions(text):
    """Normalisation des contractions (ex: 'don't' -> 'do not')"""
    contractions_pattern = re.compile('({})'.format('|'.join(contractions_dict.keys())), 
                                    flags=re.IGNORECASE|re.DOTALL)
    
    def expand_match(contraction):
        match = contraction.group(0)
        expanded = contractions_dict.get(match.lower())
        return expanded if expanded else match
    
    return contractions_pattern.sub(expand_match, text)

def expand_twitter_abbreviations(text):
    """Traitement des abr√©viations Twitter"""
    words = text.split()
    normalized_words = []
    
    for word in words:
        normalized_word = twitter_abbreviations.get(word.lower(), word)
        normalized_words.append(normalized_word)
    
    return ' '.join(normalized_words)

In [None]:
# √âtape 3.3 : Tokenization sp√©cialis√©e
# ---------------------------------------------------------------

def sentiment_aware_tokenization(text):
    """Tokenisation qui pr√©serve les ponctuations sentimentales"""
    try:
        # Utilise le tokenizer sp√©cialis√© tweets si disponible
        if 'tweet_tokenizer' in globals():
            tokens = tweet_tokenizer.tokenize(text)
        else:
            # Tokenisation manuelle
            tokens = re.findall(r'\b\w+\b|[!?]+', text)
        
        return tokens
    except Exception as e:
        print(f"Erreur tokenisation: {e}")
        return []

def remove_stopwords_and_process(tokens):
    """Supprime les stopwords et applique lemmatisation/stemming"""
    processed_tokens = []
    
    for token in tokens:
        # Garde les tokens de ponctuation sentimentale
        if token in ['!', '?']:
            processed_tokens.append(token)
        # Supprime les stopwords et applique le traitement
        elif token not in stop_words and len(token) > 2:
            try:
                if use_lemmatization:
                    if hasattr(lemmatizer, 'lemmatize'):
                        # Lemmatisation
                        processed_token = lemmatizer.lemmatize(token)
                    else:
                        # Stemming
                        processed_token = lemmatizer.stem(token)
                else:
                    processed_token = token
                processed_tokens.append(processed_token)
            except:
                processed_tokens.append(token)
    
    return processed_tokens

def create_special_tokens(text):
    """Cr√©e des tokens sp√©ciaux pour caract√©ristiques importantes"""
    special_tokens = []
    
    # D√©tection de la pr√©sence d'URL
    if re.search(r'http\S+|www\.\S+', text):
        special_tokens.append('[URL]')
    
    # D√©tection de la pr√©sence de mentions
    if re.search(r'@\w+', text):
        special_tokens.append('[MENTION]')
    
    # D√©tection de la pr√©sence de hashtags
    if re.search(r'#\w+', text):
        special_tokens.append('[HASHTAG]')
    
    # D√©tection de l'enthousiasme
    if re.search(r'!{2,}', text):
        special_tokens.append('[EXCITED]')
    
    # D√©tection des questions
    if re.search(r'\?{2,}', text):
        special_tokens.append('[QUESTION]')
    
    return special_tokens

def handle_informal_language(text):
    """Gestion du langage informel sp√©cifique aux tweets"""
    # Remplace les formes courantes de langage informel
    informal_patterns = {
        r'\b(u)\b': 'you',
        r'\b(ur)\b': 'your',
        r'\b(plz|pls)\b': 'please',
        r'\b(thx|thanx)\b': 'thanks',
        r'\b(gonna)\b': 'going to',
        r'\b(wanna)\b': 'want to',
        r'\b(gotta)\b': 'got to'
    }
    
    for pattern, replacement in informal_patterns.items():
        text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)
    
    return text

In [None]:
# ===============================================================
# √âTAPE 3.5 : PR√âPARATION DU DATAFRAME POUR L'ANALYSE
# ===============================================================

import pandas as pd
from collections import Counter

# Si tu as d√©j√† un DataFrame initial appel√© df_raw ou df (avec une colonne "text")
# on cr√©e une copie et on la pr√©pare
if 'df_processed' not in locals():
    df_processed = df.copy() if 'df' in locals() else pd.DataFrame()

# V√©rifie que la colonne de texte existe
if 'text' not in df_processed.columns:
    raise ValueError("‚ö†Ô∏è La colonne 'text' est introuvable dans ton DataFrame. V√©rifie le nom exact.")

# Supposons que tu as d√©j√† une fonction 'clean_tweet' de ton √©tape 3
def clean_tweet(text):
    import re, string, emoji
    from nltk.tokenize import TweetTokenizer
    from nltk.stem import PorterStemmer
    
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    stemmer = PorterStemmer()
    
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)  # liens
    text = re.sub(r'\@\w+|\#','', text)  # mentions et hashtags
    text = emoji.replace_emoji(text, replace='')  # supprime les √©mojis
    text = text.translate(str.maketrans('', '', string.punctuation))  # ponctuation
    
    tokens = tokenizer.tokenize(text)
    tokens = [stemmer.stem(word) for word in tokens if word.isalpha()]
    return " ".join(tokens)

# Appliquer le nettoyage
df_processed['cleaned_text'] = df_processed['text'].astype(str).apply(clean_tweet)

print("‚úÖ Texte nettoy√© et stock√© dans la colonne 'cleaned_text'")
print(df_processed[['text', 'cleaned_text']].head())

# Copier pour analyse
df_analysis = df_processed.copy()
has_sentiment = 'target' in df_analysis.columns

print("\nüìä Donn√©es pr√™tes pour analyse :")
print(f"- {len(df_analysis):,} lignes")
print(f"- Colonnes disponibles : {list(df_analysis.columns)}")


In [None]:
# Application du pr√©traitement complet
print("D√©but du pr√©traitement complet des tweets...")

# Utiliser un √©chantillon pour tester
sample_size = 10000
df_sample = df.sample(n=min(sample_size, len(df)), random_state=42)

# Appliquer le pr√©traitement
df_sample['cleaned_text'] = df_sample['text'].apply(preprocess_tweet)

# V√©rification
print(f"Pr√©traitement termin√© pour {len(df_sample)} tweets")
print(f"Textes vides: {(df_sample['cleaned_text'].str.strip() == '').sum()}")

# Afficher des exemples
print("\n" + "="*60)
print("EXEMPLES DE PR√âTRAITEMENT COMPLET")
print("="*60)

for i in range(3):
    original = df_sample['text'].iloc[i]
    cleaned = df_sample['cleaned_text'].iloc[i]
    
    print(f"\nüéØ EXEMPLE {i+1}:")
    print(f"AVANT:  {original}")
    print(f"APR√àS:  {cleaned}")
    print(f"R√âDUCTION: {len(original)} ‚Üí {len(cleaned)} caract√®res")
    print("-" * 80)

print("\n‚úÖ √âTAPE 3 TERMIN√âE AVEC SUCC√àS!")
print("Tous les √©l√©ments de pr√©traitement sont impl√©ment√©s:")
print("‚úì 3.1 Nettoyage de base")
print("‚úì 3.2 Normalisation avanc√©e") 
print("‚úì 3.3 Tokenization sp√©cialis√©e")

In [None]:
# √âtape 3.8 : Application du pr√©traitement COMPLET sur vos datasets
# ---------------------------------------------------------------

print("üîÑ APPLICATION DU PR√âTRAITEMENT SUR VOS DONN√âES R√âELLES...")

# Reprendre le dataset combin√© original (1.6M+ tweets)
print(f"Taille du dataset combin√©: {len(df)} tweets")

# Strat√©gie pour g√©rer la grande taille du dataset
def process_in_batches(df, batch_size=50000):
    """Traite le dataset par lots pour √©viter les probl√®mes de m√©moire"""
    total_batches = len(df) // batch_size + 1
    processed_texts = []
    
    for i in range(total_batches):
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, len(df))
        
        print(f"Traitement du lot {i+1}/{total_batches} (tweets {start_idx}-{end_idx})")
        
        batch = df.iloc[start_idx:end_idx]
        batch_processed = batch['text'].apply(preprocess_tweet)
        processed_texts.extend(batch_processed)
        
        # Lib√©rer m√©moire
        del batch
        del batch_processed
    
    return processed_texts

# Appliquer le pr√©traitement sur un √©chantillon d'abord (pour test)
sample_size = 50000  # Augmenter la taille pour plus de donn√©es
print(f"üîç Pr√©traitement d'un √©chantillon de {sample_size} tweets pour test...")

df_sample_large = df.sample(n=min(sample_size, len(df)), random_state=42)
df_sample_large['cleaned_text'] = df_sample_large['text'].apply(preprocess_tweet)

print("‚úÖ Pr√©traitement de l'√©chantillon termin√©!")

In [None]:
# √âtape 3.9 : Analyse d√©taill√©e sur vos donn√©es pr√©trait√©es
# ---------------------------------------------------------------

def analyze_actual_preprocessing(df_processed):
    """Analyse approfondie du pr√©traitement sur vos donn√©es r√©elles"""
    print("\n" + "="*70)
    print("üìä ANALYSE DU PR√âTRAITEMENT SUR VOS DONN√âES R√âELLES")
    print("="*70)
    
    # 1. Statistiques g√©n√©rales
    total_tweets = len(df_processed)
    successful_cleaning = df_processed['cleaned_text'].notna().sum()
    empty_texts = (df_processed['cleaned_text'].str.strip() == '').sum()
    
    print(f"üìà STATISTIQUES G√âN√âRALES:")
    print(f"   - Total tweets: {total_tweets:,}")
    print(f"   - Tweets trait√©s avec succ√®s: {successful_cleaning:,} ({successful_cleaning/total_tweets*100:.1f}%)")
    print(f"   - Textes vides apr√®s nettoyage: {empty_texts:,} ({empty_texts/total_tweets*100:.1f}%)")
    
    # 2. Analyse de r√©duction
    original_lengths = df_processed['text'].str.len()
    cleaned_lengths = df_processed['cleaned_text'].str.len()
    
    print(f"\nüìè ANALYSE DE R√âDUCTION:")
    print(f"   - Longueur moyenne avant: {original_lengths.mean():.1f} caract√®res")
    print(f"   - Longueur moyenne apr√®s: {cleaned_lengths.mean():.1f} caract√®res")
    print(f"   - R√©duction: {((original_lengths.mean() - cleaned_lengths.mean()) / original_lengths.mean() * 100):.1f}%")
    
    # 3. Distribution des sentiments (Sentiment140 seulement)
    if 'target' in df_processed.columns:
        sentiment_stats = df_processed.dropna(subset=['target'])['target'].value_counts()
        print(f"\nüé≠ DISTRIBUTION DES SENTIMENTS (Sentiment140):")
        for sentiment, count in sentiment_stats.items():
            sentiment_label = "N√©gatif" if sentiment == 0 else "Positif" if sentiment == 4 else f"Classe {sentiment}"
            percentage = (count / len(df_processed.dropna(subset=['target']))) * 100
            print(f"   - {sentiment_label}: {count:,} tweets ({percentage:.1f}%)")
    
    # 4. Analyse des tokens sp√©ciaux
    def count_special_tokens(text):
        specials = 0
        if '[URL]' in text: specials += 1
        if '[MENTION]' in text: specials += 1
        if '[HASHTAG]' in text: specials += 1
        if '[EXCITED]' in text: specials += 1
        if '[QUESTION]' in text: specials += 1
        return specials
    
    special_counts = df_processed['cleaned_text'].apply(count_special_tokens)
    print(f"\nüî§ TOKENS SP√âCIAUX D√âTECT√âS:")
    print(f"   - Tweets avec tokens sp√©ciaux: {(special_counts > 0).sum():,}")
    print(f"   - Moyenne de tokens sp√©ciaux par tweet: {special_counts.mean():.2f}")
    
    # 5. Exemples concrets de vos donn√©es
    print(f"\nüîç EXEMPLES CONCRETS DE VOS DONN√âES:")
    
    # Trouver des exemples int√©ressants
    examples_to_show = 5
    shown = 0
    for idx, row in df_processed.iterrows():
        if shown >= examples_to_show:
            break
        
        original = row['text']
        cleaned = row['cleaned_text']
        
        # Montrer seulement si le pr√©traitement a fait une diff√©rence significative
        if len(original) > 50 and len(cleaned) > 10 and len(original) != len(cleaned):
            print(f"\n--- Exemple {shown + 1} ---")
            print(f"AVANT:  {original[:150]}{'...' if len(original) > 150 else ''}")
            print(f"APR√àS:  {cleaned[:150]}{'...' if len(cleaned) > 150 else ''}")
            print(f"R√âDUCTION: {len(original)} ‚Üí {len(cleaned)} caract√®res")
            shown += 1

# Appliquer l'analyse sur votre √©chantillon
analyze_actual_preprocessing(df_sample_large)

In [None]:
# √âtape 3.10 : Pr√©paration finale pour la mod√©lisation avec vos donn√©es
# ---------------------------------------------------------------

print("\n" + "="*70)
print("üöÄ PR√âPARATION POUR LES √âTAPES SUIVANTES AVEC VOS DONN√âES")
print("="*70)

# 1. Nettoyer les donn√©es finales
df_final = df_sample_large[df_sample_large['cleaned_text'].str.strip() != ''].copy()
print(f"‚úÖ Donn√©es finales apr√®s filtrage: {len(df_final):,} tweets")

# 2. S√©parer les donn√©es Sentiment140 (avec labels) et GenerativeAI (sans labels)
if 'target' in df_final.columns:
    df_labeled = df_final.dropna(subset=['target']).copy()
    df_unlabeled = df_final[df_final['target'].isna()].copy()
    
    print(f"\nüìä S√âPARATION DES DONN√âES:")
    print(f"   - Sentiment140 (avec labels): {len(df_labeled):,} tweets")
    print(f"   - GenerativeAI (sans labels): {len(df_unlabeled):,} tweets")
    
    # Distribution des sentiments pour l'analyse
    sentiment_dist = df_labeled['target'].value_counts()
    print(f"\nüéØ DISTRIBUTION POUR L'APPRENTISSAGE:")
    for sentiment, count in sentiment_dist.items():
        label = "N√©gatif" if sentiment == 0 else "Positif"
        print(f"   - {label}: {count:,} tweets")
else:
    df_labeled = pd.DataFrame()
    df_unlabeled = df_final.copy()
    print("‚ö†Ô∏è  Aucune donn√©e √©tiquet√©e trouv√©e")

# 3. Sauvegarder les donn√©es pr√©trait√©es
try:
    # Sauvegarder l'√©chantillon pr√©trait√©
    df_final.to_csv('/kaggle/working/cleaned_tweets_sample.csv', index=False)
    
    if len(df_labeled) > 0:
        df_labeled.to_csv('/kaggle/working/cleaned_labeled_tweets.csv', index=False)
    
    if len(df_unlabeled) > 0:
        df_unlabeled.to_csv('/kaggle/working/cleaned_unlabeled_tweets.csv', index=False)
    
    print(f"\nüíæ DONN√âES SAUVEGARD√âES:")
    print(f"   - /kaggle/working/cleaned_tweets_sample.csv")
    if len(df_labeled) > 0:
        print(f"   - /kaggle/working/cleaned_labeled_tweets.csv")
    if len(df_unlabeled) > 0:
        print(f"   - /kaggle/working/cleaned_unlabeled_tweets.csv")
    
except Exception as e:
    print(f"‚ö†Ô∏è  Erreur sauvegarde: {e}")

# 4. R√©sum√© final
print("\n" + "="*70)
print("üéâ √âTAPE 3 TERMIN√âE - R√âSUM√â FINAL")
print("="*70)
print(f"üì¶ Donn√©es sources:")
print(f"   - Sentiment140: 1,600,000 tweets (avec sentiments)")
print(f"   - GenerativeAI: {len(df_genai):,} tweets (IA g√©n√©rative)")
print(f"   - Total combin√©: {len(df):,} tweets")

print(f"\nüîß Pr√©traitement appliqu√©:")
print(f"   - √âchantillon trait√©: {len(df_sample_large):,} tweets")
print(f"   - Donn√©es finales: {len(df_final):,} tweets")
print(f"   - Taux de succ√®s: {(len(df_final)/len(df_sample_large)*100):.1f}%")



In [None]:
# ===============================================================
# √âTAPE 4 : ANALYSE EXPLORATOIRE APPROFONDIE (version robuste)
# ===============================================================

import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from wordcloud import WordCloud
from collections import Counter, defaultdict
from sklearn.feature_extraction.text import CountVectorizer
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

print("üöÄ D√âBUT DE L'ANALYSE EXPLORATOIRE APPROFONDIE")
print("="*60)

# ---------------------------------------------------------------
# 1Ô∏è‚É£ V√©rifier si les donn√©es pr√©trait√©es existent d√©j√†
# ---------------------------------------------------------------

def get_or_create_df_processed():
    # Si df_processed existe d√©j√†
    if 'df_processed' in globals():
        print("‚úî Utilisation de df_processed d√©j√† pr√©sent.")
        return globals()['df_processed']
    
    # Sinon, essayer df ou df_raw
    if 'df' in globals():
        print("‚úî 'df' trouv√© ‚Üí copie vers df_processed.")
        return globals()['df'].copy()
    
    if 'df_raw' in globals():
        print("‚úî 'df_raw' trouv√© ‚Üí copie vers df_processed.")
        return globals()['df_raw'].copy()
    
    # Sinon, essayer de charger un CSV automatiquement
    default_paths = ["data.csv", "tweets.csv", "dataset.csv"]
    for p in default_paths:
        if os.path.exists(p):
            print(f"‚úî Chargement automatique depuis {p}")
            return pd.read_csv(p)
    
    # Sinon, demander manuellement le chemin
    path = input("‚ùì Aucun DataFrame trouv√©. Entrez le chemin de votre fichier CSV : ")
    if os.path.exists(path):
        print(f"‚úî Chargement manuel depuis {path}")
        return pd.read_csv(path)
    else:
        raise FileNotFoundError("‚ö†Ô∏è Aucun fichier CSV trouv√©. V√©rifie ton chemin ou ex√©cute d'abord l'√©tape 3.")

# R√©cup√©ration du DataFrame
df_processed = get_or_create_df_processed()

# ---------------------------------------------------------------
# 2Ô∏è‚É£ Pr√©paration des donn√©es pour l‚Äôanalyse
# ---------------------------------------------------------------
df_analysis = df_processed.copy()

# V√©rifier la colonne de sentiment
has_sentiment = 'target' in df_analysis.columns

print(f"üìä Donn√©es disponibles : {len(df_analysis):,} tweets")
print(f"üéØ Donn√©es de sentiment disponibles : {has_sentiment}")
print("="*60)

# ---------------------------------------------------------------
# 3Ô∏è‚É£ Premiers aper√ßus
# ---------------------------------------------------------------
print("üîé Aper√ßu des donn√©es :")
display(df_analysis.head())

print("\nüß© Colonnes disponibles :")
print(df_analysis.columns.tolist())

print("\nüßπ Valeurs manquantes :")
print(df_analysis.isna().sum())

# ---------------------------------------------------------------
# 4Ô∏è‚É£ Visualisation de base
# ---------------------------------------------------------------
if has_sentiment:
    plt.figure(figsize=(6,4))
    sns.countplot(data=df_analysis, x='target', palette='Set2')
    plt.title("R√©partition des sentiments (0 = n√©gatif, 4 = positif)")
    plt.xlabel("Sentiment")
    plt.ylabel("Nombre de tweets")
    plt.show()
else:
    print("‚ö†Ô∏è Colonne 'target' non trouv√©e ‚Äî aucune visualisation de sentiments possible.")


In [None]:
# 4.1 ANALYSE STATISTIQUE
# ===============================================================

print("\n" + "="*50)
print("4.1 ANALYSE STATISTIQUE")
print("="*50)

# Distribution des longueurs
df_analysis['text_length'] = df_analysis['cleaned_text'].str.len()
df_analysis['word_count'] = df_analysis['cleaned_text'].str.split().str.len()

print("üìè DISTRIBUTION DES LONGUEURS:")
print(f"   - Longueur moyenne : {df_analysis['text_length'].mean():.1f} caract√®res")
print(f"   - Mots moyens : {df_analysis['word_count'].mean():.1f}")
print(f"   - Min/Max : {df_analysis['text_length'].min()} / {df_analysis['text_length'].max()} caract√®res")

# Mots les plus fr√©quents
all_words = ' '.join(df_analysis['cleaned_text']).split()
word_freq = Counter(all_words)
top_words = word_freq.most_common(20)

print(f"\nüî§ MOTS LES PLUS FR√âQUENTS :")
for i, (word, freq) in enumerate(top_words[:10], 1):
    print(f"   {i:2d}. {word:15} : {freq:6,}")

# R√©partition des sentiments
if has_sentiment:
    print(f"\nüé≠ R√âPARTITION DES SENTIMENTS:")
    sentiment_counts = df_analysis['target'].value_counts()
    for sentiment, count in sentiment_counts.items():
        label = "N√©gatif" if sentiment == 0 else "Positif"
        pct = count / len(df_analysis) * 100
        print(f"   - {label}: {count:,} tweets ({pct:.1f}%)")


In [None]:
# 4.2 VISUALISATIONS AVANC√âES
# ===============================================================

print("\n" + "="*50)
print("4.2 VISUALISATIONS AVANC√âES")
print("="*50)

# Configuration des styles
plt.style.use('default')
sns.set_palette("husl")

# Cr√©er une figure avec plusieurs subplots
fig = plt.figure(figsize=(20, 15))

# 1. Distribution des longueurs de textes
plt.subplot(2, 3, 1)
plt.hist(df_analysis['text_length'], bins=50, alpha=0.7, color='skyblue', edgecolor='black')
plt.xlabel('Longueur du texte (caract√®res)')
plt.ylabel('Fr√©quence')
plt.title('Distribution des Longueurs de Textes')
plt.grid(True, alpha=0.3)

# 2. Distribution du nombre de mots
plt.subplot(2, 3, 2)
plt.hist(df_analysis['word_count'], bins=30, alpha=0.7, color='lightgreen', edgecolor='black')
plt.xlabel('Nombre de mots')
plt.ylabel('Fr√©quence')
plt.title('Distribution du Nombre de Mots par Tweet')
plt.grid(True, alpha=0.3)

# 3. R√©partition des sentiments (si disponible)
if has_sentiment:
    plt.subplot(2, 3, 3)
    sentiment_labels = ['N√©gatif', 'Positif']
    sentiment_values = [sentiment_counts.get(0, 0), sentiment_counts.get(4, 0)]
    
    plt.pie(sentiment_values, labels=sentiment_labels, autopct='%1.1f%%', 
            colors=['#ff9999', '#66b3ff'], startangle=90)
    plt.title('R√©partition des Sentiments')

# 4. Word Cloud g√©n√©ral
plt.subplot(2, 3, 4)
wordcloud = WordCloud(width=800, height=400, background_color='white', 
                      max_words=100, colormap='viridis').generate(' '.join(df_analysis['cleaned_text']))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud - Tous les Tweets')

# 5. Word Cloud par sentiment (si disponible)
if has_sentiment:
    plt.subplot(2, 3, 5)
    positive_texts = ' '.join(df_analysis[df_analysis['target'] == 4]['cleaned_text'])
    wordcloud_positive = WordCloud(width=800, height=400, background_color='white', 
                                  max_words=100, colormap='Greens').generate(positive_texts)
    plt.imshow(wordcloud_positive, interpolation='bilinear')
    plt.axis('off')
    plt.title('Word Cloud - Sentiments Positifs')

    plt.subplot(2, 3, 6)
    negative_texts = ' '.join(df_analysis[df_analysis['target'] == 0]['cleaned_text'])
    wordcloud_negative = WordCloud(width=800, height=400, background_color='white', 
                                  max_words=100, colormap='Reds').generate(negative_texts)
    plt.imshow(wordcloud_negative, interpolation='bilinear')
    plt.axis('off')
    plt.title('Word Cloud - Sentiments N√©gatifs')

plt.tight_layout()
plt.show()

# Analyse temporelle (si donn√©es disponibles)
if 'date' in df_analysis.columns:
    print("\nüìÖ ANALYSE TEMPORELLE:")
    try:
        df_analysis['date'] = pd.to_datetime(df_analysis['date'])
        df_analysis['month'] = df_analysis['date'].dt.to_period('M')
        
        monthly_counts = df_analysis['month'].value_counts().sort_index()
        
        plt.figure(figsize=(12, 6))
        monthly_counts.plot(kind='line', marker='o', color='purple')
        plt.title('√âvolution du Volume de Tweets par Mois')
        plt.xlabel('Mois')
        plt.ylabel('Nombre de Tweets')
        plt.xticks(rotation=45)
        plt.grid(True, alpha=0.3)
        plt.tight_layout()
        plt.show()
        
        print(f"   - P√©riode couverte: {df_analysis['date'].min()} to {df_analysis['date'].max()}")
        print(f"   - Mois le plus actif: {monthly_counts.idxmax()} ({monthly_counts.max()} tweets)")
        
    except Exception as e:
        print(f"   ‚ö†Ô∏è  Impossible d'analyser les dates: {e}")

In [None]:
# 4.3 D√âTECTION DES PATTERNS
# ===============================================================

print("\n" + "="*50)
print("4.3 D√âTECTION DES PATTERNS")
print("="*50)

# Patterns linguistiques par sentiment
if has_sentiment:
    print("\nüîç PATTERNS LINGUISTIQUES PAR SENTIMENT:")
    
    # S√©parer les textes par sentiment
    positive_texts = df_analysis[df_analysis['target'] == 4]['cleaned_text']
    negative_texts = df_analysis[df_analysis['target'] == 0]['cleaned_text']
    
    # Mots les plus fr√©quents par sentiment
    positive_words = ' '.join(positive_texts).split()
    negative_words = ' '.join(negative_texts).split()
    
    positive_freq = Counter(positive_words)
    negative_freq = Counter(negative_words)
    
    # Trouver les mots distinctifs
    all_positive_words = set(positive_freq.keys())
    all_negative_words = set(negative_freq.keys())
    
    distinctive_positive = all_positive_words - all_negative_words
    distinctive_negative = all_negative_words - all_positive_words
    
    print("   Mots distinctifs positifs (Top 10):")
    pos_distinctive_words = [(word, positive_freq[word]) for word in distinctive_positive]
    pos_distinctive_words.sort(key=lambda x: x[1], reverse=True)
    for word, freq in pos_distinctive_words[:10]:
        print(f"      - {word}: {freq} occurrences")
    
    print("\n   Mots distinctifs n√©gatifs (Top 10):")
    neg_distinctive_words = [(word, negative_freq[word]) for word in distinctive_negative]
    neg_distinctive_words.sort(key=lambda x: x[1], reverse=True)
    for word, freq in neg_distinctive_words[:10]:
        print(f"      - {word}: {freq} occurrences")

# Analyse des n-grams
print("\nüìä ANALYSE DES N-GRAMS:")

def get_top_ngrams(corpus, n=2, top_k=10):
    """Extrait les n-grams les plus fr√©quents"""
    vec = CountVectorizer(ngram_range=(n, n), max_features=top_k)
    X = vec.fit_transform(corpus)
    words = vec.get_feature_names_out()
    counts = X.sum(axis=0).A1
    return list(zip(words, counts))

# Bigrams
try:
    top_bigrams = get_top_ngrams(df_analysis['cleaned_text'], n=2, top_k=15)
    print(f"   Bigrams les plus fr√©quents:")
    for i, (bigram, count) in enumerate(top_bigrams[:10], 1):
        print(f"      {i:2d}. {bigram:20} : {count:4} occurrences")
except Exception as e:
    print(f"   ‚ö†Ô∏è  Erreur avec les bigrams: {e}")

# Trigrams
try:
    top_trigrams = get_top_ngrams(df_analysis['cleaned_text'], n=3, top_k=10)
    print(f"\n   Trigrams les plus fr√©quents:")
    for i, (trigram, count) in enumerate(top_trigrams[:5], 1):
        print(f"      {i:2d}. {trigram:25} : {count:4} occurrences")
except Exception as e:
    print(f"   ‚ö†Ô∏è  Erreur avec les trigrams: {e}")

# D√©tection des topics dominants
print("\nüéØ D√âTECTION DES TOPICS DOMINANTS:")

# Analyser les mots les plus fr√©quents par cat√©gorie
def analyze_topics_by_frequency(texts, category_name, top_n=15):
    """Analyse les topics par fr√©quence des mots"""
    all_text = ' '.join(texts)
    words = all_text.split()
    word_freq = Counter(words)
    
    # Filtrer les mots courts et peu informatifs
    meaningful_words = [(word, freq) for word, freq in word_freq.items() 
                       if len(word) > 3 and freq > 5]
    meaningful_words.sort(key=lambda x: x[1], reverse=True)
    
    return meaningful_words[:top_n]

# Topics g√©n√©raux
general_topics = analyze_topics_by_frequency(df_analysis['cleaned_text'], "G√©n√©ral")
print("   Topics dominants g√©n√©raux:")
for i, (word, freq) in enumerate(general_topics[:10], 1):
    print(f"      {i:2d}. {word:15} : {freq:4} occurrences")

# Topics par sentiment (si disponible)
if has_sentiment:
    positive_topics = analyze_topics_by_frequency(positive_texts, "Positif")
    negative_topics = analyze_topics_by_frequency(negative_texts, "N√©gatif")
    
    print(f"\n   Topics dominants - Sentiments Positifs:")
    for i, (word, freq) in enumerate(positive_topics[:8], 1):
        print(f"      {i:2d}. {word:15} : {freq:4} occurrences")
    
    print(f"\n   Topics dominants - Sentiments N√©gatifs:")
    for i, (word, freq) in enumerate(negative_topics[:8], 1):
        print(f"      {i:2d}. {word:15} : {freq:4} occurrences")

In [None]:
# √âTAPE 5: PR√âPARATION DES DONN√âES POUR DEEP LEARNING
# ===============================================================

import torch
from torch.utils.data import Dataset, DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

print("üöÄ D√âBUT DE LA PR√âPARATION POUR DEEP LEARNING")
print("="*60)

# V√©rifier la disponibilit√© de GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"üéØ Device utilis√©: {device}")

In [None]:
# 5.1 CR√âATION DU VOCABULAIRE
# ===============================================================

print("\n" + "="*50)
print("5.1 CR√âATION DU VOCABULAIRE")
print("="*50)

class Vocabulary:
    """Classe pour g√©rer le vocabulaire des tweets"""
    
    def __init__(self, freq_threshold=2):
        self.freq_threshold = freq_threshold
        self.itos = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}
        self.stoi = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3}
        self.vocab_size = 4
        
    def __len__(self):
        return self.vocab_size
    
    def build_vocabulary(self, sentences):
        """Construit le vocabulaire √† partir des phrases"""
        print("üî® Construction du vocabulaire...")
        
        # Compter la fr√©quence de tous les mots
        word_freq = Counter()
        for sentence in sentences:
            for word in sentence.split():
                word_freq[word] += 1
        
        # Ajouter les mots qui d√©passent le seuil de fr√©quence
        for word, freq in word_freq.items():
            if freq >= self.freq_threshold and word not in self.stoi:
                self.stoi[word] = self.vocab_size
                self.itos[self.vocab_size] = word
                self.vocab_size += 1
        
        print(f"   - Mots uniques trouv√©s: {len(word_freq):,}")
        print(f"   - Mots ajout√©s au vocabulaire: {self.vocab_size - 4:,}")
        print(f"   - Taille finale du vocabulaire: {self.vocab_size:,}")
        
        # Analyser les mots rares
        rare_words = [word for word, freq in word_freq.items() if freq < self.freq_threshold]
        print(f"   - Mots rares exclus (<{self.freq_threshold} occ): {len(rare_words):,}")
        
        if len(rare_words) > 0:
            print(f"   - Exemples de mots rares: {', '.join(rare_words[:10])}")
    
    def numericalize(self, text):
        """Convertit un texte en s√©quence num√©rique"""
        tokens = text.split()
        numericalized = [self.stoi.get(token, self.stoi["<UNK>"]) for token in tokens]
        return numericalized

# Pr√©parer les donn√©es pour la construction du vocabulaire
print("üìù Pr√©paration des textes pour le vocabulaire...")
all_texts = df_analysis['cleaned_text'].tolist()

# Construire le vocabulaire
vocab = Vocabulary(freq_threshold=2)
vocab.build_vocabulary(all_texts)

# Analyser la distribution des longueurs pour d√©finir la longueur maximale
sequence_lengths = [len(text.split()) for text in all_texts]
print(f"\nüìè ANALYSE DES LONGUEURS DE S√âQUENCES:")
print(f"   - Longueur moyenne: {np.mean(sequence_lengths):.1f} mots")
print(f"   - Longueur m√©diane: {np.median(sequence_lengths):.1f} mots")
print(f"   - Longueur max: {np.max(sequence_lengths)} mots")
print(f"   - Longueur min: {np.min(sequence_lengths)} mots")
print(f"   - 95e percentile: {np.percentile(sequence_lengths, 95):.1f} mots")

# D√©finir la longueur maximale bas√©e sur le 95e percentile
max_length = int(np.percentile(sequence_lengths, 95))
print(f"üéØ Longueur maximale choisie: {max_length} mots (95e percentile)")

In [None]:
# 5.2 S√âQUENCEMENT DES DONN√âES
# ===============================================================

print("\n" + "="*50)
print("5.2 S√âQUENCEMENT DES DONN√âES")
print("="*50)

def preprocess_sequences(texts, vocab, max_length):
    """Convertit les textes en s√©quences num√©riques avec padding"""
    sequences = []
    
    for text in texts:
        # Convertir en s√©quence num√©rique
        numericalized = vocab.numericalize(text)
        
        # Tronquer si n√©cessaire
        if len(numericalized) > max_length:
            numericalized = numericalized[:max_length]
        
        # Padding
        padded_sequence = numericalized + [vocab.stoi["<PAD>"]] * (max_length - len(numericalized))
        sequences.append(padded_sequence)
    
    return np.array(sequences)

# Pr√©parer les features (X) et labels (y)
print("üîÑ Conversion des textes en s√©quences num√©riques...")

X_sequences = preprocess_sequences(all_texts, vocab, max_length)

if has_sentiment:
    # Convertir les labels sentiment (0=n√©gatif, 4=positif ‚Üí 0,1)
    y = df_analysis['target'].apply(lambda x: 0 if x == 0 else 1).values
    print(f"üéØ Labels de sentiment pr√©par√©s: {len(y)} √©chantillons")
    print(f"   - N√©gatif (0): {np.sum(y == 0)}")
    print(f"   - Positif (1): {np.sum(y == 1)}")
else:
    y = None
    print("‚ö†Ô∏è  Aucun label de sentiment disponible - mode non supervis√©")

print(f"üì¶ Forme des s√©quences: {X_sequences.shape}")
print(f"   - Nombre d'√©chantillons: {X_sequences.shape[0]}")
print(f"   - Longueur des s√©quences: {X_sequences.shape[1]}")

In [None]:
# 5.3 SPLIT DES DONN√âES
# ===============================================================

print("\n" + "="*50)
print("5.3 SPLIT DES DONN√âES")
print("="*50)

# Dataset personnalis√© pour PyTorch
class TweetDataset(Dataset):
    def __init__(self, sequences, labels=None):
        self.sequences = torch.LongTensor(sequences)
        self.labels = torch.LongTensor(labels) if labels is not None else None
    
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        if self.labels is not None:
            return self.sequences[idx], self.labels[idx]
        else:
            return self.sequences[idx]

# Division des donn√©es selon la disponibilit√© des labels
if has_sentiment and y is not None:
    print("üéØ DIVISION AVEC LABELS (APPRENTISSAGE SUPERVIS√â)")
    
    # Split stratifi√© pour maintenir la distribution des sentiments
    X_temp, X_test, y_temp, y_test = train_test_split(
        X_sequences, y, test_size=0.15, random_state=42, stratify=y
    )
    
    X_train, X_val, y_train, y_val = train_test_split(
        X_temp, y_temp, test_size=0.15, random_state=42, stratify=y_temp
    )
    
    print(f"üìä R√âPARTITION DES DONN√âES:")
    print(f"   - Train: {len(X_train):,} √©chantillons ({len(X_train)/len(X_sequences)*100:.1f}%)")
    print(f"   - Validation: {len(X_val):,} √©chantillons ({len(X_val)/len(X_sequences)*100:.1f}%)")
    print(f"   - Test: {len(X_test):,} √©chantillons ({len(X_test)/len(X_sequences)*100:.1f}%)")
    
    # V√©rifier la distribution des sentiments dans chaque split
    print(f"\nüé≠ DISTRIBUTION DES SENTIMENTS PAR SPLIT:")
    
    for split_name, split_y in [("Train", y_train), ("Validation", y_val), ("Test", y_test)]:
        neg_count = np.sum(split_y == 0)
        pos_count = np.sum(split_y == 1)
        total = len(split_y)
        print(f"   {split_name:12}: {neg_count:4} n√©gatifs ({neg_count/total*100:.1f}%) | "
              f"{pos_count:4} positifs ({pos_count/total*100:.1f}%)")
    
    # Cr√©er les datasets PyTorch
    train_dataset = TweetDataset(X_train, y_train)
    val_dataset = TweetDataset(X_val, y_val)
    test_dataset = TweetDataset(X_test, y_test)
    
else:
    print("üîç MODE NON SUPERVIS√â - Donn√©es sans labels")
    # Utiliser tout le dataset pour l'analyse non supervis√©e
    full_dataset = TweetDataset(X_sequences)
    
    # Pour l'√©valuation, on peut quand m√™me splitter si on veut
    indices = np.arange(len(X_sequences))
    train_idx, test_idx = train_test_split(indices, test_size=0.2, random_state=42)
    
    X_train = X_sequences[train_idx]
    X_test = X_sequences[test_idx]
    
    train_dataset = TweetDataset(X_train)
    test_dataset = TweetDataset(X_test)
    val_dataset = None
    
    print(f"üìä R√âPARTITION DES DONN√âES:")
    print(f"   - Train: {len(X_train):,} √©chantillons")
    print(f"   - Test: {len(X_test):,} √©chantillons")

In [None]:
# PR√âPARATION DES DATALOADERS
# ===============================================================

print("\n" + "="*50)
print("PR√âPARATION DES DATALOADERS")
print("="*50)

# D√©finir les param√®tres des DataLoaders
batch_size = 64
print(f"üîß Configuration des DataLoaders:")
print(f"   - Batch size: {batch_size}")
print(f"   - Device: {device}")

if has_sentiment:
    # DataLoaders pour l'apprentissage supervis√©
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=2)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=2)
    
    print(f"‚úÖ DataLoaders cr√©√©s:")
    print(f"   - Train: {len(train_loader)} batches")
    print(f"   - Validation: {len(val_loader)} batches")
    print(f"   - Test: {len(test_loader)} batches")
    
else:
    # DataLoaders pour l'analyse non supervis√©e
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=2)
    
    print(f"‚úÖ DataLoaders cr√©√©s (non supervis√©):")
    print(f"   - Train: {len(train_loader)} batches")
    print(f"   - Test: {len(test_loader)} batches")

In [None]:
# V√âRIFICATION ET TESTS
# ===============================================================

print("\n" + "="*50)
print("V√âRIFICATION ET TESTS")
print("="*50)

# Tester un batch
print("üß™ Test d'un batch d'entra√Ænement...")

if has_sentiment:
    # Mode supervis√©
    data_iter = iter(train_loader)
    sequences_batch, labels_batch = next(data_iter)
    
    print(f"‚úÖ Batch test√© avec succ√®s:")
    print(f"   - Forme des s√©quences: {sequences_batch.shape}")
    print(f"   - Forme des labels: {labels_batch.shape}")
    print(f"   - Device: {sequences_batch.device}")
    
    # Afficher un exemple
    print(f"\nüîç EXEMPLE D'UN √âCHANTILLON:")
    sample_idx = 0
    sample_sequence = sequences_batch[sample_idx].cpu().numpy()
    sample_label = labels_batch[sample_idx].cpu().numpy()
    
    # Convertir la s√©quence num√©rique back en texte
    original_text = []
    for token_id in sample_sequence:
        if token_id == vocab.stoi["<PAD>"]:
            break
        original_text.append(vocab.itos.get(token_id, "<UNK>"))
    
    print(f"   - S√©quence num√©rique: {sample_sequence[:10]}...")
    print(f"   - Texte reconstruit: {' '.join(original_text[:10])}...")
    print(f"   - Label: {'Positif' if sample_label == 1 else 'N√©gatif'}")
    
else:
    # Mode non supervis√©
    data_iter = iter(train_loader)
    sequences_batch = next(data_iter)
    
    print(f"‚úÖ Batch test√© avec succ√®s (non supervis√©):")
    print(f"   - Forme des s√©quences: {sequences_batch.shape}")
    
    # Afficher un exemple
    print(f"\nüîç EXEMPLE D'UN √âCHANTILLON:")
    sample_idx = 0
    sample_sequence = sequences_batch[sample_idx].cpu().numpy()
    
    # Convertir la s√©quence num√©rique back en texte
    original_text = []
    for token_id in sample_sequence:
        if token_id == vocab.stoi["<PAD>"]:
            break
        original_text.append(vocab.itos.get(token_id, "<UNK>"))
    
    print(f"   - S√©quence num√©rique: {sample_sequence[:10]}...")
    print(f"   - Texte reconstruit: {' '.join(original_text[:10])}...")

In [None]:
# SAUVEGARDE DES OBJETS IMPORTANTS
# ===============================================================

print("\n" + "="*50)
print("SAUVEGARDE DES OBJETS")
print("="*50)

import pickle
import os

# Cr√©er le dossier de sauvegarde
os.makedirs('/kaggle/working/model_assets', exist_ok=True)

# Sauvegarder les objets importants
try:
    # Sauvegarder le vocabulaire
    with open('/kaggle/working/model_assets/vocabulary.pkl', 'wb') as f:
        pickle.dump(vocab, f)
    
    # Sauvegarder les param√®tres
    config = {
        'vocab_size': vocab.vocab_size,
        'max_length': max_length,
        'batch_size': batch_size,
        'has_sentiment': has_sentiment
    }
    
    with open('/kaggle/working/model_assets/training_config.pkl', 'wb') as f:
        pickle.dump(config, f)
    
    print("üíæ OBJETS SAUVEGARD√âS:")
    print(f"   - Vocabulaire: /kaggle/working/model_assets/vocabulary.pkl")
    print(f"   - Configuration: /kaggle/working/model_assets/training_config.pkl")
    print(f"   - Taille vocabulaire: {vocab.vocab_size:,}")
    print(f"   - Longueur max: {max_length}")
    
except Exception as e:
    print(f"‚ö†Ô∏è  Erreur lors de la sauvegarde: {e}")

In [None]:
# √âTAPE 6: CONFIGURATION DES EMBEDDINGS
# ===============================================================

import torch
import torch.nn as nn
import numpy as np
import requests
import os
from pathlib import Path

print("üöÄ D√âBUT DE LA CONFIGURATION DES EMBEDDINGS")
print("="*60)

# V√©rifier l'√©tat actuel de VOS donn√©es
print("üìä √âTAT DE VOS DONN√âES:")
print(f"   - Taille du vocabulaire: {vocab.vocab_size:,} mots")
print(f"   - Longueur des s√©quences: {max_length} tokens")
print(f"   - Device: {device}")

In [None]:
# 6.1 CHOIX DES EMBEDDINGS
# ===============================================================

print("\n" + "="*50)
print("6.1 CHOIX DES EMBEDDINGS")
print("="*50)

class EmbeddingManager:
    """G√®re les diff√©rentes strat√©gies d'embedding"""
    
    def __init__(self, vocab, embedding_dim=100):
        self.vocab = vocab
        self.embedding_dim = embedding_dim
        self.embedding_matrix = None
        
    def load_glove_embeddings(self, glove_path=None):
        """Charge les embeddings GloVe pr√©-entra√Æn√©s"""
        print("üîç Recherche d'embeddings GloVe...")
        
        # Chemins possibles pour GloVe dans Kaggle
        possible_paths = [
            '/kaggle/input/glove6b/glove.6B.100d.txt',
            '/kaggle/input/glove-global-vectors-for-word-representation/glove.6B.100d.txt',
            '/kaggle/input/glove-twitter/glove.twitter.27B.100d.txt',
            glove_path
        ]
        
        glove_file = None
        for path in possible_paths:
            if path and os.path.exists(path):
                glove_file = path
                break
                
        if glove_file:
            print(f"‚úÖ Fichier GloVe trouv√©: {glove_file}")
            return self._load_glove_from_file(glove_file)
        else:
            print("‚ùå Aucun fichier GloVe trouv√© - initialisation al√©atoire")
            return None
    
    def _load_glove_from_file(self, glove_path):
        """Charge les embeddings GloVe depuis un fichier"""
        print(f"üìñ Chargement des embeddings GloVe depuis {glove_path}...")
        
        embeddings_index = {}
        with open(glove_path, 'r', encoding='utf-8') as f:
            for line in f:
                values = line.split()
                word = values[0]
                coefs = np.asarray(values[1:], dtype='float32')
                embeddings_index[word] = coefs
        
        print(f"   - Embeddings charg√©s: {len(embeddings_index):,} mots")
        
        # Cr√©er la matrice d'embedding pour notre vocabulaire
        embedding_matrix = np.zeros((self.vocab.vocab_size, self.embedding_dim))
        matched_words = 0
        
        for word, idx in self.vocab.stoi.items():
            if word in embeddings_index:
                embedding_matrix[idx] = embeddings_index[word]
                matched_words += 1
            elif word.lower() in embeddings_index:
                embedding_matrix[idx] = embeddings_index[word.lower()]
                matched_words += 1
        
        coverage = matched_words / self.vocab.vocab_size * 100
        print(f"‚úÖ Couverture du vocabulaire: {matched_words}/{self.vocab.vocab_size} ({coverage:.2f}%)")
        
        return embedding_matrix
    
    def load_fasttext_embeddings(self):
        """Tente de charger des embeddings FastText"""
        print("üîç Recherche d'embeddings FastText...")
        
        # Chemins possibles pour FastText dans Kaggle
        possible_paths = [
            '/kaggle/input/fasttext-wikinews/wiki-news-300d-1M.vec',
            '/kaggle/input/fasttext-crawl-300d-2m/crawl-300d-2M.vec'
        ]
        
        for path in possible_paths:
            if os.path.exists(path):
                print(f"‚úÖ Fichier FastText trouv√©: {path}")
                return self._load_fasttext_from_file(path)
        
        print("‚ùå Aucun fichier FastText trouv√©")
        return None
    
    def _load_fasttext_from_file(self, fasttext_path):
        """Charge les embeddings FastText depuis un fichier"""
        print(f"üìñ Chargement des embeddings FastText...")
        
        embeddings_index = {}
        with open(fasttext_path, 'r', encoding='utf-8') as f:
            # La premi√®re ligne contient le nombre de mots et la dimension
            first_line = f.readline()
            for line in f:
                values = line.rstrip().split(' ')
                word = values[0]
                coefs = np.asarray(values[1:], dtype='float32')
                embeddings_index[word] = coefs
        
        print(f"   - Embeddings charg√©s: {len(embeddings_index):,} mots")
        
        # Mettre √† jour la dimension d'embedding
        if embeddings_index:
            sample_embedding = next(iter(embeddings_index.values()))
            self.embedding_dim = len(sample_embedding)
            print(f"   - Dimension d'embedding: {self.embedding_dim}")
        
        # Cr√©er la matrice d'embedding
        embedding_matrix = np.zeros((self.vocab.vocab_size, self.embedding_dim))
        matched_words = 0
        
        for word, idx in self.vocab.stoi.items():
            if word in embeddings_index:
                embedding_matrix[idx] = embeddings_index[word]
                matched_words += 1
            elif word.lower() in embeddings_index:
                embedding_matrix[idx] = embeddings_index[word.lower()]
                matched_words += 1
        
        coverage = matched_words / self.vocab.vocab_size * 100
        print(f"‚úÖ Couverture du vocabulaire: {matched_words}/{self.vocab.vocab_size} ({coverage:.2f}%)")
        
        return embedding_matrix
    
    def create_twitter_specific_embeddings(self):
        """Cr√©e des embeddings sp√©cifiques pour Twitter"""
        print("üê¶ Cr√©ation d'embeddings sp√©cifiques Twitter...")
        
        # Initialisation avec des valeurs adapt√©es au langage Twitter
        embedding_matrix = np.random.normal(
            scale=0.1, 
            size=(self.vocab.vocab_size, self.embedding_dim)
        )
        
        # Initialisation sp√©ciale pour les tokens Twitter courants
        twitter_words = {
            'rt': 0.5, 'lol': 0.8, 'omg': 0.7, 'haha': 0.6, 
            'love': 0.9, 'happy': 0.8, 'sad': -0.8, 'angry': -0.9,
            'good': 0.7, 'bad': -0.7, 'great': 0.9, 'terrible': -0.9
        }
        
        for word, sentiment_bias in twitter_words.items():
            if word in self.vocab.stoi:
                idx = self.vocab.stoi[word]
                # Initialiser avec un biais de sentiment
                embedding_matrix[idx] += sentiment_bias * 0.1
        
        print(f"‚úÖ Embeddings Twitter cr√©√©s: {len(twitter_words)} mots initialis√©s avec biais de sentiment")
        return embedding_matrix
    
    def initialize_random_embeddings(self):
        """Initialisation al√©atoire des embeddings"""
        print("üé≤ Initialisation al√©atoire des embeddings...")
        
        # Initialisation Xavier/Glorot pour une meilleure convergence
        scale = np.sqrt(2.0 / (self.vocab.vocab_size + self.embedding_dim))
        embedding_matrix = np.random.normal(
            scale=scale, 
            size=(self.vocab.vocab_size, self.embedding_dim)
        )
        
        # Initialisation √† z√©ro pour les tokens de padding
        if '<PAD>' in self.vocab.stoi:
            pad_idx = self.vocab.stoi['<PAD>']
            embedding_matrix[pad_idx] = np.zeros(self.embedding_dim)
        
        print(f"‚úÖ Embeddings al√©atoires initialis√©s (Xavier)")
        return embedding_matrix

# Tester diff√©rentes strat√©gies d'embedding
print("üß™ TEST DES DIFF√âRENTES STRAT√âGIES D'EMBEDDING...")

embedding_manager = EmbeddingManager(vocab, embedding_dim=100)

# 1. Essayer GloVe en premier
glove_embeddings = embedding_manager.load_glove_embeddings()

if glove_embeddings is not None:
    embedding_manager.embedding_matrix = glove_embeddings
    strategy = "GloVe pr√©-entra√Æn√©s"
    print("üéØ STRAT√âGIE CHOISIE: GloVe pr√©-entra√Æn√©s")

else:
    # 2. Essayer FastText
    fasttext_embeddings = embedding_manager.load_fasttext_embeddings()
    
    if fasttext_embeddings is not None:
        embedding_manager.embedding_matrix = fasttext_embeddings
        strategy = "FastText pr√©-entra√Æn√©s"
        print("üéØ STRAT√âGIE CHOISIE: FastText pr√©-entra√Æn√©s")
    
    else:
        # 3. Embeddings sp√©cifiques Twitter
        twitter_embeddings = embedding_manager.create_twitter_specific_embeddings()
        embedding_manager.embedding_matrix = twitter_embeddings
        strategy = "Embeddings sp√©cifiques Twitter"
        print("üéØ STRAT√âGIE CHOISIE: Embeddings sp√©cifiques Twitter")

print(f"‚úÖ Matrice d'embedding cr√©√©e: {embedding_manager.embedding_matrix.shape}")

In [None]:
# 6.2 ADAPTATION DES EMBEDDINGS
# ===============================================================

print("\n" + "="*50)
print("6.2 ADAPTATION DES EMBEDDINGS")
print("="*50)

class AdaptiveEmbeddingLayer(nn.Module):
    """Couche d'embedding adaptative avec diff√©rentes strat√©gies"""
    
    def __init__(self, vocab_size, embedding_dim, embedding_matrix=None, 
                 trainable=True, dropout=0.1):
        super(AdaptiveEmbeddingLayer, self).__init__()
        
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.trainable = trainable
        
        # Cr√©er la couche d'embedding
        if embedding_matrix is not None:
            self.embedding = nn.Embedding.from_pretrained(
                torch.FloatTensor(embedding_matrix),
                freeze=not trainable,
                padding_idx=0  # <PAD> token
            )
            print(f"‚úÖ Embeddings pr√©-entra√Æn√©s charg√©s (trainable: {trainable})")
        else:
            self.embedding = nn.Embedding(
                vocab_size, 
                embedding_dim, 
                padding_idx=0
            )
            # Initialisation Xavier
            nn.init.xavier_uniform_(self.embedding.weight)
            print(f"‚úÖ Embeddings al√©atoires initialis√©s (trainable: {trainable})")
        
        # Dropout pour r√©gularisation
        self.dropout = nn.Dropout(dropout)
        
        # Couche de projection pour adapter la dimension si n√©cessaire
        self.projection = None
        if embedding_dim != 100:  # Dimension cible pour le mod√®le
            self.projection = nn.Linear(embedding_dim, 100)
            print(f"‚úÖ Couche de projection ajout√©e: {embedding_dim} ‚Üí 100")
    
    def forward(self, x):
        embeddings = self.embedding(x)
        
        if self.projection is not None:
            embeddings = self.projection(embeddings)
        
        return self.dropout(embeddings)

# Test des diff√©rentes strat√©gies d'adaptation
print("üß™ CONFIGURATION DES STRAT√âGIES D'ADAPTATION...")

# Strat√©gie 1: Fine-tuning complet
print("\n1. üîÑ FINE-TUNING COMPLET:")
embedding_finetune = AdaptiveEmbeddingLayer(
    vocab_size=vocab.vocab_size,
    embedding_dim=embedding_manager.embedding_dim,
    embedding_matrix=embedding_manager.embedding_matrix,
    trainable=True,  # Les embeddings seront mis √† jour
    dropout=0.2
).to(device)

print(f"   - Embeddings trainables: OUI")
print(f"   - Dropout: 0.2")
print(f"   - Param√®tres √† entra√Æner: {sum(p.numel() for p in embedding_finetune.parameters()):,}")

# Strat√©gie 2: Embeddings fig√©s
print("\n2. üßä EMBEDDINGS FIG√âS:")
embedding_frozen = AdaptiveEmbeddingLayer(
    vocab_size=vocab.vocab_size,
    embedding_dim=embedding_manager.embedding_dim,
    embedding_matrix=embedding_manager.embedding_matrix,
    trainable=False,  # Les embeddings restent fixes
    dropout=0.1
).to(device)

print(f"   - Embeddings trainables: NON")
print(f"   - Dropout: 0.1")
print(f"   - Param√®tres √† entra√Æner: {sum(p.numel() for p in embedding_frozen.parameters() if p.requires_grad):,}")

# Strat√©gie 3: Apprentissage from scratch
print("\n3. üé≤ FROM SCRATCH:")
embedding_scratch = AdaptiveEmbeddingLayer(
    vocab_size=vocab.vocab_size,
    embedding_dim=100,  # Dimension standard
    embedding_matrix=None,  # Pas d'embedding pr√©-entra√Æn√©
    trainable=True,
    dropout=0.3
).to(device)

print(f"   - Embeddings trainables: OUI")
print(f"   - Dimension: 100")
print(f"   - Dropout: 0.3")
print(f"   - Param√®tres √† entra√Æner: {sum(p.numel() for p in embedding_scratch.parameters()):,}")

In [None]:
# COMBINAISON D'EMBEDDINGS
# ===============================================================

print("\n" + "="*50)
print("COMBINAISON D'EMBEDDINGS")
print("="*50)

class MultiEmbeddingLayer(nn.Module):
    """Combine plusieurs types d'embeddings"""
    
    def __init__(self, vocab_size, embedding_configs):
        super(MultiEmbeddingLayer, self).__init__()
        
        self.embedding_layers = nn.ModuleList()
        self.output_dim = 0
        
        for config in embedding_configs:
            layer = AdaptiveEmbeddingLayer(
                vocab_size=vocab_size,
                embedding_dim=config['dim'],
                embedding_matrix=config.get('matrix'),
                trainable=config.get('trainable', True),
                dropout=config.get('dropout', 0.1)
            )
            self.embedding_layers.append(layer)
            self.output_dim += config['dim']
        
        # Couche de combinaison
        self.combination_layer = nn.Linear(self.output_dim, 100)
        self.layer_norm = nn.LayerNorm(100)
        self.dropout = nn.Dropout(0.2)
        
        print(f"‚úÖ Combinaison de {len(embedding_configs)} embeddings")
        print(f"   - Dimension totale: {self.output_dim} ‚Üí 100")
    
    def forward(self, x):
        embeddings = []
        for layer in self.embedding_layers:
            emb = layer(x)
            embeddings.append(emb)
        
        # Concatenation des embeddings
        combined = torch.cat(embeddings, dim=-1)
        
        # Projection vers dimension cible
        output = self.combination_layer(combined)
        output = self.layer_norm(output)
        output = self.dropout(output)
        
        return output

# Configuration pour la combinaison d'embeddings
print("üîÑ CONFIGURATION DE LA COMBINAISON D'EMBEDDINGS...")

embedding_configs = [
    {
        'name': 'GloVe_Twitter',
        'dim': embedding_manager.embedding_dim,
        'matrix': embedding_manager.embedding_matrix,
        'trainable': True,
        'dropout': 0.1
    },
    {
        'name': 'Character_Level',
        'dim': 50,
        'matrix': None,  # Appris from scratch
        'trainable': True,
        'dropout': 0.2
    }
]

multi_embedding = MultiEmbeddingLayer(
    vocab_size=vocab.vocab_size,
    embedding_configs=embedding_configs
).to(device)

print(f"‚úÖ Couche multi-embeddings cr√©√©e")
print(f"   - Dimension de sortie: {multi_embedding.output_dim}")
print(f"   - Param√®tres totaux: {sum(p.numel() for p in multi_embedding.parameters()):,}")

In [None]:
# ===============================================================
# TESTS ET VALIDATION DES COUCHES D'EMBEDDINGS
# ===============================================================

import torch
import torch.nn as nn
import torch.nn.functional as F

print("\n" + "="*60)
print("üîé TESTS ET VALIDATION DES COUCHES D'EMBEDDINGS")
print("="*60)

# ---------------------------------------------------------------
# 1Ô∏è‚É£ Param√®tres de base
# ---------------------------------------------------------------
vocab_size = 10000      # Taille du vocabulaire (√† adapter selon ton tokenizer)
embedding_dim = 100      # Dimension de base des embeddings
batch_size = 32
seq_len = 50
device = "cuda" if torch.cuda.is_available() else "cpu"

# ---------------------------------------------------------------
# 2Ô∏è‚É£ D√©finition des diff√©rentes strat√©gies d'embeddings
# ---------------------------------------------------------------

# a. Embedding fine-tunable (classique, entra√Æn√©)
embedding_finetune = nn.Embedding(vocab_size, embedding_dim).to(device)

# b. Embedding gel√© (pr√©-entra√Æn√© fig√©)
embedding_frozen = nn.Embedding(vocab_size, embedding_dim).to(device)
embedding_frozen.weight.requires_grad = False

# c. Embedding entra√Æn√© from scratch
embedding_scratch = nn.Embedding(vocab_size, embedding_dim).to(device)


# ---------------------------------------------------------------
# 3Ô∏è‚É£ Mod√®le multi-embedding corrig√©
# ---------------------------------------------------------------

class MultiEmbedding(nn.Module):
    def __init__(self, vocab_size, emb_dims, output_dim=100, dropout=0.2):
        """
        Combine plusieurs embeddings en concat√©nant leurs repr√©sentations
        puis en les projetant dans une dimension cible.
        """
        super().__init__()
        self.embeddings = nn.ModuleList([
            nn.Embedding(vocab_size, dim) for dim in emb_dims
        ])
        total_dim = sum(emb_dims)  # somme des dimensions de tous les embeddings
        self.combination_layer = nn.Linear(total_dim, output_dim)
        self.layer_norm = nn.LayerNorm(output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # Combiner les embeddings concat√©n√©s
        embedded = [emb(x) for emb in self.embeddings]
        combined = torch.cat(embedded, dim=-1)
        # Passage lin√©aire + normalisation
        output = self.combination_layer(combined)
        output = self.layer_norm(output)
        output = self.dropout(output)
        return output

# Instancier le multi-embedding
multi_embedding = MultiEmbedding(
    vocab_size=vocab_size,
    emb_dims=[50, 100, 50],  # total_dim = 200
    output_dim=embedding_dim  # sortie finale √† 100 dimensions
).to(device)


# ---------------------------------------------------------------
# 4Ô∏è‚É£ Cr√©ation d‚Äôun batch de test
# ---------------------------------------------------------------
test_batch = torch.randint(0, vocab_size, (batch_size, seq_len)).to(device)
print(f"\nüì¶ Batch de test g√©n√©r√© : {test_batch.shape}")


# ---------------------------------------------------------------
# 5Ô∏è‚É£ Boucle de validation
# ---------------------------------------------------------------
strategies = [
    ("Fine-tuning", embedding_finetune),
    ("Embeddings fig√©s", embedding_frozen),
    ("From scratch", embedding_scratch),
    ("Multi-embeddings", multi_embedding)
]

print("\nüß™ Lancement des tests...\n")

for name, embedding_layer in strategies:
    with torch.no_grad():
        output = embedding_layer(test_batch)
        print(f"‚úÖ {name}:")
        print(f"   - Input shape : {test_batch.shape}")
        print(f"   - Output shape : {output.shape}")
        print(f"   - Norme moyenne : {output.norm(dim=-1).mean().item():.4f}\n")

print("üéØ Tous les embeddings ont √©t√© test√©s avec succ√®s !")
print("="*60)


In [None]:
# SAUVEGARDE DE LA CONFIGURATION
# ===============================================================

print("\n" + "="*50)
print("SAUVEGARDE DE LA CONFIGURATION")
print("="*50)

# Sauvegarder la matrice d'embedding
try:
    os.makedirs('/kaggle/working/embeddings', exist_ok=True)
    
    # Sauvegarder la matrice d'embedding
    np.save('/kaggle/working/embeddings/embedding_matrix.npy', 
            embedding_manager.embedding_matrix)
    
    # Sauvegarder la configuration
    embedding_config = {
        'strategy': strategy,
        'embedding_dim': embedding_manager.embedding_dim,
        'vocab_size': vocab.vocab_size,
        'coverage': coverage if 'coverage' in locals() else 'N/A',
        'trainable_params_finetune': sum(p.numel() for p in embedding_finetune.parameters()),
        'trainable_params_frozen': sum(p.numel() for p in embedding_frozen.parameters() if p.requires_grad),
        'trainable_params_scratch': sum(p.numel() for p in embedding_scratch.parameters()),
        'trainable_params_multi': sum(p.numel() for p in multi_embedding.parameters())
    }
    
    with open('/kaggle/working/embeddings/embedding_config.pkl', 'wb') as f:
        pickle.dump(embedding_config, f)
    
    print("üíæ CONFIGURATION SAUVEGARD√âE:")
    print(f"   - Matrice d'embedding: /kaggle/working/embeddings/embedding_matrix.npy")
    print(f"   - Configuration: /kaggle/working/embeddings/embedding_config.pkl")
    print(f"   - Strat√©gie: {strategy}")
    print(f"   - Dimension: {embedding_manager.embedding_dim}")
    
except Exception as e:
    print(f"‚ö†Ô∏è  Erreur lors de la sauvegarde: {e}")