In [17]:
import pandas as pd
import numpy as np  
import seaborn as sns
import matplotlib.pyplot as pyplot
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import nltk
import re
from nltk.corpus import stopwords
from nltk.corpus import stopwords
from wordcloud import WordCloud
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from nltk.corpus import wordnet
import nltk
import string
from wordcloud import STOPWORDS

In [18]:

df = pd.read_pickle("df.pkl")

# Traitement des valeurs nuls ou non pertinentes

In [19]:
df.drop_duplicates(inplace=True)


In [20]:
print(df.shape)

(7613, 17)


In [21]:
df.dropna(inplace=True)
print(df.shape) 
df.info()

(5080, 17)
<class 'pandas.core.frame.DataFrame'>
Index: 5080 entries, 31 to 7581
Data columns (total 17 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5080 non-null   int64  
 1   keyword            5080 non-null   object 
 2   location           5080 non-null   object 
 3   text               5080 non-null   object 
 4   target             5080 non-null   int64  
 5   presence_location  5080 non-null   int64  
 6   presence_keyword   5080 non-null   int64  
 7   char_count         5080 non-null   int64  
 8   word_count         5080 non-null   int64  
 9   stopword_count     5080 non-null   int64  
 10  unique_word_count  5080 non-null   int64  
 11  stop_word_count    5080 non-null   int64  
 12  url_count          5080 non-null   int64  
 13  mean_word_length   5080 non-null   float64
 14  punctuation_count  5080 non-null   int64  
 15  hashtag_count      5080 non-null   int64  
 16  mention_count    

## ⚠️ Alternative : Gestion intelligente des valeurs manquantes

Au lieu de supprimer 33% des données avec `dropna()`, considérons cette approche :

In [None]:
# Alternative recommandée : imputation intelligente
df_alternative = df.copy()

# 1. Remplacer les valeurs manquantes par des chaînes vides ou des valeurs par défaut
df_alternative['location'] = df_alternative['location'].fillna("unknown_location")
df_alternative['keyword'] = df_alternative['keyword'].fillna("no_keyword")

print(f"Données conservées : {df_alternative.shape[0]} lignes (vs {df.shape[0]} avec dropna)")
print(f"Pourcentage conservé : {df_alternative.shape[0]/7613*100:.1f}%")

# 2. Créer des indicateurs binaires de présence
df_alternative['has_location'] = df['location'].notnull().astype(int)
df_alternative['has_keyword'] = df['keyword'].notnull().astype(int)

print("\nNouvelles features créées :")
print("- has_location : indique si l'utilisateur a renseigné sa localisation")
print("- has_keyword : indique si un keyword était disponible")

In [22]:
df.head()

Unnamed: 0,id,keyword,location,text,target,presence_location,presence_keyword,char_count,word_count,stopword_count,unique_word_count,stop_word_count,url_count,mean_word_length,punctuation_count,hashtag_count,mention_count
31,48,ablaze,Birmingham,@bbcmtd Wholesale Markets ablaze http://t.co/l...,1,1,1,55,5,0,5,0,1,10.2,6,0,1
32,49,ablaze,Est. September 2012 - Bristol,We always try to bring the heavy. #metal #RT h...,0,1,1,67,10,4,10,3,1,5.8,8,2,0
33,50,ablaze,AFRICA,#AFRICANBAZE: Breaking news:Nigeria flag set a...,1,1,1,82,9,1,9,1,1,8.222222,9,1,0
34,52,ablaze,"Philadelphia, PA",Crying out for more! Set me ablaze,0,1,1,34,7,3,7,3,0,4.0,1,0,0
35,53,ablaze,"London, UK",On plus side LOOK AT THE SKY LAST NIGHT IT WAS...,0,1,1,76,13,7,13,5,1,4.923077,5,0,0


In [23]:
df['target'].value_counts(normalize=True)

target
0    0.567717
1    0.432283
Name: proportion, dtype: float64

# Proportion target

In [24]:

df_0 = df[df['target'] == 0]
df_1 = df[df['target'] == 1]

# 2. Échantillonner aléatoirement la classe 0 pour avoir autant d'exemples que la classe 1
df_0_sampled = df_0.sample(n=len(df_1), random_state=42)

df_balanced = pd.concat([df_0_sampled, df_1], axis=0).sample(frac=1, random_state=42)  # shuffle le tout


In [25]:

df_balanced['target'].value_counts(normalize=True)

target
1    0.5
0    0.5
Name: proportion, dtype: float64

In [26]:
df_balanced['keyword'].value_counts()


keyword
collision                33
outbreak                 31
sandstorm                31
fatalities               30
emergency%20plan         29
                         ..
razed                     9
detonation                8
epicentre                 7
radiation%20emergency     6
inundation                3
Name: count, Length: 221, dtype: int64

## Nettoyage

In [27]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", "", text)      # URLs
    text = re.sub(r"@\w+|#\w+", "", text)           # Mentions & hashtags
    text = re.sub(r"[^a-z\s]", "", text)            # Ponctuation / chiffres
    text = re.sub(r"\s+", " ", text).strip()
    return text

# Nettoyage de base
df_balanced["clean_text"] = df_balanced["text"].apply(clean_text)
df_balanced["clean_location"] = df_balanced["location"].apply(clean_text)
df_balanced["clean_keyword"] = df_balanced["keyword"].apply(clean_text)

# Tokenisation X
df_balanced["tokens"] = df_balanced["clean_text"].apply(lambda x: x.split())

# Suppression des stopwords
df_balanced["tokens"] = df_balanced["tokens"].apply(lambda tokens: [w for w in tokens if w not in stop_words])  
df_balanced["keyword_tokens"] = df_balanced["clean_keyword"].apply(lambda x: [w for w in x.split() if w not in stop_words])
df_balanced["location_tokens"] = df_balanced["clean_location"].apply(lambda x: [w for w in x.split() if w not in stop_words])



[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sulivanmoreau/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [28]:
df_balanced.head()

Unnamed: 0,id,keyword,location,text,target,presence_location,presence_keyword,char_count,word_count,stopword_count,...,mean_word_length,punctuation_count,hashtag_count,mention_count,clean_text,clean_location,clean_keyword,tokens,keyword_tokens,location_tokens
6818,9765,trapped,10 Steps Ahead. Cloud 9,Bomb head? Explosive decisions dat produced mo...,1,1,1,134,21,6,...,5.428571,1,0,0,bomb head explosive decisions dat produced mor...,steps ahead cloud,trapped,"[bomb, head, explosive, decisions, dat, produc...",[trapped],"[steps, ahead, cloud]"
4842,6896,mass%20murder,"Huntsville, AL",Okay not sure the word 'mass murder' applies d...,1,1,1,121,19,9,...,5.421053,8,0,0,okay not sure the word mass murder applies dur...,huntsville al,massmurder,"[okay, sure, word, mass, murder, applies, war,...",[massmurder],"[huntsville, al]"
5848,8356,ruin,Garrett,like why on earth would you want anybody to be...,0,1,1,101,17,6,...,5.0,2,0,0,like why on earth would you want anybody to be...,garrett,ruin,"[like, earth, would, want, anybody, unhappy, d...",[ruin],[garrett]
1851,2661,crush,"Cleveland, Ohio",My woman crush wedneday goes to the beautiful ...,0,1,1,108,12,3,...,8.083333,8,2,1,my woman crush wedneday goes to the beautiful,cleveland ohio,crush,"[woman, crush, wedneday, goes, beautiful]",[crush],"[cleveland, ohio]"
5729,8176,rescuers,Washington,#News: 'Many deaths' in shipwreck: Rescuers ar...,1,1,1,136,18,6,...,6.611111,14,1,1,many deaths in shipwreck rescuers are trying t...,washington,rescuers,"[many, deaths, shipwreck, rescuers, trying, sa...",[rescuers],[washington]


## Application 

In [29]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer, WordNetLemmatizer

# Téléchargements
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Initialisation
stop_words = set(stopwords.words("english"))
stemmer = SnowballStemmer("english")
lemmatizer = WordNetLemmatizer()

# Nettoyage de texte
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+|www\S+", "", text)      # URLs
    text = re.sub(r"@\w+|#\w+", "", text)           # Mentions & hashtags
    text = re.sub(r"[^a-z\s]", "", text)            # Ponctuation / chiffres
    text = re.sub(r"\s+", " ", text).strip()
    return text

# Appliquer le nettoyage
for col in ["text", "keyword", "location"]:
    df_balanced[f"clean_{col}"] = df_balanced[col].apply(clean_text)
    df_balanced[f"{col}_tokens"] = df_balanced[f"clean_{col}"].apply(
        lambda x: [w for w in x.split() if w not in stop_words]
    )
    df_balanced[f"{col}_stem"] = df_balanced[f"{col}_tokens"].apply(
        lambda tokens: " ".join([stemmer.stem(t) for t in tokens])
    )
    df_balanced[f"{col}_lemma"] = df_balanced[f"{col}_tokens"].apply(
        lambda tokens: " ".join([lemmatizer.lemmatize(t) for t in tokens])
    )

# Tu peux ensuite créer la version combinée pour la vectorisation
df_balanced["text_combined"] = (
    df_balanced["text_lemma"] + " " +
    df_balanced["keyword_lemma"].fillna("") + " " +
    df_balanced["location_lemma"].fillna("")
)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sulivanmoreau/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sulivanmoreau/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/sulivanmoreau/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [30]:
df_balanced.head()

Unnamed: 0,id,keyword,location,text,target,presence_location,presence_keyword,char_count,word_count,stopword_count,...,keyword_tokens,location_tokens,text_tokens,text_stem,text_lemma,keyword_stem,keyword_lemma,location_stem,location_lemma,text_combined
6818,9765,trapped,10 Steps Ahead. Cloud 9,Bomb head? Explosive decisions dat produced mo...,1,1,1,134,21,6,...,[trapped],"[steps, ahead, cloud]","[bomb, head, explosive, decisions, dat, produc...",bomb head explos decis dat produc dead childre...,bomb head explosive decision dat produced dead...,trap,trapped,step ahead cloud,step ahead cloud,bomb head explosive decision dat produced dead...
4842,6896,mass%20murder,"Huntsville, AL",Okay not sure the word 'mass murder' applies d...,1,1,1,121,19,9,...,[massmurder],"[huntsville, al]","[okay, sure, word, mass, murder, applies, war,...",okay sure word mass murder appli war horrend n...,okay sure word mass murder applies war horrend...,massmurd,massmurder,huntsvill al,huntsville al,okay sure word mass murder applies war horrend...
5848,8356,ruin,Garrett,like why on earth would you want anybody to be...,0,1,1,101,17,6,...,[ruin],[garrett],"[like, earth, would, want, anybody, unhappy, d...",like earth would want anybodi unhappi dont pur...,like earth would want anybody unhappy dont pur...,ruin,ruin,garrett,garrett,like earth would want anybody unhappy dont pur...
1851,2661,crush,"Cleveland, Ohio",My woman crush wedneday goes to the beautiful ...,0,1,1,108,12,3,...,[crush],"[cleveland, ohio]","[woman, crush, wedneday, goes, beautiful]",woman crush wedneday goe beauti,woman crush wedneday go beautiful,crush,crush,cleveland ohio,cleveland ohio,woman crush wedneday go beautiful crush clevel...
5729,8176,rescuers,Washington,#News: 'Many deaths' in shipwreck: Rescuers ar...,1,1,1,136,18,6,...,[rescuers],[washington],"[many, deaths, shipwreck, rescuers, trying, sa...",mani death shipwreck rescuer tri save hundr mi...,many death shipwreck rescuer trying save hundr...,rescuer,rescuer,washington,washington,many death shipwreck rescuer trying save hundr...


In [32]:
df_balanced.to_pickle("df_balanced.pkl")

## 🔧 Améliorations du nettoyage textuel

Votre nettoyage est déjà très bon ! Quelques suggestions d'amélioration :

In [None]:
def enhanced_clean_text(text):
    """
    Nettoyage textuel amélioré pour les tweets
    """
    text = str(text).lower()
    
    # 1. Gestion des contractions anglaises courantes
    contractions = {
        "can't": "cannot", "won't": "will not", "n't": " not",
        "'re": " are", "'ve": " have", "'ll": " will", "'d": " would",
        "'m": " am", "it's": "it is", "that's": "that is"
    }
    for contraction, expansion in contractions.items():
        text = text.replace(contraction, expansion)
    
    # 2. Nettoyage des URLs et mentions (garder trace du nombre)
    url_count = len(re.findall(r"http\S+|www\S+", text))
    mention_count = len(re.findall(r"@\w+", text))
    hashtag_count = len(re.findall(r"#\w+", text))
    
    text = re.sub(r"http\S+|www\S+", " URL ", text)      # Remplacer par token
    text = re.sub(r"@\w+", " MENTION ", text)            # Remplacer par token  
    text = re.sub(r"#\w+", " HASHTAG ", text)            # Remplacer par token
    
    # 3. Caractères répétés (ex: "nooooo" → "no")
    text = re.sub(r"(.)\1{2,}", r"\1", text)
    
    # 4. Nettoyage final
    text = re.sub(r"[^a-z\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    
    return text, url_count, mention_count, hashtag_count

# Exemple d'utilisation
sample_text = "OMG!!! Can't believe this happened 😱 http://news.com @emergency #disaster"
cleaned, urls, mentions, hashtags = enhanced_clean_text(sample_text)
print(f"Original: {sample_text}")
print(f"Cleaned: {cleaned}")
print(f"URLs: {urls}, Mentions: {mentions}, Hashtags: {hashtags}")

## 📊 Validation du nettoyage

Toujours vérifier l'impact du preprocessing :

In [None]:
# Validation des résultats de nettoyage
print("🔍 Validation du nettoyage")
print("=" * 50)

# 1. Vérifier l'équilibrage des classes
print(f"Distribution des classes après équilibrage :")
print(df_balanced['target'].value_counts(normalize=True))
print()

# 2. Comparer longueurs avant/après nettoyage
print(f"Longueur moyenne avant nettoyage : {df_balanced['text'].str.len().mean():.1f} caractères")
print(f"Longueur moyenne après nettoyage : {df_balanced['text_lemma'].str.len().mean():.1f} caractères")
print()

# 3. Exemples de transformation
print("📝 Exemples de nettoyage :")
for i in range(3):
    original = df_balanced['text'].iloc[i]
    cleaned = df_balanced['text_lemma'].iloc[i]
    print(f"\nOriginal: {original}")
    print(f"Nettoyé: {cleaned}")
    print("-" * 30)

# 4. Statistiques finales
print(f"\n📊 Dataset final :")
print(f"- Nombre de lignes : {len(df_balanced)}")
print(f"- Équilibrage : {df_balanced['target'].value_counts().to_dict()}")
print(f"- Colonnes créées : text_lemma, keyword_lemma, location_lemma, text_combined")