In [31]:
import pandas as pd
from text_mining import TextMining

### Importer le CSV

In [32]:
df = pd.read_csv('../data/train_tweets.csv')

### Traitement des valeurs nuls ou non pertinentes

In [33]:
df.drop_duplicates(inplace=True)


In [34]:
print(df.shape)

(7613, 5)


In [35]:
df.dropna(inplace=True)
print(df.shape) 
df.info()

(5080, 5)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 5080 entries, 31 to 7581
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        5080 non-null   int64 
 1   keyword   5080 non-null   object
 2   location  5080 non-null   object
 3   text      5080 non-null   object
 4   target    5080 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 238.1+ KB


### Gestion  des valeurs manquantes

Au lieu de supprimer 33% des données avec `dropna()`, considérons cette approche :

In [36]:
def replace_missing_values(series, default_value):
    df[series] = df[series].fillna(default_value)

    print(f"Données conservées de {series} : {df.shape[0]} lignes (vs {df.shape[0]} avec dropna)")
    print(f"Pourcentage conservé : {df.shape[0]/7613*100:.1f}%")

replace_missing_values('location', "")
replace_missing_values('keyword', "")

Données conservées de location : 5080 lignes (vs 5080 avec dropna)
Pourcentage conservé : 66.7%
Données conservées de keyword : 5080 lignes (vs 5080 avec dropna)
Pourcentage conservé : 66.7%


In [37]:
df.head()

Unnamed: 0,id,keyword,location,text,target
31,48,ablaze,Birmingham,@bbcmtd Wholesale Markets ablaze http://t.co/l...,1
32,49,ablaze,Est. September 2012 - Bristol,We always try to bring the heavy. #metal #RT h...,0
33,50,ablaze,AFRICA,#AFRICANBAZE: Breaking news:Nigeria flag set a...,1
34,52,ablaze,"Philadelphia, PA",Crying out for more! Set me ablaze,0
35,53,ablaze,"London, UK",On plus side LOOK AT THE SKY LAST NIGHT IT WAS...,0


In [38]:
df['target'].value_counts(normalize=True)

0    0.567717
1    0.432283
Name: target, dtype: float64

### Proportion target

In [None]:
df_0 = df[df['target'] == 0]
df_1 = df[df['target'] == 1]

df_0_sampled = df_0.sample(n=len(df_1), random_state=42)

df = pd.concat([df_0_sampled, df_1], axis=0).sample(frac=1, random_state=42)  # shuffle le tout


In [40]:
df['target'].value_counts(normalize=True)

1    0.5
0    0.5
Name: target, dtype: float64

In [41]:
df['keyword'].value_counts()

collision                33
outbreak                 31
sandstorm                31
fatalities               30
emergency%20plan         29
                         ..
razed                     9
detonation                8
epicentre                 7
radiation%20emergency     6
inundation                3
Name: keyword, Length: 221, dtype: int64

### Nettoyage

- Conversion en minuscules
- Suppression des URL
- mentions et hashtags
- Suppression de la ponctuation et caractères spéciaux
- Tokenisation
- Suppression des stopwords
- Lemmatisation (ou stemming)
- Vectorisation des textes

In [70]:
data_clean = df.copy()
tm = TextMining(data_clean)
data_clean = (
    tm.lowercase()
        .extract_target_char("#", "hashtags")
        .extract_target_char("@", "mentions")
        .extract_url()
        .clean_regex()
        .tokenize()
        .remove_stopwords()
        .apply_stemmer() # .apply_lemmatizer() ou .apply_stemmer()
        .vectorize(mode="tfidf") # ou mode="bow" ou mode="tfidf"
        .get_df()   
)
tm.export_csv("stemmer_tfidf.csv")
data_clean.head(10)


Unnamed: 0,id,keyword,location,text,target,word_count,stop_word_count,url_count,mean_word_length,char_count,punctuation_count,hashtags,mentions,urls,tokens,vector
6818,9765,trapped,10 steps ahead cloud 9,bomb head explosive decisions dat produced mo...,1,21,6,0,5 428571428571429,134,1,,,,"[bomb, head, explos, decis, dat, produc, dead,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4842,6896,mass 20murder,huntsville al,okay not sure the word mass murder applies d...,1,19,8,1,5 421052631578948,121,8,,,https t co sb3rjqqzix,"[okay, sure, word, mass, murder, appli, war, h...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
5848,8356,ruin,garrett,like why on earth would you want anybody to be...,0,17,8,0,5 0,101,2,,,,"[like, earth, would, want, anybodi, unhappi, p...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1851,2661,crush,cleveland ohio,my woman crush wedneday goes to the beautiful,0,12,3,1,8 083333333333334,108,8,loveyouuuu aintsheperty,taykreidler,https t co wemwdtfwic,"[woman, crush, wedneday, goe, beauti, loveyouu...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
5729,8176,rescuers,washington,many deaths in shipwreck rescuers are try...,1,18,5,1,6 611111111111111,136,14,news,thenewshype,http t co tx51oybrn6,"[mani, death, shipwreck, rescuer, tri, save, h...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1295,1870,burned,cherry creek denver co,metal cutting sparks brush fire in brighton a...,1,21,6,1,5 666666666666667,139,7,,,http t co rj7m42atws,"[metal, cut, spark, brush, fire, brighton, bru...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
5719,8162,rescuers,london,video we re picking up bodies from water re...,1,18,7,1,6 611111111111111,136,13,,,http t co puezv6bd37,"[video, pick, bodi, water, rescuer, search, hu...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4988,7116,military,usa,mike magner discusses a trust betrayed via,0,12,1,1,8 083333333333334,108,10,military veterans environment,youtube,http t co psbxl1hvu3,"[mike, magner, discuss, trust, betray, via, mi...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
7282,10422,whirlwind,florida,set a new record 7 states in 4 days i don t ...,0,28,12,0,4 0,139,9,,,,"[set, new, record, 7, state, 4, day, even, kno...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
6918,9920,trouble,illumination,trouble trouble when i don t get my way,0,9,5,0,4 0,44,5,,,,"[troubl, troubl, get, way]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [43]:
df.head()

Unnamed: 0,id,keyword,location,text,target
6818,9765,trapped,10 Steps Ahead. Cloud 9,Bomb head? Explosive decisions dat produced mo...,1
4842,6896,mass%20murder,"Huntsville, AL",Okay not sure the word 'mass murder' applies d...,1
5848,8356,ruin,Garrett,like why on earth would you want anybody to be...,0
1851,2661,crush,"Cleveland, Ohio",My woman crush wedneday goes to the beautiful ...,0
5729,8176,rescuers,Washington,#News: 'Many deaths' in shipwreck: Rescuers ar...,1


### Validation du nettoyage

Toujours vérifier l'impact du preprocessing :

In [47]:
# Validation des résultats de nettoyage
print("🔍 Validation du nettoyage")
print("=" * 50)

# 1. Vérifier l'équilibrage des classes
print("Distribution des classes après équilibrage :")
print(df['target'].value_counts(normalize=True))
print()

# 2. Comparer longueurs avant/après nettoyage
print(f"Longueur moyenne avant nettoyage : {df['text'].str.len().mean():.1f} caractères")
print(f"Longueur moyenne après nettoyage : {data_clean['text'].str.len().mean():.1f} caractères")
print()

# 3. Exemples de transformation
print("📝 Exemples de nettoyage :")
for i in range(3):
    original = df['text'].iloc[i]
    cleaned = data_clean['text'].iloc[i]
    print(f"\nOriginal: {original}")
    print(f"Nettoyé: {cleaned}")
    print("-" * 30)

# 4. Statistiques finales
print(f"\n Dataset final :")
print(f"- Nombre de lignes : {len(data_clean)}")
print(f"- Équilibrage : {data_clean['target'].value_counts().to_dict()}")

🔍 Validation du nettoyage
Distribution des classes après équilibrage :
1    0.5
0    0.5
Name: target, dtype: float64

Longueur moyenne avant nettoyage : 103.1 caractères
Longueur moyenne après nettoyage : 78.6 caractères

📝 Exemples de nettoyage :

Original: Bomb head? Explosive decisions dat produced more dead children than dead bodies trapped tween buildings on that day in September there
Nettoyé: bomb head  explosive decisions dat produced more dead children than dead bodies trapped tween buildings on that day in september there
------------------------------

Original: Okay not sure the word 'mass murder' applies during this war but it was horrendous none the less. https://t.co/Sb3rjQqzIX
Nettoyé: okay not sure the word  mass murder  applies during this war but it was horrendous none the less 
------------------------------

Original: like why on earth would you want anybody to be unhappy don't purposely ruin somebody else's happiness
Nettoyé: like why on earth would you want anyb