In [3]:
import pandas as pd
from text_minning import TextMining

### Importer le CSV

In [4]:
df = pd.read_csv('train_tweets_processed.csv')

### Traitement des valeurs nuls ou non pertinentes

In [5]:
df.drop_duplicates(inplace=True)


In [6]:
print(df.shape)

(7561, 10)


In [7]:
df.dropna(inplace=True)
print(df.shape) 
df.info()

(5061, 10)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 5061 entries, 31 to 7581
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   keyword            5061 non-null   object 
 1   location           5061 non-null   object 
 2   text               5061 non-null   object 
 3   target             5061 non-null   int64  
 4   word_count         5061 non-null   int64  
 5   stop_word_count    5061 non-null   int64  
 6   url_count          5061 non-null   int64  
 7   mean_word_length   5061 non-null   float64
 8   char_count         5061 non-null   int64  
 9   punctuation_count  5061 non-null   int64  
dtypes: float64(1), int64(6), object(3)
memory usage: 434.9+ KB


### Gestion  des valeurs manquantes

Au lieu de supprimer 33% des données avec `dropna()`, considérons cette approche :

In [8]:
def replace_missing_values(series, default_value):
    df[series] = df[series].fillna(default_value)

    print(f"Données conservées de {series} : {df.shape[0]} lignes (vs {df.shape[0]} avec dropna)")
    print(f"Pourcentage conservé : {df.shape[0]/7613*100:.1f}%")

replace_missing_values('location', "")
replace_missing_values('keyword', "")

Données conservées de location : 5061 lignes (vs 5061 avec dropna)
Pourcentage conservé : 66.5%
Données conservées de keyword : 5061 lignes (vs 5061 avec dropna)
Pourcentage conservé : 66.5%


In [9]:
df.head()

Unnamed: 0,keyword,location,text,target,word_count,stop_word_count,url_count,mean_word_length,char_count,punctuation_count
31,ablaze,Birmingham,@bbcmtd Wholesale Markets ablaze http://t.co/l...,1,5,0,1,10.2,55,6
32,ablaze,Est. September 2012 - Bristol,We always try to bring the heavy. #metal #RT h...,0,10,3,1,5.8,67,8
33,ablaze,AFRICA,#AFRICANBAZE: Breaking news:Nigeria flag set a...,1,9,1,1,8.222222,82,9
34,ablaze,"Philadelphia, PA",Crying out for more! Set me ablaze,0,7,3,0,4.0,34,1
35,ablaze,"London, UK",On plus side LOOK AT THE SKY LAST NIGHT IT WAS...,0,13,5,1,4.923077,76,5


In [10]:
df['target'].value_counts(normalize=True)

0    0.568465
1    0.431535
Name: target, dtype: float64

### Proportion target

In [11]:
df_0 = df[df['target'] == 0]
df_1 = df[df['target'] == 1]

df_0_sampled = df_0.sample(n=len(df_1), random_state=42)

df = pd.concat([df_0_sampled, df_1], axis=0).sample(frac=1, random_state=42)  # shuffle le tout


In [12]:
df['target'].value_counts(normalize=True)

0    0.5
1    0.5
Name: target, dtype: float64

In [13]:
df['keyword'].value_counts()

outbreak                 31
collision                31
evacuation               29
derailed                 29
buildings%20on%20fire    29
                         ..
detonation                8
epicentre                 8
razed                     7
radiation%20emergency     6
inundation                4
Name: keyword, Length: 221, dtype: int64

### Nettoyage

- Conversion en minuscules
- Suppression des URL
- mentions et hashtags
- Suppression de la ponctuation et caractères spéciaux
- Tokenisation
- Suppression des stopwords
- Lemmatisation (ou stemming)
- Vectorisation des textes

In [14]:
data_clean = df.copy()
tm = TextMining(data_clean)
data_clean = (
    tm.lowercase()
        .extract_target_char("#", "hashtags")
        .extract_target_char("@", "mentions")
        .extract_url()
        .clean_regex()
        .tokenize()
        .remove_stopwords()
        .apply_lemmatizer() # .apply_lemmatizer() ou .apply_stemmer()
        .vectorize(mode="tfidf") # ou mode="bow" ou mode="tfidf"
        .get_df()   
)
tm.export_csv("lemmatizer_tfidf.csv")
data_clean.head(10)


Unnamed: 0,keyword,location,text,target,word_count,stop_word_count,url_count,mean_word_length,char_count,punctuation_count,hashtags,mentions,urls,tokens,vector
4663,inundated,united states,data overload the growing demand for context ...,0,18,6,1,6 611111111111111,136,12,tech news,,http t co s0ctcqjvjx,"[data, overload, growing, demand, context, str...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
417,arsonist,bleak house,arson suspect linked to 30 fires caught in nor...,1,11,2,1,6 818181818181818,85,5,,,http t co mmgsyahdzb,"[arson, suspect, linked, 30, fire, caught, nor...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4110,hailstorm,kicking horse pass,omg nixon lives that is richard m nixon tric...,1,22,7,1,5 318181818181818,138,12,,,http t co miusvpxqte,"[omg, nixon, life, richard, nixon, tricky, dic...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3420,explode,dallas tx,that s one way to make their heads explode,0,11,3,0,6 0,76,7,,deniseromano megynkelly gop,,"[one, way, make, head, explode]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
899,bloody,level 3 garrison sector g,bloody hell what a day i haven t even really ...,0,26,11,0,4 423076923076923,140,9,,,,"[bloody, hell, day, even, really, done, anythi...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1954,cyclone,republic of the philippines,a new tropical cyclone is forming near guam on...,1,17,7,0,4 352941176470588,91,4,,,,"[new, tropical, cyclone, forming, near, guam, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1732,collided,see the barn of bleakness,omg omg omg and have collided in a nuclear acc...,0,17,5,1,6 764705882352941,132,11,justinbieber harrystyles cern harrybecareful,,http t co p4huqundqi,"[omg, omg, omg, collided, nuclear, accident, o...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4030,forest 20fire,redding california usa,view of smoke column rising above the south en...,1,19,5,1,5 526315789473684,123,7,routecomplex,,http t co yqg5pvw5gx,"[view, smoke, column, rising, south, end, blak...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
446,attack,dallas tx,stay vigilent civil liberties are under const...,1,11,2,1,8 727272727272727,107,9,nativehuman myreligion,,https t co wwu070tjej,"[stay, vigilent, civil, liberty, constant, att...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2544,destroy,virginia usa,destroy the free fandom honestly,1,5,1,0,5 6,32,0,,,,"[destroy, free, fandom, honestly]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [15]:
df.head()

Unnamed: 0,keyword,location,text,target,word_count,stop_word_count,url_count,mean_word_length,char_count,punctuation_count
4663,inundated,United States,#tech Data Overload: The Growing Demand for Co...,0,18,6,1,6.611111,136,12
417,arsonist,Bleak House,Arson suspect linked to 30 fires caught in Nor...,1,11,2,1,6.818182,85,5
4110,hailstorm,Kicking Horse Pass,OMG NIXON LIVES! That is Richard M. Nixon Tric...,1,22,7,1,5.318182,138,12
3420,explode,"Dallas, TX",@deniseromano @megynkelly @GOP That's one way ...,0,11,3,0,6.0,76,7
899,bloody,"Level 3 Garrison, Sector G",Bloody hell what a day. I haven't even really ...,0,26,11,0,4.423077,140,9


### Validation du nettoyage

Toujours vérifier l'impact du preprocessing :

In [16]:
# Validation des résultats de nettoyage
print("🔍 Validation du nettoyage")
print("=" * 50)

# 1. Vérifier l'équilibrage des classes
print("Distribution des classes après équilibrage :")
print(df['target'].value_counts(normalize=True))
print()

# 2. Comparer longueurs avant/après nettoyage
print(f"Longueur moyenne avant nettoyage : {df['text'].str.len().mean():.1f} caractères")
print(f"Longueur moyenne après nettoyage : {data_clean['text'].str.len().mean():.1f} caractères")
print()

# 3. Exemples de transformation
print("📝 Exemples de nettoyage :")
for i in range(3):
    original = df['text'].iloc[i]
    cleaned = data_clean['text'].iloc[i]
    print(f"\nOriginal: {original}")
    print(f"Nettoyé: {cleaned}")
    print("-" * 30)

# 4. Statistiques finales
print(f"\n Dataset final :")
print(f"- Nombre de lignes : {len(df)}")
print(f"- Équilibrage : {df['target'].value_counts().to_dict()}")

🔍 Validation du nettoyage
Distribution des classes après équilibrage :
0    0.5
1    0.5
Name: target, dtype: float64

Longueur moyenne avant nettoyage : 103.1 caractères
Longueur moyenne après nettoyage : 78.6 caractères

📝 Exemples de nettoyage :

Original: #tech Data Overload: The Growing Demand for Context and Structure: In a world inundated with information... http://t.co/s0ctCQJvjX #news
Nettoyé: data overload  the growing demand for context and structure  in a world inundated with information 
------------------------------

Original: Arson suspect linked to 30 fires caught in Northern California http://t.co/mmGsyAHDzb
Nettoyé: arson suspect linked to 30 fires caught in northern california
------------------------------

Original: OMG NIXON LIVES! That is Richard M. Nixon Tricky Dicky right there in the picture isn't it. Hiding in Calgary he... http://t.co/MIUsvPxQTE
Nettoyé: omg nixon lives  that is richard m  nixon tricky dicky right there in the picture isn t it  hiding in ca