In [1]:
import pandas as pd
from text_mining import TextMining
import numpy as np

### Importer le CSV

In [2]:
df = pd.read_csv('../data/train_tweets_processed.csv')

### Traitement des valeurs nuls ou non pertinentes

In [3]:
df.drop_duplicates(inplace=True)


In [4]:
print(df.shape)
df.info()

(7613, 11)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 7613 entries, 0 to 7612
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 7613 non-null   int64  
 1   keyword            7552 non-null   object 
 2   location           5080 non-null   object 
 3   text               7613 non-null   object 
 4   target             7613 non-null   int64  
 5   word_count         7613 non-null   int64  
 6   stop_word_count    7613 non-null   int64  
 7   url_count          7613 non-null   int64  
 8   mean_word_length   7613 non-null   float64
 9   char_count         7613 non-null   int64  
 10  punctuation_count  7613 non-null   int64  
dtypes: float64(1), int64(7), object(3)
memory usage: 713.7+ KB


In [5]:
df.dropna(inplace=True)
print(df.shape) 
df.info()

(5080, 11)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 5080 entries, 31 to 7581
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5080 non-null   int64  
 1   keyword            5080 non-null   object 
 2   location           5080 non-null   object 
 3   text               5080 non-null   object 
 4   target             5080 non-null   int64  
 5   word_count         5080 non-null   int64  
 6   stop_word_count    5080 non-null   int64  
 7   url_count          5080 non-null   int64  
 8   mean_word_length   5080 non-null   float64
 9   char_count         5080 non-null   int64  
 10  punctuation_count  5080 non-null   int64  
dtypes: float64(1), int64(7), object(3)
memory usage: 476.2+ KB


### Gestion  des valeurs manquantes

Au lieu de supprimer 33% des données avec `dropna()`, considérons cette approche :

In [6]:
def replace_missing_values(series, default_value):
    df[series] = df[series].fillna(default_value)

    print(f"Données conservées de {series} : {df.shape[0]} lignes (vs {df.shape[0]} avec dropna)")
    print(f"Pourcentage conservé : {df.shape[0]/7613*100:.1f}%")

replace_missing_values('location', "")
replace_missing_values('keyword', "")

Données conservées de location : 5080 lignes (vs 5080 avec dropna)
Pourcentage conservé : 66.7%
Données conservées de keyword : 5080 lignes (vs 5080 avec dropna)
Pourcentage conservé : 66.7%


In [7]:
df.head()

Unnamed: 0,id,keyword,location,text,target,word_count,stop_word_count,url_count,mean_word_length,char_count,punctuation_count
31,48,ablaze,Birmingham,@bbcmtd Wholesale Markets ablaze http://t.co/l...,1,5,0,1,10.2,55,6
32,49,ablaze,Est. September 2012 - Bristol,We always try to bring the heavy. #metal #RT h...,0,10,3,1,5.8,67,8
33,50,ablaze,AFRICA,#AFRICANBAZE: Breaking news:Nigeria flag set a...,1,9,1,1,8.222222,82,9
34,52,ablaze,"Philadelphia, PA",Crying out for more! Set me ablaze,0,7,3,0,4.0,34,1
35,53,ablaze,"London, UK",On plus side LOOK AT THE SKY LAST NIGHT IT WAS...,0,13,5,1,4.923077,76,5


In [8]:
df['target'].value_counts(normalize=True)

0    0.567717
1    0.432283
Name: target, dtype: float64

### Proportion target

In [9]:
df_0 = df[df['target'] == 0]
df_1 = df[df['target'] == 1]

df_0_sampled = df_0.sample(n=len(df_1), random_state=42)

df = pd.concat([df_0_sampled, df_1], axis=0).sample(frac=1, random_state=42)  # shuffle le tout


In [10]:
df['target'].value_counts(normalize=True)

1    0.5
0    0.5
Name: target, dtype: float64

### Nettoyage

- Conversion en minuscules
- Suppression des URL
- mentions et hashtags
- Suppression de la ponctuation et caractères spéciaux
- Tokenisation
- Suppression des stopwords
- Lemmatisation (ou stemming)
- Vectorisation des textes

In [11]:
data_clean = df.copy()

tm = TextMining(data_clean)
data_clean = (
    tm.lowercase()
        .extract_target_char("#", "hashtags")
        .extract_target_char("@", "mentions")
        .extract_url()
        .clean_regex(['keyword','text', 'location'])
        .tokenize()
        .remove_stopwords()
        .apply_lemmatizer( 'text') # .apply_lemmatizer() ou .apply_stemmer()
        .apply_lemmatizer('keyword') # .apply_lemmatizer() ou .apply_stemmer()
        .get_df()   
)
data_clean = data_clean.drop(columns=['hashtags', 'mentions', 'urls', 'location','id'])

data_clean.to_csv("lemmatizer_tfidf.csv")
data_clean.head(10)


Unnamed: 0,keyword,text,target,word_count,stop_word_count,url_count,mean_word_length,char_count,punctuation_count,tokens
6818,"[t, r, a, p, p, e, d]","[b, o, m, b, , h, e, a, d, , , e, x, p, l, ...",1,21,6,0,5.428571,134,1,"[bomb, head, explosive, decisions, dat, produc..."
4842,"[m, a, s, s, , 2, 0, m, u, r, d, e, r]","[o, k, a, y, , n, o, t, , s, u, r, e, , t, ...",1,19,8,1,5.421053,121,8,"[okay, sure, word, mass, murder, applies, war,..."
5848,"[r, u, i, n]","[l, i, k, e, , w, h, y, , o, n, , e, a, r, ...",0,17,8,0,5.0,101,2,"[like, earth, would, want, anybody, unhappy, p..."
1851,"[c, r, u, s, h]","[m, y, , w, o, m, a, n, , c, r, u, s, h, , ...",0,12,3,1,8.083333,108,8,"[woman, crush, wedneday, goes, beautiful, love..."
5729,"[r, e, s, c, u, e, r, s]","[ , , , m, a, n, y, , d, e, a, t, h, s, , ...",1,18,5,1,6.611111,136,14,"[many, deaths, shipwreck, rescuers, trying, sa..."
1295,"[b, u, r, n, e, d]","[m, e, t, a, l, , c, u, t, t, i, n, g, , s, ...",1,21,6,1,5.666667,139,7,"[metal, cutting, sparks, brush, fire, brighton..."
5719,"[r, e, s, c, u, e, r, s]","[v, i, d, e, o, , , , w, e, , r, e, , p, ...",1,18,7,1,6.611111,136,13,"[video, picking, bodies, water, rescuers, sear..."
4988,"[m, i, l, i, t, a, r, y]","[m, i, k, e, , m, a, g, n, e, r, , d, i, s, ...",0,12,1,1,8.083333,108,10,"[mike, magner, discusses, trust, betrayed, via..."
7282,"[w, h, i, r, l, w, i, n, d]","[s, e, t, , a, , n, e, w, , r, e, c, o, r, ...",0,28,12,0,4.0,139,9,"[set, new, record, 7, states, 4, days, even, k..."
6918,"[t, r, o, u, b, l, e]","[t, r, o, u, b, l, e, , t, r, o, u, b, l, e, ...",0,9,5,0,4.0,44,5,"[trouble, trouble, get, way]"


In [12]:
tm = TextMining(df)
tm.lowercase() \
  .extract_target_char("#", "hashtags") \
  .extract_target_char("@", "mentions") \
  .extract_url() \
  .clean_regex(['keyword','text', 'location']) \
  .tokenize() \
  .remove_stopwords() \
  .apply_lemmatizer() \
  .vectorize(mode="tfidf")

# Colonnes numériques classiques
numeric_features = df.select_dtypes(include="number")  # exemple
X = np.hstack([tm.X_vectors, numeric_features.values])
y = df["target"].values


TypeError: TextMining.apply_lemmatizer() missing 1 required positional argument: 'corpus'

In [None]:
data_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4392 entries, 6818 to 2019
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   target             4392 non-null   int64  
 1   word_count         4392 non-null   int64  
 2   stop_word_count    4392 non-null   int64  
 3   url_count          4392 non-null   int64  
 4   mean_word_length   4392 non-null   float64
 5   char_count         4392 non-null   int64  
 6   punctuation_count  4392 non-null   int64  
 7   tokens             4392 non-null   object 
dtypes: float64(1), int64(6), object(1)
memory usage: 308.8+ KB


In [None]:
data_clean.to_pickle('lemmatizer_tfidf.pkl')