In [51]:
import pandas as pd
from text_mining import TextMining
import numpy as np

### Importer le CSV

In [52]:
df = pd.read_csv('../data/train_tweets_processed.csv', index_col=False)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   keyword            7552 non-null   object 
 1   location           5080 non-null   object 
 2   text               7613 non-null   object 
 3   target             7613 non-null   int64  
 4   word_count         7613 non-null   int64  
 5   stop_word_count    7613 non-null   int64  
 6   mean_word_length   7613 non-null   float64
 7   char_count         7613 non-null   int64  
 8   punctuation_count  7613 non-null   int64  
 9   presence_url       7613 non-null   int64  
dtypes: float64(1), int64(6), object(3)
memory usage: 594.9+ KB


### Traitement des valeurs nuls ou non pertinentes

In [53]:
df.drop_duplicates(inplace=True)

In [54]:
print(df.shape)
df.info()

(7561, 10)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 7561 entries, 0 to 7612
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   keyword            7500 non-null   object 
 1   location           5061 non-null   object 
 2   text               7561 non-null   object 
 3   target             7561 non-null   int64  
 4   word_count         7561 non-null   int64  
 5   stop_word_count    7561 non-null   int64  
 6   mean_word_length   7561 non-null   float64
 7   char_count         7561 non-null   int64  
 8   punctuation_count  7561 non-null   int64  
 9   presence_url       7561 non-null   int64  
dtypes: float64(1), int64(6), object(3)
memory usage: 649.8+ KB


### Gestion  des valeurs manquantes

Au lieu de supprimer 33% des données avec `dropna()`, considérons cette approche :

### Proportion target

In [55]:
df_0 = df[df['target'] == 0]
df_1 = df[df['target'] == 1]

df_0_sampled = df_0.sample(n=len(df_1), random_state=42)

df = pd.concat([df_0_sampled, df_1], axis=0).sample(frac=1, random_state=42)  # shuffle le tout


In [56]:
df['target'].value_counts(normalize=True)

0    0.5
1    0.5
Name: target, dtype: float64

In [57]:
tm = TextMining(df, text_column="text")
df_clean = (
    tm.lowercase()
      .remove_accents()
      .extract_target_char("#", "hashtags")
      .extract_target_char("@", "mentions")
      .extract_url()
      .clean_regex(["text"])
      .tokenize()
      .remove_stopwords()
      .remove_short_tokens()
      .apply_lemmatizer()
      .build_clean_text()
      .get_df()
)
# df_clean.to_csv()
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6478 entries, 1780 to 4286
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   keyword            6422 non-null   object 
 1   location           4332 non-null   object 
 2   text               6478 non-null   object 
 3   target             6478 non-null   int64  
 4   word_count         6478 non-null   int64  
 5   stop_word_count    6478 non-null   int64  
 6   mean_word_length   6478 non-null   float64
 7   char_count         6478 non-null   int64  
 8   punctuation_count  6478 non-null   int64  
 9   presence_url       6478 non-null   int64  
 10  hashtags           6478 non-null   object 
 11  mentions           6478 non-null   object 
 12  urls               6478 non-null   object 
 13  tokens             6478 non-null   object 
 14  clean_text         6478 non-null   object 
dtypes: float64(1), int64(6), object(8)
memory usage: 809.8+ KB


In [58]:
df_clean = df_clean[[c for c in df_clean.columns 
                     if df_clean[c].dtype != 'object' 
                     or c in ['text', 'hashtags', 'mentions', 'tokens', 'keyword', 'clean_text']]]


In [59]:


df_clean.head(10)

Unnamed: 0,keyword,text,target,word_count,stop_word_count,mean_word_length,char_count,punctuation_count,presence_url,hashtags,mentions,tokens,clean_text
1780,crash,motogp indianapolis espargaro layout worrie...,0,11,2,8.909091,108,15,1,,,"[motogp, indianapolis, espargaro, layout, worr...",motogp indianapolis espargaro layout worry little
2806,disaster,rt the devereaux disaster exciting scifi,0,11,1,9.0,109,15,1,"thriller, scifi, kindle",amznfavorites,"[devereaux, disaster, exciting, scifi]",devereaux disaster exciting scifi
7156,war%20zone,this bed looks like a war zone,0,7,3,3.571429,31,1,0,,,"[bed, look, like, war, zone]",bed look like war zone
754,blew%20up,i hate people who tweet receipts but know it...,0,25,11,4.36,134,5,0,,,"[hate, people, tweet, receipt, know, wrong, wo...",hate people tweet receipt know wrong wont take...
5433,police,police kill hatchet wielding gunman who opened...,1,15,1,8.266667,138,11,1,,,"[police, kill, hatchet, wielding, gunman, open...",police kill hatchet wielding gunman opened fir...
271,annihilation,u s national park services tonto national fore...,1,19,3,6.315789,138,11,1,,change,"[national, park, service, tonto, national, for...",national park service tonto national forest st...
1607,collapse,and by doing this you re enabling the possible...,0,17,8,7.0,135,7,0,,"marvel, dccomics, imagecomics, darkhorsecomics...","[enabling, possible, collapse, industry]",enabling possible collapse industry
3249,engulfed,do you feel engulfed with low self image take...,0,12,4,7.583333,102,13,1,,,"[feel, engulfed, low, self, image, take, quiz]",feel engulfed low self image take quiz
6574,survivors,survivors remorse is good,0,4,1,5.5,25,0,0,,,"[survivor, remorse, good]",survivor remorse good
1073,bomb,umm because a gun stopped the gunman with who ...,1,14,7,4.928571,82,2,0,,smallforestelf,"[umm, gun, stopped, gunman, carrying, bomb]",umm gun stopped gunman carrying bomb


In [60]:
df_clean.to_csv("../data/clean_train_tweets.csv", index=False)