In [142]:
import pandas as pd
from text_mining import TextMining
import numpy as np

### Importer le CSV

In [143]:
df = pd.read_csv('../data/train_tweets_processed.csv', index_col=False)

### Traitement des valeurs nuls ou non pertinentes

In [144]:
df.drop_duplicates(inplace=True)


In [145]:
print(df.shape)
df.info()

(7613, 11)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 7613 entries, 0 to 7612
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 7613 non-null   int64  
 1   keyword            7552 non-null   object 
 2   location           5080 non-null   object 
 3   text               7613 non-null   object 
 4   target             7613 non-null   int64  
 5   word_count         7613 non-null   int64  
 6   stop_word_count    7613 non-null   int64  
 7   url_count          7613 non-null   int64  
 8   mean_word_length   7613 non-null   float64
 9   char_count         7613 non-null   int64  
 10  punctuation_count  7613 non-null   int64  
dtypes: float64(1), int64(7), object(3)
memory usage: 713.7+ KB


### Gestion  des valeurs manquantes

Au lieu de supprimer 33% des données avec `dropna()`, considérons cette approche :

In [146]:
def replace_missing_values(series, default_value):
    df[series] = df[series].fillna(default_value)

    print(f"Données conservées de {series} : {df.shape[0]} lignes (vs {df.shape[0]} avec dropna)")
    print(f"Pourcentage conservé : {df.shape[0]/7613*100:.1f}%")

replace_missing_values('location', "")
replace_missing_values('keyword', "")

Données conservées de location : 7613 lignes (vs 7613 avec dropna)
Pourcentage conservé : 100.0%
Données conservées de keyword : 7613 lignes (vs 7613 avec dropna)
Pourcentage conservé : 100.0%


### Proportion target

In [147]:
df_0 = df[df['target'] == 0]
df_1 = df[df['target'] == 1]

df_0_sampled = df_0.sample(n=len(df_1), random_state=42)

df = pd.concat([df_0_sampled, df_1], axis=0).sample(frac=1, random_state=42)  # shuffle le tout


In [148]:
df['target'].value_counts(normalize=True)

0    0.5
1    0.5
Name: target, dtype: float64

In [149]:
tm = TextMining(df, text_column="text")
df_clean = (
    tm.lowercase()
      .remove_accents()
      .extract_target_char("#", "hashtags")
      .extract_target_char("@", "mentions")
      .extract_url()
      .clean_regex(["text"])
      .tokenize()
      .remove_stopwords()
      .remove_short_tokens()
      .apply_lemmatizer()
      .build_clean_text()
      .get_df()
)

# df_clean.to_csv()

In [150]:

df_clean = df_clean[[c for c in df_clean.columns 
                    if df_clean[c].dtype != 'object' or c == 'text']]
df_clean = df_clean.drop(columns='id')
df_clean.head(10)
# df_clean.to_csv("df_clean_v1.csv", index=False)
df_clean.head()

Unnamed: 0,text,target,word_count,stop_word_count,url_count,mean_word_length,char_count,punctuation_count
4676,beyond all bounds till inundation rise,0,6,1,0,5.666667,39,1
2423,three episodes left the end is nigh,0,7,2,0,4.142857,35,0
2568,industry tryna destroy cause he exposed they c...,0,10,2,0,5.7,66,4
2760,nepal rebuilding lives and livelihoods afte...,1,10,2,1,9.4,103,8
7088,diageo s ceo stresses that a board revolt at u...,0,16,5,1,6.375,117,6
