In [142]:
import pandas as pd
from text_mining import TextMining
import numpy as np

### Importer le CSV

In [168]:
df = pd.read_csv('../data/lemmatizer_features.csv', index_col=False)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4392 entries, 0 to 4391
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   keyword                  4392 non-null   object
 1   location                 4392 non-null   object
 2   text                     4392 non-null   object
 3   target                   4392 non-null   int64 
 4   stop_word_count          4392 non-null   int64 
 5   url_count                4392 non-null   int64 
 6   mean_word_length         4392 non-null   object
 7   char_count               4392 non-null   int64 
 8   punctuation_count        4392 non-null   int64 
 9   hashtags                 1073 non-null   object
 10  mentions                 1222 non-null   object
 11  urls                     2408 non-null   object
 12  tokens                   4392 non-null   object
 13  vector                   4392 non-null   object
 14  has_top10_hashtag        4392 non-null  

In [None]:
df = df.drop(columns=['tokens', 'vector'])
df.info()
df['mean_word_length'] = (
    df['mean_word_length']
    .astype(str)
    .str.replace(' ', '.', 1) 
    .str.replace(',', '.')
    .astype(float)
)
df['mean_word_length'].dtypes
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4392 entries, 0 to 4391
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   keyword                  4392 non-null   object
 1   location                 4392 non-null   object
 2   text                     4392 non-null   object
 3   target                   4392 non-null   int64 
 4   stop_word_count          4392 non-null   int64 
 5   url_count                4392 non-null   int64 
 6   mean_word_length         4392 non-null   object
 7   char_count               4392 non-null   int64 
 8   punctuation_count        4392 non-null   int64 
 9   hashtags                 1073 non-null   object
 10  mentions                 1222 non-null   object
 11  urls                     2408 non-null   object
 12  has_top10_hashtag        4392 non-null   int64 
 13  has_top10_keyword        4392 non-null   int64 
 14  has_top_bigram           4392 non-null  

dtype('float64')

### Traitement des valeurs nuls ou non pertinentes

In [170]:
df.drop_duplicates(inplace=True)


In [171]:
print(df.shape)
df.info()

(4376, 16)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 4376 entries, 0 to 4391
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   keyword                  4376 non-null   object 
 1   location                 4376 non-null   object 
 2   text                     4376 non-null   object 
 3   target                   4376 non-null   int64  
 4   stop_word_count          4376 non-null   int64  
 5   url_count                4376 non-null   int64  
 6   mean_word_length         4376 non-null   float64
 7   char_count               4376 non-null   int64  
 8   punctuation_count        4376 non-null   int64  
 9   hashtags                 1062 non-null   object 
 10  mentions                 1218 non-null   object 
 11  urls                     2397 non-null   object 
 12  has_top10_hashtag        4376 non-null   int64  
 13  has_top10_keyword        4376 non-null   int64  
 14  has_top_bigra

### Gestion  des valeurs manquantes

Au lieu de supprimer 33% des données avec `dropna()`, considérons cette approche :

In [173]:
def replace_missing_values(series, default_value):
    df[series] = df[series].fillna(default_value)

    print(f"Données conservées de {series} : {df.shape[0]} lignes (vs {df.shape[0]} avec dropna)")
    print(f"Pourcentage conservé : {df.shape[0]/7613*100:.1f}%")

replace_missing_values('location', "")
replace_missing_values('keyword', "")

Données conservées de location : 4376 lignes (vs 4376 avec dropna)
Pourcentage conservé : 57.5%
Données conservées de keyword : 4376 lignes (vs 4376 avec dropna)
Pourcentage conservé : 57.5%


### Proportion target

In [176]:
df_0 = df[df['target'] == 0]
df_1 = df[df['target'] == 1]

df_0_sampled = df_0.sample(n=len(df_1), random_state=42)

df = pd.concat([df_0_sampled, df_1], axis=0).sample(frac=1, random_state=42)  # shuffle le tout


In [177]:
df['target'].value_counts(normalize=True)

0    0.5
1    0.5
Name: target, dtype: float64

In [178]:
tm = TextMining(df, text_column="text")
df_clean = (
    tm.lowercase()
      .remove_accents()
      .extract_target_char("#", "hashtags")
      .extract_target_char("@", "mentions")
      .extract_url()
      .clean_regex(["text"])
      .tokenize()
      .remove_stopwords()
      .remove_short_tokens()
      .apply_lemmatizer()
      .build_clean_text()
      .get_df()
)

# df_clean.to_csv()

In [180]:

df_clean = df_clean[[c for c in df_clean.columns 
                    if df_clean[c].dtype != 'object' or c == 'text']]

df_clean.head(10)
# df_clean.to_csv("df_clean_v1.csv", index=False)
df_clean.head()

Unnamed: 0,text,target,stop_word_count,url_count,mean_word_length,char_count,punctuation_count,has_top10_hashtag,has_top10_keyword,has_top_bigram,nb_words_in_cooc_class1
374,battle of the goats,0,2,1,7.8,44,5,0,0,0,0
4302,malaysia confirms plane debris washed up on re...,1,4,1,6.357143,102,5,0,0,0,0
2706,warne shocked over australia s epic collapse a...,1,2,1,7.5,135,13,0,0,0,0
3930,nah but srsly b4 u demolish ur partner s face ...,0,5,0,5.478261,148,5,0,0,0,0
3169,probably a dead boring 1st hour and a half aft...,0,7,0,5.055556,108,7,0,0,0,0


In [181]:
df_clean.to_csv("df_clean_v1.csv", index=False)