In [35]:
import pandas as pd
from text_mining import TextMining
import numpy as np

### Importer le CSV

In [54]:
df = pd.read_csv('../data/final_test.csv', index_col=False)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6478 entries, 0 to 6477
Data columns (total 19 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   keyword                  6422 non-null   object 
 1   location                 4332 non-null   object 
 2   text                     6478 non-null   object 
 3   target                   6478 non-null   int64  
 4   stop_word_count          6478 non-null   int64  
 5   mean_word_length         6478 non-null   float64
 6   char_count               6478 non-null   int64  
 7   punctuation_count        6478 non-null   int64  
 8   hashtag_count            6478 non-null   int64  
 9   presence_url             6478 non-null   int64  
 10  hashtags                 1501 non-null   object 
 11  mentions                 1689 non-null   object 
 12  urls                     3497 non-null   object 
 13  tokens                   6478 non-null   object 
 14  clean_text              

### Traitement des valeurs nuls ou non pertinentes

In [55]:
df.drop_duplicates(inplace=True)

In [56]:
print(df.shape)
df.info()

(6478, 19)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 6478 entries, 0 to 6477
Data columns (total 19 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   keyword                  6422 non-null   object 
 1   location                 4332 non-null   object 
 2   text                     6478 non-null   object 
 3   target                   6478 non-null   int64  
 4   stop_word_count          6478 non-null   int64  
 5   mean_word_length         6478 non-null   float64
 6   char_count               6478 non-null   int64  
 7   punctuation_count        6478 non-null   int64  
 8   hashtag_count            6478 non-null   int64  
 9   presence_url             6478 non-null   int64  
 10  hashtags                 1501 non-null   object 
 11  mentions                 1689 non-null   object 
 12  urls                     3497 non-null   object 
 13  tokens                   6478 non-null   object 
 14  clean_text   

### Gestion  des valeurs manquantes

Au lieu de supprimer 33% des données avec `dropna()`, considérons cette approche :

### Proportion target

In [57]:
df_0 = df[df['target'] == 0]
df_1 = df[df['target'] == 1]

df_0_sampled = df_0.sample(n=len(df_1), random_state=42)

df = pd.concat([df_0_sampled, df_1], axis=0).sample(frac=1, random_state=42)  # shuffle le tout


In [58]:
df['target'].value_counts(normalize=True)

0    0.5
1    0.5
Name: target, dtype: float64

In [61]:
tm = TextMining(df, text_column="text")
df_clean = (
    tm.lowercase()
      .remove_accents()
      .extract_target_char("#", "hashtags")
      .extract_target_char("@", "mentions")
      .extract_url()
      .clean_regex(["text"])
      .tokenize()
      .remove_stopwords()
      .remove_short_tokens()
      .apply_lemmatizer()
      .build_clean_text()
      .get_df()
)
# df_clean.to_csv()
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6478 entries, 4249 to 6441
Data columns (total 19 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   keyword                  6422 non-null   object 
 1   location                 4332 non-null   object 
 2   text                     6478 non-null   object 
 3   target                   6478 non-null   int64  
 4   stop_word_count          6478 non-null   int64  
 5   mean_word_length         6478 non-null   float64
 6   char_count               6478 non-null   int64  
 7   punctuation_count        6478 non-null   int64  
 8   hashtag_count            6478 non-null   int64  
 9   presence_url             6478 non-null   int64  
 10  hashtags                 6478 non-null   object 
 11  mentions                 6478 non-null   object 
 12  urls                     6478 non-null   object 
 13  tokens                   6478 non-null   object 
 14  clean_text           

In [60]:
df_clean = df_clean[[c for c in df_clean.columns 
                    if df_clean[c].dtype != 'object' or c in ['text', 'tokens']]]

df_clean.head(10)

Unnamed: 0,text,target,stop_word_count,mean_word_length,char_count,punctuation_count,hashtag_count,presence_url,tokens,has_top10_hashtag,has_top10_keyword,has_top_bigram,nb_words_in_cooc_class1
4249,there are people who plotted against me that a...,0,9,4.714286,79,0,0,0,"[people, plotted, still, wondering, survived]",0,0,0,0
91,windstorm board oks rate hike before change,0,1,7.272727,90,8,3,1,"[windstorm, board, ok, rate, hike, change]",0,0,0,0
5349,the road to success is paved with pennies that...,0,9,4.588235,94,2,0,0,"[road, success, paved, penny, flattened, train...",0,0,0,0
1631,just thought i d let you all know it s probabl...,0,14,4.32,132,7,0,0,"[thought, let, know, probably, good, idea, plu...",0,0,0,0
4523,rt america rt rt com eye of super typhoon soud...,1,2,8.214286,128,17,0,1,"[america, com, eye, super, typhoon, soudelor, ...",0,0,0,0
260,families to sue over legionnaires more than 40...,1,7,6.611111,136,10,1,1,"[family, sue, legionnaire, family, affected, f...",0,1,0,1
313,faan orders evacuation of abandoned aircraft a...,0,2,7.0,87,6,0,1,"[faan, order, evacuation, abandoned, aircraft,...",0,0,0,0
995,sorry for screaming at you and from the car i ...,0,10,4.619048,117,8,0,0,"[sorry, screaming, car, kinda, know, people, l...",0,0,0,0
4390,police unions retard justice amp drain gov but...,0,3,4.68,141,9,0,0,"[police, union, retard, justice, amp, drain, g...",0,0,0,0
652,carmike cinemas on antioch shooting we are gra...,1,5,6.666667,137,10,0,1,"[carmike, cinema, antioch, shooting, grateful,...",0,0,0,0


In [None]:
# df_clean.to_csv("test_to_suli.csv", index=False)