In [66]:
# Importation des modules

import pandas as pd
import nltk 
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords

from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer(language='french')

#Affichage de toutes les colonnes

pd.set_option('display.max_columns', 500)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/lilianmarey/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lilianmarey/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [67]:
#Importation des données

df = pd.read_csv('data/bdd_complete2.csv', sep = ',', encoding = 'latin-1')

In [68]:
# Echantillon de la base
print(df.shape)
df.sample(5)

(499843, 26)


Unnamed: 0.1,Unnamed: 0,Orateur,Date,Nature.de.séance,Président.de.séance,Sujet.débattu,Réplique,Didascalie,Président.de.séance_propre,nom.famille,sexe,age,groupe.sigle,commissions,nb.mandats,cabcollab,duree.pol,clustRFSP,clustVEP,hautdip,naissance_an,majo,profsigni2,ID,Groupe,time_floor
311476,311477,marc le fur,2018-12-05,Programmation 2018-2022 et réforme de la justice,Présidence de M. Richard Ferrand,Discussion des articles,Nous attendons toujours des réponses du minis...,,Présidence de M. Richard Ferrand,Le Fur,M,61,LR,Finances,4,True,27,5 - Dép. sortant,5 - Dép. sortant,ENA,1956,False,cadsuppub,311477,Exp,2018-12-01
413950,413951,richard ferrand,2019-11-15,Projet de loi de finances pour 2020,Présidence de M. Hugues Renson,,Quel est lavis de la commission sur ces amen...,,Présidence de M. Hugues Renson,Ferrand,M,55,LREM,Defense,1,True,22,5 - Dép. sortant,5 - Dép. sortant,Master,1962,True,cadsupprive,413951,Exp,2019-11-01
64179,64180,cecile untermaier,2019-05-17,Transformation de la fonction publique,Présidence de M. Hugues Renson,Discussion des articles,Lessentiel a été dit au sujet de lamendemen...,,Présidence de M. Hugues Renson,Untermaier,F,66,NG,Lois,1,False,6,5 - Dép. sortant,5 - Dép. sortant,Master,1951,False,cadsuppub,64180,Exp,2019-05-01
243598,243599,jean louis touraine,2019-10-02,Bioéthique,Présidence de M. Marc Le Fur,Discussion des articles,Simplement nous avons observé que les démarch...,,Présidence de M. Marc Le Fur,Touraine,M,72,LREM,Aff. Sociales,2,False,28,5 - Dép. sortant,5 - Dép. sortant,Doct,1945,True,proflib,243599,Exp,2019-10-01
327226,327227,martial saddier,2017-11-30,Compétences des collectivités territoriales da...,Présidence de M. Marc Le Fur,Discussion des articles,Il me semblait bien que tel était le cas mon...,,Présidence de M. Marc Le Fur,Saddier,M,48,LR,DvptDurable,3,True,22,5 - Dép. sortant,5 - Dép. sortant,Pro,1969,False,EmployeInter,327227,Exp,2017-11-01


In [69]:
# L'objectif est de nettoyer le texte dit par les intervenants. On s'intéresse donc aux colonnes 'Réplique' et 'Didascalie'

df_texte = df[['Réplique', 'Didascalie']]

In [70]:
df_texte.sample(10)

Unnamed: 0,Réplique,Didascalie
93953,Fixe des objectifs<U+0080>!,
317029,Nous ne rencontrons pas les mêmes<U+0080>!,
78708,Elle est belle la République irréprochable<U...,
371561,Cest pourquoi nous ne voterons donc pas la mo...,(Applaudissements sur les bancs des groupes L...
385494,Et les profs<U+0080>?,
223458,La France fait face depuis plusieurs années a...,
450376,Je pense également à lextension du pouvoir d...,
403798,La parole est à M. Denis Masséglia.,
300055,Vous affirmez que cette majorité a supprimé l...,
200781,Cest dramatique<U+0080>!,


In [71]:
# Première correction : les apostrophes sont codées par '\x92'. On les retire. 
# On supprime aussi les codes '<U+0080>' qui sont avant les points d'exclamation
# On remplace les tirets par des espaces
# On met tout en minuscule

df_texte['Réplique'] = df_texte.apply(
    lambda row: row.Réplique.replace('\x92', '').replace('<U+0080>', '').replace('-', ' ').lower(),
    axis = 1)

df_texte['Didascalie'] = df_texte.apply(
    lambda row: str(row.Didascalie).replace('\x92', '').replace('<U+0080>', '').replace('-', ' ').lower(),
    axis = 1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


In [72]:
# On tokenize les chaines de caractère

df_texte['tokenized_replique'] = df_texte.apply(lambda row: nltk.word_tokenize(row.Réplique, language='french'), 
                                             axis = 1)
df_texte['tokenized_didascalie'] = df_texte.apply(lambda row: nltk.word_tokenize(row.Didascalie, language='french'), 
                                             axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [73]:
# On obtient bien des listes de mots

df_texte.head(10)

# On remplace les nan par des listes vides

df_texte['tokenized_replique'] = df_texte.apply(lambda row: row.tokenized_replique if len(row.tokenized_replique) == 0 or row.tokenized_replique[0] != 'nan' else [], axis = 1)
df_texte['tokenized_didascalie'] = df_texte.apply(lambda row: row.tokenized_didascalie if len(row.tokenized_didascalie) == 0 or row.tokenized_didascalie[0] != 'nan' else [], axis = 1)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [74]:
# On retire la ponctuation pour le moment

df_texte['tokenized_replique'] = df_texte.apply(lambda row: [word for word in row.tokenized_replique if word.isalpha()], 
                                             axis = 1)

df_texte['tokenized_didascalie'] = df_texte.apply(lambda row: [word for word in row.tokenized_didascalie if word.isalpha()], 
                                             axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [75]:
# On retire les stopwords

stop_words = set(stopwords.words('french'))

df_texte['tokenized_replique'] = df_texte.apply(lambda row: [word for word in row.tokenized_replique if not word in stop_words], 
                                             axis = 1)

df_texte['tokenized_didascalie'] = df_texte.apply(lambda row: [word for word in row.tokenized_didascalie if not word in stop_words], 
                                             axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


In [76]:
# Stemming

df_texte['stemmed_replique'] = df_texte.apply(lambda row: [stemmer.stem(word) for word in row.tokenized_replique], 
                                             axis = 1)

df_texte['stemmed_didascalie'] = df_texte.apply(lambda row: [stemmer.stem(word) for word in row.tokenized_didascalie], 
                                             axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [77]:
df_cleaned = df.copy()
df_cleaned['tokenized_replique'] = df_texte['tokenized_replique']
df_cleaned['tokenized_didascalie'] = df_texte['tokenized_didascalie']
df_cleaned['stemmed_replique'] = df_texte['stemmed_replique']
df_cleaned['stemmed_didascalie'] = df_texte['stemmed_didascalie']

In [None]:
df_cleaned.to_csv('data/data_cleaned.csv', sep = ',', encoding = 'latin-1')