# Traitement de données textuelles en Python
## Lemmatisation, étiquetage

In [1]:
import fr_core_news_sm

Cette ligne importe un modèle du français proposé par [spaCy]
(http://www.spacy.io), une bibliothèque pour le traitement automatique des langues. Pour l'utiliser, il faut charger ce modèle, ce qui renvoie une "fonction" capable de traiter du texte :

In [2]:
nlp=fr_core_news_sm.load()
print(nlp)

<spacy.lang.fr.French object at 0x7f9b8fb5a730>


In [3]:
texte="Nous allons prendre ce petit texte comme exemple. Que fait Spacy avec ?"
nlp(texte)

Nous allons prendre ce petit texte comme exemple. Que fait Spacy avec ?

De prime abord, il ne s'est pas passé grand chose. Mais en fait, ce texte a été tokenisé :

In [4]:
for token in nlp(texte):
    print(token)

Nous
allons
prendre
ce
petit
texte
comme
exemple
.
Que
fait
Spacy
avec
?


Il a également été lemmatisé :

In [5]:
for token in nlp(texte):
    print(token.text,token.lemma_)

Nous nous
allons aller
prendre prendre
ce ce
petit petit
texte texte
comme comme
exemple exemple
. .
Que que
fait faire
Spacy Spacy
avec avec
? ?


Mais aussi étiqueté ! 

In [6]:
for token in nlp(texte):
    print(token.text,token.pos_)

Nous PRON
allons VERB
prendre VERB
ce DET
petit ADJ
texte NOUN
comme ADP
exemple NOUN
. PUNCT
Que PRON
fait VERB
Spacy PROPN
avec ADP
? PUNCT


On peut aussi savoir si un token est un mot ou pas :

In [7]:
for token in nlp(texte):
    print(token.text,token.is_alpha)

Nous True
allons True
prendre True
ce True
petit True
texte True
comme True
exemple True
. False
Que True
fait True
Spacy True
avec True
? False


### Exercice 1
Reprendre L'Avare et analyser toutes les répliques de manière en en faire des listes de tokens, où chaque token est un dictionnaire donnant sa forme graphique, son lemme, sa partie du discours et indicant s'il s'agit d'un mot à proprement parler.
```python
{'ACTE PREMIER': {'Scène première': [{'personnage': 'Valère',
  'réplique': [{'formeGraphique': 'Hé',
    'is_alpha': True,
    'lemme': 'hé',
    'pos': 'DET'},
   {'formeGraphique': 'quoi',
    'is_alpha': True,
    'lemme': 'quoi',
    'pos': 'PRON'},
   {'formeGraphique': '!', 
    'is_alpha': False, 
    'lemme': '!', 
    'pos': 'PUNCT'},
   {'formeGraphique': 'charmante',
    'is_alpha': True,
    'lemme': 'charmant',
    'pos': 'ADJ'},
   {'formeGraphique': 'Élise',
    'is_alpha': True,
    'lemme': 'Élise',
    'pos': 'PROPN'},
...
```

In [8]:
import json
pièce = json.loads(open('LAvare.json').read())

In [9]:
len(pièce['ACTE PREMIER']['Scène IV'])

1

In [10]:
for acte in pièce:
    for scène in pièce[acte]:
        for réplique in pièce[acte][scène]:
            nouvelleRéplique=[]
            for token in nlp(réplique['réplique']):
                d={}
                d['formeGraphique']=token.text
                d['pos']=token.pos_
                d['lemme']=token.lemma_
                d['is_alpha']=token.is_alpha
                nouvelleRéplique.append(d)
            réplique['réplique']=nouvelleRéplique

In [11]:
pièce['ACTE PREMIER']['Scène première'][0]['réplique']

[{'formeGraphique': 'Hé', 'pos': 'PROPN', 'lemme': 'Hé', 'is_alpha': True},
 {'formeGraphique': 'quoi', 'pos': 'PRON', 'lemme': 'quoi', 'is_alpha': True},
 {'formeGraphique': '!', 'pos': 'PUNCT', 'lemme': '!', 'is_alpha': False},
 {'formeGraphique': 'charmante',
  'pos': 'ADJ',
  'lemme': 'charmant',
  'is_alpha': True},
 {'formeGraphique': 'Élise',
  'pos': 'PROPN',
  'lemme': 'Élise',
  'is_alpha': True},
 {'formeGraphique': ',', 'pos': 'PUNCT', 'lemme': ',', 'is_alpha': False},
 {'formeGraphique': 'vous', 'pos': 'PRON', 'lemme': 'vous', 'is_alpha': True},
 {'formeGraphique': 'devenez',
  'pos': 'VERB',
  'lemme': 'devenir',
  'is_alpha': True},
 {'formeGraphique': 'mélancolique',
  'pos': 'NOUN',
  'lemme': 'mélancolique',
  'is_alpha': True},
 {'formeGraphique': ',', 'pos': 'PUNCT', 'lemme': ',', 'is_alpha': False},
 {'formeGraphique': 'après', 'pos': 'ADP', 'lemme': 'après', 'is_alpha': True},
 {'formeGraphique': 'les', 'pos': 'DET', 'lemme': 'le', 'is_alpha': True},
 {'formeGraph

In [12]:
with open('LAvare-analysé.json', 'w') as f:
    f.write(json.dumps(pièce))

### Exercice 2 
Quel est le nombre de tokens dans toute la pièce ?

In [11]:
n=0
for acte in pièce:
    for scène in pièce[acte]:
        for réplique in pièce[acte][scène]:
            for token in réplique['réplique']:
                n+=1
n            

24523

### Exercice 3
Quel est le nombre de lemmes différents ?

In [15]:
lemmes=[]
for acte in pièce:
    for scène in pièce[acte]:
        for réplique in pièce[acte][scène]:
            for token in réplique['réplique']:
                if token['lemme'] not in lemmes:
                    lemmes.append(token['lemme'])
len(lemmes)

2309

### Exercice 4
Quel est le lemme le plus fréquent ? Pour cela, construire un dictionnaire qui associe à chaque lemme sa fréquence, écrire ce dictionnaire dans un fichier csv (quel est le bon séparateur ?), l'ouvrir dans Excel

In [18]:
frequence = {}
for acte in pièce:
    for scène in pièce[acte]:
        for réplique in pièce[acte][scène]:
            for token in réplique['réplique']:
                if token['lemme'] not in frequence:
                    frequence[token['lemme']]=1
                else:
                    frequence[token['lemme']]+=1
f=open('frequence.csv','w')
for lemme in frequence:
    f.write('%s\t%d\n'%(lemme,frequence[lemme]))
f.close()

La bibliothèque `Pandas` permet de manipuler des tableaux semblables aux feuilles de calcul

In [20]:
import pandas as pd

In [21]:
freq_df=pd.DataFrame({'freq':frequence})

In [23]:
freq_df.head(5)

Unnamed: 0,freq
,1
!,290
"""",31
(,17
),16


In [30]:
freq_df.sort_values(by='freq', ascending=False).head(100)

Unnamed: 0,freq
",",1574
de,1057
le,975
.,960
que,802
...,...
"""",31
quel,30
raison,30
maître,30


In [26]:
freq_df.shape

(2340, 1)

In [28]:
print('Il y a %d lemmes différents'%freq_df.shape[0])
print('Il y a %d tokens'%freq_df.sum())

Il y a 2340 lemmes différents
Il y a 24756 tokens


## Vocabulaire spécifique des différentes parties de la pièce

### Exercice 5
Construire un dictionnaire de dictionnaire `lemmeParActe` qui donne le nombre d'occurrences de chacun des lemmes dans chacun des actes, en ne prenant en compte que les tokens qui sont réellement des mots.
```python
{'ACTE PREMIER': {'Adieu': 1,
 'Anselme': 4,
 'Après': 1,
 'Au': 1,
 'Bon': 1,
 'Car': 1,
 'De': 4,
 'Dieu': 3,
 'Dès': 2,
 'Eh': 2,
 'Enfin': 1,
                  
...
                 }
}
```


In [31]:
lemmeParActe={}
for acte in pièce:
    lemmeParActe[acte]={}
    for scène in pièce[acte]:
        for réplique in pièce[acte][scène]:
            for token in réplique['réplique']:
                if not token['is_alpha']:
                    continue
                if token['lemme'] not in lemmeParActe[acte]:
                    lemmeParActe[acte][token['lemme']]=1
                else:
                    lemmeParActe[acte][token['lemme']]+=1

On peut construire un `DataFrame` à partir de ce dictionnaire. Cette table est appelée `table lexicale` :

In [33]:
table=pd.DataFrame(lemmeParActe)
table.head()

Unnamed: 0,ACTE PREMIER,ACTE SECOND,ACTE TROISIÈME,ACTE QUATRIÈME,ACTE CINQUIÈME
Hé,6.0,3.0,2.0,4.0,4.0
quoi,12.0,3.0,,8.0,7.0
charmante,2.0,,,1.0,1.0
Élise,4.0,,,,1.0
vous,168.0,105.0,202.0,94.0,135.0


In [34]:
table=table.fillna(0)#Quand un lemme n'apparait pas dans un acte, c'est qu'il y apparait 0 fois !
table.head()

Unnamed: 0,ACTE PREMIER,ACTE SECOND,ACTE TROISIÈME,ACTE QUATRIÈME,ACTE CINQUIÈME
Hé,6.0,3.0,2.0,4.0,4.0
quoi,12.0,3.0,0.0,8.0,7.0
charmante,2.0,0.0,0.0,1.0,1.0
Élise,4.0,0.0,0.0,0.0,1.0
vous,168.0,105.0,202.0,94.0,135.0


On peut faire de sommes le long des lignes ou des colonnes

In [35]:
table.sum()

ACTE PREMIER      4906.0
ACTE SECOND       3951.0
ACTE TROISIÈME    3812.0
ACTE QUATRIÈME    2954.0
ACTE CINQUIÈME    3666.0
dtype: float64

In [36]:
table.sum(axis=1)

Hé             19.0
quoi           30.0
charmante       4.0
Élise           5.0
vous          704.0
              ...  
noce            1.0
allégresse      1.0
holà            2.0
écriture        2.0
payement        1.0
Length: 2278, dtype: float64

Ces décomptes bruts sont difficiles à interpréter car les actes n'ont pas tous la même taille. Une première manière de leur de donner plus de sens est de les diviser par le nombre de lemmes par acte, pour obtenir une fréquence:

In [37]:
freq=table/table.sum()
freq.head()

Unnamed: 0,ACTE PREMIER,ACTE SECOND,ACTE TROISIÈME,ACTE QUATRIÈME,ACTE CINQUIÈME
Hé,0.001223,0.000759,0.000525,0.001354,0.001091
quoi,0.002446,0.000759,0.0,0.002708,0.001909
charmante,0.000408,0.0,0.0,0.000339,0.000273
Élise,0.000815,0.0,0.0,0.0,0.000273
vous,0.034244,0.026576,0.052991,0.031821,0.036825


Une mesure classique pour estimer l'importance d'un mot dans un document (ici, dans un acte) est tf-idf

* **tf** signifie term frequency, la fréquence d'un terme dans un document. C'est ce que l'on vient de calculer. Plus un mot est fréquent dans un document, plus il est important pour ce document. Sauf s'il est fréquent dans tous les documents...

* **idf** signifier inverse document frequency, l'inverse de la fréquence en document. La fréquence en document est la fraction de document dans lesquels un mot apparait. Par exemple, Adieu n'apparait que dans 2 des 5 actes. Sa fréquence en document est 2/5. Plus ce nombre est petit, c'est-à-dire moins le mot est présent dans beaucoup de documents, plus il est important dans les documents dans lesquels ont le trouve.

En divisant la fréquence tf par la fréquence df, on obtient une mesure de l'importance du mot dans le document. Et diviser par df correspond à multiplier par son inverse :

$$\frac{tf}{df} = tf\cdot\frac{1}{df} = tf\cdot idf$$
Avec $$idf = \frac{1}{df} = \frac{N}{N_{+}}$$
$N$ étant le nombre de documents et $N_{+}$ le nombre de documents où le terme est présent.

Généralement, on ne prend pas ce idf "brut", mais son logarithme :

$$tf \cdot idf= tf\cdot log(\frac{N}{N_{+}})$$

Comment calculer ce df ?

In [38]:
df=table.copy()

In [39]:
df

Unnamed: 0,ACTE PREMIER,ACTE SECOND,ACTE TROISIÈME,ACTE QUATRIÈME,ACTE CINQUIÈME
Hé,6.0,3.0,2.0,4.0,4.0
quoi,12.0,3.0,0.0,8.0,7.0
charmante,2.0,0.0,0.0,1.0,1.0
Élise,4.0,0.0,0.0,0.0,1.0
vous,168.0,105.0,202.0,94.0,135.0
...,...,...,...,...,...
noce,0.0,0.0,0.0,0.0,1.0
allégresse,0.0,0.0,0.0,0.0,1.0
holà,0.0,0.0,0.0,0.0,2.0
écriture,0.0,0.0,0.0,0.0,2.0


In [43]:
df>0

Unnamed: 0,ACTE PREMIER,ACTE SECOND,ACTE TROISIÈME,ACTE QUATRIÈME,ACTE CINQUIÈME
Hé,True,True,True,True,True
quoi,True,True,False,True,True
charmante,True,False,False,True,True
Élise,True,False,False,False,True
vous,True,True,True,True,True
...,...,...,...,...,...
noce,False,False,False,False,True
allégresse,False,False,False,False,True
holà,False,False,False,False,True
écriture,False,False,False,False,True


In [41]:
df[df>0]=1

In [45]:
df

Unnamed: 0,ACTE PREMIER,ACTE SECOND,ACTE TROISIÈME,ACTE QUATRIÈME,ACTE CINQUIÈME
Hé,1.0,1.0,1.0,1.0,1.0
quoi,1.0,1.0,0.0,1.0,1.0
charmante,1.0,0.0,0.0,1.0,1.0
Élise,1.0,0.0,0.0,0.0,1.0
vous,1.0,1.0,1.0,1.0,1.0
...,...,...,...,...,...
noce,0.0,0.0,0.0,0.0,1.0
allégresse,0.0,0.0,0.0,0.0,1.0
holà,0.0,0.0,0.0,0.0,1.0
écriture,0.0,0.0,0.0,0.0,1.0


In [47]:
5/df.sum(axis=1)

Hé            1.000000
quoi          1.250000
charmante     1.666667
Élise         2.500000
vous          1.000000
                ...   
noce          5.000000
allégresse    5.000000
holà          5.000000
écriture      5.000000
payement      5.000000
Length: 2278, dtype: float64

In [48]:
import numpy as np
df = np.log(5/df.sum(axis=1))

In [49]:
df

Hé            0.000000
quoi          0.223144
charmante     0.510826
Élise         0.916291
vous          0.000000
                ...   
noce          1.609438
allégresse    1.609438
holà          1.609438
écriture      1.609438
payement      1.609438
Length: 2278, dtype: float64

In [50]:
tfidf=freq.multiply(df,axis=0)

In [51]:
tfidf

Unnamed: 0,ACTE PREMIER,ACTE SECOND,ACTE TROISIÈME,ACTE QUATRIÈME,ACTE CINQUIÈME
Hé,0.000000,0.000000,0.0,0.000000,0.000000
quoi,0.000546,0.000169,0.0,0.000604,0.000426
charmante,0.000208,0.000000,0.0,0.000173,0.000139
Élise,0.000747,0.000000,0.0,0.000000,0.000250
vous,0.000000,0.000000,0.0,0.000000,0.000000
...,...,...,...,...,...
noce,0.000000,0.000000,0.0,0.000000,0.000439
allégresse,0.000000,0.000000,0.0,0.000000,0.000439
holà,0.000000,0.000000,0.0,0.000000,0.000878
écriture,0.000000,0.000000,0.0,0.000000,0.000878


In [52]:
tfidf.sort_values(by='ACTE PREMIER', ascending=False)

Unnamed: 0,ACTE PREMIER,ACTE SECOND,ACTE TROISIÈME,ACTE QUATRIÈME,ACTE CINQUIÈME
avaricieux,0.001968,0.000000,0.000000,0.000000,0.000000
soeur,0.001868,0.000000,0.000000,0.000000,0.000250
dot,0.001681,0.000232,0.000000,0.000000,0.000000
avarice,0.001121,0.000232,0.000000,0.000000,0.000000
frère,0.001041,0.000000,0.000000,0.000173,0.000279
...,...,...,...,...,...
frais,0.000000,0.000232,0.000000,0.000000,0.000250
teint,0.000000,0.000407,0.000000,0.000000,0.000000
santé,0.000000,0.000232,0.000240,0.000000,0.000000
visage,0.000000,0.000232,0.000961,0.000000,0.000000


In [53]:
tfidf.sort_values(by='ACTE SECOND', ascending=False)

Unnamed: 0,ACTE PREMIER,ACTE SECOND,ACTE TROISIÈME,ACTE QUATRIÈME,ACTE CINQUIÈME
prêteur,0.000000,0.003259,0.0,0.0,0.000000
mille,0.000625,0.002069,0.0,0.0,0.000279
livre,0.000187,0.001855,0.0,0.0,0.000000
dessus,0.000000,0.001629,0.0,0.0,0.000000
emprunteur,0.000000,0.001629,0.0,0.0,0.000000
...,...,...,...,...,...
fouiller,0.000328,0.000000,0.0,0.0,0.000000
satisfaire,0.000187,0.000000,0.0,0.0,0.000250
poche,0.000328,0.000000,0.0,0.0,0.000000
Tenez,0.000328,0.000000,0.0,0.0,0.000000


### Exercice 3
Quel est le vocabulaire spécifique de chaque personnage ?

In [54]:
lemmeParPersonnage={}
for acte in pièce:
    for scène in pièce[acte]:
        for réplique in pièce[acte][scène]:
            personnage=réplique['personnage']
            if personnage not in lemmeParPersonnage:
                lemmeParPersonnage[personnage]={}
            for token in réplique['réplique']:
                if not token['is_alpha']:
                    continue
                if token['lemme'] not in lemmeParPersonnage[personnage]:
                    lemmeParPersonnage[personnage][token['lemme']]=0
                lemmeParPersonnage[personnage][token['lemme']]+=1

In [55]:
table=pd.DataFrame(lemmeParPersonnage)
table=table.fillna(0)
table.head()

Unnamed: 0,Valère,Élise,Cléante,Harpagon,La Flèche,Maître Simon,Frosine,Maître Jacques,La Merluche,Brindavoine,Mariane,Le commissaire,Anselme
Hé,3.0,0.0,2.0,5.0,1.0,0.0,2.0,6.0,0.0,0.0,0.0,0.0,0.0
quoi,1.0,2.0,4.0,10.0,4.0,0.0,2.0,6.0,0.0,0.0,0.0,0.0,1.0
charmante,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Élise,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
vous,98.0,54.0,129.0,124.0,41.0,13.0,81.0,83.0,2.0,3.0,38.0,16.0,22.0


In [56]:
table.shape #2278 lemmes, 13 personnages

(2278, 13)

In [57]:
freq=table/table.sum()

In [58]:
df=table.copy()
df[df>0]=1
df = np.log(13/df.sum(axis=1))

In [59]:
tfidf=freq.multiply(df,axis=0)

In [60]:
tfidf.sort_values(by='Valère', ascending=False).head(10)

Unnamed: 0,Valère,Élise,Cléante,Harpagon,La Flèche,Maître Simon,Frosine,Maître Jacques,La Merluche,Brindavoine,Mariane,Le commissaire,Anselme
Élise,0.005014,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
de,0.004162,0.003432,0.003552,0.003377,0.004147,0.000889,0.003897,0.003367,0.0,0.008426,0.002448,0.002601,0.002817
dot,0.003659,0.0,0.0,0.001659,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
tort,0.003659,0.0,0.0,0.0,0.0,0.0,0.0,0.002423,0.0,0.0,0.0,0.0,0.0
à,0.003527,0.002665,0.002598,0.003198,0.002608,0.003712,0.0033,0.002271,0.0,0.0,0.003341,0.002412,0.004496
pour,0.003487,0.002616,0.00323,0.002465,0.002979,0.0,0.003495,0.002887,0.0,0.0,0.002161,0.001894,0.003802
juger,0.002927,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003875
engagement,0.002927,0.001866,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
que,0.00291,0.002554,0.003319,0.002057,0.002556,0.003557,0.002206,0.002435,0.003905,0.0,0.003108,0.001734,0.002983
le,0.002879,0.003192,0.003189,0.002639,0.003977,0.001779,0.003419,0.003886,0.0,0.002106,0.002637,0.003179,0.002652


In [61]:
table.sum()

Valère            2558.0
Élise             1003.0
Cléante           3087.0
Harpagon          5641.0
La Flèche         1409.0
Maître Simon       180.0
Frosine           2177.0
Maître Jacques    1545.0
La Merluche         41.0
Brindavoine         38.0
Mariane            850.0
Le commissaire     277.0
Anselme            483.0
dtype: float64

In [63]:
table.loc['de']

Valère            133.0
Élise              43.0
Cléante           137.0
Harpagon          238.0
La Flèche          73.0
Maître Simon        2.0
Frosine           106.0
Maître Jacques     65.0
La Merluche         0.0
Brindavoine         4.0
Mariane            26.0
Le commissaire      9.0
Anselme            17.0
Name: de, dtype: float64

Certains personnage ayant un texte tres réduit, des mots pourtant tres fréquents peuvent ne pas apparaitre dans leur discours. C'est le cas de `de`par exemple, qui n'est pas prononcé par La Merluche. Cela biaise `df`, et c'est pourquoi `de` apparait comme spécifique à Valère. On peut recommencer en supprimant ces personnages.

In [64]:
table=table.drop(['La Merluche','Brindavoine'], axis=1)

In [65]:
table

Unnamed: 0,Valère,Élise,Cléante,Harpagon,La Flèche,Maître Simon,Frosine,Maître Jacques,Mariane,Le commissaire,Anselme
Hé,3.0,0.0,2.0,5.0,1.0,0.0,2.0,6.0,0.0,0.0,0.0
quoi,1.0,2.0,4.0,10.0,4.0,0.0,2.0,6.0,0.0,0.0,1.0
charmante,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Élise,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
vous,98.0,54.0,129.0,124.0,41.0,13.0,81.0,83.0,38.0,16.0,22.0
...,...,...,...,...,...,...,...,...,...,...,...
redonne,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
consentez,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
hyméné,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
ete,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [66]:
freq=table/table.sum()
df=table.copy()
df[df>0]=1
df = np.log(11/df.sum(axis=1))
tfidf=freq.multiply(df,axis=0)

In [68]:
tfidf.sort_values(by='Valère', ascending=False).head(10)

Unnamed: 0,Valère,Élise,Cléante,Harpagon,La Flèche,Maître Simon,Frosine,Maître Jacques,Mariane,Le commissaire,Anselme
Élise,0.004687,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
tort,0.003332,0.0,0.0,0.0,0.0,0.0,0.0,0.002207,0.0,0.0,0.0
dot,0.003332,0.0,0.0,0.001511,0.0,0.0,0.0,0.0,0.0,0.0,0.0
juger,0.002666,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003529
engagement,0.002666,0.0017,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
fille,0.002133,0.001209,0.000196,0.002686,0.0,0.0,0.001392,0.0,0.0,0.0,0.001255
apprenez,0.002032,0.0,0.0,0.00023,0.0,0.0,0.0,0.0,0.0,0.0,0.00269
crime,0.001999,0.0,0.0,0.001209,0.0,0.0,0.0,0.0,0.0,0.0,0.0
charmante,0.001999,0.0,0.000552,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
chercher,0.001977,0.0,0.000983,0.000179,0.0,0.0,0.0,0.0,0.0,0.0,0.002094


In [69]:
tfidf.sort_values(by='Harpagon', ascending=False).head(10)

Unnamed: 0,Valère,Élise,Cléante,Harpagon,La Flèche,Maître Simon,Frosine,Maître Jacques,Mariane,Le commissaire,Anselme
tu,0.0,0.000604,0.00216,0.008811,0.002151,0.0,0.000278,0.0,0.0,0.0,0.001255
te,0.0,0.0,0.000983,0.005918,0.001436,0.0,0.000465,0.0,0.0,0.0,0.0
pendard,0.0,0.0,0.0,0.003826,0.0,0.0,0.0,0.0,0.0,0.0,0.0
as,0.0,0.001009,0.000655,0.003407,0.000718,0.0,0.0,0.0,0.0,0.0,0.0
voleur,0.0,0.0,0.0,0.003401,0.0,0.0,0.0,0.0,0.0,0.0,0.0
justice,0.0,0.0,0.0,0.003401,0.0,0.0,0.0,0.0,0.0,0.0,0.0
coquin,0.0,0.0,0.0,0.003401,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ton,0.0,0.0,0.0,0.003324,0.0,0.0,0.0,0.0,0.0,0.0,0.003529
fille,0.002133,0.001209,0.000196,0.002686,0.0,0.0,0.001392,0.0,0.0,0.0,0.001255
traître,0.0,0.0,0.001104,0.002418,0.0,0.0,0.0,0.0,0.0,0.0,0.0
