# Topic Modeling

In [1]:
import pandas as pd
from sklearn.decomposition import NMF
from IPython.display import display, HTML

## On définit ici une fonction qui aidera a afficher les résulats

In [2]:
def print_topic_word(df,nWords):
    stylers = [topic_word.iloc[i:i+1].transpose().sort_values(by='Topic %d'%i,ascending=False).head(nWords).style.set_table_attributes("style='display:inline;margin-right:20px'")._repr_html_() for i in range(df.shape[0])]
    display(HTML(''.join(stylers)))

## Lecture des données

In [3]:
lemmesParActe = pd.read_csv('lemmesParActe.csv', index_col=0)

In [4]:
lemmesParActe

Unnamed: 0,Hé,quoi,charmant,Élise,vous,devenir,mélancolique,après,le,obligeante,...,verrez,sain,ete,noce,allégresse,présente,Holà,holà,écriture,payement
ACTE PREMIER,5.0,12.0,3.0,5.0,147.0,3.0,1.0,4.0,172.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ACTE SECOND,3.0,3.0,1.0,0.0,95.0,0.0,0.0,4.0,163.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ACTE TROISIÈME,2.0,0.0,0.0,0.0,179.0,2.0,0.0,1.0,172.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ACTE QUATRIÈME,3.0,8.0,1.0,0.0,84.0,1.0,0.0,1.0,95.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ACTE CINQUIÈME,3.0,7.0,1.0,1.0,125.0,0.0,0.0,7.0,137.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0


## Première tentative

In [5]:
n_topics=3 # on définit le nombre de topics que l'on souhaite extraire
data=lemmesParActe # On travaille sur l'ensemble de la table lexicale

nmf = NMF(n_topics,max_iter=1000) # nmf est l'object qui va effectuer les calculs
doc_topic=nmf.fit_transform(data) # il fait les calcul

doc_topic=pd.DataFrame(doc_topic, index = data.index, columns=['Topic %d'%i for i in range(n_topics)]) #On construit la matrice qui lie les documents au topics
display(doc_topic) #On l'affiche

topic_word=pd.DataFrame(nmf.components_,  columns = data.columns, index=['Topic %d'%i for i in range(n_topics)]) #On construit la matrice qui lie les topics au mots
print_topic_word(topic_word, nWords=15)# on l'affiche

Unnamed: 0,Topic 0,Topic 1,Topic 2
ACTE PREMIER,1.575457,0.518555,0.246988
ACTE SECOND,18.568785,0.0,0.0
ACTE TROISIÈME,0.0,0.0,2.662023
ACTE QUATRIÈME,0.0,0.368838,0.0
ACTE CINQUIÈME,0.689899,0.433148,0.084072


Unnamed: 0,Topic 0
de,10.029642
le,8.788059
être,7.204607
et,7.179091
que,6.500333
avoir,5.885696
un,5.559334
vous,5.117207
je,4.34442
à,4.251022

Unnamed: 0,Topic 1
de,354.607794
le,275.297273
que,263.811035
être,256.661829
et,244.017429
vous,244.005261
je,233.994818
avoir,190.949131
un,170.155734
à,162.157678

Unnamed: 0,Topic 2
de,72.994085
vous,67.216092
le,64.666735
je,47.259451
que,46.120919
être,44.905495
et,39.200802
un,35.087586
il,30.58657
avoir,27.493543


#### Ces résulats se comprennent de la manière suivante 

In [6]:
mot='maison'
acte='ACTE PREMIER'
doc_topic.loc[acte, 'Topic 0'] * topic_word.loc['Topic 0', mot] + \
doc_topic.loc[acte, 'Topic 1'] * topic_word.loc['Topic 1', mot] + \
doc_topic.loc[acte, 'Topic 2'] * topic_word.loc['Topic 2', mot]

1.9638227002449928

In [7]:
lemmesParActe.loc[acte,mot]

2.0

#### Que peut-on en dire ? Que peut-on faire de mieux ?

# Deuxième essai, sélection des mots par le leur *document frequency*

In [8]:
doc_freq=(lemmesParActe>0).sum()
doc_freq

Hé          5
quoi        4
charmant    4
Élise       2
vous        5
           ..
présente    1
Holà        1
holà        1
écriture    1
payement    1
Length: 2237, dtype: int64

In [9]:
nDocMin=2 #on définit le nombre minimal de documents dans lesquels les mots doivent apparaitre
nDocMax=4 #on définit le nombre maximal de documents dans lesquels les mots doivent apparaitre
n_topics=3 # on définit le nombre de topics que l'on souhaite extraire
data=lemmesParActe.loc[:,doc_freq<=nDocMax].loc[:,doc_freq>=nDocMin] #On selectionne les données qui nous intéressent

nmf = NMF(n_topics,max_iter=1000)
doc_topic=nmf.fit_transform(data)

doc_topic=pd.DataFrame(doc_topic, index = data.index, columns=['Topic %d'%i for i in range(n_topics)])
display(doc_topic)

topic_word=pd.DataFrame(nmf.components_,  columns = data.columns, index=['Topic %d'%i for i in range(n_topics)])
print_topic_word(topic_word, nWords=20)

Unnamed: 0,Topic 0,Topic 1,Topic 2
ACTE PREMIER,2.664794,0.0,0.0
ACTE SECOND,0.0,1.278971,0.0
ACTE TROISIÈME,0.0,0.0,1.684227
ACTE QUATRIÈME,0.772965,0.0,0.479021
ACTE CINQUIÈME,1.265877,0.240431,0.012738


Unnamed: 0,Topic 0
quoi,4.950979
ciel,3.664826
grand,3.449765
frère,3.220354
soeur,3.001148
sentiment,2.813979
peine,2.643098
dot,2.568946
moins,2.44994
épouser,2.363513

Unnamed: 0,Topic 1
mille,10.900446
grand,10.79732
an,7.257943
livre,6.025471
tant,5.557106
franc,5.369642
trois,5.285352
Frosine,5.279538
moins,4.79725
Dieu,4.654002

Unnamed: 0,Topic 2
monsieur,9.408094
Jacques,6.940139
manger,6.59062
toujours,4.832137
trop,4.623809
grand,3.973662
Frosine,3.759081
quell,3.354959
enfin,3.279658
impertinent,3.276731


# Refaire en ne prennant que les noms, non pas par acte mais par personnage