# Etude des sujets des amendements PLFSS (LDA)

## Calcul des thématiques

Basé sur https://www.machinelearningplus.com/nlp/topic-modeling-python-sklearn-examples/

In [None]:
#!python -m pip install --upgrade pip

In [None]:
#!pip install -q --upgrade pandas sklearn

In [None]:
# Put these at the top of every notebook, to get automatic reloading and inline plotting
%reload_ext autoreload
%autoreload 2
%matplotlib inline
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
import pandas as pd
amdt = pd.read_csv('./data/amdt_sans_stopword.csv.gz')

# Analyse des thématiques par LDA

LDA : https://fr.wikipedia.org/wiki/Allocation_de_Dirichlet_latente

In [None]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
 
NUM_TOPICS = 15
 
# vectorizer = CountVectorizer(min_df=5, max_df=0.9, 
#                              stop_words='french', lowercase=True, 
#                              token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
vectorizer = CountVectorizer()
data_vectorized = vectorizer.fit_transform(amdt["txt_sans_stopword"])
 
# Build a Latent Dirichlet Allocation Model
# n_jobs = -1 to use all CPU core
lda_model = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=10, learning_method='online', n_jobs = -1)
lda_Z = lda_model.fit_transform(data_vectorized)


In [None]:
from sklearn.model_selection import GridSearchCV

# Define Search Param
search_params = {'n_components': [i for i in range(3,4)],
                'learning_decay': [.5], # [.5, .7, .9]
                'max_iter' : [10],
                'learning_method':['online'],
                'n_jobs':[-1]}

# Init the Model
lda = LatentDirichletAllocation()

# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params)

# Do the Grid Search
model.fit(data_vectorized)

GridSearchCV(estimator=LatentDirichletAllocation(),
             param_grid={'learning_decay': [0.5], 'learning_method': ['online'],
                         'max_iter': [10], 'n_components': [3],
                         'n_jobs': [-1]})

In [None]:
# Best Model
best_lda_model = model.best_estimator_

# Model Parameters
print("Best Model's Params: ", model.best_params_)

# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)

# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))

Best Model's Params:  {'learning_decay': 0.5, 'learning_method': 'online', 'max_iter': 10, 'n_components': 3, 'n_jobs': -1}
Best Log Likelihood Score:  -919354.8796033779
Model Perplexity:  2252.043944176753


In [None]:
model.cv_results_

{'mean_fit_time': array([16.50784359]),
 'std_fit_time': array([0.73690747]),
 'mean_score_time': array([0.13072748]),
 'std_score_time': array([0.01201381]),
 'param_learning_decay': masked_array(data=[0.5],
              mask=[False],
        fill_value='?',
             dtype=object),
 'param_learning_method': masked_array(data=['online'],
              mask=[False],
        fill_value='?',
             dtype=object),
 'param_max_iter': masked_array(data=[10],
              mask=[False],
        fill_value='?',
             dtype=object),
 'param_n_components': masked_array(data=[3],
              mask=[False],
        fill_value='?',
             dtype=object),
 'param_n_jobs': masked_array(data=[-1],
              mask=[False],
        fill_value='?',
             dtype=object),
 'params': [{'learning_decay': 0.5,
   'learning_method': 'online',
   'max_iter': 10,
   'n_components': 3,
   'n_jobs': -1}],
 'split0_test_score': array([-994870.41586479]),
 'split1_test_score': array(

In [None]:
gscore['params'][0]['learning_decay']

0.5

In [None]:
# # Get Log Likelyhoods from Grid Search Output
# n_topics = [10, 15, 20, 25, 30]
# log_likelyhoods_5 = [round(gscore.mean_validation_score) for gscore in model.grid_scores_ if gscore.parameters['learning_decay']==0.5]
# log_likelyhoods_7 = [round(gscore.mean_validation_score) for gscore in model.grid_scores_ if gscore.parameters['learning_decay']==0.7]
# log_likelyhoods_9 = [round(gscore.mean_validation_score) for gscore in model.grid_scores_ if gscore.parameters['learning_decay']==0.9]

# # Show graph
# plt.figure(figsize=(12, 8))
# plt.plot(n_topics, log_likelyhoods_5, label='0.5')
# plt.plot(n_topics, log_likelyhoods_7, label='0.7')
# plt.plot(n_topics, log_likelyhoods_9, label='0.9')
# plt.title("Choosing Optimal LDA Model")
# plt.xlabel("Num Topics")
# plt.ylabel("Log Likelyhood Scores")
# plt.legend(title='Learning decay', loc='best')
# plt.show()

# How to see the dominant topic in each document?

To classify a document as belonging to a particular topic, a logical approach is to see which topic has the highest contribution to that document and assign it.

In the table below, I’ve greened out all major topics in a document and assigned the most dominant topic in its own column.

In [None]:
best_lda_model

LatentDirichletAllocation(learning_decay=0.5, learning_method='online',
                          n_components=3, n_jobs=-1)

In [None]:
import numpy as np
# Create Document - Topic Matrix
lda_output = best_lda_model.transform(data_vectorized)

# column names
topicnames = ["Topic" + str(i) for i in range(best_lda_model.n_components)]
data = amdt["txt_sans_stopword"]
# index names
docnames = ["Doc" + str(i) for i in range(len(data))]

# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)

# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic

# Styling
def color_green(val):
    color = 'green' if val > .1 else 'black'
    return 'color: {col}'.format(col=color)

def make_bold(val):
    weight = 700 if val > .1 else 400
    return 'font-weight: {weight}'.format(weight=weight)

# Apply Style
df_document_topics = df_document_topic.head(15).style.applymap(color_green).applymap(make_bold)
df_document_topics

Unnamed: 0,Topic0,Topic1,Topic2,dominant_topic
Doc0,0.99,0.0,0.0,0
Doc1,0.07,0.93,0.0,1
Doc2,0.03,0.96,0.0,1
Doc3,0.0,0.73,0.26,1
Doc4,0.21,0.57,0.22,1
Doc5,0.47,0.53,0.0,1
Doc6,0.0,0.89,0.11,1
Doc7,0.0,0.89,0.11,1
Doc8,0.42,0.58,0.0,1
Doc9,0.0,0.59,0.41,1


# Review topics distribution across documents

In [None]:
df_topic_distribution = df_document_topic['dominant_topic'].value_counts().reset_index(name="Num Documents")
df_topic_distribution.columns = ['Topic Num', 'Num Documents']
df_topic_distribution

Unnamed: 0,Topic Num,Num Documents
0,1,2923
1,0,1460
2,2,414


### Affecter un sujet à un texte en utilisant le modèle

In [None]:
text = "Pour les entreprises de moins de 11 salariés, l’employeur est autorisé à attribuer une fois par an, à l’ensemble des salariés qu’il emploie, la prime exceptionnelle de pouvoir d’achat, dans les conditions prévues au V. »II. – En conséquence, compléter cet article par les deux alinéas suivants"
x = lda_model.transform(vectorizer.transform([text]))[0]
print(x, x.sum())

[0.00155039 0.00155039 0.00155039 0.02860847 0.00155039 0.00155039
 0.03950717 0.00155039 0.22440802 0.00155039 0.00155039 0.00155039
 0.00155039 0.00155039 0.69042206] 0.9999999999999999


In [None]:
lda_Z.shape

(4797, 15)

In [None]:
# lda_Z contient les probabilités d'appartenance à un des S sujets pour chaque document
lda_Z[:1]

array([[4.97513153e-04, 4.97512448e-04, 4.97513049e-04, 4.97512770e-04,
        4.97512929e-04, 4.97513086e-04, 4.97512786e-04, 4.97513071e-04,
        4.97513451e-04, 4.97513135e-04, 4.97512821e-04, 4.97512990e-04,
        4.97513258e-04, 4.97513055e-04, 9.93034818e-01]])

In [None]:
## Export model and data

In [None]:
import pickle
pickle.dump( vectorizer, open( "./data/amdt_vectorizer.pickle", "wb" ) )
pickle.dump( data_vectorized, open( "./data/amdt_data_vectorized.pickle", "wb" ) )
pickle.dump( lda_model, open( "./data/amdt_lda_model.pickle", "wb" ) )
pickle.dump( lda_Z, open( "./data/amdt_lda_Z.pickle", "wb" ) )