In [20]:
# First: pip install wheel

# Next: Download Numpy and Scipy form Gholke's repo Numpy and SciPy


# Then:

# pip install numpy_package.whl
# pip install scipy_package.whl
# https://towardsdatascience.com/6-tips-to-optimize-an-nlp-topic-model-for-interpretability-20742f3047e2

In [1]:
# Importing modules
import random
import os
from pathlib import Path
from pprint import pprint
import logging
from importlib import reload  # Not needed in Python 2
import tqdm

import numpy as np
import scipy
import pandas as pd
import pickle

import nltk
import spacy
from nltk.corpus import stopwords
import gensim
from gensim.utils import simple_preprocess
import gensim.corpora as corpora
from gensim.models import CoherenceModel
from gensim.models.callbacks import PerplexityMetric, ConvergenceMetric, CoherenceMetric

reload(logging)
logging.basicConfig(filename='model_callbacks.log',format='%(asctime)s %(levelname)s:%(message)s', level=logging.DEBUG, datefmt='%I:%M:%S')
# logging.basicConfig(filename='gensim.log',
#                     format="%(asctime)s:%(levelname)s:%(message)s",
#                     level=logging.NOTSET)

In [2]:
DATA_DIR = Path('./data')
def load(filename):
    f = open(DATA_DIR/filename,"rb")
    return pickle.load(f)
    
def save(data, filename):
    with open(DATA_DIR/filename, 'wb') as handle:
        pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [3]:

file = open("moties_processed_df.pickle","rb")
df = pickle.load(file)
print(len(df))

29514


In [4]:
# remove moties without text
mask = (df['Text']=='') | (df['Text'].isna())
df = df.loc[~mask]
len(df)

29484

In [5]:
indieners = {indiener[-1].lower() for indiener in df['Indiener_persoon'].str.split() if indiener}
df = df['Text'].reset_index()
df.iloc[2000]['Text']

'2\nTweede Kamer der Staten-Generaal\nVergaderjaar 2009–2010\n31 371 Kredietcrisis\nNr. 293 MOTIE VAN HET LID TONY VAN DIJCK\nVoorgesteld 16 december 2009\nDe Kamer,\ngehoord de beraadslaging,\nbesluit een parlementaire enquête te houden naar het faillissement van de\nDSB Bank waarbij alle betrokkenen onder ede kunnen worden gehoord,\nen gaat over tot de orde van de dag.\nTony van Dijck\nKST138845\n0910tkkst31371-293\nISSN0921-7371\nSduUitgevers\nTweede Kamer, vergaderjaar 2009–2010, 31 371, nr. 293\n’s-Gravenhage2009'

In [6]:
# https://github.com/kapadias/mediumposts/blob/master/natural_language_processing/topic_modeling/notebooks/Evaluate%20Topic%20Models.ipynb
error = []
def remove_indieners(doc):
    try:
        return re.search(pattern, doc).groups()[0]
    except:
        error.append(doc)
        return 'None'

import re
# Remove punctuation
df['Text'] = df['Text'].map(lambda x: re.sub('[,\.!?;]', '', x))

df['Text'] = df['Text'].map(lambda x: re.sub(r'-\n', '', x))
df['Text'] = df['Text'].map(lambda x: re.sub(r'\n', ' ', x))
# Convert the titles to lowercase
df['Text'] = df['Text'].map(lambda x: x.lower())
pattern = re.compile("(?:voorgesteld|gehoord de beraadslaging)(.+) orde van de dag")
# df['Text'] = df['Text'].apply(lambda x: remove_indieners(x))
# df['Text'] = df['Text'].str[64:-123]
# Print out the first rows of papers
df['Text'][0]

'2 tweede kamer der staten-generaal vergaderjaar 2008–2009 23 432 de situatie in het midden-oosten nr 268 motie van het lid pechtold voorgesteld 14 januari 2009 de kamer gehoord de beraadslaging constaterende dat nederland en denemarken binnen de europese unie een voorstel hebben gedaan voor een effectief grenstoezicht op de gazaans-egyptische grens overwegende dat een dergelijk grenstoezicht de invoer van humanitaire hulp en normale goederen mogelijk kan maken en kan voorkomen dat hamas zich via de grens herbewapent verzoekt de regering bij het uitwerken van dit initiatief voorstellen te doen voor een stevige opdracht omvang en mandaat van een dergelijk (militair) grenstoezicht opdat deze daadwerkelijk effectief kan zorg dragen voor humanitaire hulp en ontwapening en gaat over tot de orde van de dag pechtold kst127027 0809tkkst23432-268 issn0921-7371 sduuitgevers tweede kamer vergaderjaar 2008–2009 23 432 nr 268 ’s-gravenhage2009'

In [7]:
len(df.loc[df['Text']=="None"])

0

In [8]:

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data = df['Text'].values.tolist()
data_words = list(sent_to_words(data))

print(data_words[:1][0][:90])

['tweede', 'kamer', 'der', 'staten', 'generaal', 'vergaderjaar', 'de', 'situatie', 'in', 'het', 'midden', 'oosten', 'nr', 'motie', 'van', 'het', 'lid', 'pechtold', 'voorgesteld', 'januari', 'de', 'kamer', 'gehoord', 'de', 'beraadslaging', 'constaterende', 'dat', 'nederland', 'en', 'denemarken', 'binnen', 'de', 'europese', 'unie', 'een', 'voorstel', 'hebben', 'gedaan', 'voor', 'een', 'effectief', 'grenstoezicht', 'op', 'de', 'gazaans', 'egyptische', 'grens', 'overwegende', 'dat', 'een', 'dergelijk', 'grenstoezicht', 'de', 'invoer', 'van', 'humanitaire', 'hulp', 'en', 'normale', 'goederen', 'mogelijk', 'kan', 'maken', 'en', 'kan', 'voorkomen', 'dat', 'hamas', 'zich', 'via', 'de', 'grens', 'herbewapent', 'verzoekt', 'de', 'regering', 'bij', 'het', 'uitwerken', 'van', 'dit', 'initiatief', 'voorstellen', 'te', 'doen', 'voor', 'een', 'stevige', 'opdracht', 'omvang']


In [9]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=20, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], min_count=20, threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [10]:
# NLTK Stop words


nltk.download('stopwords')
stop_words = stopwords.words('dutch')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
stop_words.extend(indieners)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Jesse\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [12]:
!python -m spacy download nl_core_news_sm

✔ Download and installation successful
You can now load the model via spacy.load('nl_core_news_sm')


In [None]:

# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
nlp = spacy.load("nl_core_news_sm", disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1][0][:30])

** **
#### Step 4: Data transformation: Corpus and Dictionary
** **

The two main inputs to the LDA topic model are the dictionary(id2word) and the corpus. Let’s create them.

In [None]:


# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)


print('len of dictionary before pruning\t: ', len(id2word))
id2word.filter_extremes(no_below=5, no_above=0.12) #minimally 10 occurrences and not in more than 30% of documents
print('len of dictionary after pruning\t: ',len(id2word))
r = random.choice(id2word.keys())
print('example from id2word dict:\t', r, id2word[r])


# Create Corpus
texts = data_lemmatized
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print('example document translated into bag of words ', corpus[:1][0][:30])

texts = [[id2word[word_id] for word_id, freq in doc] for doc in corpus]

In [None]:
save((id2word, corpus, texts), 'corpus_dict_texts_full_remove_indieners_no_above012')

In [None]:

id2word, corpus, texts = load('corpus_dict_texts_full_no_processing_no_above020')

In [None]:
t = df.iloc[20000]['Text']
t

In [None]:
texts[20000]

** **
#### Step 5: Base Model 
** **

We have everything required to train the base LDA model. In addition to the corpus and dictionary, you need to provide the number of topics as well. Apart from that, alpha and eta are hyperparameters that affect sparsity of the topics. According to the Gensim docs, both defaults to 1.0/num_topics prior (we'll use default for the base model).

chunksize controls how many documents are processed at a time in the training algorithm. Increasing chunksize will speed up training, at least as long as the chunk of documents easily fit into memory.

passes controls how often we train the model on the entire corpus (set to 10). Another word for passes might be "epochs". iterations is somewhat technical, but essentially it controls how often we repeat a particular loop over each document. It is important to set the number of "passes" and "iterations" high enough.

In [53]:
# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=10, 
                                       random_state=100,
                                       chunksize=100,
                                       passes=10,
                                       per_word_topics=True)

** **
The above LDA model is built with 10 different topics where each topic is a combination of keywords and each keyword contributes a certain weightage to the topic.

You can see the keywords for each topic and the weightage(importance) of each keyword using `lda_model.print_topics()`

In [54]:

# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.065*"xvi" + 0.043*"jaar" + 0.042*"orde" + 0.038*"ministerie" + '
  '0.033*"xvivaststelling" + 0.033*"sport" + 0.033*"volksgezondheid" + '
  '0.028*"risico" + 0.028*"zorg" + 0.022*"mening"'),
 (1,
  '0.070*"ziekenhuis" + 0.058*"constateren" + 0.034*"nederland" + '
  '0.032*"stellen" + 0.030*"bevolking" + 0.030*"algemeen" + 0.030*"vlissing" + '
  '0.026*"hama" + 0.024*"middel" + 0.024*"positionering"'),
 (2,
  '0.113*"israel" + 0.088*"constateren" + 0.056*"situatie" + '
  '0.051*"middenoosten" + 0.049*"hama" + 0.045*"europees" + 0.035*"gaza" + '
  '0.025*"nederland" + 0.024*"relatie" + 0.024*"gebruiken"'),
 (3,
  '0.061*"mening" + 0.038*"toelating" + 0.038*"constateren" + 0.029*"dagvan" + '
  '0.027*"maand" + 0.026*"nederlands" + 0.025*"bestaan" + 0.025*"recht" + '
  '0.024*"kabinet" + 0.023*"maken"'),
 (4,
  '0.050*"humanitair" + 0.048*"situatie" + 0.040*"internationaal" + '
  '0.039*"hulp" + 0.036*"middenoosten" + 0.036*"geweld" + 0.034*"gaza" + '
  '0.029*"mogelijk" + 0.028*

#### Compute Model Perplexity and Coherence Score

Let's calculate the baseline coherence score

In [57]:

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)

Coherence Score:  0.3443799985335936


** **
#### Step 6: Hyperparameter tuning
** **
First, let's differentiate between model hyperparameters and model parameters :

- `Model hyperparameters` can be thought of as settings for a machine learning algorithm that are tuned by the data scientist before training. Examples would be the number of trees in the random forest, or in our case, number of topics K

- `Model parameters` can be thought of as what the model learns during training, such as the weights for each word in a given topic.

Now that we have the baseline coherence score for the default LDA model, let's perform a series of sensitivity tests to help determine the following model hyperparameters: 
- Number of Topics (K)
- Dirichlet hyperparameter alpha: Document-Topic Density
- Dirichlet hyperparameter beta: Word-Topic Density

We'll perform these tests in sequence, one parameter at a time by keeping others constant and run them over the two difference validation corpus sets. We'll use `C_v` as our choice of metric for performance comparison 

In [59]:
# supporting function
def compute_coherence_values(corpus, dictionary, k, a, b):
    
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto')
                                        #    eta=b)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='u_mass')
    
    return coherence_model_lda.get_coherence()

Let's call the function, and iterate it over the range of topics, alpha, and beta parameter values

In [60]:
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')
alpha

NameError: name 'np' is not defined

In [28]:

grid = {}
grid['Validation_Set'] = {}

# Topics range
min_topics = 20
max_topics = 23
step_size = 2
topics_range = range(min_topics, max_topics, step_size)

# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')

# Beta parameter
beta = list(np.arange(0.01, 1, 0.3))
beta.append('symmetric')

# Validation sets
num_of_docs = len(corpus)
corpus_sets = [gensim.utils.ClippedCorpus(corpus, int(num_of_docs*0.75)), 
               corpus]

corpus_title = ['75% Corpus', '100% Corpus']
corpus_title = ['75% Corpus']

model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }

# Can take a long time to run
if 1 == 1:
    pbar = tqdm.tqdm(total=(len(topics_range)*len(corpus_title)))
    
    # iterate through validation corpuses
    for i in range(len(corpus_sets)):
        # iterate through number of topics
        for k in topics_range:
            a = 50 / k
            b = 0.1
            # get the coherence score for the given parameters
            cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=id2word, 
                                            k=k, a=a, b=b)
            # Save the model results
            model_results['Validation_Set'].append(corpus_title[i])
            model_results['Topics'].append(k)
            model_results['Alpha'].append(a)
            model_results['Beta'].append(b)
            model_results['Coherence'].append(cv)
            
            pbar.update(1)
    pd.DataFrame(model_results).to_csv('lda_tuning_results.csv', index=False)
    pbar.close()




  0%|          | 0/2 [00:00<?, ?it/s][A[A[A


 50%|█████     | 1/2 [00:45<00:45, 45.78s/it][A[A[A


100%|██████████| 2/2 [01:29<00:00, 45.24s/it][A[A[A

IndexError: list index out of range

In [27]:
pd.DataFrame(model_results)

Unnamed: 0,Validation_Set,Topics,Alpha,Beta,Coherence


In [None]:
# import numpy as np
# import tqdm

# grid = {}
# grid['Validation_Set'] = {}

# # Topics range
# min_topics = 2
# max_topics = 11
# step_size = 1
# topics_range = range(min_topics, max_topics, step_size)

# # Alpha parameter
# alpha = list(np.arange(0.01, 1, 0.3))
# alpha.append('symmetric')
# alpha.append('asymmetric')

# # Beta parameter
# beta = list(np.arange(0.01, 1, 0.3))
# beta.append('symmetric')

# # Validation sets
# num_of_docs = len(corpus)
# corpus_sets = [gensim.utils.ClippedCorpus(corpus, int(num_of_docs*0.75)), 
#                corpus]

# corpus_title = ['75% Corpus', '100% Corpus']

# model_results = {'Validation_Set': [],
#                  'Topics': [],
#                  'Alpha': [],
#                  'Beta': [],
#                  'Coherence': []
#                 }

# # Can take a long time to run
# if 1 == 1:
#     pbar = tqdm.tqdm(total=(len(beta)*len(alpha)*len(topics_range)*len(corpus_title)))
    
#     # iterate through validation corpuses
#     for i in range(len(corpus_sets)):
#         # iterate through number of topics
#         for k in topics_range:
#             # iterate through alpha values
#             for a in alpha:
#                 # iterare through beta values
#                 for b in beta:
#                     # get the coherence score for the given parameters
#                     cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=id2word, 
#                                                   k=k, a=a, b=b)
#                     # Save the model results
#                     model_results['Validation_Set'].append(corpus_title[i])
#                     model_results['Topics'].append(k)
#                     model_results['Alpha'].append(a)
#                     model_results['Beta'].append(b)
#                     model_results['Coherence'].append(cv)
                    
#                     pbar.update(1)
#     pd.DataFrame(model_results).to_csv('./results/lda_tuning_results.csv', index=False)
#     pbar.close()

** **
#### Step 7: Final Model
** **

Based on external evaluation (Code to be added from Excel based analysis), let's train the final model with parameters yielding highest coherence score

In [76]:
# Set up the callbacks loggers
perplexity_logger = PerplexityMetric(corpus=corpus, logger='shell')
convergence_logger = ConvergenceMetric(logger='shell')
coherence_logger = CoherenceMetric(corpus=corpus, logger='shell', coherence = 'c_v',texts=texts)
coherence_logger_umass = CoherenceMetric(corpus=corpus, logger='shell', coherence = 'u_mass', texts=texts)

In [77]:
logging.debug("test message")

In [178]:
num_topics = 20

lda_model = gensim.models.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=num_topics, 
                                           random_state=100,
                                           chunksize=3000,
                                           passes=7)
                                          #  eval_every=5,callbacks=[convergence_logger, perplexity_logger, coherence_logger,coherence_logger_umass])
                                        #    eta=1/num_topics)

In [None]:
import re
import matplotlib.pyplot as plt
p = re.compile("(-*\d+\.\d+) per-word .* (\d+\.\d+) perplexity")
matches = [p.findall(l) for l in open('gensim.log')]
matches = [m for m in matches if len(m) > 0]
tuples = [t[0] for t in matches]
perplexity = [float(t[1]) for t in tuples]
liklihood = [float(t[0]) for t in tuples]
iter = list(range(0,len(tuples)*10,10))
plt.plot(iter,liklihood,c="black")
plt.ylabel("log liklihood")
plt.xlabel("iteration")
plt.title("Topic Model Convergence")
plt.grid()
plt.savefig("convergence_liklihood.pdf")
plt.close()

In [94]:
# from pprint import pprint

# # Print the Keyword in the 10 topics
# pprint(lda_model.print_topics())
# doc_lda = lda_model[corpus]

In [28]:
import pandas as pd
with open('model_callbacks.log') as f:
    lines = [line.split() for line in f.read().splitlines()]
metrics = [[],[],[],[]]

for i in range(len(lines)):
    metrics[i % 4].append(float(lines[i][-1]))
metrics = pd.DataFrame(metrics).T
metrics.columns = ['conv','perp','coh_cv','coh_umass']
metrics.reset_index(inplace=True)
import altair as alt


In [37]:
alt.Chart(metrics).mark_line().encode(
    x= 'index:O',
    y='coh_cv:Q')

** **
#### Step 8: Visualize Results
** **

In [25]:
num_topics = 20

lda_model = gensim.models.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=num_topics, 
                                           random_state=100,
                                           chunksize=3000,
                                           passes=7)
                                          #  eval_every=5,callbacks=[convergence_logger, perplexity_logger, coherence_logger,coherence_logger_umass])
                                        #    eta=1/num_topics)

In [26]:
import pyLDAvis.gensim
import pickle 
import pyLDAvis
from IPython.core.display import display, HTML

# Visualize the topics
pyLDAvis.enable_notebook()
num_topics = 28
LDAvis_data_filepath = os.path.join('./results/ldavis_tuned_'+str(num_topics))

# # this is a bit time consuming - make the if statement True
# # if you want to execute visualization prep yourself
if 1 == 1:
    LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)

# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)

pyLDAvis.save_html(LDAvis_prepared, 'results/ldavis_tuned_'+ str(num_topics) +'.html')
display(HTML("<style>.container { max-width:100% !important; }</style>"))
display(HTML("<style>.output_result { max-width:100% !important; }</style>"))
display(HTML("<style>.output_area { max-width:100% !important; }</style>"))
display(HTML("<style>.input_area { max-width:100% !important; }</style>"))
LDAvis_prepared


  pickler.file_handle.write(chunk.tostring('C'))


In [27]:
display(HTML("<style>.container { max-width:100% !important; }</style>"))
display(HTML("<style>.output_result { max-width:100% !important; }</style>"))
display(HTML("<style>.output_area { max-width:100% !important; }</style>"))
display(HTML("<style>.input_area { max-width:100% !important; }</style>"))
LDAvis_prepared

In [72]:

word = 'financieel'
sum([1 for t in texts if word in t])/len(texts)

0.01899335232668566

** **
#### Closing Notes

We started with understanding why evaluating the topic model is essential. Next, we reviewed existing methods and scratched the surface of topic coherence, along with the available coherence measures. Then we built a default LDA model using Gensim implementation to establish the baseline coherence score and reviewed practical ways to optimize the LDA hyperparameters.

Hopefully, this article has managed to shed light on the underlying topic evaluation strategies, and intuitions behind it.

** **
#### References:
1. http://qpleple.com/perplexity-to-evaluate-topic-models/
2. https://www.amazon.com/Machine-Learning-Probabilistic-Perspective-Computation/dp/0262018020
3. https://papers.nips.cc/paper/3700-reading-tea-leaves-how-humans-interpret-topic-models.pdf
4. https://github.com/mattilyra/pydataberlin-2017/blob/master/notebook/EvaluatingUnsupervisedModels.ipynb
5. https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/
6. http://svn.aksw.org/papers/2015/WSDM_Topic_Evaluation/public.pdf
7. http://palmetto.aksw.org/palmetto-webapp/