In [2]:
import pandas as pd
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
print(str(len(train['question_text'])))


1306122


In [18]:
#Loading gensim and nltk libraries
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
nltk.download('wordnet')

import logging
import pyLDAvis.gensim
import json
import warnings
warnings.filterwarnings('ignore')  # To ignore all warnings that arise here to enhance clarity

from gensim.models.coherencemodel import CoherenceModel
from gensim.models.ldamodel import LdaModel
from gensim.corpora.dictionary import Dictionary

#Libraries for Topic Modelling Visualization
try:
    import pyLDAvis.gensim
except ImportError:
    ValueError("SKIP: please install pyLDAvis")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kaely\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
#method for lemmatization and stemming
def preprocess(text):
    try:
        result = []
        for token in gensim.utils.simple_preprocess(text):
            if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
                result.append(lemmatize_stemming(token))
        #print(result)
        return result
    except:
        return []
def lemmatize_stemming(text):
    stemmer = SnowballStemmer("english")
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))


In [5]:
train['question_text'] = train['question_text'].map(preprocess)
print('Pre-processing is done.')

Pre-processing is done.


In [6]:
train['question_text'].head(10)

0               [quebec, nationalist, provinc, nation]
1                [adopt, encourag, peopl, adopt, shop]
2    [veloc, affect, time, veloc, affect, space, ge...
3                [otto, guerick, magdeburg, hemispher]
4    [convert, montra, helicon, mountain, bike, cha...
5    [gaza, slowli, auschwitz, dachau, treblinka, p...
6    [quora, automat, conserv, opinion, report, lib...
7                   [crazi, wash, wipe, groceri, germ]
8         [thing, dress, moder, differ, dress, modest]
9    [phase, ignor, peopl, love, complet, disregard...
Name: question_text, dtype: object

In [7]:
### building dictionary for LDA
dictionary = gensim.corpora.Dictionary(train['question_text'])
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break
print('Dictionary is built.')

0 nation
1 nationalist
2 provinc
3 quebec
4 adopt
5 encourag
6 peopl
7 shop
8 affect
9 geometri
10 space
Dictionary is built.


In [8]:
dictionary.filter_extremes(no_below=5, no_above=1)

In [9]:
#Generate Bag of Words on the data set
bow_corpus = [dictionary.doc2bow(qn) for qn in train['question_text']]
bow_corpus[4310]

[(143, 1), (230, 1), (1613, 1), (2073, 1), (2567, 2), (2609, 1), (2670, 1)]

#generate TFIDF matrix
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break

#run LDA using Bag of Words
#Set Parameters
num_topics = 5

#change num_topics if want to change the number of topics generated
lda_model_bow = gensim.models.LdaMulticore(bow_corpus, num_topics=num_topics, id2word=dictionary, passes=2, workers=2)
for idx, topic in lda_model_bow.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

#Assumptions from 20, 40 , 60, 80, 100 Assume that there are 100 topics available 

#Set up Topic Model based on bow_corpus

In [22]:
lda_model_bow = LdaModel(corpus=bow_corpus, id2word=dictionary, iterations=50, num_topics=20)

To check whether topics generated are human-interpretable, we will use both umass score and cv score to evalute the topics generated.

In [43]:
# Compute Coherence Score using u_mass
coherence_model_lda_umas = CoherenceModel(model=lda_model_bow, corpus=bow_corpus, dictionary=dictionary, coherence='u_mass')
coherence_lda_umas = coherence_model_lda_umas.get_coherence()
print('\nCoherence Score: ', coherence_lda_umas)


Coherence Score:  -5.822717611510448


In [37]:
#View the pipeline parameters for one coherence model
#By pipeline parameters, we mean the functions being used to calculate segmentation, probability estimation, confirmation measure and aggregation
print(coherence_model_lda_umas)

Coherence_Measure(seg=<function s_one_pre at 0x0000021512384268>, prob=<function p_boolean_document at 0x0000021512384488>, conf=<function log_conditional_probability at 0x000002151242E1E0>, aggr=<function arithmetic_mean at 0x000002151242E950>)


#Visualize Topic Models --> TAKES TOO LONG TO RUN OMG 
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(lda_model_bow, bow_corpus, dictionary)

# cant seem to run using this method :(

In [None]:
# Compute Coherence Score using c_v
# coherence_model_lda_cv = CoherenceModel(model=lda_model_bow, texts=bow_corpus, dictionary=dictionary, coherence='c_v')
# coherence_lda_cv = coherence_model_lda_cv.get_coherence()
# print('\nCoherence Score: ', coherence_lda_cv)


#View the pipeline parameters for one coherence model
#By pipeline parameters, we mean the functions being used to calculate segmentation, probability estimation, confirmation measure and aggregation
print(coherence_model_lda_cv)

The last step is to find the optimal number of topics.We need to build many LDA models with different values of the number of topics (k) and pick the one that gives the highest coherence value. Choosing a ‘k’ that marks the end of a rapid growth of topic coherence usually offers meaningful and interpretable topics. Picking an even higher value can sometimes provide more granular sub-topics. If you see the same keywords being repeated in multiple topics, it’s probably a sign that the ‘k’ is too large.

Finding the most optimal topics using U_mass Score 

In [49]:
def compute_coherence_values(dictionary, corpus, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model=LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, corpus=corpus, dictionary=dictionary, coherence='u_mass')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [None]:
#Create a model list and plot Coherence score against a number of topics
model_list, coherence_values = compute_coherence_values(dictionary=dictionary, corpus=bow_corpus, start=20, limit=100, step=20)
# Show graph
import matplotlib.pyplot as plt
limit=100; start=20; step=20;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

The above plot shows that coherence score increases with the number of topics, with a decline between 15 to 20.Now, choosing the number of topics still depends on your requirement because topic around 33 have good coherence scores but may have repeated keywords in the topic. Topic coherence gives you a good picture so that you can take better decision.

Reference:
https://markroxor.github.io/gensim/static/notebooks/topic_coherence_tutorial.html#topic=1&lambda=1&term=
https://datascienceplus.com/evaluation-of-topic-modeling-topic-coherence/   

#run LDA using TFIDF
#change num_topics if want to change the number of topics generated
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=20, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))