Tutorial: https://www.datacamp.com/community/tutorials/discovering-hidden-topics-python

In [25]:
import os.path
from gensim import corpora
from gensim.models import LsiModel
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from gensim.models.coherencemodel import CoherenceModel
import matplotlib.pyplot as plt

In [14]:
import vaex
import vaex.jupyter.model as vjm
import numpy as np

In [54]:
from data_pipeline.modeling.my_topicmodeling import lda_topic_model
import pyLDAvis



# load data

In [12]:
def load_data(path, file_name):
    """
    Input: path and file_name
    Purpose: loading text file
    Output: list of paragraphs/ documents and title
    (initial 100 words considered as title of document)
    """
    
    documents_list = []
    titles = []
    
    with open(os.path.join(path, file_name), 'r') as fin:
        for line in fin.readlines():
            text = line.strip()
            documents_list.append(text)
    print('Total Number of Documets', len(documents_list))
    
    titles.append(text[0:min(len(text), 100)])
    
    return documents_list, titles

In [7]:
path = './data/'
file_name = 'articles+4.txt'
data = load_data(path, file_name)

Total Number of Documets 4551


# preprocessing
* tokenize the text articles
* remove stop words
* perform stemming on text articles

In [30]:
def preprocess_data(doc_set):
    """
    Input: document list
    Purpose: preprocess text (tokenize, remove stopwords, stemming)
    Ouput: preprocess text
    """
    # initialize regex tokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    # create English stop words list
    en_stop = set(stopwords.words('english'))
    # create p_stemmer of class PorterStemmer
    p_stemmer = PorterStemmer()
    # list for tokenized documents in loop
    texts = []
    # loop through document list
    for i in doc_set:
        # clean and tokenize document string
        raw = i.lower()
        tokens = tokenizer.tokenize(raw)
        # remove stop words from tokens
        stopped_tokens = [i for i in tokens if i not in en_stop]
        # stem tokens
        stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
        # add tokens to list
        texts.append(stemmed_tokens)
    return texts

In [31]:
data_clean = preprocess_data(data[0])

# Prepare corpus
* Vectorization

In [43]:
def prepare_corpus(doc_clean):
    """
    Input: clean document
    Purpose: create term dictionary of our corpus and converting
    list of documents (corpus) into Document Term Matrix
    Output: term dictionary and Document Term Matrix
    """
    # creating the term dictionary of our corpus, where every unique
    # term is assigned an index. 
    dictionary = corpora.Dictionary(doc_clean)
    # converting list of documents (corpus) into Document Term Matrix 
    # using dictionary prepared above
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
    # generate LDA model
    return dictionary, doc_term_matrix

In [55]:
dictionary, doc_term_matrix = prepare_corpus(data_clean)

INFO:MainThread:gensim.corpora.dictionary:adding document #0 to Dictionary(0 unique tokens: [])
INFO:MainThread:gensim.corpora.dictionary:built Dictionary(49792 unique tokens: ['11', '14bn', '1bn', '2005', '2007']...) from 4551 documents (total 2382025 corpus positions)


# Create an LSA model using Gensim

In [44]:
def create_gensim_lsa_model(doc_clean, number_of_topics, words):
    """
    Input: clean document, number of topics and number of words
    associated with each topic
    Purpose: create LSA model using gensim
    Output: return LSA model
    """
    dictionary, doc_term_matrix = prepare_corpus(doc_clean)

    # generate LSA model
    lsamodel = LsiModel(doc_term_matrix, 
                        num_topics=number_of_topics, 
                        id2word = dictionary)  
    print(lsamodel.print_topics(num_topics=number_of_topics, num_words=words))
    return lsamodel

In [48]:
lsa_model = create_gensim_lsa_model(data_clean, 7, 10)

INFO:MainThread:gensim.corpora.dictionary:adding document #0 to Dictionary(0 unique tokens: [])
INFO:MainThread:gensim.corpora.dictionary:built Dictionary(49792 unique tokens: ['11', '14bn', '1bn', '2005', '2007']...) from 4551 documents (total 2382025 corpus positions)
INFO:MainThread:gensim.models.lsimodel:using serial LSI version on this node
INFO:MainThread:gensim.models.lsimodel:updating model with new documents
INFO:MainThread:gensim.models.lsimodel:preparing a new chunk of documents
INFO:MainThread:gensim.models.lsimodel:using 100 extra samples and 2 power iterations
INFO:MainThread:gensim.models.lsimodel:1st phase: constructing (49792, 107) action matrix
INFO:MainThread:gensim.models.lsimodel:orthonormalizing (49792, 107) action matrix
INFO:MainThread:gensim.models.lsimodel:2nd phase: running dense svd on (107, 4551) matrix
INFO:MainThread:gensim.models.lsimodel:computing the final decomposition
INFO:MainThread:gensim.models.lsimodel:keeping 7 factors (discarding 33.285% of ene

[(0, '0.361*"trump" + 0.272*"say" + 0.233*"said" + 0.166*"would" + 0.160*"clinton" + 0.140*"peopl" + 0.136*"one" + 0.126*"campaign" + 0.123*"year" + 0.110*"time"'), (1, '0.389*"citi" + 0.370*"v" + 0.356*"h" + 0.355*"2016" + 0.354*"2017" + 0.164*"unit" + 0.159*"west" + 0.157*"manchest" + 0.116*"apr" + 0.112*"dec"'), (2, '0.612*"trump" + 0.264*"clinton" + -0.261*"eu" + -0.148*"say" + -0.137*"would" + 0.135*"donald" + -0.134*"leav" + -0.134*"uk" + 0.119*"republican" + -0.110*"cameron"'), (3, '-0.400*"min" + 0.261*"eu" + -0.183*"goal" + -0.152*"ball" + -0.132*"play" + 0.128*"said" + 0.128*"say" + -0.126*"leagu" + 0.122*"leav" + -0.122*"game"'), (4, '0.404*"bank" + -0.305*"eu" + -0.290*"min" + 0.189*"year" + -0.164*"leav" + -0.153*"cameron" + 0.143*"market" + 0.140*"rate" + -0.139*"vote" + -0.133*"say"'), (5, '-0.310*"bank" + 0.307*"say" + 0.221*"peopl" + -0.203*"trump" + -0.166*"1" + -0.164*"min" + -0.163*"0" + -0.152*"market" + -0.152*"eu" + 0.138*"like"'), (6, '0.570*"say" + 0.237*"min" 

In [96]:
import re

In [142]:
t = lsa_model.print_topic(1)
n = re.findall('\d.\d{3}', t)
n = list(map(lambda x: float(x), n))
sum(n)

  n = re.findall('\d.\d{3}', t)


2.532

# Determine the number of topics

In [38]:
def compute_coherence_values(dictionary, doc_term_matrix, doc_clean, stop, start=2, step=3):
    """
    Input: 
        dictionary: Gensim dictionary
        corpus: Gensim corpus
        text: List of input texts
        stop: Max num of topics
    Purpose: compute c_v coherence for various number of topics
    Output: 
        model_list: List of LSA topic models
        coherence_values: Coherence values corresponding to the 
                LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    
    for num_topics in range(start, stop, step):
        # generate LSA model
        # train the model
        model = LsiModel(doc_term_matrix, 
                         num_topics=num_topics,
                         id2word=dictionary)
        model_list.append(model)
        coherence_values = CoherenceModel(model=model, 
                                          text=doc_clean, 
                                          dictionary=dictionary,
                                          coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
    return model_list, coherence_values

# Plot the coherence score values

In [46]:
def plot_graph(doc_clean, start, stop, step):
    dictionary,doc_term_matrix=prepare_corpus(doc_clean)
    model_list, coherence_values = compute_coherence_values(dictionary, doc_term_matrix,doc_clean,
                                                            stop, start, step)
    # Show graph
    x = range(start, stop, step)
    plt.plot(x, coherence_values)
    plt.xlabel("Number of Topics")
    plt.ylabel("Coherence score")
    plt.legend(("coherence_values"), loc='best')
    plt.show()

In [53]:
# start,stop,step=2,12,1
# plot_graph(data_clean,start,stop,step)

In [52]:
# LSA Model
number_of_topics=7
words=10
document_list,titles=load_data("","./data/articles+4.txt")
clean_text=preprocess_data(document_list)
model=create_gensim_lsa_model(clean_text,number_of_topics,words)


Total Number of Documets 4551


INFO:MainThread:gensim.corpora.dictionary:adding document #0 to Dictionary(0 unique tokens: [])
INFO:MainThread:gensim.corpora.dictionary:built Dictionary(49792 unique tokens: ['11', '14bn', '1bn', '2005', '2007']...) from 4551 documents (total 2382025 corpus positions)
INFO:MainThread:gensim.models.lsimodel:using serial LSI version on this node
INFO:MainThread:gensim.models.lsimodel:updating model with new documents
INFO:MainThread:gensim.models.lsimodel:preparing a new chunk of documents
INFO:MainThread:gensim.models.lsimodel:using 100 extra samples and 2 power iterations
INFO:MainThread:gensim.models.lsimodel:1st phase: constructing (49792, 107) action matrix
INFO:MainThread:gensim.models.lsimodel:orthonormalizing (49792, 107) action matrix
INFO:MainThread:gensim.models.lsimodel:2nd phase: running dense svd on (107, 4551) matrix
INFO:MainThread:gensim.models.lsimodel:computing the final decomposition
INFO:MainThread:gensim.models.lsimodel:keeping 7 factors (discarding 33.298% of ene

[(0, '0.361*"trump" + 0.272*"say" + 0.233*"said" + 0.166*"would" + 0.160*"clinton" + 0.140*"peopl" + 0.136*"one" + 0.126*"campaign" + 0.123*"year" + 0.110*"time"'), (1, '0.389*"citi" + 0.370*"v" + 0.356*"h" + 0.355*"2016" + 0.354*"2017" + 0.164*"unit" + 0.159*"west" + 0.157*"manchest" + 0.116*"apr" + 0.112*"dec"'), (2, '0.612*"trump" + 0.264*"clinton" + -0.261*"eu" + -0.148*"say" + -0.137*"would" + 0.135*"donald" + -0.134*"leav" + -0.134*"uk" + 0.119*"republican" + -0.110*"cameron"'), (3, '-0.400*"min" + 0.261*"eu" + -0.183*"goal" + -0.152*"ball" + -0.132*"play" + 0.128*"said" + 0.128*"say" + -0.126*"leagu" + 0.122*"leav" + -0.122*"game"'), (4, '-0.404*"bank" + 0.305*"eu" + 0.290*"min" + -0.189*"year" + 0.164*"leav" + 0.153*"cameron" + -0.143*"market" + -0.140*"rate" + 0.139*"vote" + 0.133*"say"'), (5, '0.310*"bank" + -0.307*"say" + -0.221*"peopl" + 0.203*"trump" + 0.166*"1" + 0.164*"min" + 0.164*"0" + 0.152*"market" + 0.152*"eu" + -0.138*"like"'), (6, '-0.570*"say" + -0.237*"min" + 0.

# Visualizing the topics

In [59]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lsa_model, doc_term_matrix, dictionary)

AttributeError: 'LsiModel' object has no attribute 'inference'

In [80]:
# flatten a list
flat_text = sum(data_clean, [])

In [182]:
string = ' '.join(flat_text)

# second tutorial:
https://www.kdnuggets.com/2018/08/topic-modeling-lsa-plsa-lda-lda2vec.html

## LSA

In [72]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline

In [73]:
# raw document to tf-idf matrix
vectorizer = TfidfVectorizer(stop_words='english',
                             use_idf=True,
                             smooth_idf=True
                            )

In [74]:
# SVD to reduce dimentionality
svd_model = TruncatedSVD(n_components=100,
                         algorithm='randomized',
                         n_iter=10
)

In [75]:
# pipeline of tf-idf + SVD, fit to and applied to documents
svd_transformer = Pipeline(
    [('tfidef', vectorizer),
     ('svd', svd_model)])

In [83]:
# svd matrix can be later used to compare documents, words or queries
svd_matrix = svd_transformer.fit_transform(flat_text)

In [84]:
len(svd_matrix)

2382025

## PLSA
Rarely used in real world.

## LDA
Latent Dirichlet Allocation. It is a Bayesian version of pLSA.


I like to draw an analogy between the Dirichlet Distribution and the normal distribution, since most people understand the normal distribution.

The normal distribution is a probability distribution over all the real numbers. It is described by a mean and a variance. The mean is the expected value of this distribution, and the variance tells us how much we can expect samples to deviate from the mean. If the variance is very high, then you’re going to see values that are both much smaller than the mean and much larger than the mean. If the variance is small, then the samples will be very close to the mean. If the variance goes close to zero, all samples will be almost exactly at the mean.

The dirichlet distribution is a probability distribution as well - but it is not sampling from the space of real numbers. Instead it is sampling over a probability simplex.

And what is a probability simplex? It’s a bunch of numbers that add up to 1. For example:

(0.6, 0.4)
(0.1, 0.1, 0.8)
(0.05, 0.2, 0.15, 0.1, 0.3, 0.2)

These numbers represent probabilities over K distinct categories. In the above examples, K is 2, 3, and 6 respectively. That’s why they are also called categorical distributions.

https://www.quora.com/What-is-an-intuitive-explanation-of-the-Dirichlet-distribution

Infinite Mixture Models with Nonparametric Bayes and the Dirichlet Process: http://blog.echen.me/2012/03/20/infinite-mixture-models-with-nonparametric-bayes-and-the-dirichlet-process/

In [158]:
# from gensim.corpora.dictionary.Dictionary import load_from_text, doc2bow
from gensim.corpora import Dictionary
from gensim.corpora import MmCorpus
from gensim.models.ldamodel import LdaModel

In [170]:
# extract 100 LDA topics, updating once every 10,000 
lda = LdaModel(id2word=dictionary, 
               num_topics=100, 
               update_every=1, 
               chunksize=10000, 
               passes=1)

INFO:MainThread:gensim.models.ldamodel:using symmetric alpha at 0.01
INFO:MainThread:gensim.models.ldamodel:using symmetric eta at 0.01
INFO:MainThread:gensim.models.ldamodel:using serial LDA version on this node


In [188]:
# use LDA model: transform new doc to bag-of-words
# then apply lda
# doc_bow = Dictionary.doc2bow(document=string.split())

## Lda2Vec
lda2vec is an extension of word2vec and LDA that jointly learns word, document, and topic vectors.