In [5]:
import pandas as pd
import numpy as np
import sys
sys.path.append('./lib/')
from utility import get_text,ProcessPipeline
### Part 2 Package
from gensim import corpora, models
from IPython.display import Image
import pickle

import matplotlib.pyplot as plt
%matplotlib inline

In [6]:
### read pickle file
with open('data/news.pickle', 'rb') as handle:
    texts = pickle.load(handle)

In [8]:
pipeline = ProcessPipeline(texts)

textsProcessed = pipeline.run()

In [10]:
# double check
textsProcessed[0][:10]

['forum',
 'address',
 'one',
 'press',
 'issu',
 'lifetim',
 'global',
 'energi',
 'climat',
 'chang']

### model topic (LDA)

In [11]:
from gensim import corpora, models
import gensim
from gensim.models import CoherenceModel

In [14]:
"""
Create a dictionary from ‘textsProcessed’ containing the number of times a word appears in the training set.
The Dictionary() function traverses texts, assigning a unique integer id to each unique token while also collecting word counts and relevant statistics. 
    To see each token’s unique integer id, try print(dictionary.token2id).
""" 
dictionary = corpora.Dictionary(textsProcessed)
print(dictionary.token2id)

{'C': 0, 'CO': 1, 'acidif': 2, 'activ': 3, 'address': 4, 'also': 5, 'among': 6, 'anthropogen': 7, 'articl': 8, 'avail': 9, 'beli': 10, 'caus': 11, 'chang': 12, 'climat': 13, 'concern': 14, 'consensu': 15, 'consumpt': 16, 'continu': 17, 'contribut': 18, 'cyclon': 19, 'degre': 20, 'develop': 21, 'drought': 22, 'earth’': 23, 'econom': 24, 'electr': 25, 'emiss': 26, 'energi': 27, 'etc': 28, 'event': 29, 'expect': 30, 'extrem': 31, 'fire': 32, 'forest': 33, 'forum': 34, 'fuell': 35, 'global': 36, 'grid': 37, 'grow': 38, 'ice': 39, 'includ': 40, 'increas': 41, 'indian': 42, 'india’': 43, 'issu': 44, 'known': 45, 'level': 46, 'life': 47, 'lifetim': 48, 'link': 49, 'live': 50, 'loss': 51, 'mean': 52, 'million': 53, 'no': 54, 'not': 55, 'one': 56, 'particularli': 57, 'polar': 58, 'potenti': 59, 'press': 60, 'problem': 61, 'public': 62, 'read': 63, 'research': 64, 'rise': 65, 'sea': 66, 'solar': 67, 'strongli': 68, 'surfac': 69, 'temperatur': 70, 'timelin': 71, 'transform': 72, 'u': 73, 'undoubt

In [15]:
# transform to bag-of-words ( first element is word ind, second one is number)
corpus = [dictionary.doc2bow(text) for text in textsProcessed]
corpus

[[(0, 1),
  (1, 2),
  (2, 1),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 2),
  (8, 1),
  (9, 1),
  (10, 1),
  (11, 2),
  (12, 2),
  (13, 2),
  (14, 1),
  (15, 1),
  (16, 1),
  (17, 1),
  (18, 1),
  (19, 1),
  (20, 1),
  (21, 2),
  (22, 1),
  (23, 1),
  (24, 1),
  (25, 1),
  (26, 4),
  (27, 4),
  (28, 1),
  (29, 2),
  (30, 1),
  (31, 2),
  (32, 1),
  (33, 1),
  (34, 2),
  (35, 1),
  (36, 5),
  (37, 1),
  (38, 1),
  (39, 1),
  (40, 1),
  (41, 4),
  (42, 1),
  (43, 1),
  (44, 1),
  (45, 1),
  (46, 1),
  (47, 1),
  (48, 2),
  (49, 2),
  (50, 1),
  (51, 1),
  (52, 1),
  (53, 1),
  (54, 1),
  (55, 1),
  (56, 1),
  (57, 1),
  (58, 1),
  (59, 1),
  (60, 1),
  (61, 1),
  (62, 1),
  (63, 1),
  (64, 1),
  (65, 1),
  (66, 1),
  (67, 1),
  (68, 1),
  (69, 1),
  (70, 3),
  (71, 1),
  (72, 1),
  (73, 1),
  (74, 1),
  (75, 1),
  (76, 1),
  (77, 2),
  (78, 1),
  (79, 1)],
 [],
 [],
 [(17, 1),
  (21, 1),
  (27, 3),
  (67, 1),
  (80, 1),
  (81, 1),
  (82, 1),
  (83, 1),
  (84, 1),
  (85, 1),
  (86, 1)

In [16]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=3, id2word = dictionary, passes=20)

In [17]:
ldamodel.print_topics(num_topics=3, num_words=10)

[(0,
  '0.006*"energi" + 0.006*"increas" + 0.006*"delay" + 0.006*"©" + 0.006*"data" + 0.006*"solut" + 0.006*"market" + 0.006*"barchart" + 0.006*"host" + 0.006*"minut"'),
 (1,
  '0.033*"energi" + 0.025*"increas" + 0.021*"global" + 0.017*"emiss" + 0.013*"develop" + 0.013*"solar" + 0.013*"electr" + 0.013*"temperatur" + 0.009*"meet" + 0.009*"current"'),
 (2,
  '0.032*"renew" + 0.032*"energi" + 0.032*"georgia" + 0.032*"power" + 0.013*"continu" + 0.013*"solar" + 0.013*"cost" + 0.013*"procur" + 0.013*"resourc" + 0.013*"biomass"')]

In [18]:
# Evaluate LDA model based on Perplexity & Coherece Score
from gensim.models import CoherenceModel
# Compute Perplexity
print('\nPerplexity: ', ldamodel.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=ldamodel, texts=textsProcessed, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -5.611251297096412

Coherence Score:  0.5730086767966257


In [19]:
# Check each document and its topics
for i in range(len(corpus)):
    tmpTopic = ldamodel[corpus[i]]
    print(f"For doc {i}, corresponding topics & probability are {tmpTopic}")

For doc 0, corresponding topics & probability are [(1, 0.99369097)]
For doc 1, corresponding topics & probability are [(0, 0.33333334), (1, 0.33333334), (2, 0.33333334)]
For doc 2, corresponding topics & probability are [(0, 0.33333334), (1, 0.33333334), (2, 0.33333334)]
For doc 3, corresponding topics & probability are [(2, 0.9852645)]
For doc 4, corresponding topics & probability are [(1, 0.9912958)]
For doc 5, corresponding topics & probability are [(0, 0.33333334), (1, 0.33333334), (2, 0.33333334)]
For doc 6, corresponding topics & probability are [(0, 0.33333334), (1, 0.33333334), (2, 0.33333334)]
For doc 7, corresponding topics & probability are [(0, 0.33333334), (1, 0.33333334), (2, 0.33333334)]
For doc 8, corresponding topics & probability are [(0, 0.33333334), (1, 0.33333334), (2, 0.33333334)]
For doc 9, corresponding topics & probability are [(0, 0.031224895), (1, 0.93797684), (2, 0.030798212)]


In [20]:
# Visualize the topics
from pyLDAvis import gensim as pyLDAvis_gensim
import pyLDAvis
pyLDAvis.enable_notebook()
vis = pyLDAvis_gensim.prepare(ldamodel, corpus, dictionary)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [21]:
# find the optimal topic
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Func:
        Compute c_v coherence for various number of topics
    Input:
        dictionary : Gensim dictionary
        corpus : Gensim corpus
        texts : List of input texts
        limit : Max num of topics
    Returns:
        model_list : List of LDA topic models
        coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, id2word = dictionary)
#         model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=id2word)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [22]:
model_list, coherence_values = compute_coherence_values(dictionary=dictionary, corpus=corpus, texts=textsProcessed, start=2, limit=40, step=6)

In [24]:
coherence_values

[0.6083147290238292,
 0.7592616782222168,
 0.7461513746140788,
 0.8377298706821122,
 0.8549202857093936,
 0.8147574302525207,
 0.8215957770726989]

In [25]:
from gensim import corpora, models
import gensim
from gensim.models import CoherenceModel
from pyLDAvis import gensim as pyLDAvis_gensim
import pyLDAvis

class LDA:
    def __init__(self,texts,num_topics=3):
        ### input texts: list of texts eg: [["jason","while"],["bay","wheel"]]
        self.texts = texts
        self.dictionary = corpora.Dictionary(texts)
        self.corpus = [self.dictionary.doc2bow(text) for text in texts]
        
        self.ldamodel = gensim.models.ldamodel.LdaModel(self.corpus, num_topics=num_topics, id2word = self.dictionary, passes=20)
        
    def get_topics(self,num_topics=3,num_words=5):
        topics = self.ldamodel.print_topics(num_topics=num_topics, num_words=num_words)
        print(f"topics: {topics}")
    def get_doc_topics(self,docId=0,print_topics=False):
        tmpTopic = self.ldamodel[self.corpus[docId]]
        if print_topics: print(f"For doc {docId}, corresponding topics & probability are {tmpTopic}")
        return tmpTopic
    
    def get_perplexity(self):
        return self.ldamodel.log_perplexity(self.corpus)  # a measure of how good the model is. lower the better.

    def get_coherence(self):
        # Compute Coherence Score
        coherence_model_lda = CoherenceModel(model=self.ldamodel, texts=self.texts, dictionary=self.dictionary, coherence='c_v')
        coherence_lda = coherence_model_lda.get_coherence()
        print('\nCoherence Score: ', coherence_lda)
        return self.coherence_lda

    def plot(self):
        # Visualize the topics
        pyLDAvis.enable_notebook()
        vis = pyLDAvis_gensim.prepare(self.ldamodel,self.corpus, self.dictionary)
        return vis

### done