In [1]:
import pandas as pd
from sklearn import preprocessing
import numpy as np
import os
import re
import pickle

import matplotlib.pyplot as plt
import matplotlib.cm as cm

from tqdm import tqdm

import pyLDAvis.gensim

from gensim import corpora, models
from gensim.models.coherencemodel import CoherenceModel

  from collections import Iterable
scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.
  _deprecated()


In [2]:
path = os.path.join("..","data","./newdata_clean.xlsx")
n_cpu = 5
batch_size = 10000
max_k = 40
max_features = 256  # only consider the top max_features ordered by term frequency across the corpus.
loadpath = "processed_data_lda_wo_html"
#loadpath = "processed_data_not_rmsw"

n_topic = 80

In [3]:
with open(loadpath, "rb") as f:
    output = pickle.load(f)
clean_data = output["clean_data"]
reduced_data = output["reduced_data"]
token_data = output["token_data"]

## LDA Topic Model
如果不移除 stopword 的話效果很差，主題的字都會是 of, for, it...
`dictionary.filter_extremes()` 過濾掉 token 出現次數少於15個句子，或是出現在超過一半的句子中。 
ref: [Topic Modeling and Latent Dirichlet Allocation (LDA) in Python](https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24)

In [4]:
import gensim
from gensim.models import LdaMulticore
import pprint
print("Data length: {}".format(len(token_data)))
dictionary = gensim.corpora.Dictionary(token_data)
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

bow_corpus = [dictionary.doc2bow(doc) for doc in token_data]
print(bow_corpus[4310]) # just print

Data length: 36939
[(1, 1), (4, 1), (5, 1), (10, 1), (11, 3), (12, 1), (24, 1), (31, 3), (33, 1), (47, 1), (53, 1), (63, 1), (76, 2), (80, 2), (108, 1), (112, 1), (114, 2), (140, 1), (158, 1), (174, 1), (203, 1), (220, 1), (227, 1), (232, 1), (234, 1), (236, 3), (246, 2), (252, 1), (265, 2), (266, 1), (268, 2), (297, 1), (303, 2), (320, 1), (333, 1), (334, 1), (386, 1), (402, 1), (408, 1), (410, 1), (431, 1), (441, 2), (442, 2), (544, 1), (574, 1), (627, 1), (659, 1), (719, 1), (764, 1), (808, 1), (913, 2), (963, 1), (974, 1), (980, 1), (1041, 1), (1066, 1), (1069, 2), (1251, 1), (1274, 1), (1339, 2), (1667, 1), (1719, 1), (1812, 1), (2092, 1), (2135, 1)]


### Visualize

In [5]:
def print_topic_example(reduced_data, token_data, model, dictionary, n_topic):
    print("Model:", model)
    total_len = len(reduced_data)
    print("total_len",total_len)

    topic_distribution = [0 for i in range(n_topic)]
    topic_result = []
    example = [[] for i in range(n_topic)]
    for s, token in tqdm(zip(reduced_data, token_data), total=total_len):
        bow_vector = dictionary.doc2bow(token)
        rank = model[bow_vector]
        if len(rank) == 0:
            continue
        index, score = max(rank, key=lambda tup: tup[1])
        #print(index,score)
        topic_distribution[index] += 1
        topic_result.append(index)
        example[index].append((s, score))
        #print(s)
        #print("Score: {}  Topic: {}\n".format(score, model.print_topic(index, 5)))
    print("topic_distribution: {}".format(topic_distribution))
    
    for idx, topic in model.print_topics(-1):
        print('Topic: {} | {} datas\nWord: {}'.format(idx, topic_distribution[idx], topic))
        result = sorted(example[idx], key=lambda tup: -tup[1])[:5]
        for s, score in result:
            print("{} | {}\n".format(s,score))
        print()
        print("====================")
    
    return topic_result

### LDA using BOW

In [6]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=n_topic, id2word=dictionary, passes=2, workers=n_cpu)
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.040*"reply" + 0.036*"developer" + 0.026*"app" + 0.026*"review" + 0.021*"write" + 0.020*"google" + 0.019*"want" + 0.018*"editor" + 0.016*"contact" + 0.014*"like"
Topic: 1 
Words: 0.031*"use" + 0.022*"video" + 0.021*"device" + 0.019*"capture" + 0.015*"version" + 0.013*"vlc" + 0.012*"program" + 0.011*"please" + 0.010*"purchase" + 0.010*"app"
Topic: 2 
Words: 0.039*"download" + 0.023*"product" + 0.020*"link" + 0.020*"key" + 0.020*"purchase" + 0.019*"powerdirector" + 0.018*"powerdvd" + 0.018*"please" + 0.018*"ultra" + 0.017*"photodirector"
Topic: 3 
Words: 0.023*"download" + 0.014*"powerdvd" + 0.013*"get" + 0.012*"program" + 0.011*"use" + 0.011*"please" + 0.011*"new" + 0.011*"powerdirector" + 0.010*"work" + 0.010*"version"
Topic: 4 
Words: 0.026*"powerdirector" + 0.014*"use" + 0.013*"feature" + 0.013*"program" + 0.013*"video" + 0.012*"purchase" + 0.012*"download" + 0.010*"pip" + 0.010*"try" + 0.010*"thank"
Topic: 5 
Words: 0.030*"download" + 0.029*"product" + 0.024*"key" 

In [7]:
bow_topic_result = print_topic_example(reduced_data, token_data, lda_model, dictionary, n_topic)

  0%|          | 88/36939 [00:00<00:42, 876.43it/s]

Model: LdaModel(num_terms=3478, num_topics=80, decay=0.5, chunksize=2000)
total_len 36939


100%|██████████| 36939/36939 [00:48<00:00, 768.10it/s]


topic_distribution: [161, 193, 1070, 107, 222, 378, 201, 696, 378, 303, 474, 497, 102, 178, 1139, 261, 354, 245, 959, 498, 179, 305, 298, 730, 258, 139, 232, 211, 410, 277, 148, 425, 655, 427, 100, 113, 323, 370, 871, 1686, 1711, 284, 365, 240, 125, 131, 515, 262, 615, 160, 719, 376, 266, 213, 682, 894, 150, 1346, 63, 145, 304, 528, 271, 563, 923, 169, 276, 170, 199, 311, 648, 1421, 328, 284, 173, 1753, 210, 411, 1178, 984]
Topic: 0 | 161 datas
Word: 0.040*"reply" + 0.036*"developer" + 0.026*"app" + 0.026*"review" + 0.021*"write" + 0.020*"google" + 0.019*"want" + 0.018*"editor" + 0.016*"contact" + 0.014*"like"
﻿<html><div dir="auto"> Thank u so much iam very glad its been fixed iam soooooo thank ful iam so happy thank u guys great job and hope u guys grow best wishes&nbsp;</div> <br> <div class="gmail_quote"> <div dir="ltr"> On Mon, 14 Jan 2019, 12:30 pm  &lt;<a href="mailto:noreply@google.com"> noreply@google.com</a>  wrote:<br> </div> <blockquote class="gmail_quote" style="margin:0 0

In [8]:
visual = pyLDAvis.gensim.prepare(lda_model, bow_corpus, dictionary)
pyLDAvis.display(visual)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


### LDA using TF-IDF

In [9]:
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
print(corpus_tfidf[4310])

lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=n_topic, iterations=100000, id2word=dictionary, passes=2, workers=n_cpu)

for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} \nWord: {}'.format(idx, topic))

# Compute Perplexity
per_score = lda_model_tfidf.log_perplexity(corpus_tfidf)
print('Perplexity: ', per_score)  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model_tfidf, texts=token_data, dictionary=dictionary, coherence='c_v', processes=n_cpu)
coherence_lda = coherence_model_lda.get_coherence() # high is better
print('Coherence Score: ', coherence_lda)

[(1, 0.03167906274638008), (4, 0.05968777406530077), (5, 0.07222822116151202), (10, 0.040443009184883257), (11, 0.15954562838219177), (12, 0.04859589956958277), (24, 0.07193005471181863), (31, 0.1703003208531593), (33, 0.028934071348542197), (47, 0.0488459655168579), (53, 0.053634690951270936), (63, 0.05374512038840211), (76, 0.0865111728116621), (80, 0.12634402810353038), (108, 0.0511671022385774), (112, 0.057175724958900095), (114, 0.06001196434858863), (140, 0.02922130108810969), (158, 0.1303232830278721), (174, 0.03284660598771906), (203, 0.045525216659558415), (220, 0.08044749132910173), (227, 0.10103679008811892), (232, 0.055966722269845476), (234, 0.06566735100029075), (236, 0.16372273628252496), (246, 0.11724486387731832), (252, 0.09770160474729414), (265, 0.18171711222882117), (266, 0.100509915566908), (268, 0.13798893886991023), (297, 0.0995975353008924), (303, 0.14848800981156712), (320, 0.09793814969418894), (333, 0.10952426839118118), (334, 0.11341178827747259), (386, 0.09

Perplexity:  -12.736608774908643
Coherence Score:  0.39729621506074003


In [10]:
tfidf_topic_result = print_topic_example(reduced_data, token_data, lda_model_tfidf, dictionary, n_topic)

  0%|          | 99/36939 [00:00<00:37, 988.28it/s]

Model: LdaModel(num_terms=3478, num_topics=80, decay=0.5, chunksize=2000)
total_len 36939


100%|██████████| 36939/36939 [00:49<00:00, 741.54it/s]

topic_distribution: [59, 33, 202, 571, 82, 12, 27, 278, 266, 12, 30, 119, 26, 434, 15, 18, 908, 54, 92, 122, 814, 1633, 391, 567, 265, 3662, 12, 41, 47, 542, 147, 303, 576, 482, 212, 10266, 99, 894, 163, 2441, 113, 31, 13, 169, 255, 95, 142, 99, 2, 53, 305, 20, 931, 156, 49, 22, 20, 465, 472, 4, 515, 37, 451, 248, 211, 75, 363, 378, 161, 583, 116, 21, 103, 2, 67, 1057, 71, 550, 136, 1461]
Topic: 0 | 59 datas
Word: 0.028*"film" + 0.018*"refuse" + 0.015*"gmail" + 0.013*"disappointed" + 0.013*"frustrate" + 0.012*"camera" + 0.011*"presentation" + 0.010*"james" + 0.010*"grateful" + 0.009*"throw"
I find myself here once again, sending a complaining email. I have removed & reinstalled TOO MANY TIMES TO COUNT. does not help. I'm a yearly paid subscriber, and now with this re install it will not let me open all the stuff, that should come with paid yearly. app is updated, phone is updated, cache is cleared, no open apps in back ground. what else ya wonna throw at me to try. <br><br>Attach File 




In [11]:
visual = pyLDAvis.gensim.prepare(lda_model_tfidf, corpus_tfidf, dictionary)
pyLDAvis.display(visual)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [None]:
f, ax = plt.subplots(1, 2, figsize=(10, 5))
ax[0].hist(bow_topic_result, rwidth=0.8)   # , c=label_subset_color
ax[0].set_title('BOW Topic Distribution')
ax[1].hist(tfidf_topic_result, rwidth=0.8)   # , c=label_subset_color
ax[1].set_title('TFIDF Topic Distribution')