In [None]:
import pandas as pd
from sklearn import preprocessing
import numpy as np
import os
import re
import pickle

import matplotlib.pyplot as plt
import matplotlib.cm as cm

from tqdm import tqdm

In [1]:
path = os.path.join("..","data","./newdata_clean.xlsx")
n_cpu = 15
batch_size = 10000
max_k = 40
max_features = 256  # only consider the top max_features ordered by term frequency across the corpus.
loadpath = "processed_data_lda_wo_html"
#loadpath = "processed_data_not_rmsw"

NameError: name 'os' is not defined

In [None]:
with open(loadpath, "rb") as f:
    output = pickle.load(f)
clean_data = output["clean_data"]
reduced_data = output["reduced_data"]
token_data = output["token_data"]

## LDA Topic Model
如果不移除 stopword 的話效果很差，主題的字都會是 of, for, it...
`dictionary.filter_extremes()` 過濾掉 token 出現次數少於15個句子，或是出現在超過一半的句子中。 
ref: [Topic Modeling and Latent Dirichlet Allocation (LDA) in Python](https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24)

In [None]:
import gensim
from gensim.models import LdaMulticore
import pprint
print("Data length: {}".format(len(token_data)))
dictionary = gensim.corpora.Dictionary(token_data)
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

bow_corpus = [dictionary.doc2bow(doc) for doc in token_data]
print(bow_corpus[4310]) # just print

### LDA using BOW

In [None]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=n_cpu)
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

### LDA using TF-IDF

In [None]:
from gensim import corpora, models
from gensim.models.coherencemodel import CoherenceModel

tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
print(corpus_tfidf[4310])

model_list = []
perplexity = []
coherence = []

n_topic = 80
test = [n_topic]
for i in test:
    print(i)
    lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=i, iterations=100000, id2word=dictionary, passes=2, workers=n_cpu)
    model_list.append(lda_model_tfidf)
    
    for idx, topic in lda_model_tfidf.print_topics(-1):
        print('Topic: {} \nWord: {}'.format(idx, topic))
    print("=========================================================================================")
    
    # Compute Perplexity
    per_score = lda_model_tfidf.log_perplexity(corpus_tfidf)
    print('Perplexity: ', per_score)  # a measure of how good the model is. lower the better.
    perplexity.append(per_score)

    # Compute Coherence Score
    coherence_model_lda = CoherenceModel(model=lda_model_tfidf, texts=token_data, dictionary=dictionary, coherence='c_v', processes=n_cpu)
    coherence_lda = coherence_model_lda.get_coherence() # high is better
    print('Coherence Score: ', coherence_lda)
    coherence.append(coherence_lda)
'''
with open("perplexity", "wb") as f:
    pickle.dump(perplexity, f)
with open("coherence", "wb") as f:
    pickle.dump(coherence, f)
with open("model_list", "wb") as f:
    pickle.dump(model_list, f)


f, ax = plt.subplots(1, 2, figsize=(10, 5))
x = range(1,500)
ax[0].plot(x,perplexity)   # , c=label_subset_color
ax[0].set_title('BOW Topic Distribution')
ax[1].plot(coherence)   # , c=label_subset_color
ax[1].set_title('coherence')
'''

### Visualize

In [None]:
def print_topic_example(reduced_data, token_data, model, dictionary, n_topic):
    total_len = len(reduced_data)
    print("total_len",total_len)

    topic_distribution = [0 for i in range(n_topic)]
    topic_result = []
    example = [[] for i in range(n_topic)]
    for s, token in tqdm(zip(reduced_data, token_data), total=total_len):
        bow_vector = dictionary.doc2bow(token)
        rank = model[bow_vector]
        if len(rank) == 0:
            continue
        index, score = max(rank, key=lambda tup: tup[1])
        #print(index,score)
        topic_distribution[index] += 1
        topic_result.append(index)
        example[index].append((s, score))
        #print(s)
        #print("Score: {}  Topic: {}\n".format(score, model.print_topic(index, 5)))
    print("topic_distribution: {}".format(topic_distribution))
    
    for idx, topic in model.print_topics(-1):
        print('Topic: {} | {} datas\nWord: {}'.format(idx, topic_distribution[idx], topic))
        result = sorted(example[idx], key=lambda tup: -tup[1])[:5]
        for s, score in result:
            print("{} | {}\n".format(s,score))
        print()
        print("====================")
    
    return topic_result

In [None]:
bow_topic_result = print_topic_example(reduced_data, token_data, lda_model, dictionary, n_topic)

In [None]:
lda_model_tfidf = model_list[0]
print(lda_model_tfidf)
tfidf_topic_result = print_topic_example(reduced_data, token_data, lda_model_tfidf, dictionary, n_topic)

In [None]:
import pyLDAvis.gensim

visual = pyLDAvis.gensim.prepare(lda_model_tfidf, corpus_tfidf, dictionary)
pyLDAvis.display(visual)

In [None]:
f, ax = plt.subplots(1, 2, figsize=(10, 5))
ax[0].hist(bow_topic_result, rwidth=0.8)   # , c=label_subset_color
ax[0].set_title('BOW Topic Distribution')
ax[1].hist(tfidf_topic_result, rwidth=0.8)   # , c=label_subset_color
ax[1].set_title('TFIDF Topic Distribution')