In [1]:
from nlprocessing import ProcessCorpus
import pandas as pd
from nltk.corpus import words
import gensim

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [2]:
data1 = pd.read_csv('../data/comments.csv', sep='|').dropna()
data2 = pd.read_csv('../data/more_comments.csv').dropna()
data = data1.append(data2, ignore_index=True)
data = data[~data.duplicated()]
data = data[data['subreddit'] != 'politics']
test = data['text'].sample(n=10000)

In [3]:
corpus = ProcessCorpus()
corpus.fit(test, frequency=2)

In [9]:
for i in corpus.corpus_filtered:
    for j in i:
        if j == 'like':
            print('like')

In [4]:
[comment for comment in corpus.gensim_corpus_filtered if comment != []]

[[(1, 1)],
 [(13, 1)],
 [(21, 1)],
 [(29, 1), (31, 1), (37, 1)],
 [(46, 1)],
 [(65, 1)],
 [(74, 1), (77, 1)],
 [(89, 1), (90, 1), (97, 1)],
 [(100, 1)],
 [(107, 1)],
 [(107, 1)],
 [(120, 1)],
 [(123, 1), (124, 1)],
 [(125, 1)],
 [(135, 1)],
 [(142, 1)],
 [(155, 1)],
 [(170, 1)],
 [(176, 1)],
 [(179, 1)],
 [(13, 1), (77, 1), (183, 1)],
 [(195, 1), (198, 1), (208, 1)],
 [(223, 1), (229, 1), (238, 2), (240, 1), (250, 1)],
 [(46, 1), (261, 1)],
 [(264, 1)],
 [(265, 1)],
 [(276, 1)],
 [(284, 1)],
 [(289, 1)],
 [(291, 1)],
 [(284, 1)],
 [(300, 1)],
 [(303, 1)],
 [(284, 1), (319, 1)],
 [(195, 1), (324, 1)],
 [(343, 1)],
 [(349, 1), (350, 1)],
 [(355, 1)],
 [(77, 1), (360, 1), (361, 1), (367, 1)],
 [(183, 1)],
 [(379, 1)],
 [(390, 1)],
 [(398, 1)],
 [(408, 1)],
 [(428, 1)],
 [(432, 1), (433, 1)],
 [(1, 1), (435, 1)],
 [(439, 1)],
 [(46, 1), (442, 1)],
 [(183, 1), (428, 1), (450, 1)],
 [(176, 1), (467, 1), (476, 2), (478, 1), (481, 1)],
 [(284, 1), (490, 1), (496, 2), (503, 1), (504, 2), (508, 

In [25]:
model = gensim.models.LdaModel(
    corpus=corpus.gensim_corpus_filtered,
    id2word=corpus.gensim_dictionary,
    num_topics=5,
    update_every=1,
    chunksize=100,
    passes=10,
    alpha='auto',
    per_word_topics=True
)

In [33]:
coherence = gensim.models.CoherenceModel(
    model=model,
    texts=corpus.corpus_filtered,
    corpus=corpus.gensim_corpus_filtered
)
coherence.get_coherence()

0.5755995285087667

In [36]:
def find_perplexity_coherence(n_topics):
    model = gensim.models.LdaModel(
        corpus=corpus.gensim_corpus_filtered,
        id2word=corpus.gensim_dictionary,
        num_topics=5,
        update_every=1,
        chunksize=100,
        passes=10,
        alpha='auto',
        per_word_topics=True
    )
    coherence = gensim.models.CoherenceModel(
        model=model,
        texts=corpus.corpus_filtered,
        corpus=corpus.gensim_corpus_filtered
    )
    return [n_topics, model.log_perplexity(corpus.gensim_corpus_filtered), coherence.get_coherence()]

In [41]:
optimization = []
for i in range(20,32,2):
    optimization.append(find_perplexity_coherence(i))
optimization

[[20, -7.8239480355989075, 0.5912482399871739],
 [22, -7.833763480892517, 0.5841896704168112],
 [24, -7.83101400146077, 0.5972759201660975],
 [26, -7.837975234461614, 0.6055011334720193],
 [28, -7.844886814728603, 0.6241151163076838],
 [30, -7.842480232454846, 0.6185333114742362]]

In [40]:
optimization1=optimization.copy()

In [26]:
# model = gensim.models.LdaModel.load('mdl/model2/full_corpus_model')

In [27]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(model, corpus.gensim_corpus, corpus.gensim_dictionary)
vis