# Latent Dirichlet Allocation (LDA)

Sklearn example from https://medium.com/mlreview/topic-modeling-with-scikit-learn-e80d33668730

In [20]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pandas as pd
from tokenator import tokenize_and_lemmatize

### Load data from the 20 news groups data

In [21]:
df = pd.read_pickle('/Users/hfeiss/dsi/capstone-2/data/clean/clean.pkl')
documents = df['description'].str.join(' ')
print(documents[0])

< p > N o n - W i t n e s s   N a r r a t i v e   b y   c c w   o n   2 0 0 8 - 0 1 - 1 4   ( o k a y   t o   p u b l i s h ) :     A c o t t   L o v e l a n d   I n j u r e d   o n   S l i d e s   o f   M e a d o w   C r e e k ,   O h i o p y l e   P A     P o s t e d :   S a t   J a n   1 2 ,   2 0 0 8   8 : 3 9   p m       B e s t   W i s h e s   F o r   A   S p e e d y   R e c o v e r y   T o   S c o t   L o v e l a n d       I   w a n t e d   t o   u p d a t e   e v e r y o n e   o n   S c o t t ' s   s t a t u s ,   t o   s t o p   a n y   r u m o r s ,   b r i n g   e v e r y o n e   u p   t o   s p e e d ,   a n d   a l l o w   e v e r y o n e   t o   s e n d   t h e i r   b e s t   h o p e s   a n d   p r a y e r s   o n   f o r   h i s   s p e e d y   r e c o v e r y .     T o d a y ,   1 / 1 2 / 0 8   S c o t t   f l i p p e d   o v e r   i n   t h e   C l a s s   V   S l i d e s   r a p i d   o f   M e a d o w   R u n   i n   O h i o p y l e   a n d   s u s t a i n e d   a 

### Prepare the data for LDA

In [22]:
num_features = 1000
vectorizer = TfidfVectorizer(ngram_range=(1, 2),
                             max_df=0.55,
                             max_features=num_features,
                             token_pattern=None,
                             tokenizer=tokenize_and_lemmatize)
# LDA can only uses raw term counts
tf = vectorizer.fit_transform(documents)

NameError: name 'TfidfVectorizer' is not defined

In [0]:
tf_feature_names = vectorizer.get_feature_names() #theses are the words in our bag of words

### Build the LDA model

In [0]:
num_topics = 10

# Run LDA
lda = LatentDirichletAllocation(n_components=num_topics, max_iter=5, learning_method='online',random_state=42, n_jobs=-1)
lda.fit(tf)

In [0]:
temp = lda.components_[0]
print(temp.shape)
temp.argsort()[:10].shape

### Display the top ten words for each topic

In [26]:
def display_topics(model, feature_names, num_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-num_top_words - 1:-1]]))

num_top_words = 10
display_topics(lda, tf_feature_names, num_top_words)

NameError: name 'tf_feature_names' is not defined

### Model evaluation

Model [perplexity](https://en.wikipedia.org/wiki/Perplexity) is often used in LDA to evaluate how well a model predicts a sample.

In [0]:
print("Model perplexity: {0:0.3f}".format(lda.perplexity(tf)))

## Show how to do LDA in gensim

Example from https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24

If you don't already have gensim installed:  
`$ pip install -U gensim`

### Imports

In [0]:
import numpy as np

#gensim
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
import gensim.corpora as corpora
from gensim.models import CoherenceModel

#nltk
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import nltk
# nltk.download('wordnet')

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt


import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

### Load data

In [0]:
documents = data_text

### Preprocess data

In [0]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [0]:
processed_docs = df['description'].apply(tokenize_and_lemmatize)

In [0]:
processed_docs[:10]

### Bag of words

In [0]:
#create dictionary
id2word = gensim.corpora.Dictionary(processed_docs)

#create corpus
texts = processed_docs

#Term Document Frequency
bow_corpus = [id2word.doc2bow(text) for text in texts]

In [0]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in bow_corpus]

In [0]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=id2word, passes=2, workers=2)

### View topics in the LDA model

In [0]:
from pprint import pprint

# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[bow_corpus]

### Compute Model Perplexity and Coherence Score (interpretability of the model)



In [0]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(bow_corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_docs, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

### Visualize the topics-keywords

In [0]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, bow_corpus, id2word)
vis