# Latent Dirichlet Allocation (LDA)

Sklearn example from https://medium.com/mlreview/topic-modeling-with-scikit-learn-e80d33668730

In [39]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pandas as pd
from tokenator import tokenize_and_lemmatize

In [40]:
df = pd.read_pickle('/Users/hfeiss/dsi/capstone-2/data/clean/clean.pkl')
documents = df['description']
print(documents[0])

<p>Non-Witness Narrative by ccw on 2008-01-14 (okay to publish):  Acott Loveland Injured on Slides of Meadow Creek, Ohiopyle PA  Posted: Sat Jan 12, 2008 8:39 pm   Best Wishes For A Speedy Recovery To Scot Loveland   I wanted to update everyone on Scott's status, to stop any rumors, bring everyone up to speed, and allow everyone to send their best hopes and prayers on for his speedy recovery.  Today, 1/12/08 Scott flipped over in the Class V Slides rapid of Meadow Run in Ohiopyle and sustained a sharp blow to his head and/or neck. He was recovered by boaters in the pool at the bottom, where he complained of neck pain and numbness in his right arm. Paramedics were called to the scene, and it was decided that he would be life flighted to a trauma hospital in Pittsburgh.  There is good and bad news to follow: The bad is that Scott sustained a fracture to one of the vertebrae in his neck. The good news is that he has feeling and motion in all of limbs. They are waiting for an MRI to decide

In [41]:
num_features = 1000
vectorizer = CountVectorizer(ngram_range=(1, 2),
                             max_df=0.55,
                             max_features=num_features,
                             token_pattern=None,
                             tokenizer=tokenize_and_lemmatize)
# LDA can only uses raw term counts
tf = vectorizer.fit_transform(documents)

In [42]:
tf_feature_names = vectorizer.get_feature_names() #theses are the words in our bag of words

In [43]:
num_topics = 10

# Run LDA
lda = LatentDirichletAllocation(n_components=num_topics, max_iter=5, learning_method='online',random_state=42, n_jobs=-1)
lda.fit(tf)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='online', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=5,
                          mean_change_tol=0.001, n_components=10, n_jobs=-1,
                          perp_tol=0.1, random_state=42, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [44]:
temp = lda.components_[0]
print(temp.shape)
temp.argsort()[:10].shape

(1000,)


(10,)

In [45]:
def display_topics(model, feature_names, num_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-num_top_words - 1:-1]]))

num_top_words = 10
display_topics(lda, tf_feature_names, num_top_words)

Topic 0:
dam hydraulic low head catch kayak head dam low head fire near
Topic 1:
go know run try rope come pull help think take
Topic 2:
5 4 6 fareast medium fareast new colorful 107 new new black
Topic 3:
run group drop pin paddler downstream swim day state attempt
Topic 4:
black boy 5pt 22em medium snake chris scout 300 7
Topic 5:
new aolmail cap indent 35 green 35 35 steve white word
Topic 6:
strainer 25 pin tube pin strainer stop verdana 6 tubing 6 25
Topic 7:
man county drown sheriff trip guide park die accident office
Topic 8:
search body man county strong canoe find miss police kayaker
Topic 9:
would title 16px repeat h1 block 100 20px float decoration


### Model evaluation

Model [perplexity](https://en.wikipedia.org/wiki/Perplexity) is often used in LDA to evaluate how well a model predicts a sample.

In [46]:
print("Model perplexity: {0:0.3f}".format(lda.perplexity(tf)))

Model perplexity: 544.671


## Show how to do LDA in gensim

Example from https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24

If you don't already have gensim installed:  
`$ pip install -U gensim`

### Imports

In [60]:
import numpy as np

#gensim
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
import gensim.corpora as corpora
from gensim.models import CoherenceModel

#nltk
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import nltk
# nltk.download('wordnet')

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt


import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

### Preprocess data

In [61]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [62]:
processed_docs = df['description'].apply(tokenize_and_lemmatize)

In [63]:
processed_docs[:10]

0    [witness, ccw, 2008, 01, 14, okay, acott, love...
1    [description, chatooga, form, northern, south,...
2    [13, 1978, dr, walt, blackadar, die, routine, ...
3    [january, 14, 1983, ted, davis, lose, life, co...
4    [dinkey, creek, steep, iv, v, tributary, north...
5    [unidentified, 28, old, man, suffer, shoulder,...
6    [libre, franklin, libre, franklin, dekalb, cou...
7    [complete, email, direct, uc, davis, outdoor, ...
8    [pair, young, kayaker, low, james, town, today...
9    [aolmail, entry, meta, aolmail, clearfix, smal...
Name: description, dtype: object

### Bag of words

In [64]:
#create dictionary
id2word = gensim.corpora.Dictionary(processed_docs)

#create corpus
texts = processed_docs

#Term Document Frequency
bow_corpus = [id2word.doc2bow(text) for text in texts]

In [65]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in bow_corpus]

, 1),
  ('hiker', 1),
  ('history', 1),
  ('information', 1),
  ('knock', 1),
  ('live', 1),
  ('location', 3),
  ('material', 1),
  ('meet', 4),
  ('monitor', 1),
  ('overcome', 2),
  ('person', 1),
  ('plus', 1),
  ('possible', 1),
  ('probably', 1),
  ('rig', 1),
  ('school', 11),
  ('see', 4),
  ('set', 1),
  ('skirt', 1),
  ('storm', 2),
  ('struggle', 2),
  ('sun', 1),
  ('sunday', 2),
  ('take', 1),
  ('talk', 1),
  ('upper', 1),
  ('vehicle', 1),
  ('wave', 1),
  ('wear', 2),
  ('wood', 3),
  ('22', 1),
  ('31', 1),
  ('50', 2),
  ('73', 1),
  ('9', 2),
  ('accord', 7),
  ('approximately', 1),
  ('avid', 1),
  ('capsize', 5),
  ('death', 3),
  ('debris', 4),
  ('difficulty', 1),
  ('grieve', 2),
  ('math', 2),
  ('perform', 1),
  ('rage', 1),
  ('ride', 1),
  ('short', 2),
  ('spend', 2),
  ('stage', 5),
  ('student', 2),
  ('teacher', 7),
  ('town', 4),
  ('week', 1),
  ('device', 1),
  ('even', 3),
  ('fish', 7),
  ('flotation', 1),
  ('responder', 1),
  ('swift', 1),
  ('wir

In [66]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=id2word, passes=2, workers=2)

### View topics in the LDA model

In [67]:
from pprint import pprint

# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[bow_corpus]

[(0,
  '0.007*"county" + 0.007*"man" + 0.006*"search" + 0.005*"fall" + 0.004*"body" '
  '+ 0.004*"dam" + 0.004*"strong" + 0.004*"kayaker" + 0.004*"drown" + '
  '0.004*"accident"'),
 (1,
  '0.008*"man" + 0.007*"search" + 0.006*"county" + 0.006*"fall" + 0.006*"body" '
  '+ 0.006*"strong" + 0.005*"kayaker" + 0.005*"new" + 0.005*"drown" + '
  '0.005*"find"'),
 (2,
  '0.006*"run" + 0.006*"county" + 0.006*"man" + 0.006*"body" + 0.005*"day" + '
  '0.004*"group" + 0.004*"accident" + 0.004*"kayak" + 0.004*"kayaker" + '
  '0.004*"go"'),
 (3,
  '0.005*"dam" + 0.004*"body" + 0.004*"kayaker" + 0.004*"run" + 0.004*"rope" + '
  '0.004*"people" + 0.004*"day" + 0.003*"woman" + 0.003*"victim" + 0.003*"go"'),
 (4,
  '0.005*"run" + 0.004*"go" + 0.004*"drown" + 0.004*"know" + 0.003*"state" + '
  '0.003*"life" + 0.003*"5" + 0.003*"guide" + 0.003*"find" + 0.003*"people"'),
 (5,
  '0.006*"run" + 0.005*"group" + 0.005*"go" + 0.004*"victim" + 0.004*"foot" + '
  '0.004*"near" + 0.004*"pull" + 0.004*"creek" + 0.0

### Compute Model Perplexity and Coherence Score (interpretability of the model)



In [68]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(bow_corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_docs, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -8.272960764534632

Coherence Score:  0.3694681462573802


### Visualize the topics-keywords

In [69]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, bow_corpus, id2word)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [70]:
import joblib
joblib.dump(vis, '/Users/hfeiss/dsi/capstone-2/models/LDAvis.joblib')

['/Users/hfeiss/dsi/capstone-2/models/LDAvis.joblib']