In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from tokenator import tokenize_and_lemmatize


#gensim
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
import gensim.corpora as corpora
from gensim.models import CoherenceModel

#nltk
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import nltk

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [2]:
df = pd.read_pickle('/Users/hfeiss/dsi/capstone-2/data/clean/clean.pkl')
documents = df['description']
print(documents[0])

<p>Non-Witness Narrative by ccw on 2008-01-14 (okay to publish):  Acott Loveland Injured on Slides of Meadow Creek, Ohiopyle PA  Posted: Sat Jan 12, 2008 8:39 pm   Best Wishes For A Speedy Recovery To Scot Loveland   I wanted to update everyone on Scott's status, to stop any rumors, bring everyone up to speed, and allow everyone to send their best hopes and prayers on for his speedy recovery.  Today, 1/12/08 Scott flipped over in the Class V Slides rapid of Meadow Run in Ohiopyle and sustained a sharp blow to his head and/or neck. He was recovered by boaters in the pool at the bottom, where he complained of neck pain and numbness in his right arm. Paramedics were called to the scene, and it was decided that he would be life flighted to a trauma hospital in Pittsburgh.  There is good and bad news to follow: The bad is that Scott sustained a fracture to one of the vertebrae in his neck. The good news is that he has feeling and motion in all of limbs. They are waiting for an MRI to decide

In [3]:
num_features = 1000

vectorizer = CountVectorizer(ngram_range=(1, 2),
                             max_df=0.55,
                             max_features=num_features,
                             token_pattern=None,
                             tokenizer=tokenize_and_lemmatize)

tf = vectorizer.fit_transform(documents)

In [4]:
tf_feature_names = vectorizer.get_feature_names()

In [5]:
num_topics = 6

lda = LatentDirichletAllocation(n_components=num_topics,
                                max_iter=5,
                                learning_method='online',
                                random_state=42,
                                n_jobs=-1)

lda.fit(tf)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='online', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=5,
                          mean_change_tol=0.001, n_components=6, n_jobs=-1,
                          perp_tol=0.1, random_state=42, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [6]:
temp = lda.components_[0]
print(temp.shape)
temp.argsort()[:10].shape

(1000,)


(10,)

In [7]:
def display_topics(model, feature_names, num_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-num_top_words - 1:-1]]))

num_top_words = 10
display_topics(lda, tf_feature_names, num_top_words)

Topic 0:
man county drown dam old park near canoe accident life
Topic 1:
go run know rope try group pull come help take
Topic 2:
black new 5 4 medium 6 fareast fareast new colorful new black
Topic 3:
run group pin state day swim drop paddler fork high
Topic 4:
search body miss county find strong team recovery effort man
Topic 5:
would title 16px repeat block 100 20px h1 decoration float


### Model evaluation

Model [perplexity](https://en.wikipedia.org/wiki/Perplexity) is often used in LDA to evaluate how well a model predicts a sample.

In [8]:
print("Model perplexity: {0:0.3f}".format(lda.perplexity(tf)))

Model perplexity: 558.373


### Preprocess data

In [9]:
processed_docs = df['description'].apply(tokenize_and_lemmatize)

In [10]:
processed_docs[:10]

0    [witness, ccw, 2008, 01, 14, okay, acott, love...
1    [description, chatooga, form, northern, south,...
2    [13, 1978, dr, walt, blackadar, die, routine, ...
3    [january, 14, 1983, ted, davis, lose, life, co...
4    [dinkey, creek, steep, iv, v, tributary, north...
5    [unidentified, 28, old, man, suffer, shoulder,...
6    [libre, franklin, libre, franklin, dekalb, cou...
7    [complete, email, direct, uc, davis, outdoor, ...
8    [pair, young, kayaker, low, james, town, today...
9    [aolmail, entry, meta, aolmail, clearfix, smal...
Name: description, dtype: object

### Bag of words

In [11]:
#create dictionary
id2word = gensim.corpora.Dictionary(processed_docs)

#create corpus
texts = processed_docs

#Term Document Frequency
bow_corpus = [id2word.doc2bow(text) for text in texts]

In [12]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in bow_corpus]

[[('01', 1),
  ('14', 1),
  ('2008', 2),
  ('381', 1),
  ('8', 1),
  ('accident', 1),
  ('acott', 1),
  ('action', 1),
  ('ahead', 1),
  ('aid', 1),
  ('allow', 1),
  ('appropriate', 1),
  ('arm', 1),
  ('bad', 2),
  ('blow', 1),
  ('boater', 1),
  ('bridge', 1),
  ('bring', 1),
  ('call', 1),
  ('ccw', 1),
  ('complain', 1),
  ('creek', 1),
  ('decide', 2),
  ('derek', 1),
  ('feel', 1),
  ('flighted', 1),
  ('flip', 1),
  ('follow', 2),
  ('fracture', 1),
  ('good', 2),
  ('head', 1),
  ('hope', 1),
  ('hospital', 1),
  ('injure', 1),
  ('jan', 1),
  ('jeff', 1),
  ('jesse', 1),
  ('life', 1),
  ('likely', 1),
  ('limb', 1),
  ('loveland', 2),
  ('maintain', 1),
  ('meadow', 3),
  ('mental', 1),
  ('motion', 1),
  ('mri', 1),
  ('neck', 3),
  ('news', 2),
  ('numbness', 1),
  ('ohiopyle', 2),
  ('okay', 1),
  ('overy', 1),
  ('pain', 1),
  ('paramedic', 1),
  ('physical', 1),
  ('pittsburgh', 1),
  ('pm', 1),
  ('pool', 1),
  ('post', 1),
  ('prayer', 1),
  ('progress', 1),
  ('quick

In [13]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=6, id2word=id2word, passes=2, workers=8)

### View topics in the LDA model

In [14]:
from pprint import pprint

# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[bow_corpus]

[(0,
  '0.008*"man" + 0.007*"county" + 0.006*"body" + 0.005*"search" + 0.005*"fall" '
  '+ 0.005*"drown" + 0.005*"near" + 0.004*"find" + 0.004*"float" + '
  '0.004*"life"'),
 (1,
  '0.006*"dam" + 0.005*"group" + 0.005*"man" + 0.005*"go" + 0.005*"pull" + '
  '0.004*"victim" + 0.004*"rope" + 0.004*"run" + 0.004*"downstream" + '
  '0.004*"kayaker"'),
 (2,
  '0.006*"county" + 0.005*"4" + 0.005*"body" + 0.005*"foot" + 0.004*"fall" + '
  '0.004*"6" + 0.004*"5" + 0.004*"strong" + 0.004*"go" + 0.004*"drown"'),
 (3,
  '0.009*"run" + 0.005*"kayaker" + 0.005*"kayak" + 0.005*"group" + '
  '0.004*"body" + 0.004*"go" + 0.004*"fall" + 0.004*"man" + 0.004*"drown" + '
  '0.004*"know"'),
 (4,
  '0.005*"run" + 0.004*"new" + 0.004*"5" + 0.004*"search" + 0.003*"4" + '
  '0.003*"body" + 0.003*"kayaker" + 0.003*"man" + 0.003*"strong" + '
  '0.003*"paddler"'),
 (5,
  '0.005*"man" + 0.005*"county" + 0.004*"guide" + 0.004*"group" + '
  '0.004*"accident" + 0.004*"pull" + 0.003*"run" + 0.003*"kayaker" + '
  '0.00

### Compute Model Perplexity and Coherence Score (interpretability of the model)



In [15]:
print('\nPerplexity: ', lda_model.log_perplexity(bow_corpus))

coherence_model_lda = CoherenceModel(model=lda_model,
                                     texts=processed_docs,
                                     dictionary=id2word,
                                     coherence='c_v')

coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -8.234232660496486

Coherence Score:  0.3668232318047428


### Visualize the topics-keywords

In [None]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, bow_corpus, id2word)
vis

In [None]:
joblib.dump(vis, models + '/LDAvis.joblib')

In [None]:
joblib.load(models + '/LDAvis.joblib')
pyLDAvis.save_html(vis, models + '/LDAvis.html')