In [134]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [135]:
import sys
print(sys.version)

3.6.5 |Anaconda, Inc.| (default, Apr 29 2018, 16:14:56) 
[GCC 7.2.0]


In [136]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
#print stopwords

In [137]:
all_tweets = pd.read_csv('train_tweets.csv')
#train_tweets = all_tweets[all_tweets['source']=='@@LinkedInHelp']
len(train_tweets)

79

In [138]:
train_tweets = (train_tweets.drop_duplicates(subset='text'))


In [139]:
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'\w+')
train_tweets.loc[:,"tokens"] = train_tweets.loc[:,"text"].apply(tokenizer.tokenize)

In [140]:
data_words = train_tweets.loc[:,'tokens']
data_words.head()


9315    [been, suffering, as, a, owner, since, 2015, w...
9316    [protip, if, this, is, how, you, do, linkedin,...
9317    [pathetic, couldn, t, cancel, my, premium, sub...
9318    [guys, when, people, are, professional, and, c...
9319    [heads, up, pls, help, in, knocking, off, this...
Name: tokens, dtype: object

In [141]:

# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=10) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=10)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0:10]]])

<gensim.interfaces.TransformedCorpus object at 0x7f654b9b0e80>




In [142]:
#for i in range(10):
#    print(trigram_mod[bigram_mod[data_words[i]]])

In [143]:
 # Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

In [144]:
data_no_stopwords = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_no_stopwords)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

data_processed = data_words_bigrams

In [145]:
# Create Dictionary
id2word = corpora.Dictionary(data_processed)

# Create Corpus
texts = data_processed

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

'/home/rcarns/flaskapps/squeakywheel'

In [147]:
mallet_path = './mallet-2.0.8/bin/mallet' 
#ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=20, id2word=id2word)

In [148]:
'''# Show Topics
pprint(ldamallet.show_topics(formatted=False))

# Compute Coherence Score
coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_ldamallet = coherence_model_ldamallet.get_coherence()
print('\nCoherence Score: ', coherence_ldamallet)'''

"# Show Topics\npprint(ldamallet.show_topics(formatted=False))\n\n# Compute Coherence Score\ncoherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=data_lemmatized, dictionary=id2word, coherence='c_v')\ncoherence_ldamallet = coherence_model_ldamallet.get_coherence()\nprint('\nCoherence Score: ', coherence_ldamallet)"

lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=5, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [150]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.016*"help" + 0.013*"get" + 0.013*"hey" + 0.013*"hi" + 0.011*"linkedin" + '
  '0.011*"thanks" + 0.008*"would" + 0.008*"support" + 0.008*"account" + '
  '0.008*"one"'),
 (1,
  '0.013*"linkedin" + 0.011*"someone" + 0.011*"issue" + 0.011*"please" + '
  '0.011*"account" + 0.009*"help" + 0.007*"hi" + 0.007*"send" + 0.007*"case" + '
  '0.007*"office"'),
 (2,
  '0.008*"help" + 0.008*"post" + 0.008*"vous" + 0.008*"followed" + '
  '0.008*"keep" + 0.008*"si" + 0.008*"de" + 0.008*"able" + 0.008*"al" + '
  '0.008*"dear"'),
 (3,
  '0.016*"linkedin" + 0.010*"search" + 0.007*"support" + 0.007*"hey" + '
  '0.007*"wrong" + 0.007*"years" + 0.007*"day" + 0.007*"connection" + '
  '0.007*"still" + 0.007*"jobs"'),
 (4,
  '0.017*"account" + 0.017*"contact" + 0.011*"issue" + 0.011*"email" + '
  '0.011*"guys" + 0.011*"contacts" + 0.007*"get" + 0.007*"someone" + '
  '0.007*"process" + 0.007*"team"')]


In [151]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_processed, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -6.867594566977201

Coherence Score:  0.47766590137660286


In [152]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  return pd.concat([default_term_info] + list(topic_dfs))
