In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import spacy

import gensim
from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel
from gensim.models.wrappers import LdaMallet
from gensim.corpora import Dictionary
import pyLDAvis.gensim

import os, re, operator, warnings
warnings.filterwarnings('ignore')
%matplotlib inline

Read in csv file then prepare the text to be processed by concatting all the comments in the dataframe as a single document where comments are separated by the newline character.

In [2]:
df = pd.read_csv('../capstone/csv/comments_metadata_sentiment.csv')

In [3]:
text = df['comment'].str.cat(sep='\n')

Import Spacy's English Language pipeline and process the text and save as the "doc" variable.

In [4]:
from spacy.lang.en import English
nlp = spacy.load('en_core_web_md')
# Because of the size of the "text" we need to increase the max length. Before running, make sure
# your computer has enough free ram (rougly 1GB per 1,000,000 length).
nlp.max_length = 10000000

In [5]:
my_stop_words = [u'school', u'schools', u'PS', u'P.S.', u'read_<', u'teacher', u'teachers', u'Teacher', u'Teachers', u'child', u'children', 
                 u'parent', u'parents', u'Parent', u'Parents', u'guardian', u'guardians', u'new', u'city', u'york', u'student', u'students',
                 u'class', u'hi', u'be', u'like', u'great', u'know', u'go', u'Go', u'kid', u'kids', u'href="http://schools.nyc.gov_Offices',
                 u'\xa0 \n', u'\xa0\n', u'\xa0 ', u'<_/P', u'<_/p', u'n\n', u'son', u'daughter', u'u', u'ur', u'ms.', u'Ms.', u'mr.',
                 u'Mr.', u' ', u'Student', u'Students', u'New', u'city', u'City', u'come', u'Say', u'day', u'year', u'York', u'>', u'=',
                 u'good', u'Good', u'bad', u'Bad', u'Ms', u'ms', u'say', u'says', u'saying', u'i.s.', u'I.S.', u'good', u'bad', u'love'
                 u'hate']
for stopword in my_stop_words:
    lexeme = nlp.vocab[stopword]
    lexeme.is_stop = True

In [6]:
doc = nlp(text)

KeyboardInterrupt: 

In [None]:
doc[4].ent_type

In [None]:
# we add some words to the stop word list
texts, article = [], []
for word in doc:
    # if the word is not a stop word, punctuation, or number
    if word.text != '\n' and not word.is_stop and not word.is_punct and not word.like_num and not word.like_email and not word.like_url:
        # append lemmatized word to article container
        article.append(word.lemma_)
        # if newline then signals end of the document
    if word.text == '\n':
        texts.append(article)
        article = []

---

**The pipeline begins with a database populated with comments in the comment column.**

In [None]:
df.comment.iloc[1]

**The pandas series is then parsed and each user comment is concatted together with the newline character to indicate separate documents.**

In [None]:
text[:3000]

**A stopwords list is created then the text document is passed into spacy's pipeline to create the "doc". The doc object is a list of spacy tokens with inherent properties given to each token by the spacy pipeline.**  

In [None]:
print('{} {}\n'.format('Doc Index 0:', doc[0]))
print('{} {}'.format('Data Type: ', type(doc[0])))

In [None]:
doc[370:517]

**The doc object is then iterated through and if the word is not a stop word, punctuation, or number it is appended to a list called article. When the loop hits a newline character the article list is then then appended to a list called texts which becomes a list of lists containing the lemmatized words.**

In [None]:
doc[372].lemma_

In [None]:
# Check to see if still contains the word teacher.
texts[1]

In [None]:
article[0]

In [None]:
doc[0]

In [None]:
# first article, first 5 lemmatized words
texts[2][:]

In [None]:
print('{} {}'.format('Length of article:', len(article)))
print(article[:4])

---

In [None]:
bigram = gensim.models.Phrases(texts)

In [None]:
texts = [bigram[line] for line in texts]

In [None]:
# looking at "texts" we can see the bigrams joined with an underscore
# texts

In [None]:
# create gensim.corpora.dictionary.Dictionary
dictionary = Dictionary(texts)
# create a corpus using bag of words
corpus = [dictionary.doc2bow(text) for text in texts]

In [None]:
# looking at "corpus" we can see a list of lists containing tuples for each word's index and count
# corpus

**LDA TOPIC MODEL**

In [None]:
ldamodel = LdaModel(corpus=corpus, num_topics=10, id2word=dictionary)

In [None]:
ldamodel.show_topics()

**HDP TOPIC MODEL**

In [None]:
hdpmodel = HdpModel(corpus=corpus, id2word=dictionary)

In [None]:
hdpmodel.show_topics()

**LSI TOPIC MODEL**

In [None]:
lsimodel = LsiModel(corpus=corpus, num_topics=10, id2word=dictionary)

In [None]:
lsimodel.show_topics(num_topics=5)

**TOPIC COHERENCE** <br><br>
A coherence value is a way to compare topic models using a numerical value. Topic Coherence measures score a single topic by measuring the degree of semantic similarity between high scoring words in the topic.

In [None]:
lsitopics = [[word for word, prob in topic] for topicid, topic in lsimodel.show_topics(formatted=False)]
hdptopics = [[word for word, prob in topic] for topicid, topic in hdpmodel.show_topics(formatted=False)]
ldatopics = [[word for word, prob in topic] for topicid, topic in ldamodel.show_topics(formatted=False)]

In [None]:
lsi_coherence = CoherenceModel(topics=lsitopics[:10], texts=texts, dictionary=dictionary, window_size=10).get_coherence()
hdp_coherence = CoherenceModel(topics=hdptopics[:10], texts=texts, dictionary=dictionary, window_size=10).get_coherence()
lda_coherence = CoherenceModel(topics=ldatopics, texts=texts, dictionary=dictionary, window_size=10).get_coherence()

In [None]:
def evaluate_bar_graph(coherences, indices):
    """
    Function to plot bar graph.
    
    coherences: list of coherence values
    indices: Indices to be used to mark bars. Length of this and coherences should be equal.
    """
    assert len(coherences) == len(indices)
    n = len(coherences)
    x = np.arange(n)
    plt.bar(x, coherences, width = 0.2, tick_label=indices, align='center')
    plt.xlabel('Models')
    plt.ylabel('Coherence Value')

In [None]:
evaluate_bar_graph([lsi_coherence, hdp_coherence, lda_coherence], ['LSI', 'HDP', 'LDA'])

**pyLDAvis**

In [None]:
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)