In [None]:
"""
Use LDA and pyLDAvis for topic modeling visualization

"""

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from collections import Counter

from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

from wordcloud import WordCloud

from gensim import matutils, models
import scipy.sparse

%load_ext autoreload
%autoreload 2
import sys
sys.path.append('/Users/katiehuang/Desktop/metis/projects/onl_ds5_project_4/py')
from word_cloud import *
import importlib

import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Let's read in our document-term matrix
speech_df = pd.read_pickle('../dump/speech_clean_lemma')
data = pd.read_pickle('../dump/data_dtm_lemma.pkl')
tdm = data.transpose()
tdm.shape

(36156, 441)

# Visualization with pyLDAvis

In [3]:
# import pyLDAvis
# import pyLDAvis.sklearn
# pyLDAvis.enable_notebook()
# # from sklearn.datasets import fetch_20newsgroups
# # from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# from sklearn.decomposition import LatentDirichletAllocation


In [4]:
# newsgroups = fetch_20newsgroups(remove=('headers', 'footers', 'quotes'))
# docs_raw = newsgroups.data
# print(len(docs_raw))

### Define function

In [5]:
def generate_pyLDAvis(df,column_name,n_components):
    """Input: df and column of interesnt (e.g. transcript, nouns)
       Output: pyLDAvis graph"""
    
    transcripts = df[column_name].tolist()
    docs_raw = transcripts
    tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
                                    stop_words = 'english',
                                    lowercase = True,
                                    token_pattern = r'\b[a-zA-Z]{3,}\b',
                                    max_df = 0.5, 
                                    min_df = 10)
    dtm_tf = tf_vectorizer.fit_transform(docs_raw)
    n_components = n_components
    # for TF DTM
    lda_tf = LatentDirichletAllocation(n_components=n_components, random_state=0)
    lda_tf.fit(dtm_tf)
    
    return pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer)

In [6]:
generate_pyLDAvis(speech_df,'transcript',5)

### Test with Tf-idf
(which looks bad for LDA model)

In [7]:
transcripts = speech_df.transcript.tolist()
docs_raw = transcripts
tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
                                stop_words = 'english',
                                lowercase = True,
                                token_pattern = r'\b[a-zA-Z]{3,}\b',
                                max_df = 0.5, 
                                min_df = 10)
dtm_tf = tf_vectorizer.fit_transform(docs_raw)
print(dtm_tf.shape)

(441, 3682)


In [8]:
tfidf_vectorizer = TfidfVectorizer(**tf_vectorizer.get_params())
dtm_tfidf = tfidf_vectorizer.fit_transform(docs_raw)
print(dtm_tfidf.shape)

(441, 3682)


In [9]:
n_components = 5
# for TF DTM
lda_tf = LatentDirichletAllocation(n_components=n_components, random_state=0)
lda_tf.fit(dtm_tf)
# for TFIDF DTM
lda_tfidf = LatentDirichletAllocation(n_components=n_components, random_state=0)
lda_tfidf.fit(dtm_tfidf)

LatentDirichletAllocation(n_components=5, random_state=0)

In [10]:
# pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer)

In [11]:
pyLDAvis.sklearn.prepare(lda_tfidf, dtm_tfidf, tfidf_vectorizer)

In [12]:
# Wow. Indeed Tf-idf is not good for LDA!

## 2. Refine topic modeling
Create gensim corpus and dictionarry

In [13]:
import pickle

with open("../dump/common_words.txt", "rb") as f:   # Unpickling
    common_words = pickle.load(f)

In [14]:
# Bag of words with CountVectorizer
# add_stop_words selected from after lemmatization
# will also remove common_words (most commonly used words in all speeches)
# will also remove boring words (words that do not add much insight to topic modeling)
add_stop_words = ['like','youre','ive','im','really','id','ve','just','dont','thi','wa',
                  'say','know','make','people']

boring_words = ['say','like','just','dont','don','im',
                  'ive','youll','youve','things','thing','youre','right','really','lot',
                  'make','know','people','way','day','class']


add_stop_words = add_stop_words + common_words + boring_words

stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)

cv = CountVectorizer(stop_words=stop_words)
data_cv = cv.fit_transform(speech_df.transcript)

In [15]:
# If add_stop_words is modified, update tdm
data_dtm = cv_dtm(speech_df,'transcript',add_stop_words)
tdm = data_dtm.transpose()

In [16]:
data_dtm.shape

(441, 36121)

In [17]:
# We're going to put the term-document matrix into a new gensim format
# From df --> sparse matrix --> gensim corpus
sparse_counts = scipy.sparse.csr_matrix(tdm)
corpus = matutils.Sparse2Corpus(sparse_counts)

In [18]:
# Gensim also requires dictionary of the all terms and their respective location in the term-document matrix
# {dictionsry of location: word}
id2word = dict((v, k) for k, v in cv.vocabulary_.items())
len(id2word)

36121

In [19]:
def get_lda_topics(model, num_topics):
    """Print lda topics with pd.DataFrame"""
    
    word_dict = {}
    for i in range(num_topics):
        words = model.show_topic(i, topn = 10)
        word_dict['Topic #' + '{:02d}'.format(i+1)] = [i[0] for i in words]
        
    return pd.DataFrame(word_dict).transpose()

### A. Nouns only

In [20]:
# Let's create a function to pull out nouns from a string of text
from nltk import word_tokenize, pos_tag

def nouns(text):
    '''Given a string of text, tokenize the text and pull out only the nouns.'''
    is_noun = lambda pos: pos[:2] == 'NN'
    tokenized = word_tokenize(text)
    all_nouns = [word for (word, pos) in pos_tag(tokenized) if is_noun(pos)] 
    return ' '.join(all_nouns)

In [21]:
# Apply the nouns function to the transcripts to filter only on nouns
speech_df['nouns'] = speech_df.transcript.apply(nouns)

In [22]:
generate_pyLDAvis(speech_df,'nouns',5)

### B. Nouns and adjective

In [23]:
# Let's create a function to pull out nouns from a string of text
from nltk import word_tokenize, pos_tag

def nouns_adj(text):
    '''Given a string of text, tokenize the text and pull out only the nouns.'''
    is_noun = lambda pos: pos[:2] == 'NN' or pos[:2] == 'JJ'
    tokenized = word_tokenize(text)
    all_nouns = [word for (word, pos) in pos_tag(tokenized) if is_noun(pos)] 
    return ' '.join(all_nouns)

In [24]:
# Apply the nouns function to the transcripts to filter only on nouns
speech_df['nouns_adj'] = speech_df.transcript.apply(nouns_adj)
speech_df.head()

Unnamed: 0,speaker,year,transcript,length,nouns,nouns_adj
0,SIDDHARTHA MUKHERJEE,2018,i wish someone have tell me at my own commence...,14487,i someone commencement requirement graduation ...,i someone own commencement requirement graduat...
1,ABBY WAMBACK,2018,failure be not something to be ashamed of its ...,15866,failure something something power failure octa...,failure something something power failure high...
2,JON B. FISHER,2018,thank you very much my father commencement spe...,8544,thank father commencement speaker martin luthe...,thank much father commencement speaker great m...
3,MINDY KALING,2018,good morning to the class of the faculty the p...,15391,morning class faculty parent grandparents hono...,good morning class faculty parent grandparents...
4,JESMYN WARD,2018,persist be patient be well good morning it be ...,14063,persist morning honor pleasure share day presi...,persist patient good morning honor pleasure sh...


In [25]:
generate_pyLDAvis(speech_df,'nouns_adj',5)