In [None]:
"""
Use gensim library to build document/word vectors 
Perform topic identification and document comparison

"""

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from collections import Counter

from sklearn.feature_extraction import text 
from sklearn.feature_extraction.text import CountVectorizer

from wordcloud import WordCloud

from gensim import matutils, models
import scipy.sparse

%load_ext autoreload
%autoreload 2
import sys
sys.path.append('/Users/katiehuang/Desktop/metis/projects/onl_ds5_project_4/py')
from word_cloud import *
import importlib

## 1. Load data

In [2]:
# Let's read in our document-term matrix
speech_df = pd.read_pickle('../dump/speech_clean_lemma')
data = pd.read_pickle('../dump/data_dtm_lemma.pkl')
tdm = data.transpose()
tdm.shape

(36156, 441)

In [3]:
tdm.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,431,432,433,434,435,436,437,438,439,440
aa,0,0,0,0,0,0,0,0,2,0,...,0,0,0,0,0,0,0,0,0,0
aahhhh,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
aaron,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
aback,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
abalthus,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
speech_df.head()

Unnamed: 0,speaker,year,transcript,length
0,SIDDHARTHA MUKHERJEE,2018,i wish someone have tell me at my own commence...,14487
1,ABBY WAMBACK,2018,failure be not something to be ashamed of its ...,15866
2,JON B. FISHER,2018,thank you very much my father commencement spe...,8544
3,MINDY KALING,2018,good morning to the class of the faculty the p...,15391
4,JESMYN WARD,2018,persist be patient be well good morning it be ...,14063


## 2. Prepare for topic modeling
Create gensim corpus and dictionarry

In [5]:
import pickle

with open("../dump/common_words.txt", "rb") as f:   # Unpickling
    common_words = pickle.load(f)

In [6]:
# Bag of words with CountVectorizer
# add_stop_words selected from after lemmatization
# will also remove common_words (most commonly used words in all speeches)
# will also remove boring words (words that do not add much insight to topic modeling)
add_stop_words = ['like','youre','ive','im','really','id','ve','just','dont','thi','wa',
                  'say','know','make','people']

boring_words = ['say','like','just','dont','don','im',
                  'ive','youll','youve','things','thing','youre','right','really','lot',
                  'make','know','people','way','day','class']


add_stop_words = add_stop_words + common_words + boring_words

stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)

cv = CountVectorizer(stop_words=stop_words)
data_cv = cv.fit_transform(speech_df.transcript)

In [7]:
# If add_stop_words is modified, update tdm
data_dtm = cv_dtm(speech_df,'transcript',add_stop_words)
tdm = data_dtm.transpose()

In [8]:
# We're going to put the term-document matrix into a new gensim format
# From df --> sparse matrix --> gensim corpus
sparse_counts = scipy.sparse.csr_matrix(tdm)
corpus = matutils.Sparse2Corpus(sparse_counts)

In [9]:
# Gensim also requires dictionary of the all terms and their respective location in the term-document matrix
# {dictionsry of location: word}
id2word = dict((v, k) for k, v in cv.vocabulary_.items())
len(id2word)

36121

## 3. Topic modeling - LDA

Latent Dirichlet Allocation (**LDA**): specifically designed for text data with a probabilistic approach.  
Use two probability values: **P(word | topics)** and **P(topics | documents)**  
The two values arre calculated based on an initial random assignment, after which they are repeated for each word in each document to decide their topic assignment. In the iterative procedure, the probabilities are calculated until converged.


Input:
* document-term matrix
* number of topics
* number of iterstions.

In [10]:
def get_lda_topics(model, num_topics):
    """Print lda topics with pd.DataFrame"""
    
    word_dict = {}
    for i in range(num_topics):
        words = model.show_topic(i, topn = 10)
        word_dict['Topic #' + '{:02d}'.format(i+1)] = [i[0] for i in words]
        
    return pd.DataFrame(word_dict).transpose()

### A. All text

In [11]:
# We need to specify two parameters: the number of topics and the number of passes
num_topics = 2
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=num_topics, passes=10)
get_lda_topics(lda, num_topics)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Topic #01,start,ask,remember,change,little,let,didnt,thank,write,try
Topic #02,mean,women,let,talk,human,try,ask,feel,help,man


In [12]:
# LDA for num_topics = 3
num_topics = 3
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=num_topics, passes=10)
get_lda_topics(lda, num_topics)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Topic #01,ask,mean,little,remember,feel,let,start,write,try,didnt
Topic #02,mean,try,write,little,remember,let,ask,parent,change,talk
Topic #03,start,let,ask,believe,help,man,change,didnt,university,company


In [13]:
# LDA for num_topics = 4
num_topics = 4
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=num_topics, passes=10)
get_lda_topics(lda, num_topics)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Topic #01,human,mean,man,talk,change,ask,education,let,place,country
Topic #02,ask,start,try,mean,little,let,remember,help,parent,better
Topic #03,dream,start,let,remember,feel,little,ask,write,didnt,help
Topic #04,write,use,little,remember,ask,women,let,mean,help,believe


### B. Nouns only

In [14]:
# Let's create a function to pull out nouns from a string of text
from nltk import word_tokenize, pos_tag

def nouns(text):
    '''Given a string of text, tokenize the text and pull out only the nouns.'''
    is_noun = lambda pos: pos[:2] == 'NN'
    tokenized = word_tokenize(text)
    all_nouns = [word for (word, pos) in pos_tag(tokenized) if is_noun(pos)] 
    return ' '.join(all_nouns)

In [15]:
# Apply the nouns function to the transcripts to filter only on nouns
speech_df['nouns'] = speech_df.transcript.apply(nouns)
speech_df.head()

Unnamed: 0,speaker,year,transcript,length,nouns
0,SIDDHARTHA MUKHERJEE,2018,i wish someone have tell me at my own commence...,14487,i someone commencement requirement graduation ...
1,ABBY WAMBACK,2018,failure be not something to be ashamed of its ...,15866,failure something something power failure octa...
2,JON B. FISHER,2018,thank you very much my father commencement spe...,8544,thank father commencement speaker martin luthe...
3,MINDY KALING,2018,good morning to the class of the faculty the p...,15391,morning class faculty parent grandparents hono...
4,JESMYN WARD,2018,persist be patient be well good morning it be ...,14063,persist morning honor pleasure share day presi...


In [16]:
# Create dtm_n (document-term matrix with nouns only)
cv_n = CountVectorizer(stop_words=stop_words)
data_cv_n = cv_n.fit_transform(speech_df.nouns)
dtm_n = cv_dtm(speech_df,'nouns',add_stop_words)
# dtm_n

In [17]:
# Create the gensim corpus
corpus_n = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(dtm_n.transpose()))

# Create the vocabulary dictionary
id2word_n = dict((v, k) for k, v in cv_n.vocabulary_.items())

In [18]:
# Let's start with 2 topics
num_topics = 2
lda_n = models.LdaModel(corpus=corpus_n, num_topics=num_topics, id2word=id2word_n, passes=10)
get_lda_topics(lda_n, num_topics)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Topic #01,year,parent,place,story,word,friends,women,family,course,fact
Topic #02,parent,man,education,dream,women,university,place,state,country,year


In [19]:
# Let's start with 3 topics
num_topics = 3
lda_n = models.LdaModel(corpus=corpus_n, num_topics=num_topics, id2word=id2word_n, passes=10)
get_lda_topics(lda_n, num_topics)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Topic #01,place,year,word,kind,parent,sense,person,story,dream,fact
Topic #02,parent,question,dream,family,year,university,education,president,friends,state
Topic #03,man,women,education,place,men,state,parent,course,country,year


In [20]:
# Let's start with 4 topics
num_topics = 4
lda_n = models.LdaModel(corpus=corpus_n, num_topics=num_topics, id2word=id2word_n, passes=10)
get_lda_topics(lda_n, num_topics)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Topic #01,women,parent,year,question,word,place,friends,course,story,success
Topic #02,place,country,war,man,family,state,parent,word,moment,history
Topic #03,dream,education,university,man,parent,mother,state,women,family,students
Topic #04,education,place,parent,country,course,experience,kind,state,person,university


### B. Nouns and adjective

In [21]:
# Let's create a function to pull out nouns from a string of text
from nltk import word_tokenize, pos_tag

def nouns_adj(text):
    '''Given a string of text, tokenize the text and pull out only the nouns.'''
    is_noun = lambda pos: pos[:2] == 'NN' or pos[:2] == 'JJ'
    tokenized = word_tokenize(text)
    all_nouns = [word for (word, pos) in pos_tag(tokenized) if is_noun(pos)] 
    return ' '.join(all_nouns)

In [22]:
# Apply the nouns function to the transcripts to filter only on nouns
speech_df['nouns_adj'] = speech_df.transcript.apply(nouns_adj)
speech_df.head()

Unnamed: 0,speaker,year,transcript,length,nouns,nouns_adj
0,SIDDHARTHA MUKHERJEE,2018,i wish someone have tell me at my own commence...,14487,i someone commencement requirement graduation ...,i someone own commencement requirement graduat...
1,ABBY WAMBACK,2018,failure be not something to be ashamed of its ...,15866,failure something something power failure octa...,failure something something power failure high...
2,JON B. FISHER,2018,thank you very much my father commencement spe...,8544,thank father commencement speaker martin luthe...,thank much father commencement speaker great m...
3,MINDY KALING,2018,good morning to the class of the faculty the p...,15391,morning class faculty parent grandparents hono...,good morning class faculty parent grandparents...
4,JESMYN WARD,2018,persist be patient be well good morning it be ...,14063,persist morning honor pleasure share day presi...,persist patient good morning honor pleasure sh...


In [23]:
# Add boring words that do not add much insight to topic modeling

boring_words = ['say','like','just','dont','don','im',
                  'ive','youll','youve','things','thing','youre','right','really','lot',
                  'make','know','people','way','day','class']
add_stop_words = add_stop_words + boring_words
stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)

In [24]:
# Create dtm_n (document-term matrix with nouns only)
cv_na = CountVectorizer(stop_words=stop_words)
data_cv_na = cv_na.fit_transform(speech_df.nouns_adj)
dtm_na = cv_dtm(speech_df,'nouns_adj',add_stop_words)
# dtm_na

In [25]:
# Create the gensim corpus
corpus_na = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(dtm_na.transpose()))

# Create the vocabulary dictionary
id2word_na = dict((v, k) for k, v in cv_na.vocabulary_.items())

In [26]:
# Let's start with 2 topics
num_topics=2
lda_na = models.LdaModel(corpus=corpus_na, num_topics=num_topics, id2word=id2word_na, passes=10)
get_lda_topics(lda_na, num_topics)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Topic #01,little,parent,place,man,question,important,year,big,moment,young
Topic #02,women,education,best,university,dream,little,parent,year,big,success


In [27]:
# Let's start with 3 topics
num_topics=3
lda_na = models.LdaModel(corpus=corpus_na, num_topics=num_topics, id2word=id2word_na, passes=10)
get_lda_topics(lda_na, num_topics)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Topic #01,little,parent,women,best,man,word,big,place,family,year
Topic #02,dream,education,state,women,little,fear,university,moment,fact,hard
Topic #03,human,question,little,parent,country,important,year,place,company,state


In [28]:
# Let's start with 4 topics
num_topics=4
lda_na = models.LdaModel(corpus=corpus_na, num_topics=num_topics, id2word=id2word_na, passes=10)
get_lda_topics(lda_na, num_topics)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Topic #01,man,question,women,little,state,education,human,year,university,country
Topic #02,little,dream,important,parent,best,success,year,place,hard,big
Topic #03,education,parent,human,place,young,little,women,dream,family,better
Topic #04,parent,little,word,year,women,friends,old,big,moment,home


In [29]:
# Let's start with 10 topics
num_topics=10
lda_na = models.LdaModel(corpus=corpus_na, num_topics=num_topics, id2word=id2word_na, passes=30)
get_lda_topics(lda_na, num_topics)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Topic #01,human,peace,war,education,future,place,state,culture,experience,company
Topic #02,dream,rule,money,percent,parent,big,true,challenge,arts,friends
Topic #03,word,little,best,human,high,future,year,place,question,moment
Topic #04,parent,little,generation,year,degree,law,point,moment,story,dream
Topic #05,little,parent,dream,advice,year,course,best,speech,better,education
Topic #06,company,business,big,parent,government,little,country,success,place,family
Topic #07,fear,family,little,big,hard,young,man,state,place,best
Topic #08,women,man,little,science,men,important,wonder,year,best,kind
Topic #09,women,university,education,parent,state,human,young,men,little,country
Topic #10,question,success,best,matter,heart,year,business,place,point,answer


In [31]:
# lda_na.print_topics()

## 4. Topic modeling - NMF

Besides LDA, there are other matrix factorization techniques such as Latent Semantic Indexing (**LSI**) and non-negative Matrix Factorization (**NMF**).

NMF is similar to Principal component analysis (**PCA**). Vectors are non-negative; by factoring them into the lower-dimensional form, coefficients are also non-negative.