In [None]:
"""
Use gensim library to build document/word vectors 
Perform topic identification and document comparison with LDA

"""

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from collections import Counter

from sklearn.feature_extraction import text 
from sklearn.feature_extraction.text import CountVectorizer

from wordcloud import WordCloud

from gensim import matutils, models
import scipy.sparse

%load_ext autoreload
%autoreload 2
import sys
sys.path.append('/Users/katiehuang/Desktop/metis/projects/onl_ds5_project_4/py')
from word_cloud import *
import importlib

## 1. Load data

In [2]:
# Let's read in our document-term matrix
speech_df = pd.read_pickle('../dump/speech_clean_lemma')
data = pd.read_pickle('../dump/data_dtm_lemma.pkl')
tdm = data.transpose()
tdm.shape

(36156, 441)

In [37]:
tdm.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,431,432,433,434,435,436,437,438,439,440
aa,0,0,0,0,0,0,0,0,2,0,...,0,0,0,0,0,0,0,0,0,0
aahhhh,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
aaron,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
aback,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
abalthus,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
abandon,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
abandonment,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
abate,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
abbot,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
abbreviation,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
speech_df.head()

Unnamed: 0,speaker,year,transcript,length
0,SIDDHARTHA MUKHERJEE,2018,i wish someone have tell me at my own commence...,14487
1,ABBY WAMBACK,2018,failure be not something to be ashamed of its ...,15866
2,JON B. FISHER,2018,thank you very much my father commencement spe...,8544
3,MINDY KALING,2018,good morning to the class of the faculty the p...,15391
4,JESMYN WARD,2018,persist be patient be well good morning it be ...,14063


## 2. Prepare for topic modeling
Create gensim corpus and dictionarry

In [5]:
import pickle

with open("../dump/common_words.txt", "rb") as f:   # Unpickling
    common_words = pickle.load(f)

In [6]:
# Bag of words with CountVectorizer
# add_stop_words selected from after lemmatization
# will also remove common_words (most commonly used words in all speeches)
# will also remove boring words (words that do not add much insight to topic modeling)
add_stop_words = ['like','youre','ive','im','really','id','ve','just','dont','thi','wa',
                  'say','know','make','people']

boring_words = ['say','like','just','dont','don','im',
                  'ive','youll','youve','things','thing','youre','right','really','lot',
                  'make','know','people','way','day','class']


add_stop_words = add_stop_words + common_words + boring_words

stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)

cv = CountVectorizer(stop_words=stop_words)
data_cv = cv.fit_transform(speech_df.transcript)

In [7]:
# If add_stop_words is modified, update tdm
data_dtm = cv_dtm(speech_df,'transcript',add_stop_words)
tdm = data_dtm.transpose()

In [8]:
# We're going to put the term-document matrix into a new gensim format
# From df --> sparse matrix --> gensim corpus
sparse_counts = scipy.sparse.csr_matrix(tdm)
corpus = matutils.Sparse2Corpus(sparse_counts)

In [9]:
# Gensim also requires dictionary of the all terms and their respective location in the term-document matrix
# {dictionsry of location: word}
id2word = dict((v, k) for k, v in cv.vocabulary_.items())
len(id2word)

36121

## 3. Topic modeling - LDA

Latent Dirichlet Allocation (**LDA**): specifically designed for text data with a probabilistic approach.  
Use two probability values: **P(word | topics)** and **P(topics | documents)**  
The two values arre calculated based on an initial random assignment, after which they are repeated for each word in each document to decide their topic assignment. In the iterative procedure, the probabilities are calculated until converged.


Input:
* document-term matrix
* number of topics
* number of iterstions.

In [10]:
def get_lda_topics(model, num_topics):
    """Print lda topics with pd.DataFrame"""
    
    word_dict = {}
    for i in range(num_topics):
        words = model.show_topic(i, topn = 10)
        word_dict['Topic #' + '{:02d}'.format(i+1)] = [i[0] for i in words]
        
    return pd.DataFrame(word_dict).transpose()

### A. All text

In [11]:
# We need to specify two parameters: the number of topics and the number of passes
num_topics = 2
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=num_topics, passes=10)
get_lda_topics(lda, num_topics)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Topic #01,ask,mean,help,write,let,try,start,talk,remember,better
Topic #02,little,mean,let,ask,remember,women,start,try,thank,feel


In [12]:
# LDA for num_topics = 3
num_topics = 3
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=num_topics, passes=10)
get_lda_topics(lda, num_topics)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Topic #01,remember,try,little,talk,ask,help,start,mean,write,feel
Topic #02,human,man,state,mean,education,country,women,believe,help,change
Topic #03,ask,start,let,little,mean,try,remember,parent,thank,didnt


In [13]:
# LDA for num_topics = 4
num_topics = 4
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=num_topics, passes=10)
get_lda_topics(lda, num_topics)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Topic #01,remember,little,let,ask,change,start,mean,talk,try,didnt
Topic #02,write,mean,ask,question,try,let,feel,place,experience,little
Topic #03,women,start,ask,remember,believe,thank,help,mean,let,write
Topic #04,dream,help,mean,start,little,ask,try,let,thank,change


### B. Nouns only

In [14]:
# Let's create a function to pull out nouns from a string of text
from nltk import word_tokenize, pos_tag

def nouns(text):
    '''Given a string of text, tokenize the text and pull out only the nouns.'''
    is_noun = lambda pos: pos[:2] == 'NN'
    tokenized = word_tokenize(text)
    all_nouns = [word for (word, pos) in pos_tag(tokenized) if is_noun(pos)] 
    return ' '.join(all_nouns)

In [15]:
# Apply the nouns function to the transcripts to filter only on nouns
speech_df['nouns'] = speech_df.transcript.apply(nouns)
speech_df.head()

Unnamed: 0,speaker,year,transcript,length,nouns
0,SIDDHARTHA MUKHERJEE,2018,i wish someone have tell me at my own commence...,14487,i someone commencement requirement graduation ...
1,ABBY WAMBACK,2018,failure be not something to be ashamed of its ...,15866,failure something something power failure octa...
2,JON B. FISHER,2018,thank you very much my father commencement spe...,8544,thank father commencement speaker martin luthe...
3,MINDY KALING,2018,good morning to the class of the faculty the p...,15391,morning class faculty parent grandparents hono...
4,JESMYN WARD,2018,persist be patient be well good morning it be ...,14063,persist morning honor pleasure share day presi...


In [16]:
# Create dtm_n (document-term matrix with nouns only)
cv_n = CountVectorizer(stop_words=stop_words)
data_cv_n = cv_n.fit_transform(speech_df.nouns)
dtm_n = cv_dtm(speech_df,'nouns',add_stop_words)
# dtm_n

In [17]:
# Create the gensim corpus
corpus_n = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(dtm_n.transpose()))

# Create the vocabulary dictionary
id2word_n = dict((v, k) for k, v in cv_n.vocabulary_.items())

In [18]:
# Let's start with 2 topics
num_topics = 2
lda_n = models.LdaModel(corpus=corpus_n, num_topics=num_topics, id2word=id2word_n, passes=10)
get_lda_topics(lda_n, num_topics)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Topic #01,parent,man,education,place,question,women,year,word,friends,country
Topic #02,dream,university,year,women,place,parent,family,course,state,word


In [19]:
# Let's start with 3 topics
num_topics = 3
lda_n = models.LdaModel(corpus=corpus_n, num_topics=num_topics, id2word=id2word_n, passes=10)
get_lda_topics(lda_n, num_topics)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Topic #01,parent,friends,year,question,family,place,person,story,point,home
Topic #02,women,dream,university,education,year,president,state,country,word,place
Topic #03,man,education,war,state,place,country,word,men,kind,peace


In [20]:
# Let's start with 4 topics
num_topics = 4
lda_n = models.LdaModel(corpus=corpus_n, num_topics=num_topics, id2word=id2word_n, passes=10)
get_lda_topics(lda_n, num_topics)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Topic #01,parent,year,dream,person,family,advice,moment,friends,place,home
Topic #02,parent,sense,idea,generation,word,place,year,country,friends,president
Topic #03,women,question,state,education,year,parent,university,course,company,dream
Topic #04,man,women,place,men,success,word,course,career,university,country


In [21]:
# Test for network analysis
topic_df = get_lda_topics(lda_n, num_topics)
topic_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Topic #01,parent,year,dream,person,family,advice,moment,friends,place,home
Topic #02,parent,sense,idea,generation,word,place,year,country,friends,president
Topic #03,women,question,state,education,year,parent,university,course,company,dream
Topic #04,man,women,place,men,success,word,course,career,university,country


In [23]:
topic_df.to_pickle('../dump/topic_df')

In [24]:
# Let's start with 10 topics
num_topics = 10
lda_n = models.LdaModel(corpus=corpus_n, num_topics=num_topics, id2word=id2word_n, passes=30)
get_lda_topics(lda_n, num_topics)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Topic #01,team,state,story,family,man,war,president,year,football,moment
Topic #02,women,success,wonder,science,men,dream,career,question,arts,team
Topic #03,dream,question,idea,kind,country,course,president,place,kid,parent
Topic #04,question,dream,book,women,year,parent,speech,writer,point,heart
Topic #05,man,word,god,soul,place,hand,question,men,year,heart
Topic #06,point,future,experience,moment,commencement,matter,friends,days,home,place
Topic #07,dream,person,course,place,moment,year,community,women,experience,art
Topic #08,parent,women,friends,year,company,family,word,talk,story,place
Topic #09,university,education,career,business,company,parent,music,family,success,dream
Topic #10,country,education,state,place,war,sense,man,society,university,parent


In [25]:
# Let's start with 20 topics
num_topics = 20
lda_n = models.LdaModel(corpus=corpus_n, num_topics=num_topics, id2word=id2word_n, passes=40)
get_lda_topics(lda_n, num_topics)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Topic #01,advice,parent,voice,moment,friends,book,mother,women,marriage,happiness
Topic #02,women,word,man,men,place,year,friends,law,mind,parent
Topic #03,degree,story,family,guy,game,room,coach,man,plan,trinity
Topic #04,course,book,art,science,gift,number,film,power,hours,year
Topic #05,dream,advice,question,arts,humanities,art,place,story,commencement,disease
Topic #06,parent,university,students,family,career,person,friends,education,success,year
Topic #07,station,guy,luck,radio,group,friend,face,career,phone,story
Topic #08,wonder,science,religion,music,cells,arts,play,curiosity,awe,paradigm
Topic #09,dream,company,parent,university,education,business,person,kind,opportunity,year
Topic #10,fear,future,moment,practice,point,capacity,experience,heart,place,ones


### B. Nouns and adjective

In [26]:
# Let's create a function to pull out nouns from a string of text
from nltk import word_tokenize, pos_tag

def nouns_adj(text):
    '''Given a string of text, tokenize the text and pull out only the nouns.'''
    is_noun = lambda pos: pos[:2] == 'NN' or pos[:2] == 'JJ'
    tokenized = word_tokenize(text)
    all_nouns = [word for (word, pos) in pos_tag(tokenized) if is_noun(pos)] 
    return ' '.join(all_nouns)

In [27]:
# Apply the nouns function to the transcripts to filter only on nouns
speech_df['nouns_adj'] = speech_df.transcript.apply(nouns_adj)
speech_df.head()

Unnamed: 0,speaker,year,transcript,length,nouns,nouns_adj
0,SIDDHARTHA MUKHERJEE,2018,i wish someone have tell me at my own commence...,14487,i someone commencement requirement graduation ...,i someone own commencement requirement graduat...
1,ABBY WAMBACK,2018,failure be not something to be ashamed of its ...,15866,failure something something power failure octa...,failure something something power failure high...
2,JON B. FISHER,2018,thank you very much my father commencement spe...,8544,thank father commencement speaker martin luthe...,thank much father commencement speaker great m...
3,MINDY KALING,2018,good morning to the class of the faculty the p...,15391,morning class faculty parent grandparents hono...,good morning class faculty parent grandparents...
4,JESMYN WARD,2018,persist be patient be well good morning it be ...,14063,persist morning honor pleasure share day presi...,persist patient good morning honor pleasure sh...


In [28]:
# Add boring words that do not add much insight to topic modeling

boring_words = ['say','like','just','dont','don','im',
                  'ive','youll','youve','things','thing','youre','right','really','lot',
                  'make','know','people','way','day','class']
add_stop_words = add_stop_words + boring_words
stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)

In [29]:
# Create dtm_n (document-term matrix with nouns only)
cv_na = CountVectorizer(stop_words=stop_words)
data_cv_na = cv_na.fit_transform(speech_df.nouns_adj)
dtm_na = cv_dtm(speech_df,'nouns_adj',add_stop_words)
# dtm_na

In [30]:
# Create the gensim corpus
corpus_na = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(dtm_na.transpose()))

# Create the vocabulary dictionary
id2word_na = dict((v, k) for k, v in cv_na.vocabulary_.items())

In [31]:
# Let's start with 2 topics
num_topics=2
lda_na = models.LdaModel(corpus=corpus_na, num_topics=num_topics, id2word=id2word_na, passes=10)
get_lda_topics(lda_na, num_topics)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Topic #01,little,parent,dream,important,year,best,family,big,friends,hard
Topic #02,human,women,man,country,state,place,little,education,young,word


In [32]:
# Let's start with 3 topics
num_topics=3
lda_na = models.LdaModel(corpus=corpus_na, num_topics=num_topics, id2word=id2word_na, passes=10)
get_lda_topics(lda_na, num_topics)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Topic #01,little,dream,women,parent,success,fear,place,course,year,best
Topic #02,little,parent,year,big,family,best,important,young,friends,better
Topic #03,human,state,man,women,education,country,place,parent,question,university


In [33]:
# Let's start with 4 topics
num_topics=4
lda_na = models.LdaModel(corpus=corpus_na, num_topics=num_topics, id2word=id2word_na, passes=10)
get_lda_topics(lda_na, num_topics)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Topic #01,place,question,little,human,parent,women,young,best,war,year
Topic #02,parent,women,education,little,university,best,year,course,place,country
Topic #03,parent,little,word,university,education,young,place,future,fear,kind
Topic #04,little,big,important,man,dream,family,parent,year,best,true


In [34]:
# Let's start with 10 topics
num_topics=10
lda_na = models.LdaModel(corpus=corpus_na, num_topics=num_topics, id2word=id2word_na, passes=30)
get_lda_topics(lda_na, num_topics)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Topic #01,question,university,human,experience,sense,universe,little,parent,education,state
Topic #02,best,word,place,parent,friends,advice,little,sense,share,better
Topic #03,man,men,soul,true,old,government,heart,truth,problems,god
Topic #04,little,dream,parent,big,family,best,year,success,course,important
Topic #05,mother,story,act,god,big,man,men,women,woman,father
Topic #06,word,old,little,young,moment,question,guy,hand,sound,fear
Topic #07,education,war,human,state,women,peace,unite,man,country,men
Topic #08,future,parent,idea,generation,big,little,america,point,past,present
Topic #09,women,human,society,country,place,state,social,public,better,little
Topic #10,question,person,deaf,university,hard,education,year,president,important,students


In [35]:
# Let's start with 20 topics
num_topics=20
lda_na = models.LdaModel(corpus=corpus_na, num_topics=num_topics, id2word=id2word_na, passes=40)
get_lda_topics(lda_na, num_topics)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Topic #01,university,science,wonder,education,country,percent,problems,knowledge,challenge,state
Topic #02,little,important,best,moment,hard,point,advice,family,big,parent
Topic #03,man,war,better,young,hand,father,little,football,family,place
Topic #04,women,university,education,advice,man,passion,place,little,students,mit
Topic #05,purpose,place,little,kid,sense,big,parent,better,career,best
Topic #06,company,parent,little,old,guy,kind,different,career,big,year
Topic #07,music,risk,musicians,failure,dream,home,musician,moments,word,weekend
Topic #08,western,west,human,europe,war,state,education,countries,unite,difficult
Topic #09,women,men,little,state,woman,country,fact,important,kind,best
Topic #10,unite,nations,dream,opportunity,thank,parent,women,word,title,generation


In [36]:
# lda_na.print_topics()