In [None]:
"""
(Topic modeling continued)
Use Tf-idf and NMF

"""

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from collections import Counter

from sklearn.feature_extraction import text 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.pipeline import make_pipeline

from sklearn.decomposition import NMF
from sklearn.preprocessing import normalize

from wordcloud import WordCloud

from gensim import matutils, models
import scipy.sparse

%load_ext autoreload
%autoreload 2
import sys
sys.path.append('/Users/katiehuang/Desktop/metis/projects/onl_ds5_project_4/py')
from word_cloud import *
import importlib

In [2]:
# Let's read in our transcripts and document-term matrix
speech_df = pd.read_pickle('../dump/speech_clean_lemma')
data = pd.read_pickle('../dump/data_dtm_lemma.pkl')
tdm = data.transpose()
tdm.shape

(36156, 441)

## 4. Topic modeling - NMF

Besides LDA, there are other matrix factorization techniques such as Latent Semantic Indexing (**LSI**) and non-negative Matrix Factorization (**NMF**).

NMF is similar to Principal component analysis (**PCA**), but NMF models are interpretable. Vectors are non-negative; by factoring them into the lower-dimensional form, coefficients are also non-negative.

- Documents are expressed as combinations of topics
- Images are expressed as combinations of patterns

NMF has components (dimension of components = dimension of samples)  
> **sample = feature * components**  
> (transcript = _____ * topics)

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
tfidf = TfidfVectorizer()

In [5]:
csr_mat = tfidf.fit_transform(speech_df['transcript'])

In [6]:
# print(csr_mat)

In [7]:
csr_mat.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [8]:
words = tfidf.get_feature_names()

In [9]:
tfidf_dtm = pd.DataFrame(csr_mat.toarray(), columns=tfidf.get_feature_names())
tfidf_dtm.index = speech_df.index
tfidf_dtm.iloc[:,:-1]

Unnamed: 0,aa,aahhhh,aaron,aback,abalthus,abandon,abandonment,abate,abbot,abbreviation,...,ôi,ômay,ôsobriety,ôtell,ôthe,ôwe,ôwhat,ôyou,ôyouõre,über
0,0.0,0.0,0.0,0.0,0.0,0.012061,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
436,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
437,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
438,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
439,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [152]:
# Handle words to remove
add_stop_words = ['like','youre','ive','im','really','id','ve','just','dont','didnt','thi','wa',
                  'say','know','make','people']

boring_words = ['say','like','just','dont','don','im',
                'ive','youll','youve','things','thing','youre','right','really','lot',
                'make','know','people','way','day','class',
                'little', 'maybe','niagara','university','dartmouth','woman', 'womens','wellesley',
                'shirtwaist','scripps','aidan','tuskegee','dr','colleges', 'guy', 'dave',
                'arts','montgomery','girls']

import pickle
with open("../dump/common_words.txt", "rb") as f:   # Unpickling
    common_words = pickle.load(f)
    
add_stop_words = add_stop_words + common_words + boring_words

stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)
# stop_words

In [153]:
def tfidf_dtm(df,column_name,add_stop_words=[]):
    """
    Input: corpus (Ex: speech_clean_2, 'transcript')
    Output: Document-Term Matrix (rows: documents, columns: words)
    
    """
    tfidf = TfidfVectorizer(stop_words=stop_words)
    data_tfidf = tfidf.fit_transform(df[column_name])

    tfidf_dtm = pd.DataFrame(data_tfidf.toarray(), columns=tfidf.get_feature_names())
    tfidf_dtm.index = df.index
    
    return tfidf_dtm

In [157]:
doc_word = tfidf_dtm(speech_df,'transcript')
doc_word.iloc[:,:-1]

Unnamed: 0,aa,aahhhh,aaron,aback,abalthus,abandon,abandonment,abate,abbot,abbreviation,...,ôi,ômay,ôsobriety,ôtell,ôthe,ôwe,ôwhat,ôyou,ôyouõre,über
0,0.0,0.0,0.0,0.0,0.0,0.021631,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
436,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
437,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
438,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
439,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [155]:
# Use NMF model, specify number of topics
# (Following Vinny's lecture)
nmf_model = NMF(6, max_iter=800)
doc_topic = nmf_model.fit_transform(doc_word)
doc_topic.shape

(441, 6)

In [156]:
# Use components in NMF model to find the top 10 words for a given topic
topics = nmf_model.components_.argsort(axis=1)[:,-1:-11:-1]

# Create topic_worrd df
words = doc_word.columns
topic_words = [[words[index] for index in topic] for topic in topics]
pd.DataFrame(topic_words,index=['Topic #' + '{:02d}'.format(i) for i in range(6)])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Topic #00,write,try,happen,advice,start,ask,talk,thank,parent,remember
Topic #01,america,government,war,country,nations,unite,education,state,human,nation
Topic #02,women,men,factory,workers,mean,lean,mother,triangle,privilege,feminist
Topic #03,company,business,purpose,start,career,technology,dream,build,journey,passion
Topic #04,dream,team,coach,play,game,football,win,tennis,ball,baseball
Topic #05,fear,experience,art,human,mind,present,feel,practice,science,simply


In [151]:
len(words)

36102

### K-means (clustering)

In [13]:
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.pipeline import make_pipeline

In [14]:
svd = TruncatedSVD(n_components=50)

In [15]:
kmeans = KMeans(n_clusters=6)

In [16]:
pipeline = make_pipeline(svd,kmeans)

In [17]:
pipeline.fit(csr_mat)

Pipeline(steps=[('truncatedsvd', TruncatedSVD(n_components=50)),
                ('kmeans', KMeans(n_clusters=6))])

In [18]:
labels = pipeline.predict(csr_mat)

In [19]:
speakers = list(speech_df.speaker)
# speakers

In [20]:
df = pd.DataFrame({'label': labels, 'speaker': speakers})
print(df.sort_values('label'))

     label              speaker
314      0      ESTELLE PARSONS
258      0         CARL SCHRAMM
181      0  ARRIANNA HUFFINGTON
412      0         SUSAN SONTAG
386      0           CARL SAGAN
..     ...                  ...
245      4           JERRY YANG
169      4         FRED ARMISEN
52       4          RON SUSKIND
69       4             ED HELMS
49       5           JACK BLACK

[441 rows x 2 columns]


### NMF (DataCamp)

In [None]:
from sklearn.decomposition import NMF

In [None]:
model = NMF(n_components=6)
model.fit(csr_mat)

In [None]:
nmf_features = model.transform(csr_mat)
# nmf_features

# Normalize the NMF features
from sklearn.preprocessing import normalize
norm_features = normalize(nmf_features)

In [None]:
print(model.components_)

In [None]:
df = pd.DataFrame(nmf_features,index=speakers)
df

#### Recommender: cosine similarity

In [None]:
# Recommender using cosine similarity

df = pd.DataFrame(norm_features,index=speakers)
transcript = df.iloc[0]
similarities = df.dot(transcript)

In [None]:
# Find speeches most similar to that of SIDDHARTHA MUKHERJEE
print(similarities.nlargest())

In [None]:
print(similarities.nsmallest())

### NMF (Vinny's lecture)

In [21]:
vectorizer = CountVectorizer(stop_words = 'english')
doc_word = vectorizer.fit_transform(speech_df.transcript)
doc_word.shape

(441, 36168)

In [28]:
doc_word

<441x36168 sparse matrix of type '<class 'numpy.int64'>'
	with 234140 stored elements in Compressed Sparse Row format>

In [22]:
nmf_model = NMF(6)
doc_topic = nmf_model.fit_transform(doc_word)
doc_topic.shape



(441, 6)

In [23]:
words = vectorizer.get_feature_names()
words[27030]

'say'

In [24]:
topics = nmf_model.components_.argsort(axis=1)[:,-1:-11:-1]
# topics

In [25]:
topic_words = [[words[index] for index in topic] for topic in topics]
pd.DataFrame(topic_words,index=['Topic #' + '{:02d}'.format(i) for i in range(6)])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Topic #00,just,like,know,think,im,dont,youre,people,want,make
Topic #01,world,people,make,new,human,state,country,war,time,right
Topic #02,say,know,tell,years,come,people,want,think,dont,look
Topic #03,love,man,life,soul,god,men,make,like,come,live
Topic #04,life,make,time,work,years,live,know,learn,want,school
Topic #05,wonder,science,religion,question,make,cells,arts,ask,think,knowledge


In [26]:
speech_df['topic'] = doc_topic.argmax(axis=1)

In [27]:
speech_df.sort_values('topic').head(10)

Unnamed: 0,speaker,year,transcript,length,topic
440,GARRISON KEILLOR,0,its an honor to be with so many smart people a...,8063,0
339,SETH MACFARLANE,2004,thank you very much i tell you there be nowher...,15994,0
117,JIMMY IOVINE,2013,to all of todays graduate i cant imagine whats...,14963,0
119,TRACY CHEVALIER,2013,greet president krislov graduate students fami...,16863,0
120,JON LOVETT,2013,i recently turn thirty which i know seem like ...,10804,0
329,JON STEWART,2004,thank you mr president i have forget how crush...,9613,0
122,SHARYN ALFONSI,2013,ole miss journalism and integrate market commu...,15057,0
345,WILL FERRELL,2003,this be not the worcester mass boat show be it...,10594,0
324,THOMAS L FRIEDMAN,2005,it be an honor to stand before you this mornin...,22392,0
127,ED HELMS,2013,hello knox collegethank you students faculty p...,10506,0
