In [None]:
"""
(Topic modeling continued)
Use Tf-idf and NMF

"""

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from collections import Counter

from sklearn.feature_extraction import text 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.pipeline import make_pipeline

from sklearn.decomposition import NMF
from sklearn.preprocessing import normalize

from wordcloud import WordCloud

from gensim import matutils, models
import scipy.sparse

%load_ext autoreload
%autoreload 2
import sys
sys.path.append('/Users/katiehuang/Desktop/metis/projects/onl_ds5_project_4/py')
from word_cloud import *
import importlib

In [3]:
# Let's read in our transcripts and document-term matrix
speech_df = pd.read_pickle('../dump/speech_clean_lemma')
data = pd.read_pickle('../dump/data_dtm_lemma.pkl')
tdm = data.transpose()
tdm.shape

(36156, 441)

## 4. Topic modeling - NMF

Besides LDA, there are other matrix factorization techniques such as Latent Semantic Indexing (**LSI**) and non-negative Matrix Factorization (**NMF**).

NMF is similar to Principal component analysis (**PCA**), but NMF models are interpretable. Vectors are non-negative; by factoring them into the lower-dimensional form, coefficients are also non-negative.

- Documents are expressed as combinations of topics
- Images are expressed as combinations of patterns

NMF has components (dimension of components = dimension of samples)  
> **sample = feature * components**  
> (transcript = _____ * topics)

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
tfidf = TfidfVectorizer()

In [6]:
csr_mat = tfidf.fit_transform(speech_df['transcript'])

In [24]:
# print(csr_mat)

In [8]:
csr_mat.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [10]:
words = tfidf.get_feature_names()

In [109]:
tfidf_dtm = pd.DataFrame(csr_mat.toarray(), columns=tfidf.get_feature_names())
tfidf_dtm.index = speech_df.index
tfidf_dtm.iloc[:,:-1]

Unnamed: 0,aa,aahhhh,aaron,aback,abalthus,abandon,abandonment,abate,abbot,abbreviation,...,ôi,ômay,ôsobriety,ôtell,ôthe,ôwe,ôwhat,ôyou,ôyouõre,über
0,0.0,0.0,0.0,0.0,0.0,0.012061,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
436,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
437,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
438,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
439,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [115]:
# Handle stop words
add_stop_words = ['like','youre','ive','im','really','id','ve','just','dont','thi','wa',
                  'say','know','make','people']

boring_words = ['say','like','just','dont','don','im',
                  'ive','youll','youve','things','thing','youre','right','really','lot',
                  'make','know','people','way','day','class']

import pickle
with open("../dump/common_words.txt", "rb") as f:   # Unpickling
    common_words = pickle.load(f)
    
add_stop_words = add_stop_words + common_words + boring_words

stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)
# stop_words

In [118]:
def tfidf_dtm(df,column_name,add_stop_words=[]):
    """
    Input: corpus (Ex: speech_clean_2, 'transcript')
    Output: Document-Term Matrix (rows: documents, columns: words)
    
    """
    tfidf = TfidfVectorizer(stop_words=stop_words)
    data_tfidf = tfidf.fit_transform(df[column_name])

    tfidf_dtm = pd.DataFrame(data_tfidf.toarray(), columns=tfidf.get_feature_names())
    tfidf_dtm.index = df.index
    
    return tfidf_dtm

In [119]:
tfidf_dtm(speech_df,'transcript').iloc[:,:-1]

Unnamed: 0,aa,aahhhh,aaron,aback,abalthus,abandon,abandonment,abate,abbot,abbreviation,...,ôi,ômay,ôsobriety,ôtell,ôthe,ôwe,ôwhat,ôyou,ôyouõre,über
0,0.0,0.0,0.0,0.0,0.0,0.021622,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
436,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
437,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
438,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
439,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### K-means (clustering)

In [17]:
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.pipeline import make_pipeline

In [18]:
svd = TruncatedSVD(n_components=50)

In [19]:
kmeans = KMeans(n_clusters=6)

In [20]:
pipeline = make_pipeline(svd,kmeans)

In [25]:
pipeline.fit(csr_mat)

Pipeline(steps=[('truncatedsvd', TruncatedSVD(n_components=50)),
                ('kmeans', KMeans(n_clusters=6))])

In [27]:
labels = pipeline.predict(csr_mat)

In [31]:
speakers = list(speech_df.speaker)
# speakers

In [32]:
df = pd.DataFrame({'label': labels, 'speaker': speakers})
print(df.sort_values('label'))

     label           speaker
343      0    YVONNE THORTON
160      0      RONAN FARROW
161      0      SANJAY GUPTA
378      0  CHARLES W COLSON
164      0         NEIL HOWE
..     ...               ...
327      4       EARL BAKKEN
141      4    LOUIS B SUSMAN
305      4    CHRIS MATTHEWS
193      5     CYNTHIA ENLOE
106      5     HOWARD GORDON

[441 rows x 2 columns]


### NMF (DataCamp)

In [33]:
from sklearn.decomposition import NMF

In [44]:
model = NMF(n_components=6)
model.fit(csr_mat)



NMF(n_components=6)

In [52]:
nmf_features = model.transform(csr_mat)
# nmf_features

# Normalize the NMF features
from sklearn.preprocessing import normalize
norm_features = normalize(nmf_features)

In [46]:
print(model.components_)

[[4.21556158e-04 6.19164339e-04 0.00000000e+00 ... 0.00000000e+00
  2.06031355e-04 5.05794754e-04]
 [0.00000000e+00 0.00000000e+00 8.06709914e-04 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [6.96972481e-04 1.02116101e-03 1.84263374e-04 ... 6.74867788e-04
  0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 8.61977402e-05
  2.23903489e-04 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 5.90565900e-04 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [2.16586241e-03 0.00000000e+00 1.21690093e-03 ... 6.12141228e-04
  7.25312323e-04 3.22254312e-04]]


In [47]:
df = pd.DataFrame(nmf_features,index=speakers)
df

Unnamed: 0,0,1,2,3,4,5
SIDDHARTHA MUKHERJEE,0.031866,0.169803,0.040845,0.075233,0.095660,0.194245
ABBY WAMBACK,0.170827,0.135767,0.025368,0.117366,0.044826,0.026893
JON B. FISHER,0.097421,0.044180,0.089495,0.008672,0.112430,0.207049
MINDY KALING,0.143041,0.077636,0.139013,0.125458,0.012049,0.055104
JESMYN WARD,0.187328,0.052544,0.000000,0.041931,0.084480,0.192366
...,...,...,...,...,...,...
CARRIE CHAPMAN,0.087265,0.244183,0.000000,0.044966,0.100617,0.050038
FRANKLIN D ROOSEVELT,0.000000,0.260075,0.007759,0.089358,0.174171,0.079200
OPRAH WINFREY,0.116104,0.010120,0.185028,0.145698,0.073371,0.050137
RALPH WALDO,0.071360,0.278450,0.001799,0.083856,0.141080,0.000000


#### Recommender: cosine similarity

In [57]:
# Recommender using cosine similarity

df = pd.DataFrame(norm_features,index=speakers)
transcript = df.iloc[0]
similarities = df.dot(transcript)

In [59]:
# Find speeches most similar to that of SIDDHARTHA MUKHERJEE
print(similarities.nlargest())

SIDDHARTHA MUKHERJEE    1.000000
DAVID BRODER            0.985673
JOHN LEGEND             0.977784
JIM STEEN               0.969246
JANET YELLEN            0.964553
dtype: float64


In [61]:
print(similarities.nsmallest())

JAY LENO           0.194285
JACK BLACK         0.246147
RICHARD COSTOLO    0.254357
SANDRA BULLOCK     0.283517
MAYA RUDOLPH       0.294764
dtype: float64


### NMF (Vinny's lecture)

In [62]:
vectorizer = CountVectorizer(stop_words = 'english')
doc_word = vectorizer.fit_transform(speech_df.transcript)
doc_word.shape

(441, 36168)

In [98]:
nmf_model = NMF(6)
doc_topic = nmf_model.fit_transform(doc_word)
doc_topic.shape



(441, 6)

In [99]:
words = vectorizer.get_feature_names()
words[27030]

'say'

In [110]:
topics = nmf_model.components_.argsort(axis=1)[:,-1:-11:-1]
# topics

In [102]:
topic_words = [[words[index] for index in topic] for topic in topics]
pd.DataFrame(topic_words,index=['Topic #' + '{:02d}'.format(i) for i in range(6)])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Topic #00,just,like,know,think,im,dont,youre,people,want,make
Topic #01,world,people,make,new,human,state,country,war,time,right
Topic #02,say,know,tell,years,come,people,want,think,dont,look
Topic #03,love,man,life,soul,god,men,make,like,come,live
Topic #04,life,make,time,work,years,live,know,learn,want,school
Topic #05,wonder,science,religion,question,make,cells,arts,ask,think,knowledge


In [103]:
speech_df['topic'] = doc_topic.argmax(axis=1)

In [106]:
speech_df.sort_values('topic').head(10)

Unnamed: 0,speaker,year,transcript,length,topic
440,GARRISON KEILLOR,0,its an honor to be with so many smart people a...,8063,0
117,JIMMY IOVINE,2013,to all of todays graduate i cant imagine whats...,14963,0
119,TRACY CHEVALIER,2013,greet president krislov graduate students fami...,16863,0
120,JON LOVETT,2013,i recently turn thirty which i know seem like ...,10804,0
329,JON STEWART,2004,thank you mr president i have forget how crush...,9613,0
122,SHARYN ALFONSI,2013,ole miss journalism and integrate market commu...,15057,0
324,THOMAS L FRIEDMAN,2005,it be an honor to stand before you this mornin...,22392,0
339,SETH MACFARLANE,2004,thank you very much i tell you there be nowher...,15994,0
126,DREW HOUSTON,2013,thank you chairman reed and congratulations to...,12132,0
130,NEIL DEGRASSE,2013,i thank you for this warm introduction and the...,6151,0
