In [None]:
"""
Refine topic modeling

- Model: NMF (Tf-idf)
- Data: divide all transcripts into subgroups
- Goal: see the evolution of topic trends

"""

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from collections import Counter

from sklearn.feature_extraction import text 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.pipeline import make_pipeline

from sklearn.decomposition import NMF
from sklearn.preprocessing import normalize

from wordcloud import WordCloud

from gensim import matutils, models
import scipy.sparse

import re
import string

import nltk
from nltk import pos_tag
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.stem import WordNetLemmatizer



%load_ext autoreload
%autoreload 2
import sys
sys.path.append('/Users/katiehuang/Desktop/metis/projects/onl_ds5_project_4/py')
from word_cloud import *
from clean import *
import importlib

In [2]:
# Let's read in our transcripts and document-term matrix
speech_df_raw = pd.read_pickle('../dump/speech_df')
speech_df = pd.read_pickle('../dump/speech_clean_lemma')
data = pd.read_pickle('../dump/data_dtm_lemma.pkl')
tdm = data.transpose()
tdm.shape

(36156, 441)

## 1. Divide the transcripts base on time
Aim to divied into 5 parts; each ~ 90 speeches.
1. Pre 2003
2. 2003-2009
3. 2009-2012
4. 2012-2015
5. 2015-

In [3]:
# Examine years
sorted_year = speech_df.sort_values('year')
sorted_year

Unnamed: 0,speaker,year,transcript,length
440,GARRISON KEILLOR,0,its an honor to be with so many smart people a...,8063
439,RALPH WALDO,1838,in this refulgent summer it have be a luxury t...,40403
438,OPRAH WINFREY,1918,thank you wallis annenberg and a special thank...,15301
437,FRANKLIN D ROOSEVELT,1932,for me as for you this be a day of honorable a...,16017
436,CARRIE CHAPMAN,1936,i bring a message to sweet briar college and e...,22942
...,...,...,...,...
4,JESMYN WARD,2018,persist be patient be well good morning it be ...,14063
3,MINDY KALING,2018,good morning to the class of the faculty the p...,15391
2,JON B. FISHER,2018,thank you very much my father commencement spe...,8544
1,ABBY WAMBACK,2018,failure be not something to be ashamed of its ...,15866


In [4]:
# Update correct year
sorted_year.loc[440,'year'] = 2015
sorted_year = sorted_year.sort_values('year')

In [5]:
# Count number of speech in each time slot
speech_df_1 = sorted_year[sorted_year.year<2003]
speech_df_2 = sorted_year[(sorted_year.year>= 2003)&(sorted_year.year<2009)]
speech_df_3 = sorted_year[(sorted_year.year>= 2009)&(sorted_year.year<2012)]
speech_df_4 = sorted_year[(sorted_year.year>= 2012)&(sorted_year.year<2015)]
speech_df_5 = sorted_year[sorted_year.year>= 2015]

speech_df_1.shape[0],\
speech_df_2.shape[0],\
speech_df_3.shape[0],\
speech_df_4.shape[0],\
speech_df_5.shape[0],

(87, 90, 94, 101, 69)

## 2. Preparation for topic modeling

In [27]:
def topic_modeling(speech_df_x,column_name,num_topic):
    """
    For a given df and column_name (e.g.:'transcript' for all text, 'nouns' for nouns only),
    return the topic_word dataframe
    
    """
    
    # Prepare Tf-idf
    tfidf = TfidfVectorizer()
    csr_mat = tfidf.fit_transform(speech_df_x[column_name])
    doc_word = tfidf_dtm(speech_df_x,column_name)

    # Modeling
    nmf_model = NMF(num_topic, max_iter=1000)
    doc_topic = nmf_model.fit_transform(doc_word)

    # Use components in NMF model to find the top 10 words for a given topic
    topics = nmf_model.components_.argsort(axis=1)[:,-1:-11:-1]

    # Create topic_worrd df
    words = doc_word.columns
    topic_words = [[words[index] for index in topic] for topic in topics]
    topic_word_df = pd.DataFrame(topic_words,index=['Topic #' + '{:02d}'.format(i) for i in range(num_topic)])
    
    return topic_word_df

## 3. Topic modeling for each time frame

### A. Pre 2003

In [7]:
first = speech_df_raw[speech_df_raw.year<2003].copy()

In [8]:
# Wash the transcripts!
speech_df_x = wash(first,'transcript').copy()

#### All text

In [9]:
topic_modeling(speech_df_x,'transcript',6)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Topic #00,women,talk,write,try,word,feel,hear,music,mean,children
Topic #01,football,coach,beat,war,win,british,german,study,outwork,pay
Topic #02,peace,war,human,economic,society,country,political,man,problems,nation
Topic #03,tennis,sport,dream,play,team,remember,mentor,player,percent,friendships
Topic #04,journey,company,technology,path,leadership,law,business,media,tool,engage
Topic #05,bet,wheelbarrow,jump,success,ski,game,successful,fail,money,proud


#### Nouns only

In [10]:
topic_modeling(speech_df_x,'nouns',6)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Topic #00,women,word,children,music,parent,book,friends,course,students,education
Topic #01,tennis,sport,team,mentor,player,percent,friendships,dream,women,numberone
Topic #02,journey,company,technology,business,path,media,law,leadership,career,gut
Topic #03,football,coach,war,bomb,ship,man,players,study,code,play
Topic #04,bet,jump,wheelbarrow,success,game,money,cameras,agency,story,crowd
Topic #05,peace,war,society,country,man,nation,freedom,nations,vision,generation


### B. 2003-2009

In [11]:
second = speech_df_raw[(speech_df_raw.year>= 2003)&(speech_df_raw.year<2009)].copy()

In [12]:
# Wash the transcripts!
speech_df_x = wash(second,'transcript').copy()

#### All text

In [13]:
topic_modeling(speech_df_x,'transcript',6)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Topic #00,mean,best,write,talk,feel,try,believe,ask,future,word
Topic #01,community,company,selfconfidence,business,purpose,confidence,help,grow,dream,advice
Topic #02,failure,fear,corridor,experience,nightmares,choose,enable,imagination,parent,graduation
Topic #03,art,science,botany,visual,book,shell,genus,seashells,paint,artist
Topic #04,degree,doctor,father,mother,swiss,sister,education,doctorate,saxophone,perform
Topic #05,game,percent,rule,ask,dream,country,doors,play,kid,moveable


#### Nouns only

In [14]:
topic_modeling(speech_df_x,'nouns',6)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Topic #00,community,education,degree,dream,mother,family,sister,opportunity,thank,doctor
Topic #01,game,doors,kid,ball,door,husband,neighborhood,coach,country,moments
Topic #02,fear,failure,idea,parent,age,mind,sense,experience,word,generation
Topic #03,company,selfconfidence,business,confidence,purpose,career,engineer,leadership,team,grow
Topic #04,art,science,book,shell,paint,botany,seashells,genus,artist,course
Topic #05,percent,rule,environment,sciences,box,fraternity,governments,challenge,fund,obesity


### C. 2009-2012

In [15]:
third = speech_df_raw[(speech_df_raw.year>= 2009)&(speech_df_raw.year<2012)].copy()

In [16]:
# Wash the transcripts!
speech_df_x = wash(third,'transcript').copy()

#### All text

In [17]:
topic_modeling(speech_df_x,'transcript',6)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Topic #00,mean,dream,start,hard,happen,talk,remember,ask,parent,friends
Topic #01,women,genius,necessary,conversation,lean,particular,inside,belong,balance,success
Topic #02,bird,dream,obstacles,shingle,polluters,trigonometry,compliance,thuddy,grammar,instructors
Topic #03,education,change,government,human,future,knowledge,value,public,challenge,country
Topic #04,rhythm,screen,write,wisdom,leisure,garden,sleep,key,productive,commercial
Topic #05,money,father,listen,decision,dream,absolutely,suppose,dollars,film,read


#### Nouns only

In [18]:
topic_modeling(speech_df_x,'nouns',6)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Topic #00,education,company,change,country,place,parent,future,question,business,government
Topic #01,women,genius,conversation,success,workforce,balance,voice,belong,worlds,girl
Topic #02,bird,dream,obstacles,polluters,thuddy,drift,compliance,grammar,instructors,bust
Topic #03,money,play,father,decision,film,dream,dollars,tv,line,eye
Topic #04,sleep,garden,leisure,screen,loan,fact,wisdom,lack,fear,success
Topic #05,policy,grid,commencement,dream,mule,blue,man,diploma,months,address


### D. 2012-2015

In [19]:
fourth = speech_df_raw[(speech_df_raw.year>= 2012)&(speech_df_raw.year<2015)].copy()

In [20]:
# Wash the transcripts!
speech_df_x = wash(fourth,'transcript').copy()

#### All text

In [21]:
topic_modeling(speech_df_x,'transcript',6)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Topic #00,write,fear,start,parent,worry,change,happen,remember,ask,try
Topic #01,advice,silent,hater,silence,voice,ignorance,women,hiphop,library,quiet
Topic #02,generation,challenge,global,women,young,success,country,future,place,build
Topic #03,easier,substitute,diminish,phone,communication,retreat,entangle,dimension,schlep,instantaneous
Topic #04,dream,mother,help,baseball,reach,goal,hard,kid,idiots,education
Topic #05,perceive,best,treat,change,theres,hope,reset,flight,logic,distinct


#### Nouns only

In [22]:
topic_modeling(speech_df_x,'nouns',6)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Topic #00,parent,moment,story,fear,money,plan,chance,person,room,year
Topic #01,advice,hater,voice,ignorance,women,silence,hiphop,book,marriage,pigeon
Topic #02,generation,women,success,country,challenge,place,history,century,education,problems
Topic #03,substitute,phone,communication,someones,dimension,distract,entangle,responses,expense,emphasis
Topic #04,dream,mother,help,baseball,goal,education,friends,teachers,idiots,road
Topic #05,team,cookie,coach,football,players,jar,baseball,thesis,integrity,character


### E. 2015-

In [23]:
fifth = speech_df_raw[(speech_df_raw.year>= 2015)].copy()

In [24]:
# Wash the transcripts!
speech_df_x = wash(fifth,'transcript').copy()

#### All text

In [28]:
topic_modeling(speech_df_x,'transcript',6)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Topic #00,advice,write,letter,play,film,speech,start,sit,try,let
Topic #01,mother,privilege,feminist,makeup,twist,sign,shape,women,feminism,gender
Topic #02,purpose,create,sense,connect,generation,build,communities,start,freedom,global
Topic #03,question,resilience,listen,ask,mean,hope,joy,help,understand,cancer
Topic #04,company,let,voice,dream,success,choose,career,share,path,define
Topic #05,team,players,win,baseball,field,game,soccer,coach,play,women


#### Nouns only

In [26]:
topic_modeling(speech_df_x,'nouns',6)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Topic #00,advice,speech,team,letter,play,women,year,plan,money,service
Topic #01,purpose,sense,generation,communities,freedom,project,company,society,millennials,millions
Topic #02,mother,shape,gender,women,prize,sign,feminism,makeup,privilege,earth
Topic #03,resilience,question,company,days,share,joy,children,step,family,father
Topic #04,cancer,patients,listen,patient,doctor,hospital,medicine,poker,history,leukemia
Topic #05,film,voice,movie,character,career,dream,actor,director,point,movies
