In [1]:
import pandas as pd
import numpy as np

## Load speech file

In [2]:
# load data from pickle file
import pickle

pickle_in = open("speeches.pickle", "rb")
speech_df = pickle.load(pickle_in)

In [3]:
speech_df.head()

Unnamed: 0,type,speaker,date,speech
0,convention,Hillary Clinton,2016,Thank you all for the great convention that we...
1,convention,Robert Dole,1996,The folks in Hollywood would be happy to know ...
2,convention,George W. Bush,2000,"Thank you. Thank you for this honor. [,],Thank..."
3,convention,George W. Bush,2004,"When I said those words 4 years ago, none of u..."
4,convention,John McCain,2008,"Tonight, I have a privilege given few American..."


In [4]:
speech_text = speech_df['speech']

## Pre-processing

In [5]:
import nltk

In [6]:
from textblob import TextBlob
from nltk.util import ngrams

from collections import Counter
from operator import itemgetter

counter = Counter()

from nltk.corpus import stopwords
stop = stopwords.words('english')
stop += ['.', ',', '(', ')', "'", '"']
stop = set(stop)

n = 3
for doc in speech_text:
    words = TextBlob(doc.lower()).words  # tokenize words
    words = [w for w in words if w not in stop]   
    bigrams = ngrams(words, n)
    counter += Counter(bigrams)

for phrase, count in counter.most_common(30):
    print('%20s %i' % (" ".join(phrase), count))

citizens united states 183
      ending june 30 177
government united states 175
    year ending june 168
people united states 161
    last fiscal year 145
  fiscal year ending 144
last session congress 141
 last annual message 138
president united states 132
united states america 116
 united states great 104
  part united states 100
states great britain 89
congress last session 82
    next fiscal year 76
report secretary war 76
 present fiscal year 75
report secretary treasury 69
    year ending 30th 68
 current fiscal year 67
united states government 66
report secretary navy 66
       ended june 30 65
     year ended june 64
constitution united states 62
    fiscal year 1947 62
interstate commerce commission 60
        world war ii 59
  bank united states 59


## VC and LDA

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [8]:
count_vectorizer = CountVectorizer(
    stop_words="english",
    ngram_range=(1, 2),
    token_pattern="\\b[a-z][a-z]+\\b"
)

In [9]:
count_vectorizer.fit(speech_text)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='\\b[a-z][a-z]+\\b',
        tokenizer=None, vocabulary=None)

In [10]:
counts = count_vectorizer.transform(speech_text)

In [11]:
from sklearn import decomposition

In [12]:
n_topics = 15

lda = decomposition.LatentDirichletAllocation(
    n_components=n_topics, 
    learning_method="online", 
    verbose=1, 
    max_iter=5, 
    n_jobs=-1
)

lda.fit(counts)

iteration: 1 of max_iter: 5
iteration: 2 of max_iter: 5
iteration: 3 of max_iter: 5
iteration: 4 of max_iter: 5
iteration: 5 of max_iter: 5


LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=5, mean_change_tol=0.001,
             n_components=15, n_jobs=-1, n_topics=None, perp_tol=0.1,
             random_state=None, topic_word_prior=None,
             total_samples=1000000.0, verbose=1)

In [13]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [14]:
print_top_words(lda, count_vectorizer.get_feature_names(), 15)

Topic #0: people world america government country great nation time peace new make party american states let
Topic #1: america new world people freedom time nation country great years government american let know party
Topic #2: america people trump donald new government donald trump years american world president nation country know great
Topic #3: people government america new great american country states time world years work president make shall
Topic #4: america people world country great years government nation new shall peace time make american americans
Topic #5: government states united congress people united states year country great public new time american war world
Topic #6: people world government america american country states new nation president know years great united peace
Topic #7: government people nation time country world america great states american new united shall party public
Topic #8: america world people government new country american time united years 

## TFIDF and NMF

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

In [16]:
t_vectorizer = TfidfVectorizer(
    stop_words="english",
    ngram_range=(1, 2),
    token_pattern="\\b[a-z][a-z]+\\b"
)

t_vectorizer.fit(speech_text)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='\\b[a-z][a-z]+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [17]:
t_counts = t_vectorizer.transform(speech_text)

In [18]:
n_topics = 15

nmf = decomposition.NMF(
    n_components=n_topics,
    max_iter=5
)

nmf.fit(t_counts)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=5,
  n_components=15, random_state=None, shuffle=False, solver='cd',
  tol=0.0001, verbose=0)

In [19]:
print_top_words(nmf, t_vectorizer.get_feature_names(), 10)

Topic #0: states government united states united congress public year country subject general
Topic #1: america tonight americans people new ve jobs american work children
Topic #2: government economic federal program world congress people national year farm
Topic #3: people world government nation shall peace great freedom nations power
Topic #4: soviet programs year oil world billion energy percent united soviet union
Topic #5: applause america ve iraq tonight let people applause let congress iraqi
Topic #6: america world tonight americans united war let american president peace
Topic #7: people government president constitution union federal energy strategic congress states
Topic #8: congress vietnam tonight think years year commitments believe surtax kappel
Topic #9: interstate law business corporations men conditions work interstate commerce man industrial
Topic #10: ve party republican president democratic platform people audience ll republican party
Topic #11: iraq terrorists wo

## Max Features

In [20]:
t_vectorizer = TfidfVectorizer(
    stop_words="english",
    token_pattern="\\b[a-z][a-z]+\\b",
    max_df=0.9
)

t_vectorizer.fit(speech_text)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.9, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='\\b[a-z][a-z]+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [21]:
t_counts = t_vectorizer.transform(speech_text)

In [22]:
n_topics = 30

nmf = decomposition.NMF(
    n_components=n_topics,
    max_iter=5
)

nmf.fit(t_counts)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=5,
  n_components=30, random_state=None, shuffle=False, solver='cd',
  tol=0.0001, verbose=0)

In [23]:
print_top_words(nmf, t_vectorizer.get_feature_names(), 10)

Topic #0: congress year department general secretary report subject service attention law
Topic #1: america tonight americans jobs let children years world work know
Topic #2: program economic federal world congress programs year farm development legislation
Topic #3: world free freedom men peoples let know shall life america
Topic #4: spain public commerce powers object vessels effect tribes treaty force
Topic #5: business interstate law corporations work conditions public men industrial man
Topic #6: applause ve america let budget tonight congress year iraq laughter
Topic #7: constitution union shall public citizens state powers interests laws constitutional
Topic #8: party republican platform democratic nomination say convention republicans campaign president
Topic #9: ve ll don jobs oil soviet know going want didn
Topic #10: banks gold notes silver currency treasury public year circulation cent
Topic #11: mexico texas treaty congress mexican minister territory treasury act state
To

## Perp

In [24]:
from gensim import matutils

## Gensim

In [25]:
# gensim
from gensim import corpora, models, similarities, matutils
# sklearn
from sklearn import datasets
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
# logging for gensim (set to INFO)
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [26]:
counts2 = count_vectorizer.transform(speech_text).transpose()

In [27]:
counts2.shape

(679282, 356)

In [28]:
corpus = matutils.Sparse2Corpus(counts2)

In [29]:
id2word = dict((v, k) for k, v in count_vectorizer.vocabulary_.items())

In [30]:
lda = models.LdaModel(corpus=corpus, num_topics=5, minimum_probability=0.03, id2word=id2word, passes=10)

2018-05-29 01:38:24,562 : INFO : using symmetric alpha at 0.2
2018-05-29 01:38:24,563 : INFO : using symmetric eta at 0.2
2018-05-29 01:38:24,652 : INFO : using serial LDA version on this node
2018-05-29 01:38:25,077 : INFO : running online (multi-pass) LDA training, 5 topics, 10 passes over the supplied corpus of 356 documents, updating model once every 356 documents, evaluating perplexity every 356 documents, iterating 50x with a convergence threshold of 0.001000
2018-05-29 01:38:37,902 : INFO : -15.085 per-word bound, 34760.1 perplexity estimate based on a held-out corpus of 356 documents with 1949992 words
2018-05-29 01:38:37,903 : INFO : PROGRESS: pass 0, at document #356/356
2018-05-29 01:38:40,674 : INFO : topic #0 (0.200): 0.003*"government" + 0.003*"people" + 0.002*"states" + 0.002*"united" + 0.002*"congress" + 0.002*"world" + 0.002*"great" + 0.001*"new" + 0.001*"war" + 0.001*"united states"
2018-05-29 01:38:40,684 : INFO : topic #1 (0.200): 0.003*"states" + 0.003*"government"

2018-05-29 01:39:54,531 : INFO : topic #2 (0.200): 0.004*"government" + 0.003*"states" + 0.002*"congress" + 0.002*"united" + 0.002*"united states" + 0.002*"year" + 0.002*"people" + 0.002*"country" + 0.002*"new" + 0.002*"great"
2018-05-29 01:39:54,545 : INFO : topic #3 (0.200): 0.004*"government" + 0.004*"states" + 0.003*"united" + 0.002*"united states" + 0.002*"congress" + 0.002*"public" + 0.002*"country" + 0.002*"year" + 0.002*"great" + 0.002*"people"
2018-05-29 01:39:54,557 : INFO : topic #4 (0.200): 0.004*"government" + 0.004*"states" + 0.003*"united" + 0.003*"united states" + 0.002*"congress" + 0.002*"people" + 0.002*"country" + 0.002*"year" + 0.002*"great" + 0.002*"public"
2018-05-29 01:39:54,562 : INFO : topic diff=0.138364, rho=0.377964
2018-05-29 01:40:06,959 : INFO : -11.696 per-word bound, 3318.3 perplexity estimate based on a held-out corpus of 356 documents with 1949992 words
2018-05-29 01:40:06,962 : INFO : PROGRESS: pass 6, at document #356/356
2018-05-29 01:40:09,131 : I

In [31]:
lda.print_topics()

2018-05-29 01:40:52,827 : INFO : topic #0 (0.200): 0.003*"people" + 0.002*"world" + 0.002*"government" + 0.002*"america" + 0.002*"great" + 0.001*"nation" + 0.001*"new" + 0.001*"peace" + 0.001*"war" + 0.001*"years"
2018-05-29 01:40:52,838 : INFO : topic #1 (0.200): 0.004*"america" + 0.003*"people" + 0.003*"new" + 0.002*"world" + 0.002*"american" + 0.002*"government" + 0.002*"years" + 0.002*"work" + 0.002*"time" + 0.002*"americans"
2018-05-29 01:40:52,849 : INFO : topic #2 (0.200): 0.004*"government" + 0.003*"states" + 0.002*"congress" + 0.002*"united" + 0.002*"united states" + 0.002*"year" + 0.002*"people" + 0.002*"country" + 0.002*"great" + 0.002*"new"
2018-05-29 01:40:52,861 : INFO : topic #3 (0.200): 0.004*"government" + 0.004*"states" + 0.003*"united" + 0.002*"united states" + 0.002*"congress" + 0.002*"public" + 0.002*"country" + 0.002*"year" + 0.002*"great" + 0.002*"people"
2018-05-29 01:40:52,872 : INFO : topic #4 (0.200): 0.004*"government" + 0.004*"states" + 0.003*"united" + 0.0

[(0,
  '0.003*"people" + 0.002*"world" + 0.002*"government" + 0.002*"america" + 0.002*"great" + 0.001*"nation" + 0.001*"new" + 0.001*"peace" + 0.001*"war" + 0.001*"years"'),
 (1,
  '0.004*"america" + 0.003*"people" + 0.003*"new" + 0.002*"world" + 0.002*"american" + 0.002*"government" + 0.002*"years" + 0.002*"work" + 0.002*"time" + 0.002*"americans"'),
 (2,
  '0.004*"government" + 0.003*"states" + 0.002*"congress" + 0.002*"united" + 0.002*"united states" + 0.002*"year" + 0.002*"people" + 0.002*"country" + 0.002*"great" + 0.002*"new"'),
 (3,
  '0.004*"government" + 0.004*"states" + 0.003*"united" + 0.002*"united states" + 0.002*"congress" + 0.002*"public" + 0.002*"country" + 0.002*"year" + 0.002*"great" + 0.002*"people"'),
 (4,
  '0.004*"government" + 0.004*"states" + 0.003*"united" + 0.003*"united states" + 0.003*"congress" + 0.002*"people" + 0.002*"country" + 0.002*"great" + 0.002*"year" + 0.002*"public"')]

In [32]:
# Transform the docs from the word space to the topic space (like "transform" in sklearn)# Trans 
lda_corpus = lda[corpus]
lda_corpus

<gensim.interfaces.TransformedCorpus at 0x7f7269a28470>

In [33]:
# Store the documents' topic vectors in a list so we can take a peak
lda_docs = [doc for doc in lda_corpus]

In [34]:
# Check out the document vectors in the topic space for the first 5 documents
lda_docs[:5]

[[(0, 0.90283674), (1, 0.09703268)],
 [(1, 0.9998166)],
 [(1, 0.9997775)],
 [(1, 0.34868836), (2, 0.65118355)],
 [(0, 0.8453238), (1, 0.15451932)]]

In [35]:
speech_text[5]

'I do so with humility, deeply moved by the trust you have placed in me. It is a great honor. It is an even greater responsibility.,Tonight I am asking you to join me to walk together to a better future. By my side, I have chosen a man with a big heart from a small town. He represents the best of America, a man who will always make us proud – my friend and America\'s next Vice President, Paul Ryan.,In the days ahead, you will get to know Paul and Janna better. But last night America got to see what I saw in Paul Ryan – a strong and caring leader who is down to earth and confident in the challenge this moment demands.,I love the way he lights up around his kids and how he\'s not embarrassed to show the world how much he loves his mom.,But Paul, I still like the playlist on my iPod better than yours.,Four years ago, I know that many Americans felt a fresh excitement about the possibilities of a new president. That president was not the choice of our party but Americans always come togeth

In [36]:
# we can quantify the 'fit' of our model, to compare with other corpora, etc.
lda.log_perplexity(corpus)

2018-05-29 01:41:06,966 : INFO : -11.683 per-word bound, 3288.6 perplexity estimate based on a held-out corpus of 356 documents with 1949992 words


-11.68323848639963

In [39]:
import pyLDAvis
import pyLDAvis.gensim

In [40]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda, corpus, id2word)
vis

AttributeError: 'dict' object has no attribute 'token2id'

In [41]:
id2word

{616268: 'thank',
 273550: 'great',
 133403: 'convention',
 649930: 've',
 89935: 'chelsea',
 486999: 'proud',
 393028: 'mother',
 668172: 'woman',
 69550: 'bringing',
 373308: 'mark',
 234415: 'family',
 89259: 'charlotte',
 23487: 'aidan',
 671909: 'world',
 133965: 'conversation',
 582958: 'started',
 345071: 'law',
 354622: 'library',
 676800: 'years',
 21017: 'ago',
 265492: 'going',
 593273: 'strong',
 338404: 'know',
 344224: 'lasted',
 266148: 'good',
 622411: 'times',
 241126: 'filled',
 332437: 'joy',
 281796: 'hard',
 615729: 'tested',
 267250: 'gotten',
 669143: 'words',
 661337: 'way',
 634891: 'tuesday',
 407850: 'night',
 281448: 'happy',
 225884: 'explainer',
 90284: 'chief',
 331117: 'job',
 272886: 'grateful',
 531076: 'rest',
 255343: 'friends',
 355874: 'lifetime',
 669371: 'work',
 70766: 'brought',
 624424: 'tonight',
 332084: 'joined',
 76588: 'campaign',
 663444: 'week',
 518419: 'remarkable',
 284659: 'heard',
 370071: 'man',
 292763: 'hope',
 98926: 'clinton',