In [1]:
import pandas as pd
import numpy as np

## Load speech file

In [2]:
# load data from pickle file
import pickle

pickle_in = open("speeches.pickle", "rb")
speech_df = pickle.load(pickle_in)

In [3]:
speech_df.head()

Unnamed: 0,type,speaker,date,speech
0,convention,Hillary Clinton,2016,Thank you all for the great convention that we...
1,convention,Robert Dole,1996,The folks in Hollywood would be happy to know ...
2,convention,George W. Bush,2000,"Thank you. Thank you for this honor. [,],Thank..."
3,convention,George W. Bush,2004,"When I said those words 4 years ago, none of u..."
4,convention,John McCain,2008,"Tonight, I have a privilege given few American..."


In [4]:
speech_text = speech_df['speech']

## Pre-processing

In [5]:
import nltk

In [6]:
from textblob import TextBlob
from nltk.util import ngrams

from collections import Counter
from operator import itemgetter

counter = Counter()

from nltk.corpus import stopwords
stop = stopwords.words('english')
stop += ['.', ',', '(', ')', "'", '"']
stop = set(stop)

n = 3
for doc in speech_text:
    words = TextBlob(doc.lower()).words  # tokenize words
    words = [w for w in words if w not in stop]   
    bigrams = ngrams(words, n)
    counter += Counter(bigrams)

for phrase, count in counter.most_common(30):
    print('%20s %i' % (" ".join(phrase), count))

citizens united states 183
      ending june 30 177
government united states 175
    year ending june 168
people united states 161
    last fiscal year 145
  fiscal year ending 144
last session congress 141
 last annual message 138
president united states 132
united states america 116
 united states great 104
  part united states 100
states great britain 89
congress last session 82
    next fiscal year 76
report secretary war 76
 present fiscal year 75
report secretary treasury 69
    year ending 30th 68
 current fiscal year 67
united states government 66
report secretary navy 66
       ended june 30 65
     year ended june 64
constitution united states 62
    fiscal year 1947 62
interstate commerce commission 60
        world war ii 59
  bank united states 59


## VC and LDA

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [8]:
count_vectorizer = CountVectorizer(
    stop_words="english",
    ngram_range=(1, 2),
    token_pattern="\\b[a-z][a-z]+\\b"
)

In [9]:
count_vectorizer.fit(speech_text)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='\\b[a-z][a-z]+\\b',
        tokenizer=None, vocabulary=None)

In [10]:
counts = count_vectorizer.transform(speech_text)

In [11]:
from sklearn import decomposition

In [12]:
n_topics = 15

lda = decomposition.LatentDirichletAllocation(
    n_components=n_topics, 
    learning_method="online", 
    verbose=1, 
    max_iter=5, 
    n_jobs=-1
)

lda.fit(counts)

iteration: 1 of max_iter: 5
iteration: 2 of max_iter: 5
iteration: 3 of max_iter: 5
iteration: 4 of max_iter: 5
iteration: 5 of max_iter: 5


LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=5, mean_change_tol=0.001,
             n_components=15, n_jobs=-1, n_topics=None, perp_tol=0.1,
             random_state=None, topic_word_prior=None,
             total_samples=1000000.0, verbose=1)

In [13]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [14]:
print_top_words(lda, count_vectorizer.get_feature_names(), 15)

Topic #0: people government great country america states american world new years nation time united national united states
Topic #1: america world people time american government country years nation new party united states make president
Topic #2: people government new great country american states world america shall party time men years power
Topic #3: america people american nation president new government country party world states audience members years opponent great
Topic #4: people america government nation great time world president years party country states know peace united
Topic #5: government states united congress people united states year country great public new time american war world
Topic #6: america government people world new great american states president nation united country years peace president obama
Topic #7: america people government new country years world nation american time states let make enriched ennobled president
Topic #8: america people world ne

## TFIDF and NMF

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

In [16]:
t_vectorizer = TfidfVectorizer(
    stop_words="english",
    ngram_range=(1, 2),
    token_pattern="\\b[a-z][a-z]+\\b"
)

t_vectorizer.fit(speech_text)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='\\b[a-z][a-z]+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [17]:
t_counts = t_vectorizer.transform(speech_text)

In [18]:
n_topics = 15

nmf = decomposition.NMF(
    n_components=n_topics,
    max_iter=5
)

nmf.fit(t_counts)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=5,
  n_components=15, random_state=None, shuffle=False, solver='cd',
  tol=0.0001, verbose=0)

In [19]:
print_top_words(nmf, t_vectorizer.get_feature_names(), 10)

Topic #0: states government united states united congress year public general subject country
Topic #1: america tonight americans ve people new jobs american work children
Topic #2: government economic federal program world congress people national year new
Topic #3: people world nation government shall peace great freedom nations men
Topic #4: soviet world oil programs energy year billion soviet union percent nuclear
Topic #5: applause america ve iraq let tonight people applause let congress iraqi
Topic #6: world america tonight americans president let peace united war free
Topic #7: people government constitution president union federal energy congress strategic states
Topic #8: congress vietnam tonight think years year commitments believe surtax kappel
Topic #9: interstate law business men corporations conditions work interstate commerce man industrial
Topic #10: party ve republican democratic platform democratic party people president republican party audience
Topic #11: war fighti

## Max Features

In [60]:
t_vectorizer = TfidfVectorizer(
    stop_words="english",
    token_pattern="\\b[a-z][a-z]+\\b",
    max_df=0.9
)

t_vectorizer.fit(speech_text)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.9, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='\\b[a-z][a-z]+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [61]:
t_counts = t_vectorizer.transform(speech_text)

In [64]:
n_topics = 30

nmf = decomposition.NMF(
    n_components=n_topics,
    max_iter=5
)

nmf.fit(t_counts)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=5,
  n_components=30, random_state=None, shuffle=False, solver='cd',
  tol=0.0001, verbose=0)

In [65]:
print_top_words(nmf, t_vectorizer.get_feature_names(), 10)

Topic #0: congress year department general secretary report subject service attention law
Topic #1: america tonight americans let jobs world years help children work
Topic #2: economic program world federal congress programs year farm development security
Topic #3: world freedom free let men peoples shall man america unity
Topic #4: spain public commerce powers vessels effect colonies congress treaty duties
Topic #5: business interstate law corporations public conditions work industrial men tariff
Topic #6: applause ve america let budget tonight congress iraq year laughter
Topic #7: constitution union shall public citizens powers state laws interests rights
Topic #8: party republican platform democratic president nomination say campaign convention leadership
Topic #9: ve ll don jobs soviet oil know going want didn
Topic #10: banks gold notes silver public currency year cent treasury financial
Topic #11: mexico texas treaty congress mexican territory minister treasury act state
Topic #1