In [40]:
import pandas as pd
import numpy as np

## Read in speech

Read speech from file

In [4]:
import os

path = '/Users/kevcon/ds/metis/metisgh/Project-4/speeches'
speech_files = []

for filename in os.listdir(path):
    speech_files.append(filename)

In [5]:
speech_files

['convention.csv', 'inauguration.csv', 'farewell.csv', 'state_union.csv']

In [6]:
columns = ['type', 'speaker', 'date', 'speech']
df_speeches = pd.DataFrame(columns=columns)

for file in speech_files:
    df_temp = pd.read_csv(path + '/' + file)
    df_speeches = pd.concat([df_speeches, df_temp], ignore_index=True)

In [7]:
df_speeches.tail()

Unnamed: 0,type,speaker,date,speech
351,state_union,George Washington,1794,When we call to mind the gracious indulgence o...
352,state_union,George Washington,1793,Since the commencement of the term for which I...
353,state_union,George Washington,1792,It is some abatement of the satisfaction with ...
354,state_union,George Washington,1791,I meet you upon the present occasion with the ...
355,state_union,George Washington,1790,In meeting you again I feel much satisfaction ...


In [8]:
df_speeches.loc[df_speeches['speech'].isnull()]

Unnamed: 0,type,speaker,date,speech
104,inauguration,Abraham Lincoln,1865,
105,inauguration,Abraham Lincoln,1861,
127,state_union,George Washington,1790,


In [9]:
df_speeches['speaker'].isnull().sum()

0

In [10]:
df_speeches['speaker'].unique()

array(['Hillary Clinton', 'Robert Dole', 'George W. Bush', 'John McCain',
       'Mitt Romney', 'Donald J. Trump', 'Woodrow Wilson', 'Al Smith',
       'Franklin D. Roosevelt', 'Harry S. Truman', 'Adlai Stevenson',
       'Abraham Lincoln', 'James A. Garfield', 'Benjamin Harrison',
       'William McKinley', 'William Howard Taft', 'Charles E. Hughes',
       'Warren G. Harding', 'Calvin Coolidge', 'Herbert Hoover',
       'Wendell Willkie', 'Thomas Dewey', 'Dwight D. Eisenhower',
       'Richard Nixon', 'Barry Goldwater', 'Gerald R. Ford',
       'Ronald Reagan', 'George Bush', 'John F. Kennedy',
       'Lyndon B. Johnson', 'Hubert H. Humphrey', 'George McGovern',
       'Jimmy Carter', 'Walter F. Mondale', 'Michael S. Dukakis',
       'William J. Clinton', 'Albert Gore, Jr.', 'John F. Kerry',
       'Barack Obama', 'John Quincy Adams', 'James Monroe',
       'James Madison', 'Thomas Jefferson', 'by ', 'George Washington',
       'Franklin Delano Roosevelt', ' & ', 'Bill Clinton',
    

In [11]:
df_speeches.loc[(df_speeches['speaker'] == 'by ') | (df_speeches['speaker'] == ' & ')]

Unnamed: 0,type,speaker,date,speech
62,inauguration,by,1797,"\nWhen it was first perceived, in early times,..."
71,inauguration,&,2017,",, ,, ,, ,, ,, fellow Americans, and people o..."
345,state_union,by,1800,Gentlemen of the Senate and Gentlemen of the H...
346,state_union,by,1799,Gentlemen of the Senate and Gentlemen of the H...
347,state_union,by,1798,Gentlemen of the Senate and Gentlemen of the H...
348,state_union,by,1797,I was for some time apprehensive that it would...


In [12]:
df_speeches['date'].isnull().sum()

0

In [13]:
df_speeches['date'].unique()

array(['July 28, 2016', 'August 15, 1996', 'August 3, 2000',
       'September 2, 2004', 'September 4, 2008', 'August 30, 2012',
       'July 21, 2016', 'September 2, 1916', 'August 22, 1928',
       'July 2, 1932', 'June 27, 1936', 'July 19, 1940', 'July 20, 1944',
       'July 15, 1948', 'July 26, 1952', 'August 17, 1956',
       'June 27, 1864', 'July 12, 1880', 'September 11, 1888',
       'September 3, 1892', 'July 12, 1900', 'July 28, 1908',
       'June 10, 1916', 'June 12, 1920', 'August 14, 1924',
       'August 11, 1932', 'June 16, 1932', 'August 17, 1940',
       'June 28, 1944', 'June 24, 1948', 'July 11, 1952',
       'August 23, 1956', 'July 28, 1960', 'July 16, 1964',
       'August 8, 1968', 'August 23, 1972', 'August 19, 1976',
       'July 17, 1980', 'August 23, 1984', 'August 18, 1988',
       'August 20, 1992', 'July 15, 1960', 'August 27, 1964',
       'August 29, 1968', 'July 14, 1972', 'July 15, 1976',
       'August 14, 1980', 'July 19, 1984', 'July 21, 1988',
 

In [14]:
df_speeches['date'] = pd.to_datetime(df_speeches['date'])

In [15]:
df_speeches['date'] = df_speeches['date'].apply(lambda x: x.year)

## Vectorize

In [23]:
train_data = df_speeches['speech'].dropna()

In [24]:
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer(
    ngram_range=(1, 2),
    stop_words='english',
    min_df=1,
)

In [25]:
count_vectorizer.fit(train_data)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [26]:
count_vectorizer.get_feature_names()

['00',
 '00 000',
 '00 morning',
 '00 new',
 '00 october',
 '000',
 '000 000',
 '000 0000',
 '000 000m',
 '000 034',
 '000 0w',
 '000 10',
 '000 100',
 '000 103',
 '000 111',
 '000 12',
 '000 13',
 '000 135',
 '000 14',
 '000 15',
 '000 150',
 '000 153',
 '000 17',
 '000 1812',
 '000 182',
 '000 1820',
 '000 1827',
 '000 1836',
 '000 1840',
 '000 1849',
 '000 1850',
 '000 1851',
 '000 1858',
 '000 1860',
 '000 1861',
 '000 1869',
 '000 1870',
 '000 1876',
 '000 1884',
 '000 1885',
 '000 1888',
 '000 1890',
 '000 1891',
 '000 1894',
 '000 1895',
 '000 1896',
 '000 19',
 '000 1900',
 '000 1902',
 '000 1913',
 '000 1914',
 '000 1915',
 '000 1921',
 '000 1924',
 '000 1925',
 '000 1927',
 '000 1928',
 '000 1929',
 '000 1930',
 '000 1931',
 '000 1932',
 '000 1945',
 '000 198',
 '000 1981',
 '000 1st',
 '000 200',
 '000 22',
 '000 227',
 '000 230',
 '000 250',
 '000 289',
 '000 30',
 '000 30th',
 '000 31',
 '000 36',
 '000 37',
 '000 40',
 '000 400',
 '000 403',
 '000 42',
 '000 50',
 '000 51

In [35]:
counts = count_vectorizer.transform(train_data)

In [31]:
counts.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [1, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [32]:
pd.DataFrame(counts.toarray(), columns=count_vectorizer.get_feature_names())

Unnamed: 0,00,00 000,00 morning,00 new,00 october,000,000 000,000 0000,000 000m,000 034,...,zuloaga earnestly,zuloaga recognized,zuloaga restored,zuloaga usurper,zyje,zyje america,ôtil,ôtil 1st,ѕў,ѕў decrease
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,5,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,2,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,12,5,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,8,4,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [33]:
n_topics = 30

In [36]:
from sklearn import decomposition

lda = decomposition.LatentDirichletAllocation(
    n_components=n_topics, 
    learning_method="online", 
    verbose=1, 
    max_iter=5, 
    n_jobs=-1
)

lda.fit(counts)

iteration: 1 of max_iter: 5
iteration: 2 of max_iter: 5
iteration: 3 of max_iter: 5
iteration: 4 of max_iter: 5
iteration: 5 of max_iter: 5


LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=5, mean_change_tol=0.001,
             n_components=30, n_jobs=-1, n_topics=None, perp_tol=0.1,
             random_state=None, topic_word_prior=None,
             total_samples=1000000.0, verbose=1)

In [37]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [39]:
print_top_words(lda, count_vectorizer.get_feature_names(), 15)

Topic #0: america people american new world government nation country years know time great states united president
Topic #1: people america world new american government nation country time power states make years president great
Topic #2: people world country america new government american party states years president peace make know nation
Topic #3: people government states country america world new great shall time peace nation united war years
Topic #4: states government united united states congress people country year great public time war 000 american new
Topic #5: world people america new years nation government american year peace congress americans time make economic
Topic #6: government states united great people world country american new years united states congress nation time peace
Topic #7: government people country time american america states new nation world great united shall make years
Topic #8: new people america government years world great time nation work sta