# Speech Data Processing
This notebook processes presidential speech files. This includes cleaning, creating a speech ID and exporting the dataframe to a pickle file for modeling.

In [1]:
import pandas as pd
import numpy as np

## Load scraped speech data

Read speech from files

In [2]:
import os

path = 'data/speeches'
speech_files = []

for filename in os.listdir(path):
    speech_files.append(filename)

In [3]:
speech_files

['convention.csv', 'farewell.csv', 'inauguration.csv', 'state_union.csv']

In [4]:
columns = ['type', 'speaker', 'date', 'speech']
df_speeches = pd.DataFrame(columns=columns)

for file in speech_files:
    df_temp = pd.read_csv(path + '/' + file)
    df_speeches = pd.concat([df_speeches, df_temp], ignore_index=True)

In [5]:
df_speeches.head()

Unnamed: 0,type,speaker,date,speech
0,convention,Hillary Clinton,"July 28, 2016",Thank you all for the great convention that we...
1,convention,Robert Dole,"August 15, 1996",The folks in Hollywood would be happy to know ...
2,convention,George W. Bush,"August 3, 2000","Thank you. Thank you for this honor. [,],Thank..."
3,convention,George W. Bush,"September 2, 2004","When I said those words 4 years ago, none of u..."
4,convention,John McCain,"September 4, 2008","Tonight, I have a privilege given few American..."


## Clean
Check for null values, clean and organize dataframe

In [6]:
df_speeches.loc[df_speeches['speech'].isnull()]

Unnamed: 0,type,speaker,date,speech


In [7]:
df_speeches['speaker'].isnull().sum()

0

In [8]:
df_speeches['speaker'].unique()

array(['Hillary Clinton', 'Robert Dole', 'George W. Bush', 'John McCain',
       'Mitt Romney', 'Donald J. Trump', 'Woodrow Wilson', 'Al Smith',
       'Franklin D. Roosevelt', 'Harry S. Truman', 'Adlai Stevenson',
       'Abraham Lincoln', 'James A. Garfield', 'Benjamin Harrison',
       'William McKinley', 'William Howard Taft', 'Charles E. Hughes',
       'Warren G. Harding', 'Calvin Coolidge', 'Herbert Hoover',
       'Wendell Willkie', 'Thomas Dewey', 'Dwight D. Eisenhower',
       'Richard Nixon', 'Barry Goldwater', 'Gerald R. Ford',
       'Ronald Reagan', 'George Bush', 'John F. Kennedy',
       'Lyndon B. Johnson', 'Hubert H. Humphrey', 'George McGovern',
       'Jimmy Carter', 'Walter F. Mondale', 'Michael S. Dukakis',
       'William J. Clinton', 'Albert Gore, Jr.', 'John F. Kerry',
       'Barack Obama', 'Andrew Jackson', 'George Washington',
       'John Quincy Adams', 'James Monroe', 'James Madison',
       'Thomas Jefferson', 'John Adams', 'Franklin Delano Roosevelt',
  

In [9]:
df_speeches[df_speeches['speaker'] == 'Donald Trump']

Unnamed: 0,type,speaker,date,speech
143,state_union,Donald Trump,2018,"Mr. Speaker, Mr. Vice President, Members of Co..."
144,state_union,Donald Trump,2017,"Mr. Speaker, Mr. Vice President, Members of Co..."


In [10]:
df_speeches.loc[75, 'speaker'] = 'Donald J. Trump'
df_speeches.loc[76, 'speaker'] = 'Donald J. Trump'

In [11]:
df_speeches.loc[(df_speeches['speaker'] == 'by ') | (df_speeches['speaker'] == ' & ')]

Unnamed: 0,type,speaker,date,speech


## Reformat dates

In [12]:
df_speeches['date'].isnull().sum()

0

In [13]:
df_speeches['date'].unique()

array(['July 28, 2016', 'August 15, 1996', 'August 3, 2000',
       'September 2, 2004', 'September 4, 2008', 'August 30, 2012',
       'July 21, 2016', 'September 2, 1916', 'August 22, 1928',
       'July 2, 1932', 'June 27, 1936', 'July 19, 1940', 'July 20, 1944',
       'July 15, 1948', 'July 26, 1952', 'August 17, 1956',
       'June 27, 1864', 'July 12, 1880', 'September 11, 1888',
       'September 3, 1892', 'July 12, 1900', 'July 28, 1908',
       'June 10, 1916', 'June 12, 1920', 'August 14, 1924',
       'August 11, 1932', 'June 16, 1932', 'August 17, 1940',
       'June 28, 1944', 'June 24, 1948', 'July 11, 1952',
       'August 23, 1956', 'July 28, 1960', 'July 16, 1964',
       'August 8, 1968', 'August 23, 1972', 'August 19, 1976',
       'July 17, 1980', 'August 23, 1984', 'August 18, 1988',
       'August 20, 1992', 'July 15, 1960', 'August 27, 1964',
       'August 29, 1968', 'July 14, 1972', 'July 15, 1976',
       'August 14, 1980', 'July 19, 1984', 'July 21, 1988',
 

In [14]:
df_speeches['date'] = df_speeches['date'].apply(lambda x: str(x))

In [15]:
df_speeches['date'] = pd.to_datetime(df_speeches['date'])

In [16]:
df_speeches['date'] = df_speeches['date'].apply(lambda x: x.year)

In [17]:
df_speeches.groupby('date').count()

Unnamed: 0_level_0,type,speaker,speech
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1789,1,1,1
1790,2,2,2
1791,1,1,1
1792,1,1,1
1793,2,2,2
1794,1,1,1
1795,1,1,1
1796,2,2,2
1797,2,2,2
1798,1,1,1


In [18]:
df_speeches.sort_values(by='date', inplace=True)

## Set index to speech tag lables

In [19]:
tags = df_speeches['date'].apply(str) + ' ' + df_speeches['type'] + ' ' + df_speeches['speaker']
df_speeches.set_index(tags, inplace=True)

In [20]:
df_speeches.head()

Unnamed: 0,type,speaker,date,speech
1789 inauguration George Washington,inauguration,George Washington,1789,Fellow-Citizens of the Senate and of the House...
1790 state_union George Washington,state_union,George Washington,1790,In meeting you again I feel much satisfaction ...
1790 state_union George Washington,state_union,George Washington,1790,I embrace with great satisfaction the opportun...
1791 state_union George Washington,state_union,George Washington,1791,I meet you upon the present occasion with the ...
1792 state_union George Washington,state_union,George Washington,1792,It is some abatement of the satisfaction with ...


## Export data as pickle file

In [21]:
# save data to pickle
import pickle

pickle_out = open("data/speeches.pickle", "wb")
pickle.dump(df_speeches, pickle_out)
pickle_out.close()