# Transcript Data Processing
This notebook processes presidential address transcript files. This includes filtering, cleaning and exporting the dataframe for topic modeling.

In [1]:
import pandas as pd
import re

# Process Data

## Load Data

In [2]:
speech_df = pd.read_csv('data/transcripts.csv', index_col=0)
speech_df.head()

Unnamed: 0,Title,Date,President,Transcript
0,"February 5, 2019: State of the Union Address","February 05, 2019",Donald Trump,"Transcript\nMadam Speaker, Mr. Vice President,..."
1,"January 19, 2019: Remarks about the US Souther...","January 19, 2019",Donald Trump,Transcript\nTHE PRESIDENT: Just a short time a...
2,"September 25, 2018: Address at the 73rd Sessio...","September 25, 2018",Donald Trump,"Transcript\nTHE PRESIDENT: Madam President, Mr..."
3,"July 24, 2018: Speech at the Veterans of Forei...","July 24, 2018",Donald Trump,"Transcript\nTHE PRESIDENT: Thank you, Lee. Tha..."
4,"March 19, 2018: Remarks on Combating the Opioi...","March 19, 2018",Donald Trump,Transcript\nTHE PRESIDENT: Thank you to our Fi...


In [3]:
# number of scraped speech transcripts
len(speech_df)

989

## Check Data
Check for missing data and duplicates

In [4]:
# check for missing data in columns
print('empty dates:',speech_df['Date'].isnull().sum())
print('empty presidents:', speech_df['President'].isnull().sum())
print('empty transcripts:', speech_df['Transcript'].isnull().sum())

empty dates: 0
empty presidents: 0
empty transcripts: 0


In [5]:
# check for duplicates
speech_df[speech_df.duplicated(keep=False)]

Unnamed: 0,Title,Date,President,Transcript
623,"June 12, 1895: Declaration of US Neutrality","June 12, 1895",Grover Cleveland,Transcript\nBy the President of the United Sta...
624,"June 12, 1895: Declaration of US Neutrality","June 12, 1895",Grover Cleveland,Transcript\nBy the President of the United Sta...


In [6]:
# remove duplicates
speech_df.drop_duplicates(inplace=True)

In [7]:
# check speakers
speech_df['President'].unique()

array(['Donald Trump', 'Barack Obama', 'George W. Bush', 'Bill Clinton',
       'George H. W. Bush', 'Ronald Reagan', 'Chester A. Arthur',
       'Jimmy Carter', 'Gerald Ford', 'Richard M. Nixon',
       'Lyndon B. Johnson', 'John F. Kennedy', 'Dwight D. Eisenhower',
       'Harry S. Truman', 'Franklin D. Roosevelt', 'Herbert Hoover',
       'Calvin Coolidge', 'Warren G. Harding', 'Woodrow Wilson',
       'William Taft', 'Theodore Roosevelt', 'William McKinley',
       'Grover Cleveland', 'Benjamin Harrison', 'James A. Garfield',
       'Rutherford B. Hayes', 'Ulysses S. Grant', 'Andrew Johnson',
       'Abraham Lincoln', 'James Buchanan', 'Franklin Pierce',
       'Millard Fillmore', 'Zachary Taylor', 'James K. Polk',
       'John Tyler', 'William Harrison', 'John Quincy Adams',
       'Martin Van Buren', 'Andrew Jackson', 'James Monroe',
       'James Madison', 'Thomas Jefferson', 'John Adams',
       'George Washington'], dtype=object)

## Filter/Format Data
Remove undesired data types and sort data by date

In [8]:
# filter out debate transcripts
speech_df = speech_df[~speech_df['Title'].str.contains('Debate')]

In [9]:
# reformat dates to datetime
speech_df['Date'] = pd.to_datetime(speech_df['Date'])

In [10]:
# sort by date
speech_df.sort_values(by=['Date'], inplace=True)
# reset index based on earliest date
speech_df.reset_index(drop=True, inplace=True)

In [11]:
speech_df.head()

Unnamed: 0,Title,Date,President,Transcript
0,"April 30, 1789: First Inaugural Address",1789-04-30,George Washington,Transcript\nFellow Citizens of the Senate and ...
1,"October 3, 1789: Thanksgiving Proclamation",1789-10-03,George Washington,Transcript\nWhereas it is the duty of all Nati...
2,"January 8, 1790: First Annual Message to Congress",1790-01-08,George Washington,Transcript\nFellow Citizens of the Senate and ...
3,"December 8, 1790: Second Annual Message to Con...",1790-12-08,George Washington,Transcript\nFellow citizens of the Senate and ...
4,"December 29, 1790: Talk to the Chiefs and Coun...",1790-12-29,George Washington,Transcript\nI the President of the United Stat...


## Clean Text
Remove unnecessary words throughout text data

In [12]:
# function to clean transcript text
def text_cleaner(drty_txt):
    # remove newline characters ("\n")
    cln_txt = drty_txt.replace('\n', ' ')
    # remove leading "Transcript" text
    if cln_txt.startswith('Transcript'):
        cln_txt = cln_txt[len('Transcript '):]
    # remove notes in parantheses throughout transcript
    cln_txt = re.sub(r'\([^)]*\)', '', cln_txt)
    # return cleansed text
    return(cln_txt)

In [13]:
# run transcripts through cleaner function
speech_df['Transcript'] = speech_df['Transcript'].apply(lambda x: text_cleaner(x))

In [14]:
speech_df.head()

Unnamed: 0,Title,Date,President,Transcript
0,"April 30, 1789: First Inaugural Address",1789-04-30,George Washington,Fellow Citizens of the Senate and the House of...
1,"October 3, 1789: Thanksgiving Proclamation",1789-10-03,George Washington,Whereas it is the duty of all Nations to ackno...
2,"January 8, 1790: First Annual Message to Congress",1790-01-08,George Washington,Fellow Citizens of the Senate and House of Rep...
3,"December 8, 1790: Second Annual Message to Con...",1790-12-08,George Washington,Fellow citizens of the Senate and House of Rep...
4,"December 29, 1790: Talk to the Chiefs and Coun...",1790-12-29,George Washington,"I the President of the United States, by my ow..."


# Address Type
Label addresses by type

In [61]:
# create column for address type
speech_df.insert(3, 'Type', 'other')

In [None]:
# convention
# inaugural address
# annual message
# state of the union
# press conference
# fireside chats - roosevelt
# other

In [104]:
# convention
# speech_df.loc[speech_df['Title'].str.contains('convention', case=False), 'Type'] \
# = 'convention'
# inaugural
speech_df.loc[speech_df['Title'].str.contains('inaugural', case=False), 'Type'] \
= 'inauguration'
# state of the union
speech_df.loc[speech_df['Title'].str.contains('annual message|state of the union', case=False), 'Type'] \
= 'state_union'
# remarks
speech_df.loc[speech_df['Title'].str.contains('remarks', case=False), 'Type'] \
= 'remarks'
# address to nation
speech_df.loc[speech_df['Title'].str.contains('to the nation', case=False), 'Type'] \
= 'national'
# fireside chats
speech_df.loc[speech_df['Title'].str.contains('fireside chat', case=False), 'Type'] \
= 'fireside'

In [88]:
speech_df['Title'].str.contains('national convention', case=False).sum()

19

In [31]:
speech_df['Title'].str.contains('convention', case=False).sum()

23

In [33]:
speech_df['Title'].str.contains('press conference', case=False).sum()

29

In [34]:
speech_df['Title'].str.contains('fireside chat', case=False).sum()

30

In [43]:
speech_df['Title'].str.contains('to the nation', case=False).sum()

35

In [44]:
speech_df['Title'].str.contains('remarks', case=False).sum()

94

In [93]:
speech_df[speech_df['Title'].str.contains('convention', case=False)]

Unnamed: 0,Title,Date,President,Type,Transcript
191,"March 14, 1854: Message Regarding Proposed US-...",1854-03-14,Franklin Pierce,other,To the Senate of the United States: In transmi...
391,"November 10, 1903: Message Regarding US-Cuban ...",1903-11-10,Theodore Roosevelt,other,To the Senate and House of Representatives: It...
521,"June 27, 1936: Democratic National Convention",1936-06-27,Franklin D. Roosevelt,other,"Senator Robinson, Members of the Democratic Co..."
535,"July 19, 1940: Democratic National Convention",1940-07-19,Franklin D. Roosevelt,other,Members of the Convention-my friends: It is ve...
558,"July 20, 1944: Democratic National Convention",1944-07-20,Franklin D. Roosevelt,other,I have already indicated to you why I accept t...
571,"July 15, 1948: Democratic National Convention",1948-07-15,Harry S. Truman,other,I am sorry that the microphones are in the way...
585,"August 23, 1956: Republican National Convention",1956-08-23,Dwight D. Eisenhower,other,"Chairman Martin, Delegates and Alternates to t..."
609,"December 7, 1961: Address in Miami at the Open...",1961-12-07,John F. Kennedy,other,"Mr. Meany, Reverend Clergy, Governor Bryant, g..."
645,"August 27, 1964: Acceptance Speech at the Demo...",1964-08-27,Lyndon B. Johnson,other,"Chairman McCormack, my fellow Americans: I a..."
734,"August 19, 1976: Remarks at the Republican Nat...",1976-08-19,Ronald Reagan,other,"Mr. President, Mrs. Ford, Mr. Vice President, ..."


In [90]:
conv_spch = pd.read_csv('OLD/speeches/convention.csv')

In [91]:
conv_spch

Unnamed: 0,type,speaker,date,speech
0,convention,Hillary Clinton,"July 28, 2016",Thank you all for the great convention that we...
1,convention,Robert Dole,"August 15, 1996",The folks in Hollywood would be happy to know ...
2,convention,George W. Bush,"August 3, 2000","Thank you. Thank you for this honor. [,],Thank..."
3,convention,George W. Bush,"September 2, 2004","When I said those words 4 years ago, none of u..."
4,convention,John McCain,"September 4, 2008","Tonight, I have a privilege given few American..."
5,convention,Mitt Romney,"August 30, 2012","I do so with humility, deeply moved by the tru..."
6,convention,Donald J. Trump,"July 21, 2016","U.S.A.! U.S.A.! U.S.A.!,Together, we will lead..."
7,convention,Woodrow Wilson,"September 2, 1916",I cannot accept the leadership and responsibil...
8,convention,Al Smith,"August 22, 1928",Here I confirmed my faith in the principles of...
9,convention,Franklin D. Roosevelt,"July 2, 1932",I appreciate your willingness after these six ...


# Export Clean Data

In [15]:
# total number of addresses
len(speech_df)

975

In [16]:
speech_df.to_csv('data/cln_transcripts.csv')