In [12]:
# import necessary python packages
import pandas as pd
import numpy as np
import nltk

# expected to return a True Value
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rileylatham/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Data - Riley, Blake and Matt

This section of the project notebook includes our method of collecting, cleaning, and processing our data through sentiment analyis techniques. We will begin with providing the process in which we collected presidential debate transcripts from the 2000-2016 elections. We then pulled all of the positive and negative sentiment from each candidate and used them as predictors for our classification algorithim on predicting if the candidate won the election. This data will be paired with control data such as candidate rating, political officiliation, campaign financials, etc...

## Part 0: Import Speech Data

In [None]:
# import debate transcript, positve and negative word dictionaries.txt files
# read file and split each word into its own string

debate_path = input('Copy the path to the debate you\'re interested in.')
positive_path = input('Copy the path to the positive word dictionary.')
negative_path = input('Copy the path to the negative word dictionary.')

debate = open(debate_path, encoding="utf-8").read().split()
positive = open(positive_path, encoding="ISO-8859-1").read().split()
negative = open(negative_path, encoding="ISO-8859-1").read().split()

## Part 1: Separation Of Moderator, And The Two Candidates

Part one includes a while loop that iterates backwards through the debate transcript. Once it finds a speaking prompt from either the Moderator, Candidate One, or Candidate Two, it appends the text that exists after the current prompt and before the last prompt in the transcript. It also detaches and deletes this list of strings from the transcript so it is not duplicated. After this process is done, you obtain your separated candidate and moderator lists.

In [14]:
mod_list = []
can1_list = []
can2_list = []

i_new = -1
i_old = -1
while i_new >= -len(debate):
    if debate[i_new] == 'SCHIEFFER:': # Moderator
        mod_list += debate[i_new:]
        del debate[i_new:i_old]
        i_old = i_new

    elif debate[i_new] == 'OBAMA:': # Candidate One
        can1_list += debate[i_new:]
        del debate[i_new:i_old]
        i_old = i_new

    elif debate[i_new] == 'MCCAIN:': # Candidate Two
        can2_list += debate[i_new:]
        del debate[i_new:i_old]
        i_old = i_new

    else: pass
    i_new -= 1

## Part 2: Removal Of Stop Words

In [15]:
# import "stop words" from nltk package
stop_words = nltk.corpus.stopwords.words('english')

In [16]:
# iterates through candidate lists and removes "stop words" from the list
can1_list = [x for x in can1_list if x not in stop_words]
can2_list = [x for x in can2_list if x not in stop_words]

## Part 3: Sentiment Analysis

We will now perform our sentiment analysis and separate the positive and negative connotation words from our candidates talking points during the debate. You will see below the results of each candidate.

In [17]:
# iterates through candidate lists and separates positive/negative words 
# from the list
can1_pos = [x for x in can1_list if x in positive]
can1_neg = [x for x in can1_list if x in negative]
can2_pos = [x for x in can2_list if x in positive]
can2_neg = [x for x in can2_list if x in negative]

In [18]:
print('Candidate One Total Positive:', len(can1_pos))
print('Candidate Two Total Positive:', len(can2_pos))
print('Candidate One Total Negative:', len(can1_neg))
print('Candidate Two Total Negative:', len(can2_neg))
print('Candidate One Total Sentiment:', len(can1_pos)+len(can1_neg))
print('Candidate Two Total Sentiment:', len(can2_pos)+len(can2_neg))

Candidate One Total Positive: 2558
Candidate Two Total Positive: 2648
Candidate One Total Negative: 1231
Candidate Two Total Negative: 1263
Candidate One Total Sentiment: 3789
Candidate Two Total Sentiment: 3911


## Part 4: Election Data

The following data contains information on the presidential candidates from 2000 to 2016. This includes, their names, party, election result, campaign spending, favorability rating (based off of the popular vote), and the voting-eligible population turnout for each election. 

In [19]:
def create_data():
    election_data = pd.read_csv('election_data.csv') #Reading in the data
    for row in range(10,19):   #Dropping some extra rows
        election_data.drop(index = [row], inplace = True)
    election_data['Year'] = election_data['Year'].astype(int) #Making the year look nicer
    for i in range(len(election_data)):      #Changing the parties from the first letter to the full name
        if election_data.iloc[i,2] == 'R':
            election_data.iloc[i,2] = 'Republican'
        else:
            election_data.iloc[i,2] = 'Democrat'
    return election_data

In [20]:
create_data()

Unnamed: 0,Year,Candidate,Political Party,Election Result,Candidate Spending ($1mil),Final Favorability Rating (%),Voting-Eligible Population Turnout (%)
0,2000,Albert Gore,Democrat,L,120.3,48.4,54.2
1,2000,George W. Bush,Republican,W,186.5,47.9,54.2
2,2004,John Kerry,Democrat,L,332.7,48.3,60.1
3,2004,George W. Bush,Republican,W,355.0,50.7,60.1
4,2008,John McCain,Republican,L,239.7,45.6,61.6
5,2008,Barack Obama,Democrat,W,760.4,52.9,61.6
6,2012,Mitt Romney,Republican,L,458.7,47.2,58.6
7,2012,Barack Obama,Democrat,W,737.1,51.1,58.6
8,2016,Hillary Clinton,Democrat,L,450.6,48.2,60.1
9,2016,Donald Trump,Republican,W,239.0,46.1,60.1
