In [1]:
import pandas as pd
from pandas.api.types import CategoricalDtype
import numpy as np
import matplotlib.pyplot as plt
from datetime import date
from datetime import datetime
from scipy.stats import t 

In [6]:
electoralVotes = pd.read_csv("C:/Users/Lenovo/Documents/Election_Forecasting/election/electoralVotes.txt", sep=",", header=None)
electoralVotes.columns = ['state','votes']
electoralVotes.head()

Unnamed: 0,state,votes
0,Alabama,9
1,Alaska,3
2,Arizona,11
3,Arkansas,6
4,California,55


## Setup

In [7]:
fte = pd.read_csv('president_polls_historical.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [8]:
grade = ['A+', 'A', 'A-', 'A/B',
         'B+', 'B', 'B-', 'B/C',
         'C+', 'C', 'C-', 'C/D',
         'D+', 'D', np.nan]
weight = [4.3, 4.0, 3.7, 3.5,
          3.3, 3.0, 2.7, 2.5,
          2.3, 2.0, 1.7, 1.5,
          1.3, 1.0, 0]

grade_weights = dict(zip(grade,weight))

## Defining functions

### Step 1: Collect Polls

In [9]:
def pollsterInfo(polls, grades):
    # get number of polls by each pollster as well as each pollster's grade
    pollster_info = polls.groupby(['pollster']).agg({'poll_id':lambda x: x.nunique(),
                                                     'fte_grade': 'max'}).reset_index()
    pollster_info.columns = ['pollster','n_polls','fte_grade']
    
    # each poll is weighted by 1/sqrt(n) where n is the number of polls conducted by that pollster (https://fivethirtyeight.com/features/the-death-of-polling-is-greatly-exaggerated/)
    pollster_info['pollster_n_weight'] = 1/np.sqrt(pollster_info.n_polls)
    
    # each poll is weighted according to FTE's rating of the pollster
    pollster_info['pollster_rating_weight'] = [grades[grade] for grade in pollster_info.fte_grade]
    
    return pollster_info

In [10]:
def preProcessPolls(polls):
    processed_polls = polls.copy()
    
    # change dates from string to datetime
    processed_polls.loc[:,'start_date'] = pd.to_datetime(polls['start_date'])
    processed_polls.loc[:,'end_date'] = pd.to_datetime(polls['end_date'])
    
    # limit columns
    keep_cols = ['poll_id','pollster_id','pollster','sponsors','display_name','fte_grade','methodology','state','start_date','end_date','question_id','sample_size','population','internal','partisan','party','answer','candidate_id','candidate_name','pct']
    processed_polls = polls.loc[:,keep_cols]

    # create a ranked order from the population variable: likely voters (lv) > registered voters (rv) > voters (v) > adults (a)
    pop_cats = CategoricalDtype(categories = ['a','v','rv','lv'], ordered = True)
    processed_polls['population'] = processed_polls['population'].astype(pop_cats)

    # remove polls from F-rated pollsters
    processed_polls = processed_polls[processed_polls.fte_grade != 'F']
    
    # some polls conducted by multiple pollsters are unrated
    # if any pollster involved has a solo grade, we assign that grade to the collaboration
    # if multiple pollsters involved have solo grades, we take the average of their grades and assign it to their collaboration
    processed_polls.loc[processed_polls.pollster == 'Reconnect Research/Roanoke College','fte_grade'] = polls.loc[polls.pollster =='Roanoke College','fte_grade'].iloc[0]
    processed_polls.loc[processed_polls.pollster == 'Benenson Strategy Group/GS Strategy Group','fte_grade'] = 'B/C' # comes from https://github.com/fivethirtyeight/data/blob/master/pollster-ratings/2020/pollster-ratings.csv
    processed_polls.loc[processed_polls.pollster == 'YouGov Blue/Data for Progress','fte_grade'] = polls.loc[polls.pollster =='Data for Progress','fte_grade'].iloc[0]
    processed_polls.loc[processed_polls.pollster == 'Global Strategy Group/Data for Progress','fte_grade'] = 'B-' # average of B and B/C
    processed_polls.loc[processed_polls.pollster == 'Montana State University Bozeman/University of Denver','fte_grade'] = 'B/C' # comes from https://github.com/fivethirtyeight/data/blob/master/pollster-ratings/2020/pollster-ratings.csv
    
    # remove polls whose pollsters are unrated
    processed_polls = processed_polls.dropna(subset=['fte_grade'])
    
    # get pollster ratings and number of polls conducted
    pollster_info = pollsterInfo(processed_polls, grade_weights)
    
    return processed_polls, pollster_info

In [11]:
processed_polls, pollster_info = preProcessPolls(fte)
processed_polls.head()

Unnamed: 0,poll_id,pollster_id,pollster,sponsors,display_name,fte_grade,methodology,state,start_date,end_date,question_id,sample_size,population,internal,partisan,party,answer,candidate_id,candidate_name,pct
0,73830,940,Lake Research,National Women's Law Center,Lake Research Partners,A/B,Live Phone,,10/31/20,11/3/20,138491,2400.0,lv,False,DEM,DEM,Biden,13256,Joe Biden,51.0
1,73830,940,Lake Research,National Women's Law Center,Lake Research Partners,A/B,Live Phone,,10/31/20,11/3/20,138491,2400.0,lv,False,DEM,REP,Trump,13254,Donald Trump,48.0
4,72621,383,PPP,,Public Policy Polling,A-,IVR/Text,Iowa,11/1/20,11/2/20,136283,871.0,v,False,,DEM,Biden,13256,Joe Biden,49.0
5,72621,383,PPP,,Public Policy Polling,A-,IVR/Text,Iowa,11/1/20,11/2/20,136283,871.0,v,False,,REP,Trump,13254,Donald Trump,48.0
6,72647,461,Susquehanna,,Susquehanna Polling & Research,B+,Live Phone,Pennsylvania,11/1/20,11/2/20,136322,499.0,lv,False,,DEM,Biden,13256,Joe Biden,48.4


In [12]:
# a single poll might include different versions for likely voters, registered voters, etc.
# we want to pick the best version of each poll
# the order of preference: adults (a) < voters (v) < registered voters (rv) < likely voters (lv)
def filterPollVersions(polls):
    # get preferred version for each poll
    poll_pops = polls.groupby('poll_id',sort=False).population.max()
    poll_pops_df = pd.DataFrame({'poll_id':polls.poll_id.unique(),'population_to_use':poll_pops}).reset_index(drop=True)
    
    # only keep desired version for each poll
    desired_versions = polls.merge(poll_pops_df,on='poll_id')
    idx = desired_versions.population == desired_versions.population_to_use
    polls_filtered = polls[idx.values]
    
    return polls_filtered

In [13]:
fte_best_polls = filterPollVersions(processed_polls)
fte_best_polls

Unnamed: 0,poll_id,pollster_id,pollster,sponsors,display_name,fte_grade,methodology,state,start_date,end_date,question_id,sample_size,population,internal,partisan,party,answer,candidate_id,candidate_name,pct
0,73830,940,Lake Research,National Women's Law Center,Lake Research Partners,A/B,Live Phone,,10/31/20,11/3/20,138491,2400.0,lv,False,DEM,DEM,Biden,13256,Joe Biden,51.0
1,73830,940,Lake Research,National Women's Law Center,Lake Research Partners,A/B,Live Phone,,10/31/20,11/3/20,138491,2400.0,lv,False,DEM,REP,Trump,13254,Donald Trump,48.0
4,72621,383,PPP,,Public Policy Polling,A-,IVR/Text,Iowa,11/1/20,11/2/20,136283,871.0,v,False,,DEM,Biden,13256,Joe Biden,49.0
5,72621,383,PPP,,Public Policy Polling,A-,IVR/Text,Iowa,11/1/20,11/2/20,136283,871.0,v,False,,REP,Trump,13254,Donald Trump,48.0
6,72647,461,Susquehanna,,Susquehanna Polling & Research,B+,Live Phone,Pennsylvania,11/1/20,11/2/20,136322,499.0,lv,False,,DEM,Biden,13256,Joe Biden,48.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16700,57026,383,PPP,DEM (partisan),Public Policy Polling,A-,IVR,Ohio,11/27/18,11/28/18,92081,648.0,v,False,DEM,REP,Trump,13254,Donald Trump,46.0
16701,57026,383,PPP,DEM (partisan),Public Policy Polling,A-,IVR,Ohio,11/27/18,11/28/18,92082,648.0,v,False,DEM,DEM,Warren,13258,Elizabeth Warren,43.0
16702,57026,383,PPP,DEM (partisan),Public Policy Polling,A-,IVR,Ohio,11/27/18,11/28/18,92082,648.0,v,False,DEM,REP,Trump,13254,Donald Trump,49.0
16703,57025,399,Rasmussen (Pulse Opinion Research),,Rasmussen Reports/Pulse Opinion Research,B,IVR/Online,,11/12/18,11/13/18,92078,1000.0,lv,False,,DEM,Obama,13253,Michelle Obama,50.0


In [237]:
# if a poll included multiple questions featuring different matchups, we choose the one(s) with the most candidates

# for each poll, count the number of matchups included
multi_matchup_polls_cnt = fte_best_polls.groupby('poll_id').question_id.nunique().sort_values(ascending=False).reset_index()
# identify multi-matchup polls
multi_matchup_polls_cnt = multi_matchup_polls_cnt.loc[multi_matchup_polls_cnt.question_id > 1,]
multi_matchup_polls = fte_best_polls.loc[fte_best_polls.poll_id.isin(multi_matchup_polls_cnt.poll_id),["poll_id","pollster","population","question_id","end_date","party","answer","pct"]]
multi_matchup_polls.head(20)

Unnamed: 0,poll_id,pollster,population,question_id,end_date,party,answer,pct
58,72802,Change Research,lv,136601,11/2/20,DEM,Biden,47.0
59,72802,Change Research,lv,136601,11/2/20,REP,Trump,46.0
60,72802,Change Research,lv,136601,11/2/20,LIB,Jorgensen,4.0
61,72802,Change Research,lv,136601,11/2/20,GRE,Hawkins,1.0
62,72802,Change Research,lv,136605,11/2/20,DEM,Biden,50.0
63,72802,Change Research,lv,136605,11/2/20,REP,Trump,50.0
68,72863,TIPP,lv,136725,11/2/20,DEM,Biden,50.2
69,72863,TIPP,lv,136725,11/2/20,REP,Trump,46.2
70,72863,TIPP,lv,136725,11/2/20,LIB,Jorgensen,2.0
71,72863,TIPP,lv,136725,11/2/20,GRE,Hawkins,0.8


In [240]:
# for each poll question/matchup, count how many candidates were included
n_cands_by_poll_question = multi_matchup_polls.groupby(['poll_id','question_id']).answer.nunique().reset_index().sort_values(['poll_id','question_id'])
n_cands_by_poll_question.rename(columns={'answer':'n_candidates'},inplace=True)
n_cands_by_poll_question

Unnamed: 0,poll_id,question_id,n_candidates
0,57026,92079,2
1,57026,92080,2
2,57026,92081,2
3,57026,92082,2
4,57170,92351,2
...,...,...,...
2162,72802,136605,2
2163,72808,136723,2
2164,72808,136724,2
2165,72863,136725,4


In [244]:
# for each poll, identify the number of candidates featured in the largest matchup
max_n_cands_by_poll = n_cands_by_poll_question.groupby('poll_id').n_candidates.max().reset_index()

# get the IDs of the questions that correspond to the largest matchups in each poll
id_of_biggest_matchups =  max_n_cands_by_poll.merge(n_cands_by_poll_question, how = 'left', on = ['poll_id','n_candidates']).question_id

# only keep the version of each poll that has the most candidates included
fte_best_polls.loc[fte_best_polls['question_id'].isin(id_of_biggest_matchups),]

Unnamed: 0,poll_id,pollster_id,pollster,sponsors,display_name,fte_grade,methodology,state,start_date,end_date,question_id,sample_size,population,internal,partisan,party,answer,candidate_id,candidate_name,pct
58,72802,1365,Change Research,,Change Research,B-,Online/Text,Maine CD-2,10/29/20,11/2/20,136601,475.0,lv,False,,DEM,Biden,13256,Joe Biden,47.0
59,72802,1365,Change Research,,Change Research,B-,Online/Text,Maine CD-2,10/29/20,11/2/20,136601,475.0,lv,False,,REP,Trump,13254,Donald Trump,46.0
60,72802,1365,Change Research,,Change Research,B-,Online/Text,Maine CD-2,10/29/20,11/2/20,136601,475.0,lv,False,,LIB,Jorgensen,14611,Jo Jorgensen,4.0
61,72802,1365,Change Research,,Change Research,B-,Online/Text,Maine CD-2,10/29/20,11/2/20,136601,475.0,lv,False,,GRE,Hawkins,14612,Howie Hawkins,1.0
68,72863,770,TIPP,Investor's Business Daily,TIPP Insights,A+,Live Phone/Online,,10/29/20,11/2/20,136725,1212.0,lv,False,,DEM,Biden,13256,Joe Biden,50.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16698,57026,383,PPP,DEM (partisan),Public Policy Polling,A-,IVR,Ohio,11/27/18,11/28/18,92080,648.0,v,False,DEM,REP,Trump,13254,Donald Trump,44.0
16699,57026,383,PPP,DEM (partisan),Public Policy Polling,A-,IVR,Ohio,11/27/18,11/28/18,92081,648.0,v,False,DEM,DEM,Sanders,13257,Bernard Sanders,47.0
16700,57026,383,PPP,DEM (partisan),Public Policy Polling,A-,IVR,Ohio,11/27/18,11/28/18,92081,648.0,v,False,DEM,REP,Trump,13254,Donald Trump,46.0
16701,57026,383,PPP,DEM (partisan),Public Policy Polling,A-,IVR,Ohio,11/27/18,11/28/18,92082,648.0,v,False,DEM,DEM,Warren,13258,Elizabeth Warren,43.0


In [11]:
def getSampleSizeWeight(sample_size): #based on https://fivethirtyeight.com/features/polls-now-weighted-by-sample-size/
    weight = np.sqrt(sample_size/600)
    return weight

In [12]:
def recencyWeight(poll_date, current_date):
    poll_date_fmt = datetime.strptime(poll_date, '%m/%d/%y')
    current_date_fmt = datetime.strptime(current_date, '%m/%d/%y')
    delta = current_date_fmt - poll_date_fmt
    days_since_poll = delta.days
    weeks_since_poll = delta.days//7
    weight = 0.95**weeks_since_poll # y=0.95^x; adapted from https://fivethirtyeight.com/features/the-death-of-polling-is-greatly-exaggerated/
    return weight

In [177]:
recencyWeight('01/21/19', '11/03/20')

0.00847803669294384

In [28]:
sample_size_weights = [getSampleSizeWeight(sample) for sample in fte_best_polls.sample_size]
pollster_n_weights = [pollster_info.loc[pollster_info['pollster'] == pollster, 'pollster_n_weight'].iloc[0] for pollster in fte_best_polls.pollster]
pollster_rating_weight = [pollster_info.loc[pollster_info['pollster'] == pollster, 'pollster_rating_weight'].iloc[0] for pollster in fte_best_polls.pollster]
recency_weight = [recencyWeight(date, datetime.strptime('2020-11-03','%Y-%m-%d')) for date in fte_best_polls.end_date]

fte_best_polls.loc[:,'sample_size_weight'] = sample_size_weights
fte_best_polls.loc[:,'pollster_n_weight'] = pollster_n_weights
fte_best_polls.loc[:,'pollster_rating_weight'] = pollster_rating_weight
fte_best_polls.loc[:,'recency_weight'] = recency_weight
fte_best_polls.head()

Unnamed: 0,poll_id,pollster_id,pollster,sponsors,display_name,fte_grade,methodology,state,start_date,end_date,...,partisan,party,answer,candidate_id,candidate_name,pct,sample_size_weight,pollster_n_weight,pollster_rating_weight,recency_weight
0,73830,940,Lake Research,National Women's Law Center,Lake Research Partners,A/B,Live Phone,,2020-10-31,2020-11-03,...,DEM,DEM,Biden,13256,Joe Biden,51.0,2.0,1.0,3.5,1.0
1,73830,940,Lake Research,National Women's Law Center,Lake Research Partners,A/B,Live Phone,,2020-10-31,2020-11-03,...,DEM,REP,Trump,13254,Donald Trump,48.0,2.0,1.0,3.5,1.0
4,72621,383,PPP,,Public Policy Polling,A-,IVR/Text,Iowa,2020-11-01,2020-11-02,...,,DEM,Biden,13256,Joe Biden,49.0,1.204851,0.103142,3.7,1.0
5,72621,383,PPP,,Public Policy Polling,A-,IVR/Text,Iowa,2020-11-01,2020-11-02,...,,REP,Trump,13254,Donald Trump,48.0,1.204851,0.103142,3.7,1.0
6,72647,461,Susquehanna,,Susquehanna Polling & Research,B+,Live Phone,Pennsylvania,2020-11-01,2020-11-02,...,,DEM,Biden,13256,Joe Biden,48.4,0.911958,0.27735,3.3,1.0


In [32]:
def getWeights(polls, pollster_info):
    polls_with_weights = polls.copy()
    
    sample_size_weights = [getSampleSizeWeight(sample) for sample in polls.sample_size]
    pollster_n_weights = [pollster_info.loc[pollster_info['pollster'] == pollster, 'pollster_n_weight'].iloc[0] for pollster in polls.pollster]
    pollster_rating_weight = [pollster_info.loc[pollster_info['pollster'] == pollster, 'pollster_rating_weight'].iloc[0] for pollster in polls.pollster]
    recency_weight = [recencyWeight(date, '11/03/20') for date in polls.end_date] # change date to currentDate argument

    polls_with_weights.loc[:,'sample_size_weight'] = sample_size_weights
    polls_with_weights.loc[:,'pollster_n_weight'] = pollster_n_weights
    polls_with_weights.loc[:,'pollster_rating_weight'] = pollster_rating_weight
    polls_with_weights.loc[:,'recency_weight'] = recency_weight
    
    return polls_with_weights

In [35]:
polls_with_weights = getWeights(fte_best_polls, pollster_info)
polls_with_weights.head()

Unnamed: 0,poll_id,pollster_id,pollster,sponsors,display_name,fte_grade,methodology,state,start_date,end_date,...,partisan,party,answer,candidate_id,candidate_name,pct,sample_size_weight,pollster_n_weight,pollster_rating_weight,recency_weight
0,73830,940,Lake Research,National Women's Law Center,Lake Research Partners,A/B,Live Phone,,10/31/20,11/3/20,...,DEM,DEM,Biden,13256,Joe Biden,51.0,2.0,1.0,3.5,1.0
1,73830,940,Lake Research,National Women's Law Center,Lake Research Partners,A/B,Live Phone,,10/31/20,11/3/20,...,DEM,REP,Trump,13254,Donald Trump,48.0,2.0,1.0,3.5,1.0
4,72621,383,PPP,,Public Policy Polling,A-,IVR/Text,Iowa,11/1/20,11/2/20,...,,DEM,Biden,13256,Joe Biden,49.0,1.204851,0.103142,3.7,1.0
5,72621,383,PPP,,Public Policy Polling,A-,IVR/Text,Iowa,11/1/20,11/2/20,...,,REP,Trump,13254,Donald Trump,48.0,1.204851,0.103142,3.7,1.0
6,72647,461,Susquehanna,,Susquehanna Polling & Research,B+,Live Phone,Pennsylvania,11/1/20,11/2/20,...,,DEM,Biden,13256,Joe Biden,48.4,0.911958,0.27735,3.3,1.0


In [37]:
def calculateWeightedAverage(polls_with_weights):
    polls_with_weighted_avgs = polls_with_weights.copy()
    weighted_avg = polls_with_weighted_avgs.loc[:,'pct':'recency_weight'].product(axis=1)
    polls_with_weighted_avgs.loc[:,'weighted_pct'] = weighted_avg
    return polls_with_weighted_avgs

In [39]:
polls_with_weighted_avgs = calculateWeightedAverage(polls_with_weights)
polls_with_weighted_avgs.head()

Unnamed: 0,poll_id,pollster_id,pollster,sponsors,display_name,fte_grade,methodology,state,start_date,end_date,...,party,answer,candidate_id,candidate_name,pct,sample_size_weight,pollster_n_weight,pollster_rating_weight,recency_weight,weighted_pct
0,73830,940,Lake Research,National Women's Law Center,Lake Research Partners,A/B,Live Phone,,10/31/20,11/3/20,...,DEM,Biden,13256,Joe Biden,51.0,2.0,1.0,3.5,1.0,357.0
1,73830,940,Lake Research,National Women's Law Center,Lake Research Partners,A/B,Live Phone,,10/31/20,11/3/20,...,REP,Trump,13254,Donald Trump,48.0,2.0,1.0,3.5,1.0,336.0
4,72621,383,PPP,,Public Policy Polling,A-,IVR/Text,Iowa,11/1/20,11/2/20,...,DEM,Biden,13256,Joe Biden,49.0,1.204851,0.103142,3.7,1.0,22.530318
5,72621,383,PPP,,Public Policy Polling,A-,IVR/Text,Iowa,11/1/20,11/2/20,...,REP,Trump,13254,Donald Trump,48.0,1.204851,0.103142,3.7,1.0,22.070516
6,72647,461,Susquehanna,,Susquehanna Polling & Research,B+,Live Phone,Pennsylvania,11/1/20,11/2/20,...,DEM,Biden,13256,Joe Biden,48.4,0.911958,0.27735,3.3,1.0,40.398224


In [31]:
# deduped_polls = fte_best_polls.groupby('poll_id').first().reset_index()
# deduped_polls['standardized_weighted_pct'] = deduped_polls['weighted_pct']/sum(deduped_polls['weighted_pct'])
# deduped_polls
# fte_best_polls = fte_best_polls.merge(deduped_polls[['poll_id','standardized_weighted_pct']], on = 'poll_id')

Unnamed: 0,poll_id,pollster_id,pollster,sponsors,display_name,fte_grade,methodology,state,start_date,end_date,...,party,answer,candidate_id,candidate_name,pct,sample_size_weight,pollster_n_weight,pollster_rating_weight,recency_weight,weighted_pct
0,57025,399,Rasmussen (Pulse Opinion Research),,Rasmussen Reports/Pulse Opinion Research,B,IVR/Online,,2018-11-12,2018-11-13,...,DEM,Obama,13253,Michelle Obama,50.0,1.290994,0.133631,3.0,0.005076,0.131357
1,57026,383,PPP,DEM (partisan),Public Policy Polling,A-,IVR,Ohio,2018-11-27,2018-11-28,...,DEM,Brown,13255,Sherrod Brown,48.0,1.039230,0.103142,3.7,0.005921,0.112707
2,57170,1416,HarrisX,The Hill,HarrisX,B+,Online,,2018-12-16,2018-12-17,...,DEM,Biden,13256,Joe Biden,42.0,1.291640,0.147442,3.3,0.006560,0.173157
3,57275,383,PPP,,Public Policy Polling,A-,IVR/Online,North Carolina,2019-01-04,2019-01-07,...,DEM,Biden,13256,Joe Biden,49.0,1.118034,0.103142,3.7,0.007651,0.159967
4,57312,399,Rasmussen (Pulse Opinion Research),,Rasmussen Reports/Pulse Opinion Research,B,IVR/Online,,2019-01-10,2019-01-13,...,DEM,Ocasio-Cortez,13340,Alexandria Ocasio-Cortez,40.0,1.290994,0.133631,3.0,0.008054,0.166736
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4076,72864,1240,Øptimus,,Øptimus,B/C,Live Phone,South Carolina,2020-10-31,2020-11-02,...,DEM,Biden,13256,Joe Biden,39.0,1.166905,0.316228,2.5,1.000000,35.978249
4077,73214,169,Gallup,Institute for Advanced Studies in Culture,Gallup,B+,Live Phone,,2020-07-28,2020-08-27,...,DEM,Biden,13256,Joe Biden,56.0,1.917029,1.000000,3.3,0.630249,223.276536
4078,73340,1416,HarrisX,The Hill,HarrisX,B+,Online,,2020-10-31,2020-11-02,...,DEM,Biden,13256,Joe Biden,52.0,1.956613,0.147442,3.3,1.000000,49.504336
4079,73703,1655,USC Schwarzenegger Institute,USC Price,USC Schwarzenegger Institute,B/C,Online,California,2020-10-27,2020-10-31,...,DEM,Biden,13256,Joe Biden,65.0,1.387444,1.000000,2.5,1.000000,225.459600


In [44]:
def standardizeWeightedAverage(polls_with_weighted_averages):
    forecast = polls_with_weighted_averages.groupby(['candidate_name'])['weighted_pct'].mean()
    standardized_forecast = forecast/sum(forecast)
    return standardized_forecast.sort_values(ascending = False)

In [47]:
standardized_weighted_avgs = standardizeWeightedAverage(polls_with_weighted_avgs)
standardized_weighted_avgs.head()

candidate_name
Andrew Cuomo              0.174666
Joe Biden                 0.132000
Donald Trump              0.094278
Barack Obama              0.065013
Hillary Rodham Clinton    0.051436
Name: weighted_pct, dtype: float64

In [48]:
def _part1_pollingAverage(state, polls, pollster_info):
    
    #filter polls to state of interest
    state_polls = polls[polls.state == state] 
    
    best_state_polls = filterPollVersions(state_polls) #if there are multiple versions of a poll (rv/lv/v/a), only use the best one
    #multi_cand_polls = filterMultiCandidates(best_state_polls) #if there are multiple versions of a poll (head-to-head or multiple candidates),
                                                               #use the one with the most candidates
        
    #raw_averages = pd.DataFrame(multi_cand_polls.groupby(['poll_id','candidate_name'])['pct'].mean()).reset_index() #raw candidate averages for each poll
    
    
    polls_with_weights = getWeights(best_state_polls, pollster_info)
    
    #standardizeWeights()
    
    polls_with_weighted_avg = calculateWeightedAverage(polls_with_weights)
    
    standardized_forecast = standardizeWeightedAverage(polls_with_weighted_avg)
                                                                                          
    return standardized_forecast

In [49]:
test = _part1_pollingAverage('California', processed_polls, pollster_info)
test

candidate_name
Joe Biden            0.255851
Amy Klobuchar        0.096470
Donald Trump         0.084718
Michael Bloomberg    0.074904
Andrew Yang          0.060801
Tom Steyer           0.060054
Pete Buttigieg       0.054454
Bernard Sanders      0.052998
Elizabeth Warren     0.050585
Tulsi Gabbard        0.049329
Kamala Harris        0.026215
Jo Jorgensen         0.024705
Michelle Obama       0.022590
Beto O'Rourke        0.020547
Oprah Winfrey        0.020119
Mike Pence           0.018418
Howie Hawkins        0.016742
Nimrata R. Haley     0.010500
Name: weighted_pct, dtype: float64

In [108]:
fte[fte.poll_id.isin([58835,58836,58840])]
test[test.index == 'Joe Biden'] + test[~test.index.isin(['Donald Trump','Joe Biden'])].sum()/2
test[test.index == 'Donald Trump'] + test[~test.index.isin(['Donald Trump','Joe Biden'])].sum()/2

candidate_name
Donald Trump    0.414433
Name: weighted_pct, dtype: float64

In [68]:
def unweightedAverage(state, polls):
    
    #filter polls to state of interest
    state_polls = polls[polls.state == state] 
    
    # get polling average
    forecast = state_polls.groupby(['candidate_name'])['pct'].mean()
    standardized_forecast = forecast/sum(forecast)
                                                                                          
    return standardized_forecast

unweightedAverage('Pennsylvania', fte)

candidate_name
Amy Klobuchar        0.109387
Bernard Sanders      0.110150
Beto O'Rourke        0.108422
Donald Trump         0.107795
Elizabeth Warren     0.108236
Howie Hawkins        0.001248
Jo Jorgensen         0.004586
Joe Biden            0.120830
Kamala Harris        0.115459
Kanye West           0.001206
Michael Bloomberg    0.109656
Pete Buttigieg       0.103026
Name: pct, dtype: float64

In [58]:
def standardizeWeights(col):
    weights_sum = sum(col)
    weights_stand = col/weights_sum
    return weights_stand
deduped_polls.loc[:,'sample_size_weight':'recency_weight'] = deduped_polls.loc[:,'sample_size_weight':'recency_weight'].apply(standardizeWeights)
deduped_polls

Unnamed: 0,poll_id,pollster_id,pollster,sponsors,display_name,fte_grade,methodology,state,start_date,end_date,...,partisan,party,answer,candidate_id,candidate_name,pct,sample_size_weight,pollster_n_weight,pollster_rating_weight,recency_weight
0,57025,399,Rasmussen (Pulse Opinion Research),,Rasmussen Reports/Pulse Opinion Research,B,IVR/Online,,2018-11-12,2018-11-13,...,,DEM,Obama,13253,Michelle Obama,50.0,0.000164,0.000233,0.000274,0.000002
1,57026,383,PPP,DEM (partisan),Public Policy Polling,A-,IVR,Ohio,2018-11-27,2018-11-28,...,DEM,DEM,Brown,13255,Sherrod Brown,48.0,0.000132,0.000180,0.000338,0.000002
2,57170,1416,HarrisX,The Hill,HarrisX,B+,Online,,2018-12-16,2018-12-17,...,,DEM,Biden,13256,Joe Biden,42.0,0.000165,0.000257,0.000301,0.000002
3,57275,383,PPP,,Public Policy Polling,A-,IVR/Online,North Carolina,2019-01-04,2019-01-07,...,,DEM,Biden,13256,Joe Biden,49.0,0.000142,0.000180,0.000338,0.000003
4,57312,399,Rasmussen (Pulse Opinion Research),,Rasmussen Reports/Pulse Opinion Research,B,IVR/Online,,2019-01-10,2019-01-13,...,,DEM,Ocasio-Cortez,13340,Alexandria Ocasio-Cortez,40.0,0.000164,0.000233,0.000274,0.000003
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4076,72864,1240,Øptimus,,Øptimus,B/C,Live Phone,South Carolina,2020-10-31,2020-11-02,...,,DEM,Biden,13256,Joe Biden,39.0,0.000149,0.000551,0.000228,0.000366
4077,73214,169,Gallup,Institute for Advanced Studies in Culture,Gallup,B+,Live Phone,,2020-07-28,2020-08-27,...,,DEM,Biden,13256,Joe Biden,56.0,0.000244,0.001744,0.000301,0.000231
4078,73340,1416,HarrisX,The Hill,HarrisX,B+,Online,,2020-10-31,2020-11-02,...,,DEM,Biden,13256,Joe Biden,52.0,0.000249,0.000257,0.000301,0.000366
4079,73703,1655,USC Schwarzenegger Institute,USC Price,USC Schwarzenegger Institute,B/C,Online,California,2020-10-27,2020-10-31,...,,DEM,Biden,13256,Joe Biden,65.0,0.000177,0.001744,0.000228,0.000366


In [None]:
# polling average pseudocode, states only
def pollingAverage(state, polls, pollsterRatings, pollsterWeights, currentDate):
    state_polls = polls[polls.state = state] #filter polls to state of interest
    best_state_polls = filterPollVersions(state_polls) #if there are multiple versions of a poll (rv/lv/v/a), only use the best one
    multi_cand_polls = filterMultiCandidates(best_state_polls) #if there are multiple versions of a poll (head-to-head or multiple candidates),
                                                               #use the one with the most candidates
        
    raw_averages = pd.DataFrame(multi_cand_polls.groupby(['poll_id','candidate_name'])['pct'].mean()).reset_index() #raw candidate averages for each poll
    
    #pollster ratings and number of polls conducted
    pollster_info = pollsterInfo(polls, grades)
    
    getWeights()
    
    standardizeWeights()
    
    calculateWeightedAverage()
                                                                                          
    return weighted_average

In [138]:
pd.DataFrame(filterPollVersions(fte).groupby(['poll_id','candidate_name'])['pct'].mean()).reset_index().head(25)

poll_id  candidate_name  
57025    Donald Trump        43.00
         Michelle Obama      50.00
57026    Bernard Sanders     47.00
         Donald Trump        45.25
         Elizabeth Warren    43.00
                             ...  
74945    Joe Biden           48.20
74946    Donald Trump        48.00
         Joe Biden           44.20
74947    Donald Trump        53.00
         Joe Biden           42.00
Name: pct, Length: 10850, dtype: float64