In [89]:
import datetime

import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import pandas as pd
from scipy import stats
np.set_printoptions(precision=4, suppress=True)
pd.set_option('display.max_columns', 500)

In [90]:
today = datetime.datetime(2020, 8, 8)

In [91]:
poll2020 = pd.read_csv('../data/raw/president_polls2.csv', parse_dates=['start_date'], 
                  usecols=['question_id', 'poll_id', 'state', 'pollster_id', 'pollster', 'fte_grade', 'sample_size', 
                           'population', 'methodology', 'start_date', 'internal', 'partisan', 'race_id', 'answer',
                           'candidate_party', 'pct'])

In [92]:
poll2020.head()

Unnamed: 0,question_id,poll_id,state,pollster_id,pollster,fte_grade,sample_size,population,methodology,start_date,internal,partisan,race_id,answer,candidate_party,pct
0,126918,67815,,744,Ipsos,B-,1115,a,Online,2020-07-27,False,,6210,Biden,DEM,57.0
1,126918,67815,,744,Ipsos,B-,1115,a,Online,2020-07-27,False,,6210,Trump,REP,43.0
2,126919,67815,,744,Ipsos,B-,947,rv,Online,2020-07-27,False,,6210,Biden,DEM,57.0
3,126919,67815,,744,Ipsos,B-,947,rv,Online,2020-07-27,False,,6210,Trump,REP,43.0
4,126934,67820,,568,YouGov,B,1260,rv,Online,2020-07-26,False,,6210,Biden,DEM,49.0


In [93]:
poll2020['state'] = poll2020['state'].str.replace('Maine CD-2', 'Maine')
poll2020['state'] = poll2020['state'].str.replace('Maine CD-1', 'Maine')
poll2020['state'] = poll2020['state'].str.replace('Nebraska CD-2', 'Nebraska')
poll2020['state'] = poll2020['state'].str.replace('Nebraska CD-1', 'Nebraska')

In [94]:
poll2020['state'] = poll2020['state'].fillna('USA')

In [95]:
poll2020 = poll2020[(poll2020.answer == 'Biden') | (poll2020.answer == 'Trump')  ]

In [96]:
poll2020['month'] = poll2020.start_date.dt.month

In [97]:
len(poll2020.state.unique())

42

In [98]:
poll2020.head()

Unnamed: 0,question_id,poll_id,state,pollster_id,pollster,fte_grade,sample_size,population,methodology,start_date,internal,partisan,race_id,answer,candidate_party,pct,month
0,126918,67815,USA,744,Ipsos,B-,1115,a,Online,2020-07-27,False,,6210,Biden,DEM,57.0,7
1,126918,67815,USA,744,Ipsos,B-,1115,a,Online,2020-07-27,False,,6210,Trump,REP,43.0,7
2,126919,67815,USA,744,Ipsos,B-,947,rv,Online,2020-07-27,False,,6210,Biden,DEM,57.0,7
3,126919,67815,USA,744,Ipsos,B-,947,rv,Online,2020-07-27,False,,6210,Trump,REP,43.0,7
4,126934,67820,USA,568,YouGov,B,1260,rv,Online,2020-07-26,False,,6210,Biden,DEM,49.0,7


In [99]:
len(poll2020.pivot_table(index=['question_id', 'start_date', 'state', 'sample_size', 'pollster'], columns = 'answer', values = 'pct', aggfunc=np.mean).reset_index()['state'].dropna().unique())

42

In [100]:
poll2020 = poll2020.pivot_table(index=['start_date', 'state', 'sample_size', 'pollster'], columns = 'answer', values = 'pct', aggfunc=np.mean)

In [101]:
len(poll2020)

1205

In [102]:
poll2020 = poll2020.dropna()

In [103]:
poll2020 = poll2020.reset_index()

In [104]:
poll2020.columns

Index(['start_date', 'state', 'sample_size', 'pollster', 'Biden', 'Trump'], dtype='object', name='answer')

In [105]:
poll2020['two_parry_sum'] = poll2020['Biden'] + poll2020['Trump']

In [106]:
poll2020['dem_perc'] = poll2020['Biden'] / poll2020['two_parry_sum']
poll2020['gop_perc'] = poll2020['Trump'] / poll2020['two_parry_sum']

In [107]:
poll2020['dem_spread'] = poll2020['dem_perc'] - poll2020['gop_perc']

In [108]:
poll2020['moe'] = 1.96*np.sqrt(poll2020['dem_perc']*poll2020['gop_perc'] / poll2020['sample_size'])

In [109]:
poll2020.head()

answer,start_date,state,sample_size,pollster,Biden,Trump,two_parry_sum,dem_perc,gop_perc,dem_spread,moe
0,2018-11-27,Ohio,648,Public Policy Polling,48.0,45.25,93.25,0.514745,0.485255,0.029491,0.038481
1,2018-12-16,USA,1001,Harris Insights & Analytics,42.0,36.666667,78.666667,0.533898,0.466102,0.067797,0.030904
2,2019-01-04,North Carolina,750,Public Policy Polling,49.0,45.333333,94.333333,0.519435,0.480565,0.038869,0.035757
3,2019-01-19,USA,760,Public Policy Polling,53.0,41.666667,94.666667,0.559859,0.440141,0.119718,0.035293
4,2019-01-24,Michigan,600,Glengariff Group,53.3,41.6,94.9,0.561644,0.438356,0.123288,0.039703


In [110]:
pollsters = pd.DataFrame(poll2020['pollster'].unique())

In [111]:
weights = pd.read_csv('../data/raw/pollster_weights.csv')

In [112]:
weights.head()

Unnamed: 0,Pollster,Weight,PIE
0,1892 Polling,1.0,0.6
1,20/20 Insight,0.5,12.1
2,A&A Research,0.0,10.0
3,Abacus Associates,0.5,8.1
4,ABC News/The Washington Post,0.719,2.8


In [113]:
poll_weight_mean = weights['Weight'].mean()
poll_weight_pie = weights['PIE'].mean()

In [114]:
weights.describe()

Unnamed: 0,Weight,PIE
count,453.0,453.0
mean,0.77213,6.520751
std,0.286724,4.436076
min,0.0,0.1
25%,0.667,3.9
50%,0.857,5.5
75%,1.0,7.9
max,1.0,37.6


In [115]:
# poll2020

In [116]:
# poll2020.pollster.replace(pollster_map, inplace=True)

In [117]:
poll2020 = poll2020.merge(weights, left_on = 'pollster', right_on = 'Pollster', how= 'left')

In [118]:
poll2020.head()

Unnamed: 0,start_date,state,sample_size,pollster,Biden,Trump,two_parry_sum,dem_perc,gop_perc,dem_spread,moe,Pollster,Weight,PIE
0,2018-11-27,Ohio,648,Public Policy Polling,48.0,45.25,93.25,0.514745,0.485255,0.029491,0.038481,Public Policy Polling,0.805,5.0
1,2018-12-16,USA,1001,Harris Insights & Analytics,42.0,36.666667,78.666667,0.533898,0.466102,0.067797,0.030904,Harris Insights & Analytics,0.834,5.1
2,2019-01-04,North Carolina,750,Public Policy Polling,49.0,45.333333,94.333333,0.519435,0.480565,0.038869,0.035757,Public Policy Polling,0.805,5.0
3,2019-01-19,USA,760,Public Policy Polling,53.0,41.666667,94.666667,0.559859,0.440141,0.119718,0.035293,Public Policy Polling,0.805,5.0
4,2019-01-24,Michigan,600,Glengariff Group,53.3,41.6,94.9,0.561644,0.438356,0.123288,0.039703,Glengariff Group,0.8,5.6


In [119]:
poll2020  = poll2020[['start_date', 'state', 'pollster', 'sample_size', 'dem_perc', 'gop_perc', 'dem_spread',
                      'moe', 'Weight', 'PIE' ]]
poll2020.columns = poll2020.columns.str.lower()

In [120]:
poll2020['weight'] = poll2020['weight'].fillna(poll_weight_mean)
poll2020['pie'] = poll2020['pie'].fillna(poll_weight_pie)

In [121]:
poll2020.describe()


Unnamed: 0,sample_size,dem_perc,gop_perc,dem_spread,moe,weight,pie
count,1137.0,1137.0,1137.0,1137.0,1137.0,1137.0,1137.0
mean,2481.71416,0.528295,0.471705,0.056591,0.030066,0.774741,5.429393
std,4636.033936,0.041875,0.041875,0.083751,0.01049,0.118242,2.128671
min,149.0,0.319588,0.294872,-0.360825,0.005322,0.0,0.6
25%,745.0,0.510638,0.451613,0.021277,0.026844,0.75,4.6
50%,1000.0,0.531674,0.468326,0.063348,0.030905,0.77213,5.1
75%,1303.0,0.548387,0.489362,0.096774,0.035671,0.813,6.520751
max,33549.0,0.705128,0.680412,0.410256,0.080277,1.0,20.3


In [122]:

def exp_decay(days):
    # defensive coding, accepts timedeltas
    days = getattr(days, "days", days)
    return .5 ** (days/30.)

In [123]:
def average_error(nobs, p=50.0):
    return p*nobs**-.5

In [124]:

def effective_sample(total_error, p=50.0):
    return p**2 * (total_error**-2.)

In [125]:
poll2020 = poll2020.sort_values(by = ['state', 'pollster'])

In [137]:

poll2020.head(20)

Unnamed: 0,start_date,state,pollster,sample_size,dem_perc,gop_perc,dem_spread,moe,weight,pie,ESS,MESS
964,2020-07-02,Alabama,Auburn University at Montgomery,558,0.424686,0.575314,-0.150628,0.041013,0.77213,6.520751,33.509813,33.509813
666,2020-05-14,Alabama,FM3 Research,601,0.423913,0.576087,-0.152174,0.039509,0.786,5.9,39.659666,39.659666
312,2020-02-04,Alabama,Mason-Dixon Polling & Strategy,625,0.394191,0.605809,-0.211618,0.038312,0.868,5.1,49.593335,49.593335
271,2020-01-07,Alabama,WPA Intelligence (WPAi),500,0.391753,0.608247,-0.216495,0.042788,0.676,7.5,26.373806,26.373806
916,2020-06-23,Alaska,Alaska Survey Research,663,0.493776,0.506224,-0.012448,0.038057,0.556,7.1,30.579224,30.579224
980,2020-07-07,Alaska,Public Policy Polling,1081,0.483871,0.516129,-0.032258,0.029791,0.805,5.0,58.795653,58.795653
1112,2020-07-23,Alaska,Public Policy Polling,885,0.468085,0.531915,-0.06383,0.032875,0.805,5.0,66.581053,7.7854
92,2019-07-22,Alaska,Zogby Interactive/JZ Analytics,321,0.464037,0.535963,-0.071926,0.054557,0.775,5.4,37.264476,37.264476
138,2019-09-09,Arizona,Bendixen & Amandi International,520,0.494118,0.505882,-0.011765,0.042973,1.0,3.0,92.717718,92.717718
754,2020-05-29,Arizona,Change Research,329,0.494382,0.505618,-0.011236,0.054026,0.769,6.6,28.556494,28.556494


In [127]:
state_pollsters = poll2020.groupby(['state', 'pollster'])

In [138]:
Marist_az = state_pollsters.get_group(("Alaska", "Public Policy Polling"))

In [143]:
Marist_az

Unnamed: 0,start_date,state,pollster,sample_size,dem_perc,gop_perc,dem_spread,moe,weight,pie,cumulative,average_error,total_error,ESS,MESS
1112,2020-07-23,Alaska,Public Policy Polling,885,0.468085,0.531915,-0.06383,0.032875,0.805,5.0,885,1.680732,6.680732,56.013403,56.013403
980,2020-07-07,Alaska,Public Policy Polling,1081,0.483871,0.516129,-0.032258,0.029791,0.805,5.0,1966,1.12766,6.12766,66.581053,10.56765


In [140]:
Marist_az.sort_values("start_date", ascending=False, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Marist_az.sort_values("start_date", ascending=False, inplace=True)


In [141]:
Marist_az["cumulative"] = Marist_az["sample_size"].cumsum()
Marist_az["average_error"] = average_error(Marist_az["cumulative"])
Marist_az["total_error"] = Marist_az["pie"] + Marist_az["average_error"]
# Marist_az[var_idx + ["cumulative"]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Marist_az["cumulative"] = Marist_az["sample_size"].cumsum()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Marist_az["average_error"] = average_error(Marist_az["cumulative"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Marist_az["total_error"] = Marist_az["pie"] + Marist_az["average_error"]


In [142]:

Marist_az["ESS"] = effective_sample(Marist_az["total_error"])
Marist_az["MESS"] = Marist_az["ESS"].diff()
# fill in first one
Marist_az["MESS"].fillna(Marist_az["ESS"].head(1).item(), inplace=True);

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Marist_az["ESS"] = effective_sample(Marist_az["total_error"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Marist_az["MESS"] = Marist_az["ESS"].diff()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


In [133]:
def calculate_mess(group):
    cumulative = group["sample_size"].cumsum()
    ae = average_error(cumulative)
    total_error = ae + group["pie"]
    ess = effective_sample(total_error)
    mess = ess.diff()
    mess.fillna(ess.head(1).item(), inplace=True)
    #from IPython.core.debugger import Pdb; Pdb().set_trace()
    return pd.concat((ess, mess), axis=1)

In [134]:
df = state_pollsters.apply(calculate_mess)
df.rename(columns={0 : "ESS", 1 : "MESS"}, inplace=True);

In [135]:
poll2020 = poll2020.join(df)


In [144]:
df.head()

Unnamed: 0,ESS,MESS
964,33.509813,33.509813
666,39.659666,39.659666
312,49.593335,49.593335
271,26.373806,26.373806
916,30.579224,30.579224


In [145]:
td = today - poll2020["start_date"].head(1).item()

In [146]:
poll2020["start_date"].head(1).item()

Timestamp('2020-07-02 00:00:00')

In [147]:
td

Timedelta('37 days 00:00:00')

In [148]:
poll2020["time_weight"] = (today - poll2020["start_date"]).apply(exp_decay)

In [149]:
def weighted_mean(group):
    weights1 = group["time_weight"]
    weights2 = group["MESS"]/100
    return (weights1*weights2).sum()
#     return np.sum(weights1*weights2*group["dem_spread"]/(weights1*weights2).sum())

In [150]:

state_pollsters = poll2020.groupby(["state", "pollster"])
state_polls = state_pollsters.apply(weighted_mean)

In [152]:
state_polls.head(20)

state    pollster                                    
Alabama  Auburn University at Montgomery                 0.142528
         FM3 Research                                    0.054375
         Mason-Dixon Polling & Strategy                  0.006746
         WPA Intelligence (WPAi)                         0.001879
Alaska   Alaska Survey Research                          0.105645
         Public Policy Polling                           0.334496
         Zogby Interactive/JZ Analytics                  0.000053
Arizona  Bendixen & Amandi International                 0.000413
         Change Research                                 0.104673
         Civiqs                                          0.175388
         Climate Nexus                                   0.005313
         Data Orbital                                    0.502655
         Emerson College                                 0.000645
         Fabrizio, Lee & Associates                      0.001082
         Fox News/Beac

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001C568054D00>

In [62]:
poll2020.head()

Unnamed: 0,start_date,state,pollster,sample_size,dem_perc,gop_perc,dem_spread,moe,weight,pie,ESS,MESS,time_weight
964,2020-07-02,Alabama,Auburn University at Montgomery,558,0.424686,0.575314,-0.150628,0.041013,0.77213,6.520751,33.509813,33.509813,0.425334
666,2020-05-14,Alabama,FM3 Research,601,0.423913,0.576087,-0.152174,0.039509,0.786,5.9,39.659666,39.659666,0.137103
312,2020-02-04,Alabama,Mason-Dixon Polling & Strategy,625,0.394191,0.605809,-0.211618,0.038312,0.868,5.1,49.593335,49.593335,0.013602
271,2020-01-07,Alabama,WPA Intelligence (WPAi),500,0.391753,0.608247,-0.216495,0.042788,0.676,7.5,26.373806,26.373806,0.007123
916,2020-06-23,Alaska,Alaska Survey Research,663,0.493776,0.506224,-0.012448,0.038057,0.556,7.1,30.579224,30.579224,0.345478


## 2016 polling

In [269]:
poll2016 = pd.read_csv('../data/raw/all_polls_2016_2.csv', parse_dates=['start.date'], 
                       usecols=['state', 'pollster', 'start.date', 'number.of.observations', 'trump' , 'clinton'])

In [270]:
poll2016.state = poll2016.state.str.replace('--', 'USA')

In [271]:
poll2016.groupby(['start.date', 'pollster'])['number.of.observations'].mean()

start.date  pollster                               
2015-05-19  Quinnipiac University                      1046.00
2015-06-18  Public Policy Polling                      1108.00
2015-06-20  YouGov                                      420.75
2015-06-21  Fox News/Opinion Dynamics Corp.            1005.00
2015-06-25  Public Policy Polling                      1072.00
                                                        ...   
2016-11-04  Selzer & Co.                                799.00
            Siena College/The New York Times Upshot     800.00
            YouGov                                     1834.50
2016-11-05  Opinion Savvy/InsiderAdvantage              853.00
2016-11-06  Landmark Communications                    1200.00
Name: number.of.observations, Length: 1084, dtype: float64

In [272]:
poll2016.head()

Unnamed: 0,state,pollster,start.date,number.of.observations,trump,clinton
0,USA,Insights West,2016-11-04,940.0,41,45
1,USA,Insights West,2016-11-04,,6,89
2,USA,Insights West,2016-11-04,,82,7
3,USA,Insights West,2016-11-04,,38,43
4,USA,IBD/TIPP,2016-11-04,1107.0,43,41


In [273]:
poll2016['number.of.observations'] = poll2016['number.of.observations'].fillna(poll2016['number.of.observations'].median())

In [274]:
poll2016['two_parry_sum'] = poll2016['clinton'] + poll2016['trump']

poll2016['dem_perc'] = poll2016['clinton'] / poll2016['two_parry_sum']
poll2016['gop_perc'] = poll2016['trump'] / poll2016['two_parry_sum']

poll2016['dem_spread'] = poll2016['dem_perc'] - poll2016['gop_perc']
poll2016['moe'] = 1.96*np.sqrt(poll2016['dem_perc']*poll2016['gop_perc'] / poll2016['number.of.observations'])

In [275]:
poll2016 = poll2016[['start.date', 'state', 'pollster', 'number.of.observations', 'dem_perc', 'gop_perc', 'dem_spread', 
                     'moe']]


In [276]:
poll2016.columns = ['start_date', 'state', 'pollster', 'sample_size', 'dem_perc', 'gop_perc', 'dem_spread', 'moe']

In [277]:
poll2016.isna().sum()/len(poll2016)

start_date     0.0
state          0.0
pollster       0.0
sample_size    0.0
dem_perc       0.0
gop_perc       0.0
dem_spread     0.0
moe            0.0
dtype: float64

In [278]:
state_groups_2016 = poll2016.groupby("state")

In [279]:
poll2016 = poll2016.merge(weights, left_on = 'pollster', right_on = 'Pollster', how= 'left').drop(
    'Pollster', axis=1)

In [280]:
poll2016.columns = poll2016.columns.str.lower()
poll2016['weight'] = poll2016['weight'].fillna(poll_weight_mean)
poll2016['pie'] = poll2016['pie'].fillna(poll_weight_pie)

In [281]:
state_pollsters = poll2016.groupby(['state', 'pollster'])

df = state_pollsters.apply(calculate_mess)
df.rename(columns={0 : "ESS", 1 : "MESS"}, inplace=True);

In [282]:
poll2016 = poll2016.join(df)

In [283]:

date2016 = datetime.datetime(2016, 11, 8)

In [316]:
poll2016 = poll2016[(date2016 - poll2016['start_date']) <= datetime.timedelta(90)]

In [285]:
poll2016["time_weight"] =(date2016 - poll2016['start_date']).apply(exp_decay)

In [286]:
poll2020.columns


Index(['start_date', 'state', 'pollster', 'sample_size', 'dem_perc',
       'gop_perc', 'dem_spread', 'moe', 'weight', 'pie', 'ESS', 'MESS',
       'time_weight'],
      dtype='object')

In [287]:

poll2016.columns

Index(['start_date', 'state', 'pollster', 'sample_size', 'dem_perc',
       'gop_perc', 'dem_spread', 'moe', 'weight', 'pie', 'ESS', 'MESS',
       'time_weight'],
      dtype='object')

In [288]:
state_pollsters = poll2016.groupby(["state", "pollster"])
state_polls = state_pollsters.apply(weighted_mean)

# 2012 polling

In [291]:
poll2012 = pd.read_csv('../data/raw/all_polls_2012_2.csv', parse_dates=['start.date'], 
                       usecols=['state', 'pollster', 'start.date', 'number.of.observations', 'romney' , 'obama'])

poll2012.state = poll2012.state.str.replace('--', 'USA')

poll2012.groupby(['start.date', 'pollster'])['number.of.observations'].mean()



poll2012.head()

poll2012['number.of.observations'] = poll2012['number.of.observations'].fillna(poll2012['number.of.observations'].median())

poll2012['two_parry_sum'] = poll2012['obama'] + poll2012['romney']

poll2012['dem_perc'] = poll2012['obama'] / poll2012['two_parry_sum']
poll2012['gop_perc'] = poll2012['romney'] / poll2012['two_parry_sum']

poll2012['dem_spread'] = poll2012['dem_perc'] - poll2012['gop_perc']
poll2012['moe'] = 1.96*np.sqrt(poll2012['dem_perc']*poll2012['gop_perc'] / poll2012['number.of.observations'])

poll2012 = poll2012[['start.date', 'state', 'pollster', 'number.of.observations', 'dem_perc', 'gop_perc', 'dem_spread', 
                     'moe']]


poll2012.columns = ['start_date', 'state', 'pollster', 'sample_size', 'dem_perc', 'gop_perc', 'dem_spread', 'moe']

poll2012.isna().sum()/len(poll2012)



state_groups_2012 = poll2012.groupby("state")

poll2012 = poll2012.merge(weights, left_on = 'pollster', right_on = 'Pollster', how= 'left').drop(
    'Pollster', axis=1)

poll2012.columns = poll2012.columns.str.lower()
poll2012['weight'] = poll2012['weight'].fillna(poll_weight_mean)
poll2012['pie'] = poll2012['pie'].fillna(poll_weight_pie)

state_pollsters = poll2012.groupby(['state', 'pollster'])

df = state_pollsters.apply(calculate_mess)
df.rename(columns={0 : "ESS", 1 : "MESS"}, inplace=True);

poll2012 = poll2012.join(df)


date2012 = datetime.datetime(2012, 11, 6)

poll2012 = poll2012[(date2012 - poll2012['start_date']) <= datetime.timedelta(90)]

poll2012["time_weight"] =(date2012 - poll2012['start_date']).apply(exp_decay)

poll2020.columns



poll2012.columns

state_pollsters = poll2012.groupby(["state", "pollster"])
state_polls = state_pollsters.apply(weighted_mean)

In [293]:
poll2012.tail()

Unnamed: 0,start_date,state,pollster,sample_size,dem_perc,gop_perc,dem_spread,moe,weight,pie,ESS,MESS,time_weight
1170,2012-11-03,SD,Nielson Brothers Polling,671.0,0.456522,0.543478,-0.086957,0.037689,0.889,12.2,14.344103,0.36285,0.933033
1171,2012-11-04,MO,YouGov,779.0,0.442105,0.557895,-0.115789,0.034876,0.885,4.9,69.854524,6.024467,0.954842
1172,2012-11-04,TN,YouGov,697.0,0.442105,0.557895,-0.115789,0.03687,0.885,4.9,68.834721,5.874141,0.954842
1173,2012-11-04,TX,YouGov,1563.0,0.4,0.6,-0.2,0.024287,0.885,4.9,76.575827,6.269463,0.954842
1174,2012-11-03,UT,Dan Jones & Associates,870.0,0.273684,0.726316,-0.452632,0.029627,0.947,6.9,39.129249,3.103233,0.933033


# 2008 polls

In [294]:
poll2008 = pd.read_csv('../data/raw/all_polls_2008_2.csv', parse_dates=['start.date'], 
                       usecols=['state', 'pollster', 'start.date', 'number.of.observations', 'mccain' , 'obama'])

poll2008.state = poll2008.state.str.replace('--', 'USA')

poll2008.groupby(['start.date', 'pollster'])['number.of.observations'].mean()



poll2008.head()

poll2008['number.of.observations'] = poll2008['number.of.observations'].fillna(poll2008['number.of.observations'].median())

poll2008['two_parry_sum'] = poll2008['obama'] + poll2008['mccain']

poll2008['dem_perc'] = poll2008['obama'] / poll2008['two_parry_sum']
poll2008['gop_perc'] = poll2008['mccain'] / poll2008['two_parry_sum']

poll2008['dem_spread'] = poll2008['dem_perc'] - poll2008['gop_perc']
poll2008['moe'] = 1.96*np.sqrt(poll2008['dem_perc']*poll2008['gop_perc'] / poll2008['number.of.observations'])

poll2008 = poll2008[['start.date', 'state', 'pollster', 'number.of.observations', 'dem_perc', 'gop_perc', 'dem_spread', 
                     'moe']]


poll2008.columns = ['start_date', 'state', 'pollster', 'sample_size', 'dem_perc', 'gop_perc', 'dem_spread', 'moe']

poll2008.isna().sum()/len(poll2008)



state_groups_2008 = poll2008.groupby("state")

poll2008 = poll2008.merge(weights, left_on = 'pollster', right_on = 'Pollster', how= 'left').drop(
    'Pollster', axis=1)

poll2008.columns = poll2008.columns.str.lower()
poll2008['weight'] = poll2008['weight'].fillna(poll_weight_mean)
poll2008['pie'] = poll2008['pie'].fillna(poll_weight_pie)

state_pollsters = poll2008.groupby(['state', 'pollster'])

df = state_pollsters.apply(calculate_mess)
df.rename(columns={0 : "ESS", 1 : "MESS"}, inplace=True);

poll2008 = poll2008.join(df)


date2008 = datetime.datetime(2008, 11, 4)

poll2008 = poll2008[(date2008 - poll2008['start_date']) <= datetime.timedelta(90)]

poll2008["time_weight"] =(date2008 - poll2008['start_date']).apply(exp_decay)





poll2008.columns

state_pollsters = poll2008.groupby(["state", "pollster"])
state_polls = state_pollsters.apply(weighted_mean)

## Clustering States by Demographic

In [305]:
pvi = pd.read_csv("../data/raw/partisan_voting.csv", )
pvi.set_index("State", inplace=True);
pvi.PVI = pvi.PVI.replace({"EVEN" : "0"})
pvi.PVI = pvi.PVI.str.replace("R\+", "-")
pvi.PVI = pvi.PVI.str.replace("D\+", "")
pvi.PVI = pvi.PVI.astype(float)
pvi.PVI

State
Alabama                -14.0
Alaska                  -9.0
Arizona                 -5.0
Arkansas               -15.0
California              12.0
Colorado                 1.0
Connecticut              6.0
Delaware                 6.0
District of Columbia    39.0
Florida                 -2.0
Georgia                 -5.0
Hawaii                  18.0
Idaho                  -19.0
Illinois                 7.0
Indiana                 -9.0
Iowa                    -3.0
Kansas                 -13.0
Kentucky               -15.0
Louisiana              -11.0
Maine                    3.0
Maryland                12.0
Massachusetts           12.0
Michigan                 1.0
Minnesota                1.0
Mississippi             -9.0
Missouri                -9.0
Montana                -11.0
Nebraska               -14.0
Nevada                   1.0
New Hampshire            1.0
New Jersey               7.0
New Mexico               3.0
New York                12.0
North Carolina          -3.0
North Da

In [314]:
party_affil = pd.read_csv("../data/raw/gallup.csv")
party_affil.set_index("State", inplace=True);
party_affil.rename(columns={"Democratic advantage" : "dem_adv"}, inplace=True);
party_affil["no_party"] = 100 - party_affil.Democrat - party_affil.Republican

In [315]:
party_affil.head()

Unnamed: 0_level_0,Democrat,Republican,dem_adv,N,no_party
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Alabama,35,50,-15,3057,15
Alaska,31,52,-21,553,17
Arizona,40,42,-2,3669,18
Arkansas,36,45,-9,1883,19
California,51,30,21,18168,19
