In [26]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

In [27]:
states_dict = {
        'AK': ['Alaska', 3],
        'AL': ['Alabama', 9],
        'AR': ['Arkansas', 6],
        'AZ': ['Arizona', 11],
        'CA': ['California', 55],
        'CO': ['Colorado', 9],
        'CT': ['Connecticut', 7],
        'DC': ['District of Columbia', 3],
        'DE': ['Delaware', 3],
        'FL': ['Florida', 29],
        'GA': ['Georgia', 16],
        'HI': ['Hawaii', 4],
        'IA': ['Iowa', 6],
        'ID': ['Idaho', 4],
        'IL': ['Illinois', 20],
        'IN': ['Indiana', 11],
        'KS': ['Kansas', 6],
        'KY': ['Kentucky', 8],
        'LA': ['Louisiana', 8],
        'MA': ['Massachusetts', 11],
        'MD': ['Maryland', 10],
        'ME': ['Maine', 4],
        'MI': ['Michigan', 16],
        'MN': ['Minnesota', 10],
        'MO': ['Missouri', 10],
        'MS': ['Mississippi', 6],
        'MT': ['Montana', 3],
        'NC': ['North Carolina', 15],
        'ND': ['North Dakota', 3],
        'NE': ['Nebraska', 5],
        'NH': ['New Hampshire', 4],
        'NJ': ['New Jersey', 14],
        'NM': ['New Mexico', 5],
        'NV': ['Nevada', 6],
        'NY': ['New York', 29],
        'OH': ['Ohio', 18],
        'OK': ['Oklahoma', 7],
        'OR': ['Oregon', 7],
        'PA': ['Pennsylvania', 20],
        'RI': ['Rhode Island', 4],
        'SC': ['South Carolina', 9],
        'SD': ['South Dakota', 3],
        'TN': ['Tennessee', 11],
        'TX': ['Texas' ,38],
        'UT': ['Utah', 6],
        'VA': ['Virginia', 13],
        'VT': ['Vermont', 3],
        'WA': ['Washington', 12],
        'WI': ['Wisconsin', 10],
        'WV': ['West Virginia', 5],
        'WY': ['Wyoming', 3]
}

## Reading pollster weight

In [28]:
weights = pd.read_csv('../data/raw/pollster_weights.csv')
poll_weight_mean = weights['Weight'].mean()
poll_weight_pie = weights['PIE'].mean()
weights.columns = weights.columns.str.lower()

## Defining some functions

In [48]:
def exp_decay(days):
    # defensive coding, accepts timedeltas
    days = getattr(days, "days", days)
    return .5 ** (days/30.)

def average_error(nobs, p=50.0):
    return p*nobs**-.5

def effective_sample(total_error, p=50.0):
    return p**2 * (total_error**-2.)

def calculate_mess(group):
    cumulative = group["sample_size"].cumsum()
    ae = average_error(cumulative)
    total_error = ae + group["pie"]
    ess = effective_sample(total_error)
    mess = ess.diff()
    mess.fillna(ess.head(1).item(), inplace=True)
    #from IPython.core.debugger import Pdb; Pdb().set_trace()
    return pd.concat((ess, mess), axis=1)

## 2020 Election polling data

In [83]:
today = datetime.datetime(2020, 8, 8)

In [84]:
poll2020 = pd.read_csv('../data/raw/president_polls2.csv', parse_dates=['start_date'], 
                  usecols=['state', 'pollster', 'sample_size', 'start_date',  'answer', 'pct'])

In [85]:
poll2020['state'] = poll2020['state'].fillna('USA')
poll2020['state'] = poll2020['state'].str.replace('Maine CD-2', 'Maine')
poll2020['state'] = poll2020['state'].str.replace('Maine CD-1', 'Maine')
poll2020['state'] = poll2020['state'].str.replace('Nebraska CD-2', 'Nebraska')
poll2020['state'] = poll2020['state'].str.replace('Nebraska CD-1', 'Nebraska')

In [86]:
poll2020 = poll2020[(poll2020.answer == 'Biden') | (poll2020.answer == 'Trump')  ]

In [87]:
poll2020 = poll2020.pivot_table(index=['start_date', 'state', 'sample_size', 'pollster'], columns = 'answer',
                                values = 'pct', aggfunc=np.mean)
poll2020 = poll2020.dropna()
poll2020 = poll2020.reset_index()

poll2020 = poll2020[(today - poll2020['start_date']) <= datetime.timedelta(120)]

In [88]:
poll2020 = poll2020.merge(weights, on = 'pollster', how= 'left')
poll2020['weight'] = poll2020['weight'].fillna(poll_weight_mean)
poll2020['pie'] = poll2020['pie'].fillna(poll_weight_pie)

In [89]:
poll2020['time_weight'] = (today - poll2020["start_date"]).apply(exp_decay)

In [90]:
poll2020.tail()

Unnamed: 0,start_date,state,sample_size,pollster,Biden,Trump,weight,pie,time_weight
638,2020-07-24,Wisconsin,392,Change Research,48.0,43.0,0.769,6.6,0.707107
639,2020-07-25,USA,1160,IBD/TIPP,48.0,41.0,0.778,2.2,0.723635
640,2020-07-26,USA,1260,YouGov,49.0,40.0,0.885,4.9,0.740549
641,2020-07-27,USA,947,Ipsos,57.0,43.0,0.705,4.6,0.757858
642,2020-07-27,USA,1115,Ipsos,57.0,43.0,0.705,4.6,0.757858


In [91]:
state_pollsters = poll2020.groupby(['state', 'pollster'])

df = state_pollsters.apply(calculate_mess)
df.rename(columns={0 : "ESS", 1 : "MESS"}, inplace=True);
poll2020 = poll2020.join(df)

In [92]:
poll2020['Biden'] = poll2020['Biden']*poll2020['weight']*poll2020['time_weight']
poll2020['Trump'] = poll2020['Trump']*poll2020['weight']*poll2020['time_weight']

In [93]:
poll2020 = poll2020.groupby('state')['Biden', 'Trump'].sum()

  poll2020 = poll2020.groupby('state')['Biden', 'Trump'].sum()


In [94]:
poll2020['two_parry_sum'] = poll2020['Biden'] + poll2020['Trump']
poll2020['dem_perc'] = poll2020['Biden'] / poll2020['two_parry_sum']
poll2020['gop_perc'] = poll2020['Trump'] / poll2020['two_parry_sum']
poll2020['dem_spread'] = poll2020['dem_perc'] - poll2020['gop_perc']
# poll2020['moe'] = 1.96*np.sqrt(poll2020['dem_perc']*poll2020['gop_perc'] / poll2020['sample_size'])

In [95]:
poll2020.head(42)

Unnamed: 0_level_0,Biden,Trump,two_parry_sum,dem_perc,gop_perc,dem_spread
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Alabama,17.536324,23.774153,41.310476,0.424501,0.575499,-0.150999
Alaska,50.911534,55.632328,106.543862,0.477846,0.522154,-0.044308
Arizona,321.914446,296.131778,618.046224,0.520858,0.479142,0.041716
Arkansas,10.45125,10.91575,21.367,0.48913,0.51087,-0.021739
California,22.422099,12.217586,34.639685,0.647295,0.352705,0.29459
Colorado,89.26057,66.012906,155.273475,0.57486,0.42514,0.149721
Connecticut,4.616956,2.720706,7.337662,0.629213,0.370787,0.258427
Florida,346.075508,305.669205,651.744713,0.530999,0.469001,0.061997
Georgia,205.675324,207.214328,412.889652,0.498136,0.501864,-0.003727
Indiana,5.017278,6.447947,11.465224,0.437608,0.562392,-0.124783


## 2016 

In [82]:
date2016 = datetime.datetime(2016, 11, 8) 

poll2016 = pd.read_csv('../data/raw/all_polls_2016_2.csv', parse_dates=['start.date'], 
                       usecols=['state', 'pollster', 'start.date', 'number.of.observations', 'trump' , 'clinton'])

poll2016['state'] = poll2016.state.str.replace('--', 'USA')
poll2016['state'] = poll2016['state'].str.replace('Maine CD-2', 'Maine')
poll2016['state'] = poll2016['state'].str.replace('Maine CD-1', 'Maine')
poll2016['state'] = poll2016['state'].str.replace('Nebraska CD-2', 'Nebraska')
poll2016['state'] = poll2016['state'].str.replace('Nebraska CD-1', 'Nebraska')

poll2016['number.of.observations'] = poll2016['number.of.observations'].fillna(poll2016['number.of.observations'].median())

poll2016.columns = ['state', 'pollster', 'start_date', 'sample_size', 'trump', 'clinton']

poll2016 = poll2016[(date2016 - poll2016['start_date']) <= datetime.timedelta(120)]
poll2016 = poll2016.merge(weights, on = 'pollster', how= 'left')
poll2016['weight'] = poll2016['weight'].fillna(poll_weight_mean)
poll2016['pie'] = poll2016['pie'].fillna(poll_weight_pie)

poll2016['time_weight'] = (date2016 - poll2016["start_date"]).apply(exp_decay)



state_pollsters = poll2016.groupby(['state', 'pollster'])

df = state_pollsters.apply(calculate_mess)
df.rename(columns={0 : "ESS", 1 : "MESS"}, inplace=True);
poll2016 = poll2016.join(df)

poll2016['clinton'] = poll2016['clinton']*poll2016['weight']*poll2016['time_weight']
poll2016['trump'] = poll2016['trump']*poll2016['weight']*poll2016['time_weight']

poll2016 = poll2016.groupby('state')['clinton', 'trump'].sum()

poll2016['two_parry_sum'] = poll2016['clinton'] + poll2016['trump']
poll2016['dem_perc'] = poll2016['clinton'] / poll2016['two_parry_sum']
poll2016['gop_perc'] = poll2016['trump'] / poll2016['two_parry_sum']
poll2016['dem_spread'] = poll2016['dem_perc'] - poll2016['gop_perc']
# poll2016['moe'] = 1.96*np.sqrt(poll2016['dem_perc']*poll2016['gop_perc'] / poll2016['sample_size'])

poll2016.head(42)

  poll2016 = poll2016.groupby('state')['clinton', 'trump'].sum()


Unnamed: 0_level_0,clinton,trump,two_parry_sum,dem_perc,gop_perc,dem_spread
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AK,198.520547,261.021949,459.542496,0.431996,0.568004,-0.136008
AL,189.663428,289.175314,478.838742,0.39609,0.60391,-0.207819
AR,207.209451,317.645478,524.854929,0.394794,0.605206,-0.210412
AZ,741.478795,794.370255,1535.84905,0.482781,0.517219,-0.034438
CA,640.036533,368.800864,1008.837396,0.63443,0.36557,0.26886
CO,909.878745,816.414632,1726.293376,0.527071,0.472929,0.054142
CT,271.41214,203.456355,474.868495,0.571552,0.428448,0.143104
DE,241.912075,181.182897,423.094972,0.571768,0.428232,0.143536
FL,1435.487258,1381.793027,2817.280284,0.509529,0.490471,0.019059
GA,728.020226,775.723525,1503.743751,0.484138,0.515862,-0.031723


## 2012

In [104]:
date2012 = datetime.datetime(2012, 11, 6)

poll2012 = pd.read_csv('../data/raw/all_polls_2012_2.csv', parse_dates=['start.date'], 
                       usecols=['state', 'pollster', 'start.date', 'number.of.observations', 'romney' , 'obama'])

poll2012['state'] = poll2012.state.str.replace('--', 'USA')
poll2012['state'] = poll2012['state'].str.replace('Maine CD-2', 'Maine')
poll2012['state'] = poll2012['state'].str.replace('Maine CD-1', 'Maine')
poll2012['state'] = poll2012['state'].str.replace('Nebraska CD-2', 'Nebraska')
poll2012['state'] = poll2012['state'].str.replace('Nebraska CD-1', 'Nebraska')

poll2012['number.of.observations'] = poll2012['number.of.observations'].fillna(poll2012['number.of.observations'].median())

poll2012.columns = ['state', 'pollster',  'sample_size', 'start_date','romney', 'obama']

poll2012 = poll2012[(date2012 - poll2012['start_date']) <= datetime.timedelta(120)]
poll2012 = poll2012.merge(weights, on = 'pollster', how= 'left')
poll2012['weight'] = poll2012['weight'].fillna(poll_weight_mean)
poll2012['pie'] = poll2012['pie'].fillna(poll_weight_pie)

poll2012['time_weight'] = (date2012 - poll2012["start_date"]).apply(exp_decay)



state_pollsters = poll2012.groupby(['state', 'pollster'])

df = state_pollsters.apply(calculate_mess)
df.rename(columns={0 : "ESS", 1 : "MESS"}, inplace=True);
poll2012 = poll2012.join(df)

poll2012['obama'] = poll2012['obama']*poll2012['weight']*poll2012['time_weight']
poll2012['romney'] = poll2012['romney']*poll2012['weight']*poll2012['time_weight']

poll2012 = poll2012.groupby('state')['obama', 'romney'].sum()

poll2012['two_parry_sum'] = poll2012['obama'] + poll2012['romney']
poll2012['dem_perc'] = poll2012['obama'] / poll2012['two_parry_sum']
poll2012['gop_perc'] = poll2012['romney'] / poll2012['two_parry_sum']
poll2012['dem_spread'] = poll2012['dem_perc'] - poll2012['gop_perc']
# poll2012['moe'] = 1.96*np.sqrt(poll2012['dem_perc']*poll2012['gop_perc'] / poll2012['sample_size'])

poll2012.head(42)

  poll2012 = poll2012.groupby('state')['obama', 'romney'].sum()


Unnamed: 0_level_0,obama,romney,two_parry_sum,dem_perc,gop_perc,dem_spread
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AL,8.608814,5.239584,13.848399,0.621647,0.378353,0.243294
AR,62.665239,35.277893,97.943132,0.639812,0.360188,0.279625
AZ,256.695258,221.366721,478.06198,0.53695,0.46305,0.073899
CA,284.447031,413.145214,697.592245,0.407755,0.592245,-0.184489
CO,945.2113,966.872571,1912.083871,0.494336,0.505664,-0.011329
CT,349.555693,453.938048,803.493742,0.435045,0.564955,-0.129911
FL,1304.765851,1293.075321,2597.841172,0.50225,0.49775,0.0045
GA,168.957336,140.030699,308.988035,0.546809,0.453191,0.093617
HI,36.805141,69.152979,105.95812,0.347356,0.652644,-0.305289
IA,662.153564,699.715316,1361.86888,0.486209,0.513791,-0.027581
