In [1]:
import collections
import datetime

import numpy as np
import pandas as pd
import requests

API_ENDPOINT = "http://elections.huffingtonpost.com/pollster/api/polls"

np.random.seed(2016)

In [16]:
def get_all_results(state='US', party='gop', start_date='2015-06-01'):
    #topic = '2016-president-{}-primary'.format(party)
    topic = '2016-president'
    params = {'state': state,
              'after': start_date,
              'topic': topic              
             }
    page = 1
    while True:
        params['page'] = page
        page_results = requests.get(API_ENDPOINT,
                                    params=params).json()
        for poll in page_results:
            #print 'reached'
            try:
                subpop = next(i['subpopulations'][0]
                              for i in poll['questions']
                              if i['topic'] == topic)
                for response in subpop['responses']:
                    if response['first_name']:
                        yield {'poll': poll['id'],
                               'date': poll['end_date'],
                               'filter': subpop['name'].lower(),
                               'obs': subpop['observations'],
                               'candidate': '{} {}'.format(response['first_name'], response['last_name']),
                               'mean': response['value']}
            except (RuntimeError, TypeError, NameError, IndexError):
                pass

        if len(page_results) < 10:
            break
        page += 1

In [17]:
def get_polls(state='US', party='gop', start_date='2015-12-01'):
    polls = pd.DataFrame(get_all_results(state=state, party=party, start_date=start_date))
    try:
        polls['date'] = pd.to_datetime(polls['date'])
        return polls
    except (RuntimeError, TypeError, NameError, IndexError, KeyError):
        return True

In [18]:
def get_distribution_for_date(polls, target_date=None, window=60):
    try:
        if target_date is None:
            target_date = datetime.datetime.today()
        polls = polls[
            (polls['date'] <= target_date)
            & (polls['date'] > target_date - datetime.timedelta(window))
        ]
        weights = 1 / np.square((target_date - polls['date']) / np.timedelta64(1, 'D') + 1)
        weighted = polls[['candidate']].copy()
        weighted['n'] = weights * polls['obs']
        weighted['votes'] = polls['mean'] / 100 * polls['obs'] * weights
        weighted = weighted.groupby('candidate').sum()
        weighted['mean'] = weighted['votes'] / weighted['n']
        weighted['std'] = np.sqrt((weighted['mean'] * (1 - weighted['mean'])) / weighted['n'])
        return weighted[['mean', 'std']].query('mean > 0').copy()
    except (RuntimeError, TypeError, NameError, IndexError):
        return True

In [19]:
def run_simulation(dists, trials=10000):
    runs = pd.DataFrame(
        [np.random.normal(dists['mean'], dists['std'])
         for i in range(trials)],
        columns=dists.index)
    results = pd.Series(collections.Counter(runs.T.idxmax()))
    return results / results.sum()
    #return results

In [20]:
def predict(state='us', party='gop', window=30, trials=10000, target_date=None):
    polls = get_polls(state=state, party=party)
    dists = get_distribution_for_date(polls, window=window, target_date=target_date)
    print('Superpoll Results:')
    try:
        print(dists.sort_values('mean', ascending=False).applymap(lambda x: '{:.1%}'.format(x)))
        print()
        print('Simulation Results:')
        #print(run_simulation(dists, trials=trials).sort_values(ascending=False).map(lambda x: '{:.1%}'.format(x)))
        print(run_simulation(dists,trials=trials))
    except (RuntimeError, TypeError, NameError, IndexError, AttributeError):
        pass    

In [21]:
target_date = datetime.datetime(2016, 9, 15)
predict(state='US', party='gop')

Superpoll Results:
                  mean   std
candidate                   
Hillary Clinton  46.0%  1.4%
Donald Trump     42.9%  1.4%
Gary Johnson      7.7%  1.4%
()
Simulation Results:
Donald Trump       0.0588
Hillary Clinton    0.9412
dtype: float64


In [25]:
states = ['AL','AK','AZ','AR','CA','CO','CT','DE','FL','GA','HI','ID','IL','IN','IA','KS','KY','LA','ME','MD','MA','MI',
          'MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','OH','OK','OR','PA','RI','SC','SD','TN','TX','UT',
          'VT','VA','WA','WV','WI','WY']

candidates= ['Hillary Clinton', 'Donald Trump']
test1 = pd.DataFrame(columns=['Donald Trump','Hillary Clinton'], index = states)
for i in states:
    polls = get_polls(state=i)
    dists = get_distribution_for_date(polls,window=30)
    #print(dists.sort_values('mean', ascending=False).applymap(lambda x: '{:.1%}'.format(x)))
    #print dists
    if hasattr(dists, 'empty'):
        simul_results = run_simulation(dists)
        for j in candidates:
            #test1.loc[i][j] = '{mean:'+'{:.1%}'.format(dists.loc[j]['mean'])+',std:'+ \
            #                   '{:.1%}'.format(dists.loc[j]['std'])+',simul:'+ \
            #                   '{:.1%}'.format(simul_results[j])+'}' 
            test1.loc[i][j] = '{:.1%}'.format(simul_results[j])
    
    
    #poll['state'] = 

In [12]:
polls1 = pd.DataFrame(get_all_results(state='CA', start_date='2016-6-1'))

In [26]:
test1

Unnamed: 0,Donald Trump,Hillary Clinton
AL,74.4%,25.6%
AK,57.6%,42.4%
AZ,60.8%,39.2%
AR,65.8%,34.2%
CA,0.1%,99.9%
CO,35.7%,64.3%
CT,31.9%,68.1%
DE,41.1%,58.9%
FL,49.4%,50.6%
GA,83.4%,16.6%


In [27]:
test1.to_csv('poll_results_pctwin_0922.csv')

In [None]:
topic = '2016-president'
params = {'state': 'pa',
          'after': '2016-04-01',
          'topic': topic
         }
page = 1
params['page'] = page
page_results = requests.get(API_ENDPOINT,
                                    params=params).json()
page_results

In [26]:
polls = pd.DataFrame(get_all_results(state='tx', party='gop', start_date='2016-4-1'))
polls

In [8]:
predict(state='ia', party='dem', target_date=target_date)

Superpoll Results:
                  mean   std
candidate                   
Hillary Clinton  47.4%  2.4%
Bernie Sanders   46.0%  2.4%
Martin O'Malley   3.6%  0.9%

Simulation Results:
Hillary Clinton    66.0%
Bernie Sanders     34.0%
dtype: object


In [9]:
predict(state='ia', party='gop', target_date=target_date,  window=4)

Superpoll Results:
                 mean   std
candidate                  
Donald Trump    27.5%  2.1%
Ted Cruz        23.1%  2.0%
Marco Rubio     18.1%  1.9%
Ben Carson       7.5%  1.3%
Rand Paul        5.1%  1.1%
Jeb Bush         4.1%  0.9%
Mike Huckabee    3.5%  0.9%
John Kasich      2.8%  0.8%
Carly Fiorina    2.5%  0.7%
Chris Christie   2.0%  0.7%
Rick Santorum    1.3%  0.5%

Simulation Results:
Donald Trump    93.6%
Ted Cruz         6.4%
Marco Rubio      0.0%
dtype: object
