In [39]:
import pandas as pd
import numpy as np
from datascience import *
import csv
from functools import reduce
import math
import random

In [40]:
filepath = r"/1976-2022-house.csv";
df = pd.read_csv(filepath).fillna('NO PARTY')
tbl = Table.from_df(df)
tbl_reduced = tbl.where('year', are.above(2003)).where('totalvotes', are.above(1)).where('stage', 'GEN').where('writein', False).select('year', 'state', 'district', 'stage', 'candidate', 'party','candidatevotes', 'totalvotes')
tbl_reduced = tbl_reduced.with_column('pct', tbl_reduced.column('candidatevotes') / tbl_reduced.column('totalvotes'))

In [41]:
# Train for Alpha Value - Data from 2022

dem_vote_share = [0.466,0.52,0.529,0.448,0.484,0.449,0.473,0.529,0.503]
gop_vote_share = [0.492,0.441,0.424,0.514,0.471,0.507,0.483,0.443,0.472]
ind_vote_share = [0.042,0.04,0.047,0.038,0.045,0.045,0.043,0.028,0.025]
state_list = reduce(lambda re, x: re+[x] if x not in re else re, tbl_reduced.column('state').tolist(), [])

def calc_ewma(lst, alpha = 0.6):
    if len(lst) == 1:
      return lst[0]
    return alpha * lst[0] + (1 - alpha) * calc_ewma(lst[1:], alpha)

def district_ewma(state, district, alpha = 0.6):
  tbl_district = tbl_reduced.where('state', state).where('district', district)
  parties = reduce(lambda re, x: re+[x] if x not in re else re, tbl_district.column('party').tolist(), [])
  state_lst = []
  district_lst = []
  proj_lst = []
  for party in parties:
    tbl_party = tbl_district.where('party', party)
    pct_by_yr = []
    state_lst.append(state)
    district_lst.append(district)
    for year in range(2004, 2022, 2):
      if tbl_party.where('year', year).num_rows == 0:
        pct_by_yr.append(0)
      else:
        pct_by_yr.append(tbl_party.where('year', year).column('pct').item(0))
    if party == 'DEMOCRAT':
      pct_diff = np.array(pct_by_yr) - np.array(dem_vote_share)
    elif party == 'REPUBLICAN':
      pct_diff = np.array(pct_by_yr) - np.array(gop_vote_share)
    else:
      pct_diff = np.array(pct_by_yr) - np.array(ind_vote_share)
    one_proj = calc_ewma(pct_diff[::-1], alpha)
    proj_lst.append(one_proj)
  return Table().with_columns('State', state_lst, 'District', district_lst, 'Party', parties, 'Proj Diff', proj_lst)

def state_ewma(state, alpha = 0.6):
  tbl_state = tbl_reduced.where('state', state)
  district_ct = max(tbl_state.column('district').tolist())
  state_proj = Table().with_columns('State', [], 'District', [], 'Party', [], 'Proj Diff', [])
  if district_ct == 0:
    state_proj.append(district_ewma(state, 0, alpha))
  else:
    for district in range(1, district_ct + 1):
      one_district = district_ewma(state, district, alpha)
      state_proj = state_proj.append(one_district)
  return state_proj

def ewma(dem_pct, gop_pct, ind_pct, alpha = 0.6):
  overall_proj = Table().with_columns('State', [], 'District', [], 'Party', [], 'Proj Diff', [])
  for state in state_list:
    state_proj = state_ewma(state, alpha)
    overall_proj = overall_proj.append(state_proj)
  parties = overall_proj.column('Party').tolist()
  diffs = overall_proj.column('Proj Diff').tolist()
  pcts = []
  for i in range(overall_proj.num_rows):
    if parties[i] == 'DEMOCRAT':
      pcts.append(diffs[i] + dem_pct)
    elif parties[i] == 'REPUBLICAN':
      pcts.append(diffs[i] + gop_pct)
    else:
      pcts.append(diffs[i] + ind_pct)
  return overall_proj.with_column('Proj Pct', pcts).where('Proj Pct', are.above(0))

results22 = tbl_reduced.where('year', 2022)

alphas = []
diffs = []
for i in np.arange(0.5,0.9,0.01):
  alphas.append(i)
  test = ewma(0.473, 0.5, 0.027, alpha = i)
  test = test.join(['State', 'District', 'Party'], results22, ['state', 'district', 'party'])
  diffsq = np.mean((test.column('Proj Pct') - test.column('pct')) ** 2)
  diffs.append(diffsq)
Table().with_columns('Alpha', alphas, 'Diff', diffs).show()

Alpha,Diff
0.5,0.0144079
0.51,0.0144012
0.52,0.0143962
0.53,0.0143664
0.54,0.0143647
0.55,0.0143515
0.56,0.01434
0.57,0.0143431
0.58,0.0143479
0.59,0.0143542


In [42]:
# Train for Probabilities - Data from 2020

dem_vote_share = [0.466,0.52,0.529,0.448,0.484,0.449,0.473,0.529]
gop_vote_share = [0.492,0.441,0.424,0.514,0.471,0.507,0.483,0.443]
ind_vote_share = [0.042,0.04,0.047,0.038,0.045,0.045,0.043,0.028]
dem_generic20 = 0.49869617
gop_generic20 = 0.42613349
est_ind_vote20 = calc_ewma(ind_vote_share[::-1], alpha = 0.73)
est_dem_vote20 = (dem_generic20 / (dem_generic20 + gop_generic20)) * (1 - est_ind_vote20)
est_gop_vote20 = (gop_generic20 / (dem_generic20 + gop_generic20)) * (1 - est_ind_vote20)
results20 = tbl_reduced.where('year', 2020)

def district_ewma(state, district, alpha = 0.73):
  tbl_district = tbl_reduced.where('state', state).where('district', district)
  parties = reduce(lambda re, x: re+[x] if x not in re else re, tbl_district.column('party').tolist(), [])
  state_lst = []
  district_lst = []
  proj_lst = []
  for party in parties:
    tbl_party = tbl_district.where('party', party)
    pct_by_yr = []
    state_lst.append(state)
    district_lst.append(district)
    for year in range(2004, 2020, 2):
      if tbl_party.where('year', year).num_rows == 0:
        pct_by_yr.append(0)
      else:
        pct_by_yr.append(np.sum(tbl_party.where('year', year).column('pct')))
    if party == 'DEMOCRAT':
      pct_diff = np.array(pct_by_yr) - np.array(dem_vote_share)
      proj_diff = calc_ewma(pct_diff[::-1], alpha)
      proj_vote = proj_diff + est_dem_vote20
    elif party == 'REPUBLICAN':
      pct_diff = np.array(pct_by_yr) - np.array(gop_vote_share)
      proj_diff = calc_ewma(pct_diff[::-1], alpha)
      proj_vote = proj_diff + est_gop_vote20
    else:
      pct_diff = np.array(pct_by_yr) - np.array(ind_vote_share)
      proj_diff = calc_ewma(pct_diff[::-1], alpha)
      proj_vote = proj_diff + est_ind_vote20
    proj_lst.append(proj_vote)
  proj_vote_tbl = Table().with_columns('State', state_lst, 'District', district_lst, 'Party', parties, 'Proj Vote', proj_lst).sort('Proj Vote', descending = True)
  return proj_vote_tbl

def test_prob(district_tbl):
  sorted_tbl = district_tbl.sort('Proj Vote', descending = True)
  proj_margins = []
  proj_vote = sorted_tbl.column('Proj Vote').tolist()
  for i in range(sorted_tbl.num_rows):
    if i == 0:
      proj_margins.append(proj_vote[0] - proj_vote[1])
    else:
      proj_margins.append(proj_vote[i] - proj_vote[0])
  tbl_w_margins = district_tbl.with_column('Proj Margin', proj_margins).where('Proj Vote', are.above(0))
  joined_tbl = tbl_w_margins.join(['State', 'District', 'Party'], results20, ['state', 'district', 'party'])
  if not joined_tbl:
    return Table().with_columns('State', [], 'District', [], 'Party', [], 'Proj Vote', [], 'Proj Margin', [], 'pct', [], 'Win?', [])
  joined_tbl = joined_tbl.select('State', 'District', 'Party', 'Proj Vote', 'Proj Margin', 'pct').sort('pct', descending = True)
  win = []
  for i in range(joined_tbl.num_rows):
    if i == 0:
      win.append(1)
    else:
      win.append(0)
  return joined_tbl.with_column('Win?', win)

def state_ewma(state, alpha = 0.73):
  tbl_state = tbl_reduced.where('state', state).where('year', 2020)
  district_ct = max(tbl_state.column('district').tolist())
  state_proj = Table().with_columns('State', [], 'District', [], 'Party', [], 'Proj Vote', [], 'Proj Margin', [], 'pct', [], 'Win?', [])
  if district_ct == 0:
    state_proj.append(test_prob(district_ewma(state, 0, alpha)))
  else:
    for district in range(1, district_ct + 1):
      one_district = test_prob(district_ewma(state, district, alpha))
      state_proj = state_proj.append(one_district)
  return state_proj

def ewma(alpha = 0.73):
  overall_proj = Table().with_columns('State', [], 'District', [], 'Party', [], 'Proj Vote', [], 'Proj Margin', [], 'pct', [], 'Win?', [])
  for state in state_list:
    state_proj = state_ewma(state, alpha)
    overall_proj = overall_proj.append(state_proj)
  return overall_proj

ewma()

State,District,Party,Proj Vote,Proj Margin,pct,Win?
ALABAMA,1,REPUBLICAN,0.696829,0.402207,0.643698,1
ALABAMA,1,DEMOCRAT,0.294622,-0.402207,0.355387,0
ALABAMA,2,REPUBLICAN,0.582997,0.188381,0.652227,1
ALABAMA,2,DEMOCRAT,0.394616,-0.188381,0.346827,0
ALABAMA,3,REPUBLICAN,0.63554,0.271794,0.674615,1
ALABAMA,3,DEMOCRAT,0.363746,-0.271794,0.324593,0
ALABAMA,4,REPUBLICAN,0.835534,0.675348,0.822419,1
ALABAMA,4,DEMOCRAT,0.160186,-0.675348,0.17683,0
ALABAMA,5,REPUBLICAN,0.619618,0.253639,0.958109,1
ALABAMA,6,REPUBLICAN,0.698799,0.398192,0.9713,1


In [72]:
# 2024 Test

dem_vote_share = [0.466,0.52,0.529,0.448,0.484,0.449,0.473,0.529,0.503,0.473]
gop_vote_share = [0.492,0.441,0.424,0.514,0.471,0.507,0.483,0.443,0.472,0.5]
ind_vote_share = [0.042,0.04,0.047,0.038,0.045,0.045,0.043,0.028,0.025,0.027]
dem_generic24 = 0.463
gop_generic24 = 0.452
est_ind_vote24 = calc_ewma(ind_vote_share[::-1], alpha = 0.73)
est_dem_vote24 = (dem_generic24 / (dem_generic24 + gop_generic24)) * (1 - est_ind_vote24)
est_gop_vote24 = (gop_generic24 / (dem_generic24 + gop_generic24)) * (1 - est_ind_vote24)
state_list = reduce(lambda re, x: re+[x] if x not in re else re, tbl_reduced.where('year', 2022).column('state').tolist(), [])

def calc_ewma(lst, alpha = 0.73):
    if len(lst) == 1:
      return lst[0]
    return alpha * lst[0] + (1 - alpha) * calc_ewma(lst[1:], alpha)

def district_ewma(state, district, alpha = 0.73):
  tbl_district = tbl_reduced.where('state', state).where('district', district)
  parties = reduce(lambda re, x: re+[x] if x not in re else re, tbl_district.column('party').tolist(), [])
  state_lst = []
  district_lst = []
  proj_lst = []
  for party in parties:
    tbl_party = tbl_district.where('party', party)
    pct_by_yr = []
    state_lst.append(state)
    district_lst.append(district)
    for year in range(2004, 2024, 2):
      if tbl_party.where('year', year).num_rows == 0:
        pct_by_yr.append(0)
      else:
        pct_by_yr.append(np.sum(tbl_party.where('year', year).column('pct')))
    if party == 'DEMOCRAT':
      pct_diff = np.array(pct_by_yr) - np.array(dem_vote_share)
      proj_diff = calc_ewma(pct_diff[::-1], alpha)
      proj_vote = proj_diff + est_dem_vote24
    elif party == 'REPUBLICAN':
      pct_diff = np.array(pct_by_yr) - np.array(gop_vote_share)
      proj_diff = calc_ewma(pct_diff[::-1], alpha)
      proj_vote = proj_diff + est_gop_vote24
    else:
      pct_diff = np.array(pct_by_yr) - np.array(ind_vote_share)
      proj_diff = calc_ewma(pct_diff[::-1], alpha)
      proj_vote = proj_diff + est_ind_vote24
    proj_lst.append(proj_vote)
  proj_vote_tbl = Table().with_columns('State', state_lst, 'District', district_lst, 'Party', parties, 'Proj Vote', proj_lst).sort('Proj Vote', descending = True)
  proj_margins = []
  proj_vote_lst = proj_vote_tbl.column('Proj Vote').tolist()
  for i in range(proj_vote_tbl.num_rows):
    if i == 0:
      proj_margins.append(proj_vote_lst[0] - proj_vote_lst[1])
    else:
      proj_margins.append(proj_vote_lst[i] - proj_vote_lst[0])
  tbl_w_margins = proj_vote_tbl.with_column('Proj Margin', proj_margins).where('Proj Vote', are.above(0.005))
  probabilities = 1 / (1 + 2.718281828459045 ** (-27.5411 * tbl_w_margins.column('Proj Margin')))
  return tbl_w_margins.with_column('Chance', probabilities)

def state_ewma(state, alpha = 0.73):
  tbl_state = tbl_reduced.where('state', state).where('year', 2022)
  district_ct = max(tbl_state.column('district').tolist())
  state_proj = Table().with_columns('State', [], 'District', [], 'Party', [], 'Proj Vote', [], 'Proj Margin', [], 'Chance', [])
  if district_ct == 0:
    state_proj.append(district_ewma(state, 0, alpha))
  else:
    for district in range(1, district_ct + 1):
      one_district = district_ewma(state, district, alpha)
      state_proj = state_proj.append(one_district)
  return state_proj

def ewma(alpha = 0.73):
  overall_proj = Table().with_columns('State', [], 'District', [], 'Party', [], 'Proj Vote', [], 'Proj Margin', [], 'Chance', [])
  for state in state_list:
    state_proj = state_ewma(state, alpha)
    overall_proj = overall_proj.append(state_proj)
  return overall_proj

total_tbl = ewma()
total_tbl.show()

State,District,Party,Proj Vote,Proj Margin,Chance
ALABAMA,1,REPUBLICAN,0.778152,0.663674,1.0
ALABAMA,1,LIBERTARIAN,0.114477,-0.663674,1.15299e-08
ALABAMA,1,DEMOCRAT,0.101406,-0.676746,8.04404e-09
ALABAMA,2,REPUBLICAN,0.665523,0.345658,0.999927
ALABAMA,2,DEMOCRAT,0.319865,-0.345658,7.3379e-05
ALABAMA,2,LIBERTARIAN,0.0124601,-0.653063,1.54437e-08
ALABAMA,3,REPUBLICAN,0.689303,0.405441,0.999986
ALABAMA,3,DEMOCRAT,0.283862,-0.405441,1.4143e-05
ALABAMA,3,INDEPENDENT,0.0146881,-0.674615,8.53035e-09
ALABAMA,3,LIBERTARIAN,0.0116321,-0.677671,7.84176e-09


In [73]:
def one_sim(tbl):
  dem_wins = 0
  gop_wins = 0
  ind_wins = 0
  for state in reduce(lambda re, x: re+[x] if x not in re else re, tbl.column('State').tolist(), []):
    state_tbl = tbl.where('State', state)
    district_ct = max(state_tbl.column('District').tolist())
    if district_ct == 0:
      starting_value = 0
    else:
      starting_value = 1
    for district in range(starting_value, district_ct + 1):
      district_tbl = state_tbl.where('District', district)
      probs = district_tbl.column('Chance').tolist()
      parties = district_tbl.column('Party').tolist()
      rand_num = random.uniform(0, 1)
      total_vote = 0
      wins = []
      for i in range(district_tbl.num_rows):
        if sum(wins) == 1:
          wins.append(0)
        else:
          total_vote += probs[i]
          if rand_num < total_vote:
            wins.append(1)
            if parties[i] == 'DEMOCRAT':
              dem_wins += 1
            elif parties[i] == 'REPUBLICAN':
              gop_wins += 1
            else:
              ind_wins += 1
          else:
            wins.append(0)
  win_tbl = Table().with_columns('Party', ['DEMOCRAT', 'REPUBLICAN', 'OTHER'], 'Wins', [dem_wins, gop_wins, ind_wins])
  return win_tbl

def multiple_sims(tbl, n):
  first_wins = one_sim(tbl).column('Wins')
  if first_wins.item(0) > 218:
    win_lst = make_array(1,0,0)
  else:
    win_lst = make_array(0,1,0)
  for i in range(n - 1):
    onesim = one_sim(tbl)
    first_wins += onesim.column('Wins')
    if onesim.column('Wins').item(0) > 218:
      win_lst += make_array(1,0,0)
    else:
      win_lst += make_array(0,1,0)
  avg_seats = first_wins / n
  return Table().with_columns('Party', ['DEMOCRAT', 'REPUBLICAN', 'OTHER'], 'Avg Seats', avg_seats, 'Majority %', (win_lst / n))
multiple_sims(total_tbl, 1000)

Party,Avg Seats,Majority %
DEMOCRAT,225.072,0.965
REPUBLICAN,208.828,0.035
OTHER,1.1,0.0
