In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
from scipy import stats

In [16]:
factor_types = {}

incumb_factors = ['year', 'is_midterm_year', 'rep_pres', 'RepSeatShare_prior',
         'RepVoteShare_prior']
econ_factors = ['absolute_sentiment', 'sentiment_change', 'absolute_gdp', 'gdp_change',
         'rdi', 'rdi_change']
poll_factors = ['annual_poll_approval', 'two_month_poll_approval',
         'annual_poll_disapproval', 'two_month_poll_disapproval', 'annual_dem',
         'two_month_dem', 'annual_rep', 'two_month_rep']

factor_types["Incumbent Factors"] = incumb_factors
factor_types["Economic Factors"] = econ_factors
factor_types["Polling Factors"] = poll_factors

factor_types

{'Incumbent Factors': ['year',
  'is_midterm_year',
  'rep_pres',
  'RepSeatShare_prior',
  'RepVoteShare_prior'],
 'Economic Factors': ['absolute_sentiment',
  'sentiment_change',
  'absolute_gdp',
  'gdp_change',
  'rdi',
  'rdi_change'],
 'Polling Factors': ['annual_poll_approval',
  'two_month_poll_approval',
  'annual_poll_disapproval',
  'two_month_poll_disapproval',
  'annual_dem',
  'two_month_dem',
  'annual_rep',
  'two_month_rep']}

In [17]:
data_2022 = pd.read_csv("data_2022.csv",index_col=0)
data_2022.head()

Unnamed: 0,year,is_midterm_year,rep_pres,RepSeatShare_prior,RepVoteShare_prior,absolute_sentiment,sentiment_change,absolute_gdp,gdp_change,rdi,rdi_change,annual_poll_approval,two_month_poll_approval,annual_poll_disapproval,two_month_poll_disapproval,annual_dem,two_month_dem,annual_rep,two_month_rep
36,2022,1,False,0.489655,0.487428,58.6,-19.433333,2.6,-0.628571,45389.0,-0.118866,41.630623,42.266799,53.563929,53.052741,44.081883,45.510514,43.552552,46.566844


In [18]:
train_df_nona = pd.read_csv("train_df_nona.csv",index_col=0)
train_df_nona.head()

Unnamed: 0,year,is_midterm_year,rep_pres,RepSeatShare_prior,RepVoteShare_prior,absolute_sentiment,sentiment_change,absolute_gdp,gdp_change,rdi,rdi_change,annual_poll_approval,two_month_poll_approval,annual_poll_disapproval,two_month_poll_disapproval,annual_dem,two_month_dem,annual_rep,two_month_rep
16,1980,0,False,0.365517,0.457484,76.7,4.6,7.7,-0.007887,0.006848,0.006848,40.625,31.0,47.5625,56.0,46.222222,45.6,36.444444,35.4
17,1982,1,True,0.441379,0.48767,72.1,0.7,0.2,-0.027861,0.006116,0.006116,43.5,42.0,45.055556,46.666667,50.25,50.333333,36.035714,38.333333
18,1984,0,True,0.37931,0.439036,95.7,25.3,3.3,0.084472,0.049447,0.049447,53.75,60.0,35.625,30.666667,47.0,45.230769,39.5,40.076923
19,1986,1,True,0.42069,0.473162,91.4,-4.6,2.2,0.037906,0.026141,0.026141,64.066667,64.0,24.933333,28.0,47.631579,47.4,41.473684,41.2
20,1988,0,True,0.409195,0.449127,93.0,2.6,5.4,0.050047,0.035426,0.035426,51.714286,56.0,36.714286,35.0,46.888889,42.6,37.333333,37.0


In [19]:
train_y = pd.read_csv("train_y.csv",index_col=0)
train_y.head()

Unnamed: 0,RepSeatShare,RepVoteShare
0,0.408776,0.464762
1,0.47806,0.502342
2,0.531178,0.486872
3,0.48037,0.475504
4,0.482679,0.487135


In [20]:
all_types_data = pickle.load(open('all_types_data.p', 'rb'))

In [21]:
all_infos = []
for key in all_types_data.keys():

    model_info = {}

    model_info["types"] = key

    model_data = pd.DataFrame(all_types_data[key])

    model = model_data.iloc[5]["model"]

    cols = []
    for k in key:
        cols.extend(factor_types[k])

    predictions = [
    {"seat_share": model.estimators_[i].predict(data_2022[cols])[0][0], 
    "vote_share": model.estimators_[i].predict(data_2022[cols])[0][1]} 
    for i in range(len(model.estimators_))
    ]

    model_info["seat_share"] = model.predict(data_2022[cols])[0][0]
    model_info["seat_amt"] = round(model.predict(data_2022[cols])[0][0]*435)

    model_info["seat_lwr"] = stats.norm.ppf(0.025, model.predict(data_2022[cols])[0][0], model_data.iloc[5]["Seat Share Residual Std"][0])
    model_info["seat_upr"] = stats.norm.ppf(0.975, model.predict(data_2022[cols])[0][0], model_data.iloc[5]["Seat Share Residual Std"][0])

    model_info["seat_lwr_rf"] = list(pd.DataFrame(predictions)["vote_share"].quantile([0.025,0.975]))[0]
    model_info["seat_upr_rf"] = list(pd.DataFrame(predictions)["vote_share"].quantile([0.025,0.975]))[1]

    model_info["vote_share"] = model.predict(data_2022[cols])[0][1]

    model_info["vote_lwr"] = stats.norm.ppf(0.025, model.predict(data_2022[cols])[0][0], model_data.iloc[5]["Vote Share Residual Std"][0])
    model_info["vote_upr"] = stats.norm.ppf(0.975, model.predict(data_2022[cols])[0][0], model_data.iloc[5]["Vote Share Residual Std"][0])

    model_info["vote_lwr_rf"] = list(pd.DataFrame(predictions)["vote_share"].quantile([0.025,0.975]))[0]
    model_info["vote_upr_rf"] = list(pd.DataFrame(predictions)["vote_share"].quantile([0.025,0.975]))[1]

    model_info["seat_std"] = model_data.iloc[5]["Seat Share Residual Std"][0]
    model_info["vote_std"] = model_data.iloc[5]["Vote Share Residual Std"][0]

    all_infos.append(model_info)



In [22]:
pd.DataFrame(all_infos)

Unnamed: 0,types,seat_share,seat_amt,seat_lwr,seat_upr,seat_lwr_rf,seat_upr_rf,vote_share,vote_lwr,vote_upr,vote_lwr_rf,vote_upr_rf,seat_std,vote_std
0,"(Incumbent Factors,)",0.527811,230,0.422042,0.63358,0.45717,0.535956,0.505696,0.47673,0.578892,0.45717,0.535956,0.053965,0.026062
1,"(Economic Factors,)",0.448445,195,0.347813,0.549076,0.439036,0.534744,0.473486,0.400752,0.496138,0.439036,0.534744,0.051343,0.024334
2,"(Polling Factors,)",0.535939,233,0.423004,0.648873,0.444394,0.535956,0.51416,0.485542,0.586336,0.444394,0.535956,0.057621,0.025713
3,"(Incumbent Factors, Economic Factors)",0.497445,216,0.40271,0.592179,0.439036,0.535956,0.491451,0.449529,0.54536,0.439036,0.535956,0.048335,0.024447
4,"(Incumbent Factors, Polling Factors)",0.530924,231,0.428185,0.633663,0.444394,0.535956,0.510943,0.481632,0.580216,0.444394,0.535956,0.052419,0.02515
5,"(Economic Factors, Polling Factors)",0.510458,222,0.404343,0.616573,0.439036,0.535956,0.500649,0.461319,0.559596,0.439036,0.535956,0.054141,0.025071
6,"(Incumbent Factors, Economic Factors, Polling ...",0.515673,224,0.4179,0.613446,0.444394,0.535956,0.503504,0.468056,0.56329,0.444394,0.535956,0.049885,0.024295


In [23]:
pd.DataFrame(all_infos)

Unnamed: 0,types,seat_share,seat_amt,seat_lwr,seat_upr,seat_lwr_rf,seat_upr_rf,vote_share,vote_lwr,vote_upr,vote_lwr_rf,vote_upr_rf,seat_std,vote_std
0,"(Incumbent Factors,)",0.527811,230,0.422042,0.63358,0.45717,0.535956,0.505696,0.47673,0.578892,0.45717,0.535956,0.053965,0.026062
1,"(Economic Factors,)",0.448445,195,0.347813,0.549076,0.439036,0.534744,0.473486,0.400752,0.496138,0.439036,0.534744,0.051343,0.024334
2,"(Polling Factors,)",0.535939,233,0.423004,0.648873,0.444394,0.535956,0.51416,0.485542,0.586336,0.444394,0.535956,0.057621,0.025713
3,"(Incumbent Factors, Economic Factors)",0.497445,216,0.40271,0.592179,0.439036,0.535956,0.491451,0.449529,0.54536,0.439036,0.535956,0.048335,0.024447
4,"(Incumbent Factors, Polling Factors)",0.530924,231,0.428185,0.633663,0.444394,0.535956,0.510943,0.481632,0.580216,0.444394,0.535956,0.052419,0.02515
5,"(Economic Factors, Polling Factors)",0.510458,222,0.404343,0.616573,0.439036,0.535956,0.500649,0.461319,0.559596,0.439036,0.535956,0.054141,0.025071
6,"(Incumbent Factors, Economic Factors, Polling ...",0.515673,224,0.4179,0.613446,0.444394,0.535956,0.503504,0.468056,0.56329,0.444394,0.535956,0.049885,0.024295


In [24]:
pd.DataFrame(all_infos)[["types","seat_std", "vote_std"]]

Unnamed: 0,types,seat_std,vote_std
0,"(Incumbent Factors,)",0.053965,0.026062
1,"(Economic Factors,)",0.051343,0.024334
2,"(Polling Factors,)",0.057621,0.025713
3,"(Incumbent Factors, Economic Factors)",0.048335,0.024447
4,"(Incumbent Factors, Polling Factors)",0.052419,0.02515
5,"(Economic Factors, Polling Factors)",0.054141,0.025071
6,"(Incumbent Factors, Economic Factors, Polling ...",0.049885,0.024295


In [25]:
all_types_data.keys()

dict_keys([('Incumbent Factors',), ('Economic Factors',), ('Polling Factors',), ('Incumbent Factors', 'Economic Factors'), ('Incumbent Factors', 'Polling Factors'), ('Economic Factors', 'Polling Factors'), ('Incumbent Factors', 'Economic Factors', 'Polling Factors')])

In [26]:
pd.DataFrame(all_types_data[('Incumbent Factors', 'Economic Factors', 'Polling Factors')]).iloc[5]

name                                            Random Forest Regressor (1000)
R^2                                                                   0.908466
Adjusted R^2                                                                 0
CV Scores                    [-20.182326716204507, -4.3650305274500925, 0.1...
residuals                    [[[0.03319874579644705, 0.005341899740252343]]...
Seat Share Residual Mean                               [0.0042306905499130566]
Seat Share Residual Std                                  [0.04988517731370993]
Vote Share Residual Mean                               [0.0014014067843851116]
Vote Share Residual Std                                 [0.024294980911360727]
2020 Predicted Seat Share                                             0.480163
2020 Predicted Vote Share                                             0.487929
2020 Seat Share Error                                                -0.009493
2020 Vote Share Error                               

In [27]:
from scipy import stats

In [28]:
stats.norm.()

SyntaxError: invalid syntax (<ipython-input-28-89743f6dfa3e>, line 1)