In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import Markdown as md
np.random.seed(42)

In [2]:
pd.options.display.max_rows = 999

In [3]:
raw_df = pd.read_stata("./ARM_Data/nes/nes5200_processed_voters_realideo.dta")

In [4]:
raw_df.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
year,41498,,,,1977.68,14.7177,1948.0,1966.0,1978.0,1990.0,2002.0
resid,41498,,,,1325.85,1055.72,1.0,511.0,1125.0,1806.0,6009.0
weight1,41498,,,,1.0565,0.375776,0.2417,1.0,1.0,1.0,4.0
weight2,41498,,,,1.04031,0.39871,0.0,1.0,1.0,1.0,4.0
weight3,41498,,,,1.03988,0.399297,0.0,1.0,1.0,1.0,4.0
age,39532,83.0,35,951.0,,,,,,,
gender,41395,2.0,2. female,22866.0,,,,,,,
race,41185,6.0,1. white,34563.0,,,,,,,
educ1,41059,4.0,"2. high school (12 grades or fewer, incl",19021.0,,,,,,,
urban,38794,3.0,"3. rural, small towns, outlying and adja",14328.0,,,,,,,


In [5]:
clean_df = raw_df[~raw_df.vote.isna()][['gender', 'race', 'educ1', 'partyid7', 'ideo_feel', 'vote']]

In [6]:
clean_df

Unnamed: 0,gender,race,educ1,partyid7,ideo_feel,vote
0,1. male,1. white,1. grade school of less (0-8 grades),,,"2. yes, voted"
1,2. female,1. white,"2. high school (12 grades or fewer, incl",,,"2. yes, voted"
2,2. female,1. white,"2. high school (12 grades or fewer, incl",,,"2. yes, voted"
3,2. female,1. white,"3. some college(13 grades or more,but no",,,"2. yes, voted"
4,1. male,1. white,"3. some college(13 grades or more,but no",,,"2. yes, voted"
...,...,...,...,...,...,...
41491,2. female,1. white,4. college or advanced degree (no cases,6. weak republican,49. neutral,"2. yes, voted"
41493,1. male,1. white,4. college or advanced degree (no cases,2. weak democrat,29,"2. yes, voted"
41494,2. female,2. black,"3. some college(13 grades or more,but no",1. strong democrat,,"2. yes, voted"
41495,2. female,1. white,"2. high school (12 grades or fewer, incl",5. independent-republican,74,"2. yes, voted"


In [7]:
clean_df.describe()

Unnamed: 0,gender,race,educ1,partyid7,ideo_feel,vote
count,37986,37806,37697,36817,25635,38088
unique,2,6,4,7,91,2
top,2. female,1. white,"2. high school (12 grades or fewer, incl",2. weak democrat,49. neutral,"2. yes, voted"
freq,20999,31840,17447,8083,7714,27319


`ideo_feel` has too many unique values, some tranformation is needed

In [8]:
clean_df.ideo_feel.value_counts()

49. neutral              7714
54.0                     2446
44.0                     1813
59.0                     1799
39.0                     1189
64.0                     1045
34.0                      686
71.0                      589
69.0                      581
66.0                      552
61.0                      539
76.0                      534
56.0                      519
84.0                      428
41.0                      392
36.0                      367
26.0                      322
31.0                      316
97. most conservative     310
74.0                      283
29.0                      279
21.0                      267
91.0                      233
72.0                      171
79.0                      161
14.0                      158
24.0                      126
51.0                      124
46.0                      123
25.0                      117
90.0                      108
82.0                      107
77.0                      102
00. most l

In [9]:
clean_df = clean_df.assign(
    ideo_c=lambda x: x.ideo_feel.str.split('.').str[0].fillna(49).astype('int'),
    ideo_q=lambda x: (x.ideo_c.mean() - x.ideo_c)/2*x.ideo_c.std(),
    voted=lambda x: x.vote.str.contains("yes").astype('int')
)[['gender', 'race', 'educ1', 'partyid7', 'ideo_q', 'voted']]

In [10]:
mdl_full = smf.logit(data = clean_df, formula="voted ~ gender + race + educ1 + partyid7 + ideo_q").fit()

Optimization terminated successfully.
         Current function value: 0.562363
         Iterations 6


In [11]:
mdl_full.summary()

0,1,2,3
Dep. Variable:,voted,No. Observations:,36397.0
Model:,Logit,Df Residuals:,36380.0
Method:,MLE,Df Model:,16.0
Date:,"Thu, 19 Mar 2020",Pseudo R-squ.:,0.04413
Time:,23:21:32,Log-Likelihood:,-20468.0
converged:,True,LL-Null:,-21413.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,1.1781,0.041,28.857,0.000,1.098,1.258
gender[T.2. female],-0.1069,0.025,-4.359,0.000,-0.155,-0.059
race[T.2. black],-0.4823,0.039,-12.475,0.000,-0.558,-0.406
race[T.3. asian],-0.5257,0.135,-3.882,0.000,-0.791,-0.260
race[T.4. native american],-0.5401,0.089,-6.037,0.000,-0.715,-0.365
race[T.5. hispanic],-0.6064,0.066,-9.238,0.000,-0.735,-0.478
race[T.7. other],-0.4649,0.208,-2.236,0.025,-0.872,-0.057
"educ1[T.2. high school (12 grades or fewer, incl]",0.1962,0.034,5.719,0.000,0.129,0.263
"educ1[T.3. some college(13 grades or more,but no]",0.5334,0.041,13.044,0.000,0.453,0.614


In [12]:
mdl_cut = smf.logit(data = clean_df, formula="voted ~ educ1 + race + gender + partyid7").fit()
mdl_cut.summary()

Optimization terminated successfully.
         Current function value: 0.562365
         Iterations 6


0,1,2,3
Dep. Variable:,voted,No. Observations:,36397.0
Model:,Logit,Df Residuals:,36381.0
Method:,MLE,Df Model:,15.0
Date:,"Thu, 19 Mar 2020",Pseudo R-squ.:,0.04412
Time:,23:21:32,Log-Likelihood:,-20468.0
converged:,True,LL-Null:,-21413.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,1.1777,0.041,28.858,0.000,1.098,1.258
"educ1[T.2. high school (12 grades or fewer, incl]",0.1963,0.034,5.721,0.000,0.129,0.264
"educ1[T.3. some college(13 grades or more,but no]",0.5334,0.041,13.045,0.000,0.453,0.614
educ1[T.4. college or advanced degree (no cases],0.9685,0.045,21.508,0.000,0.880,1.057
race[T.2. black],-0.4827,0.039,-12.492,0.000,-0.558,-0.407
race[T.3. asian],-0.5255,0.135,-3.881,0.000,-0.791,-0.260
race[T.4. native american],-0.5399,0.089,-6.036,0.000,-0.715,-0.365
race[T.5. hispanic],-0.6064,0.066,-9.239,0.000,-0.735,-0.478
race[T.7. other],-0.4650,0.208,-2.237,0.025,-0.872,-0.058


In [13]:
mdl_int = smf.logit(data = clean_df, formula="voted ~ educ1 + race*gender + partyid7").fit()
mdl_int.summary()

Optimization terminated successfully.
         Current function value: 0.562247
         Iterations 6


0,1,2,3
Dep. Variable:,voted,No. Observations:,36397.0
Model:,Logit,Df Residuals:,36376.0
Method:,MLE,Df Model:,20.0
Date:,"Thu, 19 Mar 2020",Pseudo R-squ.:,0.04432
Time:,23:21:32,Log-Likelihood:,-20464.0
converged:,True,LL-Null:,-21413.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,1.1914,0.041,28.799,0.000,1.110,1.272
"educ1[T.2. high school (12 grades or fewer, incl]",0.1974,0.034,5.752,0.000,0.130,0.265
"educ1[T.3. some college(13 grades or more,but no]",0.5343,0.041,13.062,0.000,0.454,0.614
educ1[T.4. college or advanced degree (no cases],0.9692,0.045,21.510,0.000,0.881,1.058
race[T.2. black],-0.5380,0.061,-8.874,0.000,-0.657,-0.419
race[T.3. asian],-0.7618,0.188,-4.042,0.000,-1.131,-0.392
race[T.4. native american],-0.5068,0.140,-3.616,0.000,-0.782,-0.232
race[T.5. hispanic],-0.7610,0.098,-7.733,0.000,-0.954,-0.568
race[T.7. other],-0.5870,0.296,-1.985,0.047,-1.167,-0.007


In [14]:
mdl_int_cut = smf.logit(data = clean_df, formula="voted ~ educ1 + race:gender + partyid7").fit()
mdl_int_cut.summary()

Optimization terminated successfully.
         Current function value: 0.562247
         Iterations 6


0,1,2,3
Dep. Variable:,voted,No. Observations:,36397.0
Model:,Logit,Df Residuals:,36376.0
Method:,MLE,Df Model:,20.0
Date:,"Thu, 19 Mar 2020",Pseudo R-squ.:,0.04432
Time:,23:21:32,Log-Likelihood:,-20464.0
converged:,True,LL-Null:,-21413.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,1.1914,0.041,28.799,0.000,1.110,1.272
"educ1[T.2. high school (12 grades or fewer, incl]",0.1974,0.034,5.752,0.000,0.130,0.265
"educ1[T.3. some college(13 grades or more,but no]",0.5343,0.041,13.062,0.000,0.454,0.614
educ1[T.4. college or advanced degree (no cases],0.9692,0.045,21.510,0.000,0.881,1.058
partyid7[T.2. weak democrat],-0.6002,0.038,-15.864,0.000,-0.674,-0.526
partyid7[T.3. independent-democrat],-0.6924,0.045,-15.221,0.000,-0.782,-0.603
partyid7[T.4. independent-independent],-0.9834,0.046,-21.543,0.000,-1.073,-0.894
partyid7[T.5. independent-republican],-0.4707,0.049,-9.542,0.000,-0.567,-0.374
partyid7[T.6. weak republican],-0.4402,0.044,-10.117,0.000,-0.525,-0.355


* So we've kicked out the ideology variable as it was shown to be insignificant.
* out of all interactions between race and gender only the case of `hispanic` and `female` combination was an important factor when separate field values were present. Removing the fields themselves and leaving only combinations shown that some of other combinations became significant, yet, it didn't improve the model performance.