In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import Markdown as md
np.random.seed(42)
sns.set()

In [2]:
raw_df = pd.read_stata("./ARM_Data/nes/nes5200_processed_voters_realideo.dta")

In [3]:
model_df = raw_df[['female', 'black', 'income', 'vote', 'year']].assign(vote=lambda x: (x.vote=="2. yes, voted").astype('int'))
model_df.describe(include='all')

Unnamed: 0,female,black,income,vote,year
count,41395.0,41185.0,37020,41498.0,41498.0
unique,,,5,,
top,,,3. 34 to 67 percentile,,
freq,,,12034,,
mean,0.552386,0.105597,,0.658321,1977.682983
std,0.497254,0.307324,,0.474278,14.71768
min,0.0,0.0,,0.0,1948.0
25%,0.0,0.0,,0.0,1966.0
50%,1.0,0.0,,1.0,1978.0
75%,1.0,0.0,,1.0,1990.0


In [4]:
mdl60 = smf.logit(data=model_df, formula="vote ~ female + black + income", subset=(model_df.year == 1960)).fit()
mdl60.summary()

Optimization terminated successfully.
         Current function value: 0.372509
         Iterations 6


0,1,2,3
Dep. Variable:,vote,No. Observations:,1028.0
Model:,Logit,Df Residuals:,1021.0
Method:,MLE,Df Model:,6.0
Date:,"Tue, 24 Mar 2020",Pseudo R-squ.:,0.05106
Time:,17:00:12,Log-Likelihood:,-382.94
converged:,True,LL-Null:,-403.54
Covariance Type:,nonrobust,LLR p-value:,2.631e-07

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,1.5655,0.254,6.167,0.000,1.068,2.063
income[T.2. 17 to 33 percentile],0.1601,0.290,0.553,0.581,-0.408,0.728
income[T.3. 34 to 67 percentile],0.5306,0.273,1.946,0.052,-0.004,1.065
income[T.4. 68 to 95 percentile],1.1538,0.289,3.990,0.000,0.587,1.721
income[T.5. 96 to 100 percentile],0.9289,0.518,1.795,0.073,-0.086,1.943
female,-0.3314,0.192,-1.723,0.085,-0.708,0.046
black,-0.9596,0.299,-3.212,0.001,-1.545,-0.374


In [5]:
mdl64 = smf.logit(data=model_df, formula="vote ~ female + black + income", subset=(model_df.year == 1964)).fit()
mdl64.summary()

Optimization terminated successfully.
         Current function value: 0.455748
         Iterations 6


0,1,2,3
Dep. Variable:,vote,No. Observations:,1301.0
Model:,Logit,Df Residuals:,1294.0
Method:,MLE,Df Model:,6.0
Date:,"Tue, 24 Mar 2020",Pseudo R-squ.:,0.01294
Time:,17:00:12,Log-Likelihood:,-592.93
converged:,True,LL-Null:,-600.7
Covariance Type:,nonrobust,LLR p-value:,0.01638

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,1.1096,0.186,5.969,0.000,0.745,1.474
income[T.2. 17 to 33 percentile],0.2850,0.229,1.243,0.214,-0.165,0.735
income[T.3. 34 to 67 percentile],0.4172,0.213,1.961,0.050,0.000,0.834
income[T.4. 68 to 95 percentile],0.7318,0.217,3.372,0.001,0.307,1.157
income[T.5. 96 to 100 percentile],0.6594,0.330,2.001,0.045,0.013,1.305
female,0.1179,0.149,0.790,0.429,-0.175,0.410
black,-0.2923,0.237,-1.231,0.218,-0.758,0.173


In [6]:
mdl68 = smf.logit(data=model_df, formula="vote ~ female + black + income", subset=(model_df.year == 1968)).fit()
mdl68.summary()

Optimization terminated successfully.
         Current function value: 0.487251
         Iterations 6


0,1,2,3
Dep. Variable:,vote,No. Observations:,1127.0
Model:,Logit,Df Residuals:,1120.0
Method:,MLE,Df Model:,6.0
Date:,"Tue, 24 Mar 2020",Pseudo R-squ.:,0.02532
Time:,17:00:12,Log-Likelihood:,-549.13
converged:,True,LL-Null:,-563.4
Covariance Type:,nonrobust,LLR p-value:,7.457e-05

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.9443,0.213,4.444,0.000,0.528,1.361
income[T.2. 17 to 33 percentile],0.0123,0.233,0.053,0.958,-0.445,0.469
income[T.3. 34 to 67 percentile],0.5862,0.228,2.568,0.010,0.139,1.033
income[T.4. 68 to 95 percentile],0.8999,0.244,3.682,0.000,0.421,1.379
income[T.5. 96 to 100 percentile],0.5312,0.370,1.437,0.151,-0.193,1.256
female,0.0957,0.154,0.621,0.535,-0.206,0.398
black,-0.4051,0.227,-1.785,0.074,-0.850,0.040


In [7]:
mdl72 = smf.logit(data=model_df, formula="vote ~ female + black + income", subset=(model_df.year == 1972)).fit()
mdl72.summary()

Optimization terminated successfully.
         Current function value: 0.579994
         Iterations 5


0,1,2,3
Dep. Variable:,vote,No. Observations:,2180.0
Model:,Logit,Df Residuals:,2173.0
Method:,MLE,Df Model:,6.0
Date:,"Tue, 24 Mar 2020",Pseudo R-squ.:,0.01722
Time:,17:00:12,Log-Likelihood:,-1264.4
converged:,True,LL-Null:,-1286.5
Covariance Type:,nonrobust,LLR p-value:,6.391e-08

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.7226,0.130,5.558,0.000,0.468,0.977
income[T.2. 17 to 33 percentile],0.0183,0.173,0.106,0.916,-0.321,0.357
income[T.3. 34 to 67 percentile],0.3373,0.135,2.493,0.013,0.072,0.603
income[T.4. 68 to 95 percentile],0.6521,0.147,4.444,0.000,0.364,0.940
income[T.5. 96 to 100 percentile],0.6550,0.244,2.683,0.007,0.177,1.133
female,-0.0941,0.098,-0.957,0.339,-0.287,0.099
black,-0.4365,0.151,-2.890,0.004,-0.732,-0.140


The issue is not reproducible in the same way as in the book on the dataset given. The guess is that it happens to a different nature of the income variable.