In [1]:
import statsmodels
import scipy as sc
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
import statsmodels.stats.api as sms
from statsmodels.graphics.regressionplots import plot_leverage_resid2
import matplotlib.pyplot as plt

In [40]:
data = pd.read_csv('botswana.tsv', sep='\t')
data.head()

Unnamed: 0,ceb,age,educ,religion,idlnchld,knowmeth,usemeth,evermarr,agefm,heduc,urban,electric,radio,tv,bicycle
0,0,18,10,catholic,4.0,1.0,1.0,0,,,1,1.0,1.0,1.0,1.0
1,2,43,11,protestant,2.0,1.0,1.0,1,20.0,14.0,1,1.0,1.0,1.0,1.0
2,0,49,4,spirit,4.0,1.0,0.0,1,22.0,1.0,1,1.0,1.0,0.0,0.0
3,0,24,12,other,2.0,1.0,0.0,0,,,1,1.0,1.0,1.0,1.0
4,3,32,13,other,3.0,1.0,1.0,1,24.0,12.0,1,1.0,1.0,1.0,1.0


In [41]:
set(data['religion'].values)

{'catholic', 'other', 'protestant', 'spirit'}

In [42]:
data.dropna().shape

(1834, 15)

In [43]:
data['nevermarr'] = (data['agefm'].fillna(1) == 1).astype('int')

In [44]:
data = data.drop('evermarr', axis=1)

In [45]:
data['agefm'].fillna(0, inplace=True)

In [46]:
data['heduc'] = data[data['nevermarr'] == 1]['heduc'].fillna(1)

In [47]:
data['heduc'].shape[0] - data['heduc'].dropna().shape[0]

2079

In [49]:
def dropna(newcol,oldcol,cons):
    data[newcol]=0
    ind=data[data[oldcol].isnull()].index
    data.loc[ind,newcol]=1
    data[oldcol]=data[oldcol].fillna(cons)

dropna('idlnchld_noans','idlnchld',-1)
dropna('heduc_noans','heduc',-2)
dropna('usemeth_noans','usemeth',-1)

In [51]:
data.dropna(inplace=True)
data.shape[0]*data.shape[1]

78264

In [53]:
data.columns

Index(['ceb', 'age', 'educ', 'religion', 'idlnchld', 'knowmeth', 'usemeth',
       'agefm', 'heduc', 'urban', 'electric', 'radio', 'tv', 'bicycle',
       'nevermarr', 'idlnchld_noans', 'heduc_noans', 'usemeth_noans'],
      dtype='object')

In [56]:
m1 = smf.ols('ceb~age+educ+religion+idlnchld+knowmeth+usemeth+agefm+heduc+urban+electric+radio+tv+bicycle+nevermarr+idlnchld_noans+heduc_noans+usemeth_noans',
            data=data)
fitted = m1.fit()
print(fitted.summary())

                            OLS Regression Results                            
Dep. Variable:                    ceb   R-squared:                       0.639
Model:                            OLS   Adj. R-squared:                  0.638
Method:                 Least Squares   F-statistic:                     451.2
Date:                Sun, 23 Aug 2020   Prob (F-statistic):               0.00
Time:                        17:19:50   Log-Likelihood:                -7762.6
No. Observations:                4348   AIC:                         1.556e+04
Df Residuals:                    4330   BIC:                         1.568e+04
Df Model:                          17                                         
Covariance Type:            nonrobust                                         
                             coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------
Intercept                 -1

In [59]:
sms.het_breuschpagan(fitted.resid, fitted.model.exog)[1]

1.9982296788640461e-227

In [60]:
m2= smf.ols('ceb~age+educ+idlnchld+knowmeth+usemeth+agefm+heduc+urban+electric+bicycle+nevermarr+idlnchld_noans+heduc_noans+usemeth_noans',data=data)
fitted2 = m2.fit()
print ('Breusch-Pagan test: p=%f' % sms.het_breuschpagan(fitted2.resid, fitted2.model.exog)[1])
print (fitted2.summary())

Breusch-Pagan test: p=0.000000
                            OLS Regression Results                            
Dep. Variable:                    ceb   R-squared:                       0.638
Model:                            OLS   Adj. R-squared:                  0.637
Method:                 Least Squares   F-statistic:                     637.9
Date:                Sun, 23 Aug 2020   Prob (F-statistic):               0.00
Time:                        17:23:39   Log-Likelihood:                -7767.1
No. Observations:                4348   AIC:                         1.556e+04
Df Residuals:                    4335   BIC:                         1.564e+04
Df Model:                          12                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept    

In [61]:
m2= smf.ols('ceb~age+educ+idlnchld+knowmeth+usemeth+agefm+heduc+urban+electric+bicycle+nevermarr+idlnchld_noans+heduc_noans+usemeth_noans',data=data)
fitted2 = m2.fit(cov_type='HC1')
m1.fit().compare_f_test(m2.fit())

(1.8292908617227528, 0.1036097169930321, 5.0)

In [62]:
m3= smf.ols('ceb~age+educ+idlnchld+knowmeth+agefm+heduc+urban+electric+bicycle+nevermarr+idlnchld_noans+heduc_noans',data=data)
fitted3 = m3.fit(cov_type='HC1')
print (fitted.summary())

                            OLS Regression Results                            
Dep. Variable:                    ceb   R-squared:                       0.639
Model:                            OLS   Adj. R-squared:                  0.638
Method:                 Least Squares   F-statistic:                     451.2
Date:                Sun, 23 Aug 2020   Prob (F-statistic):               0.00
Time:                        17:25:54   Log-Likelihood:                -7762.6
No. Observations:                4348   AIC:                         1.556e+04
Df Residuals:                    4330   BIC:                         1.568e+04
Df Model:                          17                                         
Covariance Type:            nonrobust                                         
                             coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------
Intercept                 -1

In [63]:
m2.fit().compare_f_test(m3.fit())

(85.80023138395171, 2.8579012404914337e-37, 2.0)