# 15. Instrumental Variables Estimation and Two Stage Least Squares

## 15.1. Instrumental Variables in Simple Regression Models

### Example 15.1: Return to Education for Married Women

In [1]:
import linearmodels.iv as iv
import wooldridge as woo
import numpy as np
import pandas as pd
import linearmodels.iv as iv
import statsmodels.formula.api as smf
import scipy.stats as stats
from supplementaryFunctions import *

mroz = woo.dataWoo("mroz")
mroz.dropna(subset=["lwage"], inplace = True)
mroz.head()

Unnamed: 0,inlf,hours,kidslt6,kidsge6,age,educ,wage,repwage,hushrs,husage,...,faminc,mtr,motheduc,fatheduc,unem,city,exper,nwifeinc,lwage,expersq
0,1,1610,1,0,32,12,3.354,2.65,2708,34,...,16310.0,0.7215,12,7,5.0,0,14,10.91006,1.210154,196
1,1,1656,0,2,30,12,1.3889,2.65,2310,30,...,21800.0,0.6615,7,7,11.0,1,5,19.499981,0.328512,25
2,1,1980,1,3,35,12,4.5455,4.04,3072,40,...,21040.0,0.6915,12,7,5.0,0,15,12.03991,1.514138,225
3,1,456,0,3,34,12,1.0965,3.25,1920,53,...,7300.0,0.7815,7,7,5.0,0,6,6.799996,0.092123,36
4,1,1568,1,2,31,14,4.5918,3.6,2000,32,...,27300.0,0.6215,12,14,9.5,1,7,20.100058,1.524272,49


In [2]:
cov_mat = mroz.cov()
cov_mat.round(2).head()

Unnamed: 0,inlf,hours,kidslt6,kidsge6,age,educ,wage,repwage,hushrs,husage,...,faminc,mtr,motheduc,fatheduc,unem,city,exper,nwifeinc,lwage,expersq
inlf,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
hours,0.0,602601.92,-51.17,-179.92,328.84,-115.1,-250.87,508.7,-7345.69,282.96,...,1365702.32,-10.84,-47.55,-159.55,-186.38,-8.57,1871.27,-669.71,-9.34,54381.73
kidslt6,0.0,-51.17,0.15,0.05,-1.02,0.12,0.04,-0.03,-4.33,-1.1,...,-329.14,0.0,0.08,0.09,0.02,0.0,-0.59,-0.11,-0.01,-18.05
kidsge6,0.0,-179.92,0.05,1.73,-4.04,-0.28,-0.35,-0.43,88.42,-3.71,...,-747.92,0.02,0.2,-0.22,-0.07,-0.06,-4.11,0.46,-0.11,-129.98
age,0.0,328.84,-1.02,-4.04,59.62,-0.92,0.78,0.08,-546.89,54.9,...,10266.16,-0.07,-5.74,-2.98,2.17,0.37,30.08,7.93,0.31,1056.94


In [3]:
x_name, y_name, z_name = "educ", "lwage", "fatheduc"
cov_yz = cov_mat[y_name][z_name]
cov_xy = cov_mat[x_name][y_name]
cov_xz = cov_mat[x_name][z_name]
var_x = cov_mat[x_name][x_name]
x_bar, y_bar = mroz[[x_name, y_name]].mean()

In [4]:
# OLS slope parameter:
beta_xy = cov_xy / var_x
beta_iv = cov_yz / cov_xz
beta_xy, beta_iv

(0.10864865517467549, 0.05917347999936601)

In [5]:
formula = "np.log(wage) ~ educ"
reg_ols = smf.ols(formula = formula, data = mroz)
results_ols = reg_ols.fit()
OLS_summary(results_ols)

Unnamed: 0_level_0,$\beta $,$t$,$$P>|t|$$,$SE$
$$r^2: 0.1179$$,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Intercept,-0.1852,-0.9998,0.318,0.1852
educ,0.1086,7.5451,0.0,0.0144


In [6]:
formula = "np.log(wage) ~ 1 + [educ ~ fatheduc]"
reg_iv = iv.IV2SLS.from_formula(formula = formula,
                               data = mroz)
results_iv = reg_iv.fit(cov_type="unadjusted", debiased = True)
LM_summary(results_iv)

Unnamed: 0_level_0,$\beta $,$t$,$$P>|t|$$,$SE$
$$r^2: 0.0934$$,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Intercept,0.4411,0.9888,0.3233,0.4461
educ,0.0592,1.6839,0.0929,0.0351


### Example 15.4: Using College Proximity as an IV for Education

In [7]:
card = woo.dataWoo("card")
formula = "educ ~ nearc4 + exper + I(exper**2) + black + smsa + "\
    "south + smsa66 + reg662 + reg663 + reg664 + reg665 + reg666 + "\
    "reg667 + reg668 + reg669"


reg_redf = smf.ols(formula = formula, data = card)
results_redf = reg_redf.fit()
OLS_summary(results_redf)

Unnamed: 0_level_0,$\beta $,$t$,$$P>|t|$$,$SE$
$$r^2: 0.4771$$,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Intercept,16.6383,69.1446,0.0,0.2406
nearc4,0.3199,3.6408,0.0003,0.0879
exper,-0.4125,-12.2415,0.0,0.0337
I(exper ** 2),0.0009,0.5263,0.5987,0.0017
black,-0.9355,-9.9806,0.0,0.0937
smsa,0.4022,3.8372,0.0001,0.1048
south,-0.0516,-0.3811,0.7032,0.1354
smsa66,0.0255,0.2409,0.8096,0.1058
reg662,-0.0786,-0.4203,0.6743,0.1871
reg663,-0.0279,-0.1524,0.8789,0.1834


In [8]:
formula = "np.log(wage) ~ educ + exper + I(exper**2) + black + smsa + "\
    "south + smsa66 + reg662 + reg663 + reg664 + reg665 + reg666 + "\
    "reg667 + reg668 + reg669"
reg_ols = smf.ols(formula = formula, data = card)
results_ols = reg_ols.fit()
OLS_summary(results_ols)

Unnamed: 0_level_0,$\beta $,$t$,$$P>|t|$$,$SE$
$$r^2: 0.2998$$,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Intercept,4.6208,62.2476,0.0,0.0742
educ,0.0747,21.351,0.0,0.0035
exper,0.0848,12.8063,0.0,0.0066
I(exper ** 2),-0.0023,-7.2232,0.0,0.0003
black,-0.199,-10.9058,0.0,0.0182
smsa,0.1364,6.7851,0.0,0.0201
south,-0.148,-5.695,0.0,0.026
smsa66,0.0262,1.3493,0.1773,0.0194
reg662,0.0964,2.6845,0.0073,0.0359
reg663,0.1445,4.1151,0.0,0.0351


In [9]:
formula = "np.log(wage) ~ 1 + exper + I(exper**2) + black + smsa + "\
    "south + smsa66 + reg662 + reg663 + reg664 + reg665 + reg666 + "\
    "reg667 + reg668 + reg669 + [educ ~ nearc4]"
reg_iv  = iv.IV2SLS.from_formula(formula = formula, data = card)
results_iv = reg_iv.fit(cov_type = "unadjusted", debiased = True)
LM_summary(results_iv)

Unnamed: 0_level_0,$\beta $,$t$,$$P>|t|$$,$SE$
$$r^2: 0.2382$$,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Intercept,3.6662,3.9641,0.0001,0.9248
I(exper**2),-0.0023,-7.0014,0.0,0.0003
black,-0.1468,-2.7231,0.0065,0.0539
exper,0.1083,4.5764,0.0,0.0237
reg662,0.1008,2.6739,0.0075,0.0377
reg663,0.1483,4.0272,0.0001,0.0368
reg664,0.0499,1.1408,0.2541,0.0437
reg665,0.1463,3.1079,0.0019,0.0471
reg666,0.1629,3.1382,0.0017,0.0519
reg667,0.1346,2.724,0.0065,0.0494


## 15.3 Two Stage Least Squares

### Example 15.5: Return to Education for Married Women

In [10]:
# 1st Stage
formula = "educ ~ exper + I(exper**2) + motheduc + fatheduc"
reg_redf = smf.ols(formula = formula, data = mroz)
results_redf = reg_redf.fit()
OLS_summary(results_redf)

Unnamed: 0_level_0,$\beta $,$t$,$$P>|t|$$,$SE$
$$r^2: 0.2115$$,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Intercept,9.1026,21.3396,0.0,0.4266
exper,0.0452,1.1236,0.2618,0.0403
I(exper ** 2),-0.001,-0.8386,0.4022,0.0012
motheduc,0.1576,4.3906,0.0,0.0359
fatheduc,0.1895,5.6152,0.0,0.0338


In [11]:
mroz["educ_fitted"] = results_redf.fittedvalues
formula = "np.log(wage) ~ educ_fitted + exper + I(exper**2)"
reg_secstg = smf.ols(formula = formula,
                    data = mroz)
results_secstg = reg_secstg.fit()
OLS_summary(results_secstg)

Unnamed: 0_level_0,$\beta $,$t$,$$P>|t|$$,$SE$
$$r^2: 0.0498$$,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Intercept,0.0481,0.1146,0.9088,0.4198
educ_fitted,0.0614,1.8626,0.0632,0.033
exper,0.0442,3.1361,0.0018,0.0141
I(exper ** 2),-0.0009,-2.1344,0.0334,0.0004


In [12]:

# 2nd Stage
formula = "np.log(wage) ~ 1 + exper + I(exper**2) + [educ ~ motheduc + fatheduc]"
reg_iv = iv.IV2SLS.from_formula(formula = formula,
                    data = mroz)
results_iv = reg_iv.fit(cov_type = "unadjusted", 
                        debiased = True)
LM_summary(results_iv)

Unnamed: 0_level_0,$\beta $,$t$,$$P>|t|$$,$SE$
$$r^2: 0.1357$$,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Intercept,0.0481,0.1202,0.9044,0.4003
I(exper**2),-0.0009,-2.238,0.0257,0.0004
exper,0.0442,3.2883,0.0011,0.0134
educ,0.0614,1.953,0.0515,0.0314


## 15.4 Testing for Exogeneity of the Regressors

### Example 15.7: Return to Education for Married Women

In [13]:
# In this example, we include some instrumental variables from the first stage
# in the second stage and also add the error term from the first stage

# 1st stage (reduced form):
formula = "educ ~ exper + I(exper**2) + motheduc + fatheduc"
reg_redf = smf.ols(formula = formula,
                  data = mroz)
results_redf = reg_redf.fit()
mroz["resid"] = results_redf.resid

# 2nd stage:
formula = "np.log(wage) ~ resid + educ + exper + I(exper**2)"
reg_secstg = smf.ols(formula = formula,
                    data = mroz)
results_secstg = reg_secstg.fit()
OLS_summary(results_secstg)


Unnamed: 0_level_0,$\beta $,$t$,$$P>|t|$$,$SE$
$$r^2: 0.1624$$,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Intercept,0.0481,0.1219,0.903,0.3946
resid,0.0582,1.6711,0.0954,0.0348
educ,0.0614,1.9815,0.0482,0.031
exper,0.0442,3.3363,0.0009,0.0132
I(exper ** 2),-0.0009,-2.2706,0.0237,0.0004


## 15.5 Testing Overidentifying Restrictions

1. Estimate the model by 2SLS and obtain residuals $\hat{u}$

2. Regress $hat{u}$ on all exogenous variabls and calculate $R_1^2$

3. The test statistic $nR_1^2$ is asymptotically distributed as $\chi_q^2$, where $q$ is the number of overidentifying restrictions, i.e., number of instruments minus number of endogenous regressors

### Example 15.8: Return to Education for Married Women

In [14]:
#IV regression
formula = "np.log(wage) ~ 1 + exper + I(exper**2) + [educ ~ motheduc + fatheduc]"
reg_iv = iv.IV2SLS.from_formula(formula = formula,
                               data = mroz)
results_iv = reg_iv.fit(cov_type = "unadjusted",
                       debiased = True)
LM_summary(results_iv)

Unnamed: 0_level_0,$\beta $,$t$,$$P>|t|$$,$SE$
$$r^2: 0.1357$$,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Intercept,0.0481,0.1202,0.9044,0.4003
I(exper**2),-0.0009,-2.238,0.0257,0.0004
exper,0.0442,3.2883,0.0011,0.0134
educ,0.0614,1.953,0.0515,0.0314


In [17]:
# auxiliary regress, check for overidentification
mroz["resid_iv"] = results_iv.resids
# regress iv residual on exogenous variables from iv regression
reg_aux = smf.ols(formula = "resid_iv ~ exper + I(exper**2) + motheduc + fatheduc",
                 data = mroz)
results_aux = reg_aux.fit()
OLS_summary(results_aux)

Unnamed: 0_level_0,$\beta $,$t$,$$P>|t|$$,$SE$
$$r^2: 0.0009$$,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Intercept,0.011,0.0776,0.9382,0.1413
exper,-0.0,-0.0014,0.9989,0.0133
I(exper ** 2),0.0,0.0018,0.9985,0.0004
motheduc,-0.0066,-0.5558,0.5786,0.0119
fatheduc,0.0058,0.5173,0.6052,0.0112


#### The LM statistic indicates that find that the variables are exogenous, though this does not indicate that we have necessarily made the correct choice of endogenous variable 

In [16]:
r2 = results_aux.rsquared
n = results_aux.nobs
test_stat = n * r2
pval = 1 - stats.chi2.cdf(test_stat, 1)

test_stat, pval

(0.3780714069673037, 0.5386371981604612)

In [40]:
### Example 15.10: Job Training and Worker Productivity
jtrain = woo.dataWoo("jtrain")
jtrain_87_88 = jtrain.loc[(jtrain["year"] == 1987) | (jtrain["year"] == 1988), :].set_index(["fcode","year"])
jtrain_87_88.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,employ,sales,avgsal,scrap,rework,tothrs,union,grant,d89,d88,...,grant_1,clscrap,cgrant,clemploy,clsales,lavgsal,clavgsal,cgrant_1,chrsemp,clhrsemp
fcode,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
410032.0,1987,100.0,47000000.0,35000.0,,,12.0,0,0,0,0,...,0,,0,,,10.463103,,,,
410032.0,1988,131.0,43000000.0,37000.0,,,8.0,0,0,0,1,...,0,,0,0.270027,-0.088949,10.518673,0.05557,0.0,-8.946565,-1.165385
410440.0,1987,12.0,1560000.0,10500.0,,,12.0,0,0,0,0,...,0,,0,,,9.25913,,,,
410440.0,1988,13.0,1970000.0,11000.0,,,12.0,0,0,0,1,...,0,,0,0.080043,0.233347,9.305651,0.04652,0.0,0.0,0.0
410495.0,1987,20.0,750000.0,17680.0,,,50.0,0,0,0,0,...,0,,0,,,9.78019,,,,


In [41]:
# calculate differenced values by entity
mean_keys = ["lscrap", "hrsemp", "grant"]
mean_diffs = jtrain_87_88.sort_values(["fcode", "year"]).groupby("fcode").diff()[mean_keys]
mean_diffs


Unnamed: 0_level_0,Unnamed: 1_level_0,lscrap,hrsemp,grant
fcode,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
410032.0,1987,,,
410032.0,1988,,-8.946565,0.0
410440.0,1987,,,
410440.0,1988,,0.000000,0.0
410495.0,1987,,,
...,...,...,...,...
419482.0,1988,-0.041594,0.000000,0.0
419483.0,1987,,,
419483.0,1988,0.223144,0.000000,0.0
419486.0,1987,,,


In [46]:
for key in mean_keys:
    jtrain_87_88[key + "_diff1"] = \
        mean_diffs[key]

# IV Regression:
formula = "lscrap_diff1 ~ 1 + [hrsemp_diff1 ~ grant_diff1]"
reg_iv = iv.IV2SLS.from_formula(formula = formula,
                               data = jtrain_87_88.dropna(subset = ["lscrap_diff1", "hrsemp_diff1"]))
results_iv = reg_iv.fit(cov_type = "unadjusted",
                       debiased = True)
LM_summary(results_iv)

Unnamed: 0_level_0,$\beta $,$t$,$$P>|t|$$,$SE$
$$r^2: 0.0159$$,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Intercept,-0.0327,-0.2573,0.7982,0.127
hrsemp_diff1,-0.0142,-1.7882,0.0808,0.0079
