# 两阶段最小二乘练习

In [1]:
import wooldridge as woo
import numpy as np

In [2]:
mroz = woo.dataWoo('mroz')
mroz = mroz.dropna(subset=['lwage']) # restrict to non-missing wage observations
mroz

Unnamed: 0,inlf,hours,kidslt6,kidsge6,age,educ,wage,repwage,hushrs,husage,...,faminc,mtr,motheduc,fatheduc,unem,city,exper,nwifeinc,lwage,expersq
0,1,1610,1,0,32,12,3.3540,2.65,2708,34,...,16310.0,0.7215,12,7,5.0,0,14,10.910060,1.210154,196
1,1,1656,0,2,30,12,1.3889,2.65,2310,30,...,21800.0,0.6615,7,7,11.0,1,5,19.499981,0.328512,25
2,1,1980,1,3,35,12,4.5455,4.04,3072,40,...,21040.0,0.6915,12,7,5.0,0,15,12.039910,1.514138,225
3,1,456,0,3,34,12,1.0965,3.25,1920,53,...,7300.0,0.7815,7,7,5.0,0,6,6.799996,0.092123,36
4,1,1568,1,2,31,14,4.5918,3.60,2000,32,...,27300.0,0.6215,12,14,9.5,1,7,20.100058,1.524272,49
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
423,1,680,0,5,36,10,2.3118,0.00,3430,43,...,19772.0,0.7215,7,7,7.5,0,2,18.199976,0.838026,4
424,1,2450,0,1,40,12,5.3061,6.50,2008,40,...,35641.0,0.6215,7,7,5.0,1,21,22.641056,1.668857,441
425,1,2144,0,2,43,13,5.8675,0.00,2140,43,...,34220.0,0.5815,7,7,7.5,1,22,21.640079,1.769429,484
426,1,1760,0,1,33,12,3.4091,3.21,3380,34,...,30000.0,0.5815,12,16,11.0,1,14,23.999985,1.226448,196


### 求解模型：$ln(wage) = \beta_0 + \beta_1 \cdot educ + \beta_2 \cdot exper  + \beta_3 \cdot expersq  + \epsilon$
- 其中，$educ$为内生变量，其工具变量为 $constant$, $exper$, $expersq$, $motheduc$, $fatheduc$

#### 方法一：

##### 第1阶段：用工具变量对内生变量回归，得到内生变量的拟合值
- 模型：$educ = \beta_0 + \beta_1 \cdot exper  + \beta_2 \cdot expersq  + \beta_3 \cdot motheduc + \beta_4 \cdot fatheduc + \epsilon$
- 结果：$\hat{educ} = 9.1026 + 0.0452 \cdot exper -0.0010 \cdot expersq +  0.1576 \cdot motheduc + 0.1895 \cdot fatheduc$



In [3]:
y = mroz['educ']
x1 = np.ones(y.shape)
x2 = mroz['exper']
x3 = mroz['expersq']
x4 = mroz['motheduc']
x5 = mroz['fatheduc']
X_1st = np.c_[x1,x2, x3, x4,x5]
beta_1st = np.linalg.inv(X_1st.T @ X_1st) @ X_1st.T @ y
educ_hat = X_1st @ beta_1st

print(f'cons: {beta_1st[0]:.4f}')
print(f'exper: {beta_1st[1]:.4f}')
print(f'expersq: {beta_1st[2]:.4f}')
print(f'motheduc: {beta_1st[3]:.4f}')
print(f'fatheduc: {beta_1st[4]:.4f}')

cons: 9.1026
exper: 0.0452
expersq: -0.0010
motheduc: 0.1576
fatheduc: 0.1895


##### 第2阶段：将内生变量的拟合值和其他非内生变量相结合X，然后用y对其回归
- 模型：$ln(wage) = \beta_0 + \beta_1 \cdot \hat{educ} + \beta_2 \cdot exper  + \beta_3 \cdot expersq  + \epsilon$
- 结果：$\hat{ln(wage)} = 0.0481 +  0.1576 \cdot \hat{educ} + 0.0442 \cdot exper - 0.0009 \cdot expersq $

In [4]:
ln_wage = mroz['lwage']

cons = np.ones(ln_wage.shape)
exper = mroz['exper']
expersq = mroz['expersq']

X_2ed = np.c_[cons,educ_hat,exper,expersq]
beta_2ed = np.linalg.inv(X_2ed.T @ X_2ed) @ X_2ed.T @ ln_wage
ln_wage_hat = X_2ed @ beta_2ed

print(f'cons: {beta_2ed[0]:.4f}')
print(f'educ: {beta_2ed[1]:.4f}')
print(f'exper: {beta_2ed[2]:.4f}')
print(f'expersq: {beta_2ed[3]:.4f}')

cons: 0.0481
educ: 0.0614
exper: 0.0442
expersq: -0.0009


方法二：
直接求解：$ \beta = (X'Z(Z'Z)^{-1}Z'X)^{-1} X'Z(Z'Z)^{-1}Z'y$

In [5]:
X = np.c_[cons,mroz['educ'],exper,expersq]
Z = np.c_[cons,mroz['exper'], mroz['expersq'], mroz['motheduc'], mroz['fatheduc']]

temp1 = np.linalg.inv(Z.T @ Z)
beta = np.linalg.inv(X.T @ Z @ temp1 @ Z.T @ X) @ X.T @ Z @ temp1 @ Z.T @ ln_wage

print(f'cons: {beta[0]:.4f}')
print(f'educ: {beta[1]:.4f}')
print(f'exper: {beta[2]:.4f}')
print(f'expersq: {beta[3]:.4f}')

cons: 0.0481
educ: 0.0614
exper: 0.0442
expersq: -0.0009


In [6]:
from linearmodels.iv import IV2SLS

exog = np.c_[cons,exper,expersq]
endog = mroz['educ']
instruments = np.c_[mroz['motheduc'], mroz['fatheduc']]
model = IV2SLS(ln_wage, exog, endog, instruments)
results1 = model.fit(cov_type = "homoskedastic")
print(results1.summary)

                          IV-2SLS Estimation Summary                          
Dep. Variable:                  lwage   R-squared:                      0.1357
Estimator:                    IV-2SLS   Adj. R-squared:                 0.1296
No. Observations:                 428   F-statistic:                    24.653
Date:                Thu, Mar 27 2025   P-value (F-stat)                0.0000
Time:                        17:39:10   Distribution:                  chi2(3)
Cov. Estimator:         homoskedastic                                         
                                                                              
                             Parameter Estimates                              
            Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------
exog.0         0.0481     0.3985     0.1207     0.9039     -0.7329      0.8291
exog.1         0.0442     0.0134     3.3038     0.00

### 拓展：如果 $educ$的工具变量为 $motheduc$, $fatheduc$ 呢？

In [7]:
X_1st = np.c_[mroz['motheduc'], mroz['fatheduc']]
beta_1st = np.linalg.inv(X_1st.T @ X_1st) @ X_1st.T @ mroz['educ']
educ_hat = X_1st @ beta_1st

ln_wage = np.log(mroz['wage'])
X_2ed = np.c_[np.ones(ln_wage.shape),educ_hat,mroz['exper'],mroz['expersq']]
beta_2ed = np.linalg.inv(X_2ed.T @ X_2ed) @ X_2ed.T @ ln_wage

print(f'cons: {beta_2ed[0]:.4f}')
print(f'educ: {beta_2ed[1]:.4f}')
print(f'exper: {beta_2ed[2]:.4f}')
print(f'expersq: {beta_2ed[3]:.4f}')

cons: 0.6141
educ: 0.0161
exper: 0.0470
expersq: -0.0010
