In [1]:
import numpy as np
import wooldridge as woo
import pandas as pd
import numpy.linalg as la

In [2]:
# 读取数据
df = woo.data("pension")
df.head()

Unnamed: 0,id,pyears,prftshr,choice,female,married,age,educ,finc25,finc35,finc50,finc75,finc100,finc101,wealth89,black,stckin89,irain89,pctstck
0,38,1.0,0,1,0,1,64,12,0,0,1,0,0,0,77.900002,0,1,1,0
1,152,6.0,1,1,1,1,56,13,0,0,0,1,0,0,154.899994,0,1,1,50
2,152,25.0,1,1,0,1,56,12,0,0,0,1,0,0,154.899994,0,1,1,50
3,182,20.0,1,0,1,1,63,12,1,0,0,0,0,0,232.5,0,1,1,100
4,222,35.0,0,1,0,1,67,12,0,1,0,0,0,0,179.0,0,0,1,100


### 课堂练习1：基于矩阵运算计算 OLS 与 WLS 回归系数

- **模型**：
  对 `pension` 数据集构建回归模型：
  $$
  pctstck = \beta_0 + \beta_1 \cdot choice + \beta_2 \cdot age + \beta_3 \cdot educ + \beta_4 \cdot female + \beta_5 \cdot black + \beta_6 \cdot married + \epsilon
  $$

- **OLS 回归**：
  利用矩阵公式计算 OLS 回归系数：
  $$
  \hat{\beta}_{OLS} = (X^TX)^{-1}X^Ty
  $$
  - 构造设计矩阵 \(X\) 时，将截距项（全 1 向量）与各解释变量（`choice`, `age`, `educ`, `female`, `black`, `married`）拼接在一起

- **WLS 回归**：
  在计算出 OLS 残差 $ u = y - X\hat{\beta}_{OLS} $ 后，构造权重矩阵
  $$
  W = \text{diag}\left(\frac{1}{u_i^2 + \epsilon}\right)
  $$
  并利用矩阵公式计算 WLS 回归系数：
  $$
  \hat{\beta}_{WLS} = (X^TWX)^{-1}X^TWy
  $$

- **结果**：
  代码输出一个表格，对比显示了每个变量对应的 OLS 和 WLS 回归系数，从而可以观察在加权最小二乘法调整后，各系数的变化情况。

In [3]:
explanatory_vars = ["choice", "age", "educ", "female", "black", "married"] # 解释变量列表
y = df['pctstck'] # 被解释变量
X = df[explanatory_vars]
cons = np.ones(len(df))
X = np.c_[cons, X] # 构造设计矩阵 X: 第一列为截距项，后面为各解释变量
var_names = ['cons'] + explanatory_vars # 对应的变量名称（第一项为常数）

# OLS 回归：利用矩阵公式 beta_ols = (X'X)^{-1}X'y
beta_ols = la.inv(X.T @ X) @ (X.T @ y)
u = y - X @ beta_ols  # OLS 残差

#  构造权重矩阵：使用残差平方的倒数作为权重
epsilon = 1e-7  # 避免除0
w = 1 / (u**2 + epsilon)
W = np.diag(w)

# 加权最小二乘（WLS）回归：beta_wls = (X'W X)^{-1}X'W y
beta_wls = np.linalg.inv(X.T @ W @ X) @ (X.T @ W @ y)

# 输出结果
wls_result_matrix = pd.DataFrame({
    "变量名": var_names,
    "OLS估计系数": beta_ols.round(4),
    "WLS估计系数": beta_wls.round(4)
})
wls_result_matrix

Unnamed: 0,变量名,OLS估计系数,WLS估计系数
0,cons,124.3346,68.0617
1,choice,9.1492,2.0626
2,age,-1.4884,-0.3428
3,educ,0.3034,0.0167
4,female,0.7593,0.1497
5,black,4.9679,1.4196
6,married,4.4788,0.7776


In [4]:
import statsmodels.api as sm
y = df['pctstck'] # 被解释变量
explanatory_vars = ["choice", "age", "educ", "female", "black", "married"] # 解释变量列表
X = df[explanatory_vars]
X = sm.add_constant(X)

# 1. 使用 OLS 进行回归估计
ols_model = sm.OLS(y, X)
ols_results = ols_model.fit()
# print(ols_results.summary())
u = ols_results.resid
epsilon = 1e-7
w = 1 / (u**2 + epsilon)

# 2. 使用 WLS 进行回归估计
wls_model = sm.WLS(y, X, weights=w)
wls_result_sm = wls_model.fit()
print(wls_result_sm.summary())

                            WLS Regression Results                            
Dep. Variable:                pctstck   R-squared:                       0.025
Model:                            WLS   Adj. R-squared:                 -0.006
Method:                 Least Squares   F-statistic:                    0.7955
Date:                Thu, 27 Mar 2025   Prob (F-statistic):              0.575
Time:                        17:39:49   Log-Likelihood:                -823.89
No. Observations:                 194   AIC:                             1662.
Df Residuals:                     187   BIC:                             1685.
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         68.0617      9.138      7.448      0.0

In [5]:
# 1. 读取数据
df = woo.data("pension")

# 2. 构造设计矩阵和被解释变量
y = df['pctstck'].values

explanatory_vars = ["choice", "age", "educ", "female", "black", "married"]

X_vars = df[explanatory_vars].values
n = X_vars.shape[0]
cons = np.ones((n, 1))
X = np.hstack((cons, X_vars))
var_names = ['const'] + explanatory_vars

# 3. 初步 OLS估计，计算残差
beta_ols = la.inv(X.T @ X) @ (X.T @ y)
u = y - X @ beta_ols

# 4. 分组变量：以 'id' 聚类（转换为 numpy 数组）
cluster_ids = df['id'].values
cluster_labels = np.unique(cluster_ids)

# 5. 对每个分组估计组内协方差矩阵（采用 u_g u_g^T 作为简单估计）
Omega_blocks = []
for g in cluster_labels:
    idx = (cluster_ids == g)
    u_g = u[idx].reshape(-1, 1)  # 现在 u[idx] 返回的是 numpy 数组
    Omega_g = u_g @ u_g.T + 1e-12 * np.eye(len(u_g))
    Omega_blocks.append(Omega_g)

# 6. 对每个组进行数据变换
X_star = np.zeros_like(X)
y_star = np.zeros_like(y)
for i, g in enumerate(cluster_labels):
    idx = np.where(cluster_ids == g)[0]
    Omega_g = Omega_blocks[i] + 1e-12 * np.eye(len(idx))
    C = la.cholesky(Omega_g)
    C_inv = la.inv(C)
    X_star[idx, :] = C_inv @ X[idx, :]
    y_star[idx] = (C_inv @ y[idx].reshape(-1, 1)).flatten()

# 7. FGLS 估计：利用转换后的数据进行 OLS
beta_fgls = la.inv(X_star.T @ X_star) @ (X_star.T @ y_star)

# 输出 FGLS 回归系数，保留小数点后四位
results_df_cholesky = pd.DataFrame({
    "Variable": var_names,
    "FGLS Coefficient": np.round(beta_fgls, 4)
})
print("=== FGLS 回归结果（按 id 聚类）===")
results_df_cholesky

=== FGLS 回归结果（按 id 聚类）===


Unnamed: 0,Variable,FGLS Coefficient
0,const,124.3346
1,choice,9.1492
2,age,-1.4884
3,educ,0.3034
4,female,0.7593
5,black,4.9679
6,married,4.4788


In [6]:
import numpy as np
import pandas as pd
import wooldridge as woo
import numpy.linalg as la

# 1. 读取数据
df = woo.data("pension")

# 2. 构造设计矩阵和被解释变量
y = df['pctstck'].values

explanatory_vars = ["choice", "age", "educ", "female", "black", "married"]

X_vars = df[explanatory_vars].values
n = X_vars.shape[0]
cons = np.ones((n, 1))
X = np.hstack((cons, X_vars))
var_names = ['const'] + explanatory_vars

# 3. 初步 OLS 估计，计算残差
beta_ols = la.inv(X.T @ X) @ (X.T @ y)
u = y - X @ beta_ols

# 4. 分组变量：以 'id' 聚类（转换为 numpy 数组）
cluster_ids = df['id'].values
cluster_labels = np.unique(cluster_ids)

# 5. 对每个分组估计组内协方差矩阵
#    这里简单估计：Omega_g = u_g u_g^T，加一个微小正数保证数值稳定性
#    并直接计算 Omega_g 的逆
XWX = np.zeros((X.shape[1], X.shape[1]))
XWy = np.zeros(X.shape[1])

for g in cluster_labels:
    idx = np.where(cluster_ids == g)[0]
    u_g = u[idx].reshape(-1, 1)
    # 估计组内协方差矩阵
    Omega_g = u_g @ u_g.T + 1e-12 * np.eye(len(idx))
    # 求逆
    Omega_inv_g = la.inv(Omega_g)
    X_g = X[idx, :]
    y_g = y[idx]

    XWX += X_g.T @ Omega_inv_g @ X_g
    XWy += X_g.T @ Omega_inv_g @ y_g

beta_fgls_direct = la.inv(XWX) @ XWy
# 输出 FGLS 回归系数，保留小数点后四位
results_df_direct  = pd.DataFrame({
    "Variable": var_names,
    "FGLS Coefficient": np.round(beta_fgls_direct, 4)
})
results_df_direct

Unnamed: 0,Variable,FGLS Coefficient
0,const,124.3346
1,choice,9.1492
2,age,-1.4884
3,educ,0.3034
4,female,0.7593
5,black,4.9679
6,married,4.4788


In [7]:
import statsmodels.api as sm

# 1. 读取数据
df = woo.data("pension")

# 2. 设置变量
y = df['pctstck'] # 被解释变量
explanatory_vars = ["choice", "age", "educ", "female", "black", "married"] # 解释变量名称
X = df[explanatory_vars]
X = sm.add_constant(X) # 构造设计矩阵，并添加常数项

# 3. 使用 GLS 进行回归估计，同时对 id 进行聚类调整
gls_model = sm.GLS(y, X)
gls_res = gls_model.fit(cov_type='cluster', cov_kwds={'groups': df['id']})

# 4. 输出结果，并将系数和标准误保留小数点后四位
print(gls_res.summary())

                            GLS Regression Results                            
Dep. Variable:                pctstck   R-squared:                       0.042
Model:                            GLS   Adj. R-squared:                  0.011
Method:                 Least Squares   F-statistic:                     1.484
Date:                Thu, 27 Mar 2025   Prob (F-statistic):              0.186
Time:                        17:39:49   Log-Likelihood:                -984.95
No. Observations:                 194   AIC:                             1984.
Df Residuals:                     187   BIC:                             2007.
Df Model:                           6                                         
Covariance Type:              cluster                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const        124.3346     50.961      2.440      0.0