In [1]:
import wooldridge as woo
import numpy as np

In [2]:
ceosal1 = woo.dataWoo('ceosal1')  # 从 wooldridge 库加载 "ceosal1" 数据集
ceosal1.head(5)   # 打印数据集的前五行，了解数据的概况

Unnamed: 0,salary,pcsalary,sales,roe,pcroe,ros,indus,finance,consprod,utility,lsalary,lsales
0,1095,20,27595.0,14.1,106.400002,191,1,0,0,0,6.998509,10.225389
1,1001,32,9958.0,10.9,-30.6,13,1,0,0,0,6.908755,9.206132
2,1122,9,6125.899902,23.5,-16.299999,14,1,0,0,0,7.022868,8.720281
3,578,-9,16246.0,5.9,-25.700001,-21,1,0,0,0,6.359574,9.695602
4,1368,7,21783.199219,13.8,-3.0,56,1,0,0,0,7.221105,9.988894


### 课堂练习1： 基于 矩阵 计算回归系数 $\beta_0$ 和 $\beta_1$ 及其对应的方差
- 模型：$salary = \beta_0+\beta_1 roe + \epsilon$
    - $\hat{\beta} = (X^TX)^{-1}X^Ty$
- 结果：$\hat{salary} = 963.1913 + 18.5012 \cdot roe$
- 系数：Cons 为 212.2175； Roe 为 11.0699

In [3]:
x = ceosal1['roe']  # 提取自变量 'roe'
y = ceosal1['salary']  # 提取因变量 'salary'
cons = np.ones(x.shape)  # 创建一个与 x 同大小的常数项向量，用于添加截距项

X = np.c_[cons, x]  # 将常数项和 'roe' 组合成一个矩阵 X，其中第一列是常数项，第二列是 'roe'
beta = np.linalg.inv(X.T @ X) @ X.T @ y  # 计算回归系数
u = y - X @ beta

In [4]:
# 同方差
sigma_squared = u.T @ u / (len(y) - X.shape[1])
temp1 = np.linalg.inv(X.T @ X) @ X.T
beta_var = sigma_squared * temp1 @ temp1.T
beta_var_diag = np.diag(beta_var) # 取主对角线方差
beta_std = np.sqrt(beta_var_diag) # 计算标准误差，并输出

print(f'cons: {beta[0]:.4f}  {beta_std[0]:.4f}')
print(f'educ: {beta[1]:.4f}  {beta_std[1]:.4f}')

cons: 963.1913  213.2403
educ: 18.5012  11.1233


In [5]:
import statsmodels.api as sm

x = ceosal1['roe']  # 提取自变量 'roe'
y = ceosal1['salary']  # 提取因变量 'salary'
cons = np.ones(x.shape)  # 创建一个与 x 同大小的常数项向量，用于添加截距项
X = np.c_[cons, x]  # 将常数项和 'roe' 组合成一个矩阵 X，其中第一列是常数项，第二列是 'roe'

model = sm.OLS(y, X) # 创建 OLS 模型
robust_results = model.fit()
print(robust_results.summary())

                            OLS Regression Results                            
Dep. Variable:                 salary   R-squared:                       0.013
Model:                            OLS   Adj. R-squared:                  0.008
Method:                 Least Squares   F-statistic:                     2.767
Date:                Sat, 29 Mar 2025   Prob (F-statistic):             0.0978
Time:                        09:26:54   Log-Likelihood:                -1804.5
No. Observations:                 209   AIC:                             3613.
Df Residuals:                     207   BIC:                             3620.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        963.1913    213.240      4.517      0.0

In [6]:
# 稳健标准误（残差项不相关，但存在异方差）
u_squared = u ** 2
sigma_matrix = np.diag(u_squared)

temp1 = np.linalg.inv(X.T @ X) @ X.T
beta_var = temp1 @ sigma_matrix @ temp1.T
beta_var_diag = np.diag(beta_var) # 取主对角线方差
beta_std = np.sqrt(beta_var_diag) # 计算标准误差，并输出

print(f'cons: {beta[0]:.4f}  {beta_std[0]:.4f}')
print(f'educ: {beta[1]:.4f}  {beta_std[1]:.4f}')

cons: 963.1913  120.5254
educ: 18.5012  6.7967


In [7]:
import statsmodels.api as sm

x = ceosal1['roe']  # 提取自变量 'roe'
y = ceosal1['salary']  # 提取因变量 'salary'
cons = np.ones(x.shape)  # 创建一个与 x 同大小的常数项向量，用于添加截距项
X = np.c_[cons, x]  # 将常数项和 'roe' 组合成一个矩阵 X，其中第一列是常数项，第二列是 'roe'

model = sm.OLS(y, X) # 创建 OLS 模型
robust_results = model.fit(cov_type='HC0')  # 使用 White 异方差稳健标准误差
print(robust_results.summary())  # 输出模型的拟合结果，包含稳健标准误差和其他统计信息

                            OLS Regression Results                            
Dep. Variable:                 salary   R-squared:                       0.013
Model:                            OLS   Adj. R-squared:                  0.008
Method:                 Least Squares   F-statistic:                     7.410
Date:                Sat, 29 Mar 2025   Prob (F-statistic):            0.00704
Time:                        09:26:54   Log-Likelihood:                -1804.5
No. Observations:                 209   AIC:                             3613.
Df Residuals:                     207   BIC:                             3620.
Df Model:                           1                                         
Covariance Type:                  HC0                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const        963.1913    120.525      7.992      0.0

In [8]:
# 聚类稳健标准误（残差项不相关，但存在异方差）

N = X.shape[0]  # 总样本数
K = X.shape[1]  # 变量个数
cluster_ids = ceosal1['indus']  # 我们按 'industry' 进行聚类
cluster_labels = np.unique(cluster_ids)  # 获取唯一 cluster

G = len(cluster_labels)  # 计算 cluster 数量
S = np.zeros((K, K))  # 形状为 (K, K) 的零矩阵

# 遍历每个 cluster，计算贡献
for g in cluster_labels:
    idx = (cluster_ids == g)  # 选出属于 cluster g 的观测
    X_g = X[idx, :]  # 选取该 cluster 的 X 矩阵
    u_g = u[idx]  # 选取该 cluster 的残差
    S += X_g.T @ np.outer(u_g, u_g) @ X_g  # 计算 X_g' * u_g * u_g' * X_g

# 计算 (X'X)^(-1)
inv_XTX = np.linalg.inv(X.T @ X)

# 计算聚类稳健标准误的方差估计
# dof_correction = (G / (G - 1)) * ((N - 1) / (N - K))  # 附加自由度修正
# beta_var_cluster = inv_XTX @ S @ inv_XTX * dof_correction # 进行自由度修正
dof_correction = (G / (G - 1)) * ((N - 1) / (N - K))  # 附加自由度修正
beta_var_cluster = inv_XTX @ S @ inv_XTX * dof_correction # 进行自由度修正
beta_var_diag = np.diag(beta_var_cluster)  # 取主对角线方差
beta_std = np.sqrt(beta_var_diag)  # 计算标准误差

print(f'cons: {beta[0]:.4f}  {beta_std[0]:.4f}')
print(f'educ: {beta[1]:.4f}  {beta_std[1]:.4f}')

cons: 963.1913  11.5204
educ: 18.5012  6.0881


In [9]:
import statsmodels.api as sm

x = ceosal1['roe']  # 提取自变量 'roe'
y = ceosal1['salary']  # 提取因变量 'salary'
X = sm.add_constant(x)  # 使用 sm.add_constant 自动添加常数项

cluster_ids = ceosal1['indus']  # 我们按 'industry' 进行聚类

# 运行 OLS 并使用聚类稳健标准误
model = sm.OLS(y, X)
clustered_results = model.fit(cov_type='cluster', cov_kwds={'groups': cluster_ids})

# 输出结果
print(clustered_results.summary())

                            OLS Regression Results                            
Dep. Variable:                 salary   R-squared:                       0.013
Model:                            OLS   Adj. R-squared:                  0.008
Method:                 Least Squares   F-statistic:                     9.235
Date:                Sat, 29 Mar 2025   Prob (F-statistic):              0.202
Time:                        09:26:54   Log-Likelihood:                -1804.5
No. Observations:                 209   AIC:                             3613.
Df Residuals:                     207   BIC:                             3620.
Df Model:                           1                                         
Covariance Type:              cluster                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const        963.1913     11.520     83.608      0.0