学习使用R语言进行短面板数据分析。

In [54]:
library(plm)
library(tidyverse)
library(broom)

In [55]:
# 导入数据集
library(foreign)
traffic <- read.dta("./data/traffic.dta")

glimpse(traffic)

Rows: 336
Columns: 54
$ state      [3m[90m<fct>[39m[23m AL, AL, AL, AL, AL, AL, AL, AZ, AZ, AZ, AZ, AZ, AZ, AZ, AR,~
$ year       [3m[90m<int>[39m[23m 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1982, 1983, 1984,~
$ spircons   [3m[90m<dbl>[39m[23m 1.37, 1.36, 1.32, 1.28, 1.23, 1.18, 1.17, 1.97, 1.90, 2.14,~
$ unrate     [3m[90m<dbl>[39m[23m 14.4, 13.7, 11.1, 8.9, 9.8, 7.8, 7.2, 9.9, 9.1, 5.0, 6.5, 6~
$ perinc     [3m[90m<dbl>[39m[23m 10544.15, 10732.80, 11108.79, 11332.63, 11661.51, 11944.00,~
$ emppop     [3m[90m<dbl>[39m[23m 50.69204, 52.14703, 54.16809, 55.27114, 56.51450, 57.50988,~
$ beertax    [3m[90m<dbl>[39m[23m 1.53937948, 1.78899074, 1.71428561, 1.65254235, 1.60990703,~
$ sobapt     [3m[90m<dbl>[39m[23m 30.3557, 30.3336, 30.3115, 30.2895, 30.2674, 30.2453, 30.22~
$ mormon     [3m[90m<dbl>[39m[23m 0.32829, 0.34341, 0.35924, 0.37579, 0.39311, 0.41123, 0.430~
$ mlda       [3m[90m<dbl>[39m[23m 19.00, 19.00, 19.00, 19.67, 21.00, 21.00, 21.00, 

# 混合回归

In [9]:
# 混合回归
fit_pooled <- plm(
  fatal ~ beertax + spircons + unrate + perinck,
  data = traffic,
  index = c("state", "year"),
  model = "pooling"
)

tidy(fit_pooled)

term,estimate,std.error,statistic,p.value
<chr>,<dbl>,<dbl>,<dbl>,<dbl>
(Intercept),4.11867355,0.29699504,13.867819,8.568174e-35
beertax,0.0971997,0.06155179,1.579153,0.1152561
spircons,0.16234707,0.04324656,3.753988,0.0002055204
unrate,-0.02910139,0.01271565,-2.288628,0.02273061
perinck,-0.15842906,0.01698686,-9.326566,1.611358e-18


In [11]:
# 手动实现混合回归
fit_pooled_manual <- lm(
  fatal ~ beertax + spircons + unrate + perinck,
  data = traffic
)

# 使用聚类稳健标准误
library(sandwich)
library(lmtest)
CL <- vcovCL(fit_pooled_manual, cluster = ~ state)
coeftest(fit_pooled_manual, vcov = CL) %>% tidy()

term,estimate,std.error,statistic,p.value
<chr>,<dbl>,<dbl>,<dbl>,<dbl>
(Intercept),4.11867355,0.67657457,6.0875382,3.17412e-09
beertax,0.0971997,0.11688401,0.8315911,0.4062393
spircons,0.16234707,0.10709883,1.5158622,0.1305086
unrate,-0.02910139,0.02086869,-1.3945,0.1641019
perinck,-0.15842906,0.03714042,-4.2656775,2.604208e-05


# 固定效应模型

## FE估计量

In [31]:
# 混合回归
pdata_traffic <- pdata.frame(traffic, index = c("state", "year"))
fit_fe <- plm(
  fatal ~ beertax + spircons + unrate + perinck,
  data = pdata_traffic,
  model = "within"
)

tidy(fit_fe)

term,estimate,std.error,statistic,p.value
<chr>,<dbl>,<dbl>,<dbl>,<dbl>
beertax,-0.48407277,0.162510555,-2.978716,0.0031445531089131
spircons,0.81696515,0.079211792,10.313681,0.0
unrate,-0.02904993,0.009027433,-3.217962,0.0014405725749847
perinck,0.10471027,0.020598576,5.083374,6.737876165e-07


In [44]:
summary(fit_fe)

Oneway (individual) effect Within Model

Call:
plm(formula = fatal ~ beertax + spircons + unrate + perinck, 
    data = pdata_traffic, model = "within")

Balanced Panel: n = 48, T = 7, N = 336

Residuals:
       Min.     1st Qu.      Median     3rd Qu.        Max. 
-0.44378892 -0.07922880  0.00078846  0.06761301  0.56861719 

Coefficients:
           Estimate Std. Error t-value              Pr(>|t|)    
beertax  -0.4840728  0.1625106 -2.9787              0.003145 ** 
spircons  0.8169652  0.0792118 10.3137 < 0.00000000000000022 ***
unrate   -0.0290499  0.0090274 -3.2180              0.001441 ** 
perinck   0.1047103  0.0205986  5.0834          0.0000006738 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Total Sum of Squares:    10.785
Residual Sum of Squares: 6.9816
R-Squared:      0.35265
Adj. R-Squared: 0.2364
F-statistic: 38.6774 on 4 and 284 DF, p-value: < 0.000000000000000222

In [57]:
fixef(fit_fe)

In [32]:
# 使用聚类稳健标准误
CL <- vcovCL(fit_fe, cluster = ~ state)
coeftest(fit_fe, vcov = CL) %>% tidy()

ERROR: Error in UseMethod("estfun"): no applicable method for 'estfun' applied to an object of class "c('plm', 'panelmodel')"


我们发现一件非常要命的事情，{plm}包无法使用{sandwich}和{lmtest}包进行聚类稳健把标准误检验。

但是个体不同时期的数据不太可能认为不存在自相关，因此我们下面手动实现FE估计量：

In [46]:
# 手动计算组内离差
traffic_fe <- traffic %>% 
  select(state, year, fatal, beertax, spircons, unrate, perinck) %>% 
  group_by(state) %>% 
  mutate(
    fatal = fatal - mean(fatal),
    beertax = beertax - mean(beertax),
    spircons = spircons - mean(spircons),
    unrate = unrate - mean(unrate),
    perinck = perinck - mean(perinck)
  ) %>%
  ungroup()

# 拟合FE模型
fit_fe_manual <- lm(
  fatal ~ beertax + spircons + unrate + perinck,  # 注意这里
  data = traffic_fe
)

# 使用聚类稳健标准误
# 不用科学计数法
options(scipen = 999)
CL_fe <- vcovCL(fit_fe_manual, cluster = ~ state)
coeftest(fit_fe_manual, vcov = CL_fe) %>% round(digits = 8) %>% tidy()

term,estimate,std.error,statistic,p.value
<chr>,<dbl>,<dbl>,<dbl>,<dbl>
(Intercept),0.0,0.0,1.051193,0.29393661
beertax,-0.48407277,0.22187542,-2.181732,0.02983231
spircons,0.81696515,0.12726271,6.419517,0.0
unrate,-0.02904993,0.00945812,-3.071428,0.00230694
perinck,0.10471027,0.0341455,3.066591,0.00234353


其实从公式来看，回归公式应该不包括截距项，虽然Stata的结果包含截距项，但是我认为R包的做法是正确的，因此下面的代码去掉截距项进行回归：

In [47]:
# 手动计算组内离差
traffic_fe <- traffic %>% 
  select(state, year, fatal, beertax, spircons, unrate, perinck) %>% 
  group_by(state) %>% 
  mutate(
    fatal = fatal - mean(fatal),
    beertax = beertax - mean(beertax),
    spircons = spircons - mean(spircons),
    unrate = unrate - mean(unrate),
    perinck = perinck - mean(perinck)
  ) %>%
  ungroup()

# 拟合FE模型
fit_fe_manual <- lm(
  fatal ~ beertax + spircons + unrate + perinck -1,  # 不应该包含截距项
  data = traffic_fe
)

# 使用聚类稳健标准误
# 不用科学计数法
options(scipen = 999)
CL_fe <- vcovCL(fit_fe_manual, cluster = ~ state)
coeftest(fit_fe_manual, vcov = CL_fe) %>% round(digits = 8) %>% tidy()

term,estimate,std.error,statistic,p.value
<chr>,<dbl>,<dbl>,<dbl>,<dbl>
beertax,-0.48407277,0.22154102,-2.185025,0.02958566
spircons,0.81696515,0.1270709,6.429207,0.0
unrate,-0.02904993,0.00944386,-3.076064,0.00227182
perinck,0.10471027,0.03409404,3.07122,0.00230796


## LSDV法

In [56]:
# 创建虚拟变量
traffic_LSDV <- traffic %>% 
  select(state, year, fatal, beertax, spircons, unrate, perinck) %>% 
  mutate(value = 1, state_temp = state) %>% 
  pivot_wider(
    names_from = state_temp,
    values_from = value,
    values_fill = 0,
    names_prefix = "state_"
  )

# 去掉最后一列，防止共线性
# 去掉最后一列，防止共线性
# 去掉最后一列，防止共线性
traffic_LSDV <- traffic_LSDV[, -ncol(traffic_LSDV)]

# 拟合LSDV模型
fit_LSDV <- lm(
  fatal ~ . - year,
  data = traffic_LSDV
)

# 使用聚类稳健标准误
CL_LSDV <- vcovCL(fit_LSDV, cluster = ~ state)
coeftest(fit_LSDV, vcov = CL_LSDV) %>% round(digits = 8) %>% tidy() %>% tail(4)

term,estimate,std.error,statistic,p.value
<chr>,<dbl>,<dbl>,<dbl>,<dbl>
beertax,-0.48407277,0.2395323,-2.020908,0.04422739
spircons,0.81696515,0.1373903,5.94631,1e-08
unrate,-0.02904993,0.0102108,-2.845021,0.00476393
perinck,0.10471027,0.0368628,2.840541,0.00482947


## 混合回归还是FE

In [64]:
# 创建虚拟变量
traffic_LSDV <- traffic %>% 
  select(state, year, fatal, beertax, spircons, unrate, perinck) %>% 
  mutate(value = 1, state_temp = state) %>% 
  pivot_wider(
    names_from = state_temp,
    values_from = value,
    values_fill = 0,
    names_prefix = "state_"
  )

# 去掉最后一列，防止共线性
# 去掉最后一列，防止共线性
# 去掉最后一列，防止共线性
traffic_LSDV <- traffic_LSDV[, -ncol(traffic_LSDV)]

# 拟合LSDV模型
fit_LSDV <- lm(
  fatal ~ . - year,
  data = traffic_LSDV
)

# 使用聚类稳健标准误
CL_LSDV <- vcovCL(fit_LSDV, cluster = ~ state)
coeftest(fit_LSDV, vcov = CL_LSDV) %>% 
  round(digits = 8) %>% 
  tidy() %>%
  # 提取state开头的行
  filter(str_detect(term, "^state")) %>%
  mutate(p = ifelse(p.value < 0.05, 1, 0)) %>% 
  summarise(p = sum(p), n = n())

p,n
<dbl>,<int>
38,47


# 双向固定效应

## 离差法

In [68]:
# 先看一下时间有哪些取值
unique(traffic$year)

In [70]:
fit_twfe <- plm(
  fatal ~ beertax + spircons + unrate + perinck,
  data = pdata_traffic,
  model = "within",
  effect = "twoways"
)

tidy(fit_twfe)

term,estimate,std.error,statistic,p.value
<chr>,<dbl>,<dbl>,<dbl>,<dbl>
beertax,-0.43471948,0.15395637,-2.823654,0.0050912543255764
spircons,0.805857,0.11264255,7.154108,7.4882e-12
unrate,-0.05490839,0.01034183,-5.309351,2.254073193e-07
perinck,0.08826358,0.0199988,4.413443,1.45720996101e-05


In [71]:
# 使用聚类稳健标准误
CL_twfe <- vcovCL(fit_twfe, cluster = ~ state)
coeftest(fit_twfe, vcov = CL_twfe) %>% round(digits = 8) %>% tidy()

ERROR: Error in UseMethod("estfun"): no applicable method for 'estfun' applied to an object of class "c('plm', 'panelmodel')"


老问题，无法直接用稳健标准误，我们手动实现：

In [88]:
# 手动实现双向固定效应
traffic_fe <- traffic %>% 
  select(state, year, fatal, beertax, spircons, unrate, perinck) %>% 
  group_by(state) %>% 
  mutate(
    fatal_state = mean(fatal),
    beertax_state = mean(beertax),
    spircons_state = mean(spircons),
    unrate_state = mean(unrate),
    perinck_state = mean(perinck)
  ) %>%
  ungroup() %>%
  group_by(year) %>%
  mutate(
    fatal_year = mean(fatal),
    beertax_year = mean(beertax),
    spircons_year = mean(spircons),
    unrate_year = mean(unrate),
    perinck_year = mean(perinck)
  ) %>%
  ungroup() %>%
  mutate(
    fatal = fatal - fatal_state - fatal_year + mean(fatal),
    beertax = beertax - beertax_state - beertax_year + mean(beertax),
    spircons = spircons - spircons_state - spircons_year + mean(spircons),
    unrate = unrate - unrate_state - unrate_year + mean(unrate),
    perinck = perinck - perinck_state - perinck_year + mean(perinck)
  )

# 拟合TWFE模型
fit_twfe_manual <- lm(
  fatal ~ beertax + spircons + unrate + perinck -1,  # 不应该包含截距项
  data = traffic_fe
)

# 使用聚类稳健标准误
CL_twfe_manual <- vcovCL(fit_twfe_manual, cluster = ~ state)
coeftest(fit_twfe_manual, vcov = CL_twfe_manual) %>% tidy()

term,estimate,std.error,statistic,p.value
<chr>,<dbl>,<dbl>,<dbl>,<dbl>
beertax,-0.43471948,0.24168861,-1.798676,0.0729781279807958
spircons,0.805857,0.11487817,7.014884,1.29763e-11
unrate,-0.05490839,0.01163832,-4.717896,3.516392924e-06
perinck,0.08826358,0.03195484,2.762135,0.0060622076030058


## LSDV法

In [82]:
# 手动实现双向固定效应LSDV法
# 创建state虚拟变量
traffic_twfe_LSDV <- traffic %>% 
  select(state, year, fatal, beertax, spircons, unrate, perinck) %>% 
  mutate(value = 1, state_temp = state) %>% 
  pivot_wider(
    names_from = state_temp,
    values_from = value,
    values_fill = 0,
    names_prefix = "state_"
  )
traffic_twfe_LSDV <- traffic_twfe_LSDV[-ncol(traffic_twfe_LSDV)] # 去掉最后一列，防止共线性

# 创建year虚拟变量
traffic_twfe_LSDV <- traffic_twfe_LSDV %>% 
  mutate(value = 1, year_temp = year) %>% 
  pivot_wider(
    names_from = year_temp,
    values_from = value,
    values_fill = 0,
    names_prefix = "year_"
  )
traffic_twfe_LSDV <- traffic_twfe_LSDV[-ncol(traffic_twfe_LSDV)] # 去掉最后一列，防止共线性

# 拟合TWFE LSDV模型
fit_twfe_LSDV <- lm(
  fatal ~ . - year -state,
  data = traffic_twfe_LSDV
)

# 使用聚类稳健标准误
CL_twfe_LSDV <- vcovCL(fit_twfe_LSDV, cluster = ~ state)
coeftest(fit_twfe_LSDV, vcov = CL_twfe_LSDV) %>% 
  round(digits = 8) %>% 
  tidy() %>%
  head()

term,estimate,std.error,statistic,p.value
<chr>,<dbl>,<dbl>,<dbl>,<dbl>
(Intercept),0.94669881,0.6342624,1.492598,0.13667641
beertax,-0.43471948,0.26412094,-1.645911,0.10091219
spircons,0.805857,0.12554059,6.419095,0.0
unrate,-0.05490839,0.01271853,-4.317195,2.2e-05
perinck,0.08826358,0.03492073,2.527541,0.01204089
state_AL,0.67943936,0.42350477,1.604325,0.10977782


# 差分估计量

In [91]:
# 差分估计量
fit_fd <- plm(
  fatal ~ beertax + spircons + unrate + perinck,
  data = pdata_traffic,
  model = "fd"
)

tidy(fit_fd)

term,estimate,std.error,statistic,p.value
<chr>,<dbl>,<dbl>,<dbl>,<dbl>
(Intercept),-0.044226639,0.0197073,-2.2441754,0.02559407804
beertax,0.049569298,0.27263488,0.1818157,0.8558575313
spircons,0.31626695,0.16759422,1.8870994,0.06017030763
unrate,-0.002437778,0.01190617,-0.2047492,0.83791526412
perinck,0.184922993,0.04170962,4.4335818,1.327425e-05


不会默认去掉截距项，因此需要我们主动设定：

In [94]:
# 差分估计量
fit_fd <- plm(
  fatal ~ beertax + spircons + unrate + perinck -1,  # 去掉截距项
  data = pdata_traffic,
  model = "fd"
)

tidy(fit_fd)

term,estimate,std.error,statistic,p.value
<chr>,<dbl>,<dbl>,<dbl>,<dbl>
beertax,0.118770149,0.27280363,0.4353686,0.6636255513
spircons,0.523584049,0.14082489,3.7179796,0.0002419129
unrate,0.003398955,0.0117009,0.2904866,0.7716559762
perinck,0.141798111,0.03728137,3.8034575,0.0001747144


In [95]:
# 使用聚类稳健标准误
CL_fd <- vcovCL(fit_fd, cluster = ~ state)
coeftest(fit_fd, vcov = CL_fd) %>% round(digits = 8) %>% tidy()

ERROR: Error in UseMethod("estfun"): no applicable method for 'estfun' applied to an object of class "c('plm', 'panelmodel')"


In [98]:
# 手动估计FD估计量
# 差分
traffic_fd <- traffic %>% 
  select(state, year, fatal, beertax, spircons, unrate, perinck) %>% 
  group_by(state) %>%
  mutate(across(
    .cols = c(fatal, beertax, spircons, unrate, perinck),
    .fns = ~ .x - lag(.x)
  )) %>%
  ungroup() %>%
  filter(!is.na(fatal))

# 拟合FD模型
fit_fd_manual <- lm(
  fatal ~ beertax + spircons + unrate + perinck -1,  # 去掉截距项
  data = traffic_fd
)

# 使用聚类稳健标准误
CL_fd_manual <- vcovCL(fit_fd_manual, cluster = ~ state)
coeftest(fit_fd_manual, vcov = CL_fd_manual) %>% round(digits = 8) %>% tidy()

term,estimate,std.error,statistic,p.value
<chr>,<dbl>,<dbl>,<dbl>,<dbl>
beertax,0.11877015,0.22768748,0.5216367,0.60232985
spircons,0.52358405,0.16331353,3.2060054,0.00149958
unrate,0.00339895,0.0118903,0.2858595,0.77519416
perinck,0.14179811,0.04686992,3.025354,0.0027108


# 随机效应模型

In [17]:
# 实现随机效应模型
pdata_traffic <- pdata.frame(traffic, index = c("state", "year"))
fit_re <- plm(
  fatal ~ beertax + spircons + unrate + perinck,
  data = pdata_traffic,
  model = "random"
)

tidy(fit_re)

term,estimate,std.error,statistic,p.value
<chr>,<dbl>,<dbl>,<dbl>,<dbl>
(Intercept),2.00197349,0.38112466,5.2528049,1.498002e-07
beertax,0.04427676,0.12046126,0.3675601,0.7132013
spircons,0.30247114,0.06429535,4.7044012,2.546121e-06
unrate,-0.0491381,0.00981968,-5.0040432,5.614018e-07
perinck,-0.01107273,0.01947461,-0.5685727,0.5696462


In [33]:
# 使用聚类稳健标准误
library(lmtest)
coeftest(fit_re, vcov = vcovHC) %>% round(digits = 8) %>% tidy()

term,estimate,std.error,statistic,p.value
<chr>,<dbl>,<dbl>,<dbl>,<dbl>
(Intercept),2.00197349,0.51408083,3.8942776,0.00011916
beertax,0.04427676,0.12621489,0.3508045,0.72595815
spircons,0.30247114,0.1062759,2.8460935,0.00470191
unrate,-0.0491381,0.00890361,-5.5188982,7e-08
perinck,-0.01107273,0.02453987,-0.451214,0.65213081


老问题，无法直接用聚类稳健标准误，考虑手动实现：

我们先用BE结合FE的方法实现。

In [None]:
# 手动实现随机效应模型
T <- length(unique(traffic$year))
N <- length(unique(traffic$state))
# 估计扰动项
fit_fe <- plm(
  fatal ~ beertax + spircons + unrate + perinck,
  data = pdata_traffic,
  model = "within"
)
# e2的方差考虑调整自由度，修正完毕
e2 <- sum(residuals(fit_fe)^2) /
      (nrow(traffic) - length(fit_fe$coefficients) - N)

# 估计随机效应与扰动项
fit_be <- plm(
  fatal ~ beertax + spircons + unrate + perinck,
  data = pdata_traffic,
  model = "between"
)
# 得到u2+e2/T
u2_plus_e2 <- sum(residuals(fit_be)^2) / (N - length(fit_be$coefficients))

# 估计theta
u2 <- u2_plus_e2 - e2 / T
theta <- 1 - sqrt(e2 / (T * u2 + e2))

# 计算差值
traffic_re_manual <- traffic %>%
  group_by(state) %>%
  mutate(
    across(
      .cols = c(fatal, beertax, spircons, unrate, perinck),
      .fns = ~ .x - theta * mean(.x)
    )
  ) %>%
  ungroup()

# 拟合RE模型
fit_re_manual <- lm(
  fatal ~ beertax + spircons + unrate + perinck,
  data = traffic_re_manual
)

# 使用聚类稳健标准误
CL_re_manual <- vcovCL(fit_re_manual, cluster = ~ state)
coeftest(fit_re_manual, vcov = CL_re_manual) %>% round(digits = 8) %>% tidy()

term,estimate,std.error,statistic,p.value
<chr>,<dbl>,<dbl>,<dbl>,<dbl>
(Intercept),0.28183657,0.07357843,3.8304237,0.00015302
beertax,0.04427676,0.12831892,0.3450525,0.7302741
spircons,0.30247114,0.10804754,2.7994265,0.0054197
unrate,-0.0491381,0.00905203,-5.4284055,1.1e-07
perinck,-0.01107273,0.02494896,-0.4438155,0.65746598


再用OLS结合FE实现一遍。

In [53]:
# 手动实现随机效应模型
T <- length(unique(traffic$year))
N <- length(unique(traffic$state))
# 估计扰动项
fit_fe <- plm(
  fatal ~ beertax + spircons + unrate + perinck,
  data = pdata_traffic,
  model = "within"
)
# 得到e2
e2 <- sum(residuals(fit_fe)^2) /
      (nrow(traffic) - length(fit_fe$coefficients) - N)

# 估计随机效应与扰动项
fit_ols <- lm(
  fatal ~ beertax + spircons + unrate + perinck,
  data = traffic
)
# 得到u2+e2
u2_plus_e2 <- sum(residuals(fit_ols)^2) / (nrow(traffic) - length(fit_ols$coefficients))

# 估计theta
u2 <- u2_plus_e2 - e2
theta <- 1 - sqrt(e2 / u2)

# 计算差值
# 计算差值
traffic_re_manual <- traffic %>%
  group_by(state) %>%
  mutate(
    across(
      .cols = c(fatal, beertax, spircons, unrate, perinck),
      .fns = ~ .x - theta * mean(.x)
    )
  ) %>%
  ungroup()

# 拟合RE模型
fit_re_manual <- lm(
  fatal ~ beertax + spircons + unrate + perinck,
  data = traffic_re_manual
)

# 使用聚类稳健标准误
CL_re_manual <- vcovCL(fit_re_manual, cluster = ~ state)
coeftest(fit_re_manual, vcov = CL_re_manual) %>% round(digits = 8) %>% tidy()

term,estimate,std.error,statistic,p.value
<chr>,<dbl>,<dbl>,<dbl>,<dbl>
(Intercept),1.24455736,0.16216231,7.6747633,0.0
beertax,0.1379153,0.1060133,1.3009246,0.19418917
spircons,0.08714016,0.10044609,0.8675316,0.3862793
unrate,-0.05988748,0.01131262,-5.2938658,2.2e-07
perinck,-0.09642134,0.02222958,-4.3375247,1.916e-05


In [150]:
ercomp(
  fatal ~ beertax + spircons + unrate + perinck,
  data = traffic,
  index = c("state", "year"),
  method = "swar"
)

                  var std.dev share
idiosyncratic 0.02458 0.15679 0.124
individual    0.17369 0.41676 0.876
theta: 0.8592

# 组间估计量

In [56]:
# 组间估计量
fit_between <- plm(
  fatal ~ beertax + spircons + unrate + perinck,
  data = pdata_traffic,
  model = "between"
)

tidy(fit_between)

term,estimate,std.error,statistic,p.value
<chr>,<dbl>,<dbl>,<dbl>,<dbl>
(Intercept),3.79634303,0.75020251,5.0604243,8.303145e-06
beertax,0.07403619,0.14563334,0.5083739,0.6137895
spircons,0.29975174,0.11281346,2.6570566,0.01101864
unrate,0.03223326,0.03800499,0.8481324,0.4010609
perinck,-0.18417475,0.04222407,-4.3618429,7.923493e-05


# 豪斯曼检验

In [57]:
phtest(fit_fe, fit_re)


	Hausman Test

data:  fatal ~ beertax + spircons + unrate + perinck
chisq = 130.93, df = 4, p-value < 2.2e-16
alternative hypothesis: one model is inconsistent


In [60]:
library(plm)
library(sandwich)

phtest(fit_fe, fit_re, vcov = function(x) vcovHC(x, type = "HC1"))


	Hausman Test

data:  fatal ~ beertax + spircons + unrate + perinck
chisq = 130.93, df = 4, p-value < 2.2e-16
alternative hypothesis: one model is inconsistent


In [63]:
# 使用聚类稳健标准误
library(lmtest)
phtest(fit_fe, fit_re, vcov = function(x) vcovCL(x, cluster = ~ state))


	Hausman Test

data:  fatal ~ beertax + spircons + unrate + perinck
chisq = 130.93, df = 4, p-value < 2.2e-16
alternative hypothesis: one model is inconsistent


In [73]:
# 辅助回归替代豪斯曼检验
T <- length(unique(traffic$year))
N <- length(unique(traffic$state))

# 添加变量-均值项，命名格式为变量_gamma
traffic_re_manual <- traffic %>%
  group_by(state) %>%
  mutate(
    across(
      .cols = c(beertax, spircons, unrate, perinck),
      .fns = ~ .x - mean(.x),
      .names = "{col}_gamma"
    )
  ) %>%
  ungroup()

# 估计扰动项
fit_fe <- plm(
  fatal ~ beertax + spircons + unrate + perinck,
  data = pdata_traffic,
  index = c("state", "year"),
  model = "within"
)

# 得到e2
e2 <- sum(residuals(fit_fe)^2) /
      (nrow(traffic) - length(fit_fe$coefficients) - N)

# 估计随机效应与扰动项
fit_be <- plm(
  fatal ~ beertax + spircons + unrate + perinck,
  data = pdata_traffic,
  index = c("state", "year"),
  model = "between"
)
# 得到u2+e2/T
u2_plus_e2 <- sum(residuals(fit_be)^2) / (N - length(fit_be$coefficients))

# 估计theta
u2 <- u2_plus_e2 - e2 / T
theta <- 1 - sqrt(e2 / (T * u2 + e2))

# 计算差值
traffic_re_manual <- traffic_re_manual %>%
  group_by(state) %>%
  mutate(
    across(
      .cols = c(fatal, beertax, spircons, unrate, perinck),
      .fns = ~ .x - theta * mean(.x)
    )
  ) %>%
  ungroup()

# 拟合RE模型
fit_re_manual <- lm(
  fatal ~ beertax + spircons + unrate + perinck +
          beertax_gamma + spircons_gamma + unrate_gamma + perinck_gamma,
  data = traffic_re_manual
)

# 使用聚类稳健标准误
CL_re_manual <- vcovCL(fit_re_manual, cluster = ~ state)
coeftest(fit_re_manual, vcov = CL_re_manual) %>% round(digits = 8) %>% tidy()

term,estimate,std.error,statistic,p.value
<chr>,<dbl>,<dbl>,<dbl>,<dbl>
(Intercept),0.53444678,0.10299814,5.1888974,3.7e-07
beertax,0.07403619,0.12199449,0.6068815,0.54435071
spircons,0.29975174,0.09356142,3.2037962,0.00148987
unrate,0.03223326,0.03001884,1.0737678,0.28371846
perinck,-0.18417475,0.04105139,-4.4864435,1.004e-05
beertax_gamma,-0.55810896,0.25930126,-2.1523573,0.03210066
spircons_gamma,0.51721342,0.16525684,3.1297549,0.00190716
unrate_gamma,-0.0612832,0.03205472,-1.9118305,0.05677169
perinck_gamma,0.28888502,0.0634226,4.5549226,7.41e-06


In [79]:
# 看_gamma结尾系数是否同时为0
library(car)
linearHypothesis(
  fit_re_manual,
  c("beertax_gamma = 0", "spircons_gamma = 0", "unrate_gamma = 0", "perinck_gamma = 0"),
  vcov = CL_re_manual,
  test = "Chisq"
)

Unnamed: 0_level_0,Res.Df,Df,Chisq,Pr(>Chisq)
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>
1,331,,,
2,327,4.0,63.42695,5.517458e-13
