-
Notifications
You must be signed in to change notification settings - Fork 28
/
mediate.R
3701 lines (3405 loc) · 150 KB
/
mediate.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#' Causal Mediation Analysis
#'
#' 'mediate' is used to estimate various quantities for causal mediation
#' analysis, including average causal mediation effects (indirect effect),
#' average direct effects, proportions mediated, and total effect.
#'
#' @details This is the workhorse function for estimating causal mediation
#' effects for a variety of data types. The average causal mediation effect
#' (ACME) represents the expected difference in the potential outcome when the
#' mediator took the value that would realize under the treatment condition as
#' opposed to the control condition, while the treatment status itself is held
#' constant. That is,
#' \deqn{\delta(t) \ = \ E\{Y(t, M(t_1)) - Y(t, M(t_0))\},}{%
#' \delta(t) = E[Y(t, M(t1)) - Y(t, M(t0))],}
#' where \eqn{t, t_1, t_0}{t, t1, t0} are particular values of the treatment
#' \eqn{T} such that \eqn{t_1 \neq t_0}{t1 != t0}, \eqn{M(t)} is the potential
#' mediator, and \eqn{Y(t,m)} is the potential outcome variable. The average
#' direct effect (ADE) is defined similarly as,
#' \deqn{\zeta(t) \ = \ E\{Y(t_1, M(t)) - Y(t_0, M(t))\},}{%
#' \zeta(t) = E[Y(t1, M(t)) - Y(t0, M(t))],}
#' which represents the expected difference in the potential outcome when the
#' treatment is changed but the mediator is held constant at the value that
#' would realize if the treatment equals \eqn{t}. The two quantities on
#' average add up to the total effect of the treatment on the outcome,
#' \eqn{\tau}. See the references for more details.
#'
#' When both the mediator model ('model.m') and outcome model ('model.y') are
#' normal linear regressions, the results will be identical to the usual LSEM
#' method by Baron and Kenny (1986). The function can, however, accommodate
#' other data types including binary, ordered and count outcomes and mediators
#' as well as censored outcomes. Variables can also be modeled
#' nonparametrically, semiparametrically, or using quantile regression.
#'
#' If it is desired that inference be made conditional on specific values of
#' the pre-treatment covariates included in the model, the `covariates'
#' argument can be used to set those values as a list or data frame. The list
#' may contain either the entire set or any strict subset of the covariates in
#' the model; in the latter case, the effects will be averaged over the other
#' covariates. The `covariates' argument will be particularly useful when the
#' models contain interactions between the covariates and the treatment and/or
#' mediator (known as ``moderated mediation'').
#'
#' The prior weights in the mediator and outcome models are taken as sampling
#' weights and the estimated effects will be weighted averages when non-NULL
#' weights are used in fitting 'model.m' and 'model.y'. This will be useful
#' when data does not come from a simple random sample, for example.
#'
#' As of version 4.0, the mediator model can be of either 'lm', 'glm' (or
#' `bayesglm'), 'polr' (or `bayespolr'), 'gam', 'rq', `survreg', or `merMod'
#' class, corresponding respectively to the linear regression models,
#' generalized linear models, ordered response models, generalized additive
#' models, quantile regression models, parametric duration models, or
#' multilevel models.. For binary response models, the 'mediator' must be a
#' numeric variable with values 0 or 1 as opposed to a factor.
#' Quasi-likelihood-based inferences are not allowed for the mediator model
#' because the functional form must be exactly specified for the estimation
#' algorithm to work. The 'binomial' family can only be used for binary
#' response mediators and cannot be used for multiple-trial responses. This
#' is due to conflicts between how the latter type of models are implemented
#' in \code{\link{glm}} and how 'mediate' is currently written.
#'
#' For the outcome model, the censored regression model fitted via package
#' \code{VGAM} (of class 'vglm' with 'family@vfamily' equal to "tobit") can be
#' used in addition to the models listed above for the mediator. The
#' 'mediate' function is not compatible with censored regression models fitted
#' via other packages. When the quantile regression is used for the outcome
#' model ('rq'), the estimated quantities are quantile causal mediation
#' effects, quantile direct effects and etc., instead of the average effects.
#' If the outcome model is of class 'survreg', the name of the outcome
#' variable must be explicitly supplied in the `outcome' argument. This is due
#' to the fact that 'survreg' objects do not contain that information in an
#' easily extractable form. It should also be noted that for
#' \code{\link{survreg}} models, the \code{\link{Surv}} function must be
#' directly used in the model formula in the call to the survreg function, and
#' that censoring types requiring more than two arguments to Surv (e.g.,
#' interval censoring) are not currently supported by 'mediate'.
#'
#' The quasi-Bayesian approximation (King et al. 2000) cannot be used if
#' 'model.m' is of class 'rq' or 'gam', or if 'model.y' is of class 'gam',
#' 'polr' or 'bayespolr'. In these cases, either an error message is returned
#' or use of the nonparametric bootstrap is forced. Users should note that use
#' of the nonparametric bootstrap often requires significant computing time,
#' especially when 'sims' is set to a large value.
#'
#' The 'control' argument must be provided when 'gam' is used for the outcome
#' model and user wants to allow ACME and ADE to vary as functions of the
#' treatment (i.e., to relax the "no interaction" assumption). Note that the
#' outcome model must be fitted via package \code{\link{mgcv}} with
#' appropriate formula using \code{\link{s}} constructs (see Imai et al. 2009
#' in the references). For other model types, the interaction can be allowed
#' by including an interaction term between \eqn{T} and \eqn{M} in the linear
#' predictor of the outcome model. As of version 3.0, the 'INT' argument is
#' deprecated and the existence of the interaction term is automatically
#' detected (except for 'gam' outcome models).
#'
#' When the treatment variable is continuous or a factor with multiple levels,
#' user must specify the values of \eqn{t_1}{t1} and \eqn{t_0}{t0} using the
#' 'treat.value' and 'control.value' arguments, respectively. The value of
#' \eqn{t} in the above expressions is set to \eqn{t_0}{t0} for 'd0', 'z0',
#' etc. and to \eqn{t_1}{t1} for 'd1', 'z1', etc.
#'
#' @param model.m a fitted model object for mediator. Can be of class 'lm',
#' 'polr', 'bayespolr', 'glm', 'bayesglm', 'gam', 'rq', 'survreg', or
#' 'merMod'.
#' @param model.y a fitted model object for outcome. Can be of class 'lm',
#' 'polr', 'bayespolr', 'glm', 'bayesglm', 'gam', 'vglm', 'rq', 'survreg', or
#' 'merMod'.
#' @param sims number of Monte Carlo draws for nonparametric bootstrap or
#' quasi-Bayesian approximation.
#' @param boot a logical value. if 'FALSE' a quasi-Bayesian approximation is
#' used for confidence intervals; if 'TRUE' nonparametric bootstrap will be
#' used. Default is 'FALSE'.
#' @param boot.ci.type a character string indicating the type of bootstrap
#' confidence intervals. If "bca" and boot = TRUE, bias-corrected and
#' accelerated (BCa) confidence intervals will be estimated. If "perc" and
#' boot = TRUE, percentile confidence intervals will be estimated. Default is
#' "perc".
#' @param conf.level level of the returned two-sided confidence intervals.
#' Default is to return the 2.5 and 97.5 percentiles of the simulated
#' quantities.
#' @param treat a character string indicating the name of the treatment variable
#' used in the models. The treatment can be either binary (integer or a
#' two-valued factor) or continuous (numeric).
#' @param mediator a character string indicating the name of the mediator
#' variable used in the models.
#' @param covariates a list or data frame containing values for a subset of the
#' pre-treatment covariates in 'model.m' and 'model.y'. If provided, the
#' function will return the estimates conditional on those covariate values.
#' @param outcome a character string indicating the name of the outcome variable
#' in `model.y'. Only necessary if 'model.y' is of class 'survreg'; otherwise
#' ignored.
#' @param control a character string indicating the name of the control group
#' indicator. Only relevant if 'model.y' is of class 'gam'. If provided, 'd0',
#' 'z0' and 'n0' are allowed to differ from 'd1', 'z1' and 'n1', respectively.
#' @param control.value value of the treatment variable used as the control
#' condition. Default is 0.
#' @param treat.value value of the treatment variable used as the treatment
#' condition. Default is 1.
#' @param long a logical value. If 'TRUE', the output will contain the entire
#' sets of simulation draws of the the average causal mediation effects,
#' direct effects, proportions mediated, and total effect. Default is 'TRUE'.
#' @param dropobs a logical value indicating the behavior when the model frames
#' of 'model.m' and 'model.y' (and the 'cluster' variable if included) are
#' composed of different observations. If 'TRUE', models will be re-fitted
#' using common data rows. If 'FALSE', error is returned. Default is 'FALSE'.
#' @param robustSE a logical value. If 'TRUE', heteroskedasticity-consistent
#' standard errors will be used in quasi-Bayesian simulations. Ignored if
#' 'boot' is 'TRUE' or neither 'model.m' nor 'model.y' has a method for
#' \code{vcovHC} in the \code{sandwich} package. Default is 'FALSE'.
#' @param cluster a variable indicating clusters for standard errors. Note that
#' this should be a vector of cluster indicators itself, not a character
#' string for the name of the variable.
#' @param group.out a character string indicating the name of the lmer/glmer
#' group on which the mediate output is based. Can be used even when a merMod
#' function is applied to only one of the mediator or the outcome. If merMod
#' functions are applied to both the mediator and the outcome, default is the
#' group name used in the outcome model; if the mediator group and the outcome
#' group are different and the user is interested in the mediate output based
#' on the mediator group, then set group.out to the group name used in the
#' mediator merMod model. If a merMod function is applied to only one of the
#' mediator or the outcome, group.out is automatically set to the group name
#' used in the merMod model.
#' @param use_speed a logical value indicating whether, if nonparametric
#' bootstrap is used, \code{lm} and \code{glm} models should be re-fit using
#' functions from the \code{speedglm} package. Ignored if 'boot' is 'FALSE' or
#' if neither 'model.m' nor 'model.y' is of class 'lm' or 'glm'. Default is
#' 'FALSE'.
#' @param ... other arguments passed to \code{vcovHC} in the \code{sandwich}
#' package: typically the 'type' argument, which is ignored if 'robustSE' is
#' 'FALSE'. Arguments to the \code{boot} in the \code{boot} package may also
#' be passed, e.g. 'parallel' and 'ncpus'.
#'
#' @return \code{mediate} returns an object of class "\code{mediate}",
#' "\code{mediate.order}" if the outcome model used is 'polr' or 'bayespolr',
#' or "\code{mediate.mer}" if 'lmer' or 'glmer' is used for the outcome or the
#' mediator model, a list that contains the components listed below. Some of
#' these elements are not available if 'long' is set to 'FALSE' by the user.
#'
#' The function \code{summary} (i.e., \code{summary.mediate},
#' \code{summary.mediate.order}, or \code{summary.mediate.mer}) can be used to
#' obtain a table of the results. The function \code{plot} (i.e.,
#' \code{plot.mediate}, \code{plot.mediate.order}, or \code{plot.mediate.mer})
#' can be used to produce a plot of the estimated average causal mediation,
#' average direct, and total effects along with their confidence intervals.
#'
#' \item{d0, d1}{point estimates for average causal mediation effects under
#' the control and treatment conditions.}
#' \item{d0.ci, d1.ci}{confidence intervals for average causal mediation
#' effects. The confidence level is set at the value specified in
#' 'conf.level'.}
#' \item{d0.p, d1.p}{two-sided p-values for average causal mediation effects.}
#' \item{d0.sims, d1.sims}{vectors of length 'sims' containing simulation
#' draws of average causal mediation effects.}
#' \item{z0, z1}{point estimates for average direct effect under the control
#' and treatment conditions.}
#' \item{z0.ci, z1.ci}{confidence intervals for average direct effects.}
#' \item{z0.p, z1.p}{two-sided p-values for average causal direct effects.}
#' \item{z0.sims, z1.sims}{vectors of length 'sims' containing simulation
#' draws of average direct effects.}
#' \item{n0, n1}{the "proportions mediated", or the size of the average causal
#' mediation effects relative to the total effect.}
#' \item{n0.ci, n1.ci}{confidence intervals for the proportions mediated.}
#' \item{n0.p, n1.p}{two-sided p-values for proportions mediated.}
#' \item{n0.sims, n1.sims}{vectors of length 'sims' containing simulation
#' draws of the proportions mediated.}
#' \item{tau.coef}{point estimate for total effect.}
#' \item{tau.ci}{confidence interval for total effect.}
#' \item{tau.p}{two-sided p-values for total effect.}
#' \item{tau.sims}{a vector of length 'sims' containing simulation draws of
#' the total effect.}
#' \item{d.avg, z.avg, n.avg}{simple averages of d0 and d1, z0 and z1, n0 and
#' n1, respectively, which users may want to use as summary values when those
#' quantities differ.}
#' \item{d.avg.ci, z.avg.ci, n.avg.ci}{confidence intervals for the above.}
#' \item{d.avg.p, z.avg.p, n.avg.p}{two-sided p-values for the above.}
#' \item{d.avg.sims, z.avg.sims, n.avg.sims}{vectors of length 'sims'
#' containing simulation draws of d.avg, z.avg and n.avg, respectively.}
#' \item{d0.group, d1.group}{group-specific point estimates for average
#' causal mediation effects under the control and treatment conditions.}
#' \item{d0.ci.group, d1.ci.group}{group-specific confidence intervals for
#' average causal mediation effects. The confidence level is set at the value
#' specified in 'conf.level'.}
#' \item{d0.p.group, d1.p.group}{group-specific two-sided p-values for average
#' causal mediation effects.}
#' \item{d0.sims.group, d1.sims.group}{group-specific vectors of length 'sims'
#' containing simulation draws of average causal mediation effects.}
#' \item{z0.group, z1.group}{group-specific point estimates for average direct
#' effect under the control and treatment conditions.}
#' \item{z0.ci.group, z1.ci.group}{group-specific confidence intervals for
#' average direct effects.}
#' \item{z0.p.group, z1.p.group}{group-specific two-sided p-values for average
#' causal direct effects.}
#' \item{z0.sims.group, z1.sims.group}{group-specific vectors of length 'sims'
#' containing simulation draws of average direct effects.}
#' \item{n0.group, n1.group}{the group-specific "proportions mediated", or the
#' size of the group-specific average causal mediation effects relative to the
#' total effect.}
#' \item{n0.ci.group, n1.ci.group}{group-specific confidence intervals for the
#' proportions mediated.}
#' \item{n0.p.group, n1.p.group}{group-specific two-sided p-values for
#' proportions mediated.}
#' \item{n0.sims.group, n1.sims.group}{group-specific vectors of length 'sims'
#' containing simulation draws of the proportions mediated.}
#' \item{tau.coef.group}{group-specific point estimate for total effect.}
#' \item{tau.ci.group}{group-specific confidence interval for total effect.}
#' \item{tau.p.group}{group-specific two-sided p-values for total effect.}
#' \item{tau.sims.group}{a group-specific vector of length 'sims' containing
#' simulation draws of the total effect.}
#' \item{d.avg.group, z.avg.group, n.avg.group}{group-specific simple averages
#' of d0 and d1, z0 and z1, n0 and n1, respectively, which users may want to
#' use as summary values when those quantities differ.}
#' \item{d.avg.ci.group, z.avg.ci.group, n.avg.ci.group}{group-specific
#' confidence intervals for the above.}
#' \item{d.avg.p.group, z.avg.p.group, n.avg.p.group}{group-specific two-sided
#' p-values for the above.}
#' \item{d.avg.sims.group, z.avg.sims.group, n.avg.sims.group}{group-specific
#' vectors of length 'sims' containing simulation draws of d.avg, z.avg and
#' n.avg, respectively.}
#' \item{boot}{logical, the 'boot' argument used.}
#' \item{treat}{a character string indicating the name of the 'treat' variable
#' used.}
#' \item{mediator}{a character string indicating the name of the 'mediator'
#' variable used.}
#' \item{INT}{a logical value indicating whether the model specification
#' allows the effects to differ between the treatment and control conditions.}
#' \item{conf.level}{the confidence level used. }
#' \item{model.y}{the outcome model used.}
#' \item{model.m}{the mediator model used.}
#' \item{group.m}{the name of the mediator group used.}
#' \item{group.y}{the name of the outcome group used.}
#' \item{group.name}{the name of the group on which the output is based.}
#' \item{group.id.m}{the data on the mediator group.}
#' \item{group.id.y}{the data on the outcome group.}
#' \item{group.id}{the data on the group on which the output is based.}
#' \item{control.value}{value of the treatment variable used as the control
#' condition.}
#' \item{treat.value}{value of the treatment variable used as the treatment
#' condition.}
#' \item{nobs}{number of observations in the model frame for 'model.m' and
#' 'model.y'. May differ from the numbers in the original models input to
#' 'mediate' if 'dropobs' was 'TRUE'.}
#' \item{robustSE}{`TRUE' or `FALSE'.}
#' \item{cluster}{the clusters used.}
#'
#' @author Dustin Tingley, Harvard University,
#' \email{dtingley@@gov.harvard.edu}; Teppei Yamamoto, Massachusetts Institute
#' of Technology, \email{teppei@@mit.edu}; Luke Keele, Penn State University,
#' \email{ljk20@@psu.edu}; Kosuke Imai, Princeton University,
#' \email{kimai@@princeton.edu}; Kentaro Hirose, Princeton University,
#' \email{hirose@@princeton.edu}.
#'
#' @seealso \code{\link{medsens}}, \code{\link{plot.mediate}},
#' \code{\link{summary.mediate}}, \code{\link{summary.mediate.mer}},
#' \code{\link{plot.mediate.mer}}, \code{\link{mediations}}, \code{vcovHC}
#'
#' @references Tingley, D., Yamamoto, T., Hirose, K., Imai, K. and Keele, L.
#' (2014). "mediation: R package for Causal Mediation Analysis", Journal of
#' Statistical Software, Vol. 59, No. 5, pp. 1-38.
#'
#' Imai, K., Keele, L., Tingley, D. and Yamamoto, T. (2011). Unpacking the
#' Black Box of Causality: Learning about Causal Mechanisms from Experimental
#' and Observational Studies, American Political Science Review, Vol. 105, No.
#' 4 (November), pp. 765-789.
#'
#' Imai, K., Keele, L. and Tingley, D. (2010) A General Approach to Causal
#' Mediation Analysis, Psychological Methods, Vol. 15, No. 4 (December), pp.
#' 309-334.
#'
#' Imai, K., Keele, L. and Yamamoto, T. (2010) Identification, Inference, and
#' Sensitivity Analysis for Causal Mediation Effects, Statistical Science,
#' Vol. 25, No. 1 (February), pp. 51-71.
#'
#' Imai, K., Keele, L., Tingley, D. and Yamamoto, T. (2009) "Causal Mediation
#' Analysis Using R" in Advances in Social Science Research Using R, ed. H. D.
#' Vinod New York: Springer.
#'
#' @export
#' @examples
#' # Examples with JOBS II Field Experiment
#'
#' # **For illustration purposes a small number of simulations are used**
#'
#' data(jobs)
#'
#' ####################################################
#' # Example 1: Linear Outcome and Mediator Models
#' ####################################################
#' b <- lm(job_seek ~ treat + econ_hard + sex + age, data=jobs)
#' c <- lm(depress2 ~ treat + job_seek + econ_hard + sex + age, data=jobs)
#'
#' # Estimation via quasi-Bayesian approximation
#' contcont <- mediate(b, c, sims=50, treat="treat", mediator="job_seek")
#' summary(contcont)
#' plot(contcont)
#'
#' \dontrun{
#' # Estimation via nonparametric bootstrap
#' contcont.boot <- mediate(b, c, boot=TRUE, sims=50, treat="treat", mediator="job_seek")
#' summary(contcont.boot)
#' }
#'
#' # Allowing treatment-mediator interaction
#' d <- lm(depress2 ~ treat + job_seek + treat:job_seek + econ_hard + sex + age, data=jobs)
#' contcont.int <- mediate(b, d, sims=50, treat="treat", mediator="job_seek")
#' summary(contcont.int)
#'
#' # Allowing ``moderated mediation'' with respect to age
#' b.int <- lm(job_seek ~ treat*age + econ_hard + sex, data=jobs)
#' d.int <- lm(depress2 ~ treat*job_seek*age + econ_hard + sex, data=jobs)
#' contcont.age20 <- mediate(b.int, d.int, sims=50, treat="treat", mediator="job_seek",
#' covariates = list(age = 20))
#' contcont.age70 <- mediate(b.int, d.int, sims=50, treat="treat", mediator="job_seek",
#' covariates = list(age = 70))
#' summary(contcont.age20)
#' summary(contcont.age70)
#'
#' # Continuous treatment
#' jobs$treat_cont <- jobs$treat + rnorm(nrow(jobs)) # (hypothetical) continuous treatment
#' b.contT <- lm(job_seek ~ treat_cont + econ_hard + sex + age, data=jobs)
#' c.contT <- lm(depress2 ~ treat_cont + job_seek + econ_hard + sex + age, data=jobs)
#' contcont.cont <- mediate(b.contT, c.contT, sims=50,
#' treat="treat_cont", mediator="job_seek",
#' treat.value = 4, control.value = -2)
#' summary(contcont.cont)
#'
#' # Categorical treatment
#' \dontrun{
#' b <- lm(job_seek ~ educ + sex, data=jobs)
#' c <- lm(depress2 ~ educ + job_seek + sex, data=jobs)
#'
#' # compare two categories of educ --- gradwk and somcol
#' model.cat <- mediate(b, c, treat="educ", mediator="job_seek", sims=50,
#' control.value = "gradwk", treat.value = "somcol")
#' summary(model.cat)
#' }
#'
#' ######################################################
#' # Example 2: Binary Outcome and Ordered Mediator
#' ######################################################
#' \dontrun{
#' jobs$job_disc <- as.factor(jobs$job_disc)
#' b.ord <- polr(job_disc ~ treat + econ_hard + sex + age, data=jobs,
#' method="probit", Hess=TRUE)
#' d.bin <- glm(work1 ~ treat + job_disc + econ_hard + sex + age, data=jobs,
#' family=binomial(link="probit"))
#' ordbin <- mediate(b.ord, d.bin, sims=50, treat="treat", mediator="job_disc")
#' summary(ordbin)
#'
#' # Using heteroskedasticity-consistent standard errors
#' ordbin.rb <- mediate(b.ord, d.bin, sims=50, treat="treat", mediator="job_disc",
#' robustSE=TRUE)
#' summary(ordbin.rb)
#'
#' # Using non-parametric bootstrap
#' ordbin.boot <- mediate(b.ord, d.bin, sims=50, treat="treat", mediator="job_disc",
#' boot=TRUE)
#' summary(ordbin.boot)
#' }
#'
#' ######################################################
#' # Example 3: Quantile Causal Mediation Effect
#' ######################################################
#' require(quantreg)
#' c.quan <- rq(depress2 ~ treat + job_seek + econ_hard + sex + age, data=jobs,
#' tau = 0.5) # median
#' contquan <- mediate(b, c.quan, sims=50, treat="treat", mediator="job_seek")
#' summary(contquan)
#'
#' ######################################################
#' # Example 4: GAM Outcome
#' ######################################################
#' \dontrun{
#' require(mgcv)
#' c.gam <- gam(depress2 ~ treat + s(job_seek, bs="cr") +
#' econ_hard + sex + age, data=jobs)
#' contgam <- mediate(b, c.gam, sims=10, treat="treat",
#' mediator="job_seek", boot=TRUE)
#' summary(contgam)
#'
#' # With interaction
#' d.gam <- gam(depress2 ~ treat + s(job_seek, by = treat) +
#' s(job_seek, by = control) + econ_hard + sex + age, data=jobs)
#' contgam.int <- mediate(b, d.gam, sims=10, treat="treat", mediator="job_seek",
#' control = "control", boot=TRUE)
#' summary(contgam.int)
#' }
#' ######################################################
#' # Example 5: Multilevel Outcome and Mediator Models
#' ######################################################
#' \dontrun{
#' require(lme4)
#'
#' # educ: mediator group
#' # occp: outcome group
#'
#' # Varying intercept for mediator
#' model.m <- glmer(job_dich ~ treat + econ_hard + (1 | educ),
#' family = binomial(link = "probit"), data = jobs)
#'
#' # Varying intercept and slope for outcome
#' model.y <- glmer(work1 ~ treat + job_dich + econ_hard + (1 + treat | occp),
#' family = binomial(link = "probit"), data = jobs)
#'
#' # Output based on mediator group ("educ")
#' multilevel <- mediate(model.m, model.y, treat = "treat",
#' mediator = "job_dich", sims=50, group.out="educ")
#'
#' # Output based on outcome group ("occp")
#' # multilevel <- mediate(model.m, model.y, treat = "treat",
#' mediator = "job_dich", sims=50)
#'
#' # Group-average effects
#' summary(multilevel)
#'
#' # Group-specific effects organized by effect
#' summary(multilevel, output="byeffect")
#' # plot(multilevel, group.plots=TRUE)
#' # See summary.mediate.mer and plot.mediate.mer for detailed explanations
#'
#' # Group-specific effects organized by group
#' summary(multilevel, output="bygroup")
#' # See summary.mediate.mer for detailed explanations
#' }
mediate <- function(model.m, model.y, sims = 1000,
boot = FALSE, boot.ci.type = "perc",
treat = "treat.name", mediator = "med.name",
covariates = NULL, outcome = NULL,
control = NULL, conf.level = .95,
control.value = 0, treat.value = 1,
long = TRUE, dropobs = FALSE,
robustSE = FALSE, cluster = NULL, group.out = NULL,
use_speed = FALSE, ...){
cl <- match.call()
# Warn users who still use INT option
if(match("INT", names(cl), 0L)){
warning("'INT' is deprecated - existence of interaction terms is now automatically detected from model formulas")
}
# Warning for robustSE and cluster used with boot
if(robustSE && boot){
warning("'robustSE' is ignored for nonparametric bootstrap")
}
if(!is.null(cluster) && boot){
warning("'cluster' is ignored for nonparametric bootstrap")
}
if(robustSE & !is.null(cluster)){
stop("choose either `robustSE' or `cluster' option, not both")
}
if(boot.ci.type != "bca" & boot.ci.type != "perc"){
stop("choose either `bca' or `perc' for boot.ci.type")
}
# Drop observations not common to both mediator and outcome models
if(dropobs){
odata.m <- model.frame(model.m)
odata.y <- model.frame(model.y)
if(!is.null(cluster)){
if(is.null(row.names(cluster)) &
(nrow(odata.m)!=length(cluster) | nrow(odata.y)!=length(cluster))
){
warning("cluster IDs may not correctly match original observations due to missing data")
}
odata.y <- merge(odata.y, as.data.frame(cluster), sort=FALSE,
by="row.names")
rownames(odata.y) <- odata.y$Row.names
odata.y <- odata.y[,-1L]
}
newdata <- merge(odata.m, odata.y, sort=FALSE,
by=c("row.names", intersect(names(odata.m), names(odata.y))))
rownames(newdata) <- newdata$Row.names
newdata <- newdata[,-1L]
rm(odata.m, odata.y)
Call.M <- getCall(model.m)
Call.Y <- getCall(model.y)
Call.M$data <- Call.Y$data <- newdata
if(c("(weights)") %in% names(newdata)){
Call.M$weights <- Call.Y$weights <- model.weights(newdata)
}
model.m <- eval.parent(Call.M)
model.y <- eval.parent(Call.Y)
if(!is.null(cluster)){
cluster <- factor(newdata[, ncol(newdata)]) # factor drops missing levels
}
}
# Model type indicators
isGam.y <- inherits(model.y, "gam")
isGam.m <- inherits(model.m, "gam")
isGlm.y <- inherits(model.y, "glm") # Note gam and bayesglm also inherits "glm"
isGlm.m <- inherits(model.m, "glm") # Note gam and bayesglm also inherits "glm"
isLm.y <- inherits(model.y, "lm") # Note gam, glm and bayesglm also inherit "lm"
isLm.m <- inherits(model.m, "lm") # Note gam, glm and bayesglm also inherit "lm"
isVglm.y <- inherits(model.y, "vglm")
isRq.y <- inherits(model.y, "rq")
isRq.m <- inherits(model.m, "rq")
isOrdered.y <- inherits(model.y, "polr") # Note bayespolr also inherits "polr"
isOrdered.m <- inherits(model.m, "polr") # Note bayespolr also inherits "polr"
isSurvreg.y <- inherits(model.y, "survreg")
isSurvreg.m <- inherits(model.m, "survreg")
isMer.y <- inherits(model.y, "merMod") # Note lmer and glmer do not inherit "lm" and "glm"
isMer.m <- inherits(model.m, "merMod") # Note lmer and glmer do not inherit "lm" and "glm"
# Record family and link of model.m if glmer
if(isMer.m && class(model.m)[[1]] == "glmerMod"){
m.family <- as.character(model.m@call$family)
if(m.family[1] == "binomial" && (is.na(m.family[2]) || m.family[2] == "logit")){
M.fun <- binomial(link = "logit")
} else if(m.family[1] == "binomial" && m.family[2] == "probit"){
M.fun <- binomial(link = "probit")
} else if(m.family[1] == "binomial" && m.family[2] == "cloglog"){
M.fun <- binomial(link = "cloglog")
} else if(m.family[1] == "poisson" && (is.na(m.family[2]) || m.family[2] == "log")){
M.fun <- poisson(link = "log")
} else if(m.family[1] == "poisson" && m.family[2] == "identity"){
M.fun <- poisson(link = "identity")
} else if(m.family[1] == "poisson" && m.family[2] == "sqrt"){
M.fun <- poisson(link = "sqrt")
} else {
stop("glmer family for the mediation model not supported")
} ### gamma & inverse gaussian excluded (no function to estimate parameters of S4 glmer vs. S3 glm)
}
# Record family and link of model.y if glmer
if(isMer.y && class(model.y)[[1]] == "glmerMod"){
y.family <- as.character(model.y@call$family)
if(y.family[1] == "binomial" && (is.na(y.family[2]) || y.family[2] == "logit")){
Y.fun <- binomial(link = "logit")
} else if(y.family[1] == "binomial" && y.family[2] == "probit"){
Y.fun <- binomial(link = "probit")
} else if(y.family[1] == "binomial" && y.family[2] == "cloglog"){
Y.fun <- binomial(link = "cloglog")
} else if(y.family[1] == "poisson" && (is.na(y.family[2]) || y.family[2] == "log")){
Y.fun <- poisson(link = "log")
} else if(y.family[1] == "poisson" && y.family[2] == "identity"){
Y.fun <- poisson(link = "identity")
} else if(y.family[1] == "poisson" && y.family[2] == "sqrt"){
Y.fun <- poisson(link = "sqrt")
} else {
stop("glmer family for the outcome model not supported")
} ### gamma & inverse gaussian excluded (no function to estimate parameters of S4 glmer vs. S3 glm)
}
# Record family of model.m if glm
if(isGlm.m){
FamilyM <- model.m$family$family
}
# Record family of model.m if glmer
if(isMer.m && class(model.m)[[1]] == "glmerMod"){
FamilyM <- M.fun$family
}
# Record vfamily of model.y if vglm (currently only tobit)
if(isVglm.y){
VfamilyY <- model.y@family@vfamily
}
# Warning for unused options
if(!is.null(control) && !isGam.y){
warning("'control' is only used for GAM outcome models - ignored")
control <- NULL
}
if(!is.null(outcome) && !(isSurvreg.y && boot)){
warning("'outcome' is only relevant for survival outcome models with bootstrap - ignored")
}
# Model frames for M and Y models
m.data <- model.frame(model.m) # Call.M$data
y.data <- model.frame(model.y) # Call.Y$data
if(!is.null(cluster)){
row.names(m.data) <- 1:nrow(m.data)
row.names(y.data) <- 1:nrow(y.data)
if(!is.null(model.m$weights)){
m.weights <- as.data.frame(model.m$weights)
m.name <- as.character(model.m$call$weights)
names(m.weights) <- m.name
m.data <- cbind(m.data, m.weights)
}
if(!is.null(model.y$weights)){
y.weights <- as.data.frame(model.y$weights)
y.name <- as.character(model.y$call$weights)
names(y.weights) <- y.name
y.data <- cbind(y.data, y.weights)
}
}
# group-level mediator
if(isMer.y & !isMer.m){
m.data <- eval(model.m$call$data, environment(formula(model.m))) ### add group ID to m.data
m.data <- na.omit(m.data)
y.data <- na.omit(y.data)
}
# Specify group names
if(isMer.m && isMer.y){
med.group <- names(model.m@flist)
out.group <- names(model.y@flist)
n.med <- length(med.group)
n.out <- length(out.group)
if(n.med > 1 || n.out > 1){
stop("mediate does not support more than two levels per model")
} else {
group.m <- med.group
group.y <- out.group
if(!is.null(group.out) && !(group.out %in% c(group.m, group.y))){
warning("group.out does not match group names used in merMod")
} else if(is.null(group.out)){
group.out <- group.y
}
}
} else if(!isMer.m && isMer.y){
out.group <- names(model.y@flist)
n.out <- length(out.group)
if(n.out > 1){
stop("mediate does not support more than two levels per model")
} else {
group.m <- NULL
group.y <- out.group
group.out <- group.y
}
} else if(isMer.m && !isMer.y){
med.group <- names(model.m@flist)
n.med <- length(med.group)
if(n.med > 1){
stop("mediate does not support more than two levels per model")
} else {
group.m <- med.group
group.y <- NULL
group.out <- group.m
}
} else {
group.m <- NULL
group.y <- NULL
group.out <- NULL
}
# group data if lmer or glmer
if(isMer.m){
group.id.m <- m.data[,group.m]
} else {
group.id.m <- NULL
}
if(isMer.y){
group.id.y <- y.data[,group.y]
} else {
group.id.y <- NULL
}
# group data to be output in summary and plot if lmer or glmer
if(isMer.y && isMer.m){
if(group.out == group.m){
group.id <- m.data[,group.m]
group.name <- group.m
} else {
group.id <- y.data[,group.y]
group.name <- group.y
}
} else if(!isMer.y && isMer.m){
group.id <- m.data[,group.m]
group.name <- group.m
} else if(isMer.y && !isMer.m){ ### group-level mediator
if(!(group.y %in% names(m.data))){
stop("specify group-level variable in mediator data")
} else {
group.id <- y.data[,group.y]
group.name <- group.y
Y.ID<- sort(unique(group.id))
if(is.character(m.data[,group.y])){
M.ID <- sort(as.factor(m.data[,group.y]))
} else {
M.ID <- sort(as.vector(data.matrix(m.data[group.y])))
}
if(length(Y.ID) != length(M.ID)){
stop("groups do not match between mediator and outcome models")
} else {
if(FALSE %in% unique(Y.ID == M.ID)){
stop("groups do not match between mediator and outcome models")
}
}
}
} else {
group.id <- NULL
group.name <- NULL
}
# Numbers of observations and categories
n.m <- nrow(m.data)
n.y <- nrow(y.data)
if(!(isMer.y & !isMer.m)){ ### n.y and n.m are different when group-level mediator is used
if(n.m != n.y){
stop("number of observations do not match between mediator and outcome models")
} else{
n <- n.m
}
m <- length(sort(unique(model.frame(model.m)[,1])))
}
# Extracting weights from models
weights.m <- model.weights(m.data)
weights.y <- model.weights(y.data)
if(!is.null(weights.m) && isGlm.m && FamilyM == "binomial"){
message("weights taken as sampling weights, not total number of trials")
}
if(!is.null(weights.m) && isMer.m && class(model.m)[[1]] == "glmerMod" && FamilyM == "binomial"){
message("weights taken as sampling weights, not total number of trials")
}
if(is.null(weights.m)){
weights.m <- rep(1,nrow(m.data))
}
if(is.null(weights.y)){
weights.y <- rep(1,nrow(y.data))
}
if(!(isMer.y & !isMer.m)){
if(!all(weights.m == weights.y)) {
stop("weights on outcome and mediator models not identical")
} else {
weights <- weights.m
}
} else{
weights <- weights.y ### group-level mediator
}
# Convert character treatment to factor
if(is.character(m.data[,treat])){
m.data[,treat] <- factor(m.data[,treat])
}
if(is.character(y.data[,treat])){
y.data[,treat] <- factor(y.data[,treat])
}
# Convert character mediator to factor
if(is.character(y.data[,mediator])){
y.data[,mediator] <- factor(y.data[,mediator])
}
# Factor treatment indicator
isFactorT.m <- is.factor(m.data[,treat])
isFactorT.y <- is.factor(y.data[,treat])
if(isFactorT.m != isFactorT.y){
stop("treatment variable types differ in mediator and outcome models")
} else {
isFactorT <- isFactorT.y
}
if(isFactorT){
t.levels <- levels(y.data[,treat])
if(treat.value %in% t.levels & control.value %in% t.levels){
cat.0 <- control.value
cat.1 <- treat.value
} else {
cat.0 <- t.levels[1]
cat.1 <- t.levels[2]
warning("treatment and control values do not match factor levels; using ", cat.0, " and ", cat.1, " as control and treatment, respectively")
}
} else {
cat.0 <- control.value
cat.1 <- treat.value
}
# Factor mediator indicator
isFactorM <- is.factor(y.data[,mediator])
if(isFactorM){
m.levels <- levels(y.data[,mediator])
}
# Eventually, we want to use only objects collected in K,
# passing K into intermediate functions, and clean up all stray objects.
K <- as.list(environment())
# rm(list = ls())
############################################################################
############################################################################
### CASE I: EVERYTHING EXCEPT ORDERED OUTCOME
############################################################################
############################################################################
if (!isOrdered.y) {
########################################################################
## Case I-1: Quasi-Bayesian Monte Carlo
########################################################################
if(!boot){
# Error if gam outcome or quantile mediator
if(isGam.m | isGam.y | isRq.m){
stop("'boot' must be 'TRUE' for models used")
}
# Get mean and variance parameters for mediator simulations
if(isSurvreg.m && is.null(survival::survreg.distributions[[model.m$dist]]$scale)){
MModel.coef <- c(coef(model.m), log(model.m$scale))
scalesim.m <- TRUE
} else if(isMer.m){
MModel.fixef <- lme4::fixef(model.m)
MModel.ranef <- lme4::ranef(model.m)
scalesim.m <- FALSE
} else {
MModel.coef <- coef(model.m)
scalesim.m <- FALSE
}
if(isOrdered.m){
if(is.null(model.m$Hess)){
cat("Mediator model object does not contain 'Hessian';")
}
k <- length(MModel.coef)
MModel.var.cov <- vcov(model.m)[(1:k),(1:k)]
} else if(isSurvreg.m){
MModel.var.cov <- vcov(model.m)
} else {
if(robustSE & !isMer.m){
MModel.var.cov <- vcovHC(model.m, ...)
} else if(robustSE & isMer.m){
MModel.var.cov <- vcov(model.m)
warning("robustSE does not support mer class: non-robust SEs are computed for model.m")
} else if(!is.null(cluster)){
if(nrow(m.data)!=length(cluster)){
warning("length of cluster vector differs from # of obs for mediator model")
}
dta <- merge(m.data, as.data.frame(cluster), sort=FALSE,
by="row.names")
fm <- update(model.m, data=dta)
MModel.var.cov <- sandwich::vcovCL(fm, dta[,ncol(dta)])
} else {
MModel.var.cov <- vcov(model.m)
}
}
# Get mean and variance parameters for outcome simulations
if(isSurvreg.y && is.null(survival::survreg.distributions[[model.y$dist]]$scale)){
YModel.coef <- c(coef(model.y), log(model.y$scale))
scalesim.y <- TRUE # indicates if survreg scale parameter is simulated
} else if(isMer.y){
YModel.fixef <- lme4::fixef(model.y)
YModel.ranef <- lme4::ranef(model.y)
scalesim.y <- FALSE
} else {
YModel.coef <- coef(model.y)
scalesim.y <- FALSE
}
if(isRq.y){
YModel.var.cov <- summary(model.y, covariance=TRUE)$cov
} else if(isSurvreg.y){
YModel.var.cov <- vcov(model.y)
} else {
if(robustSE & !isMer.y){
YModel.var.cov <- vcovHC(model.y, ...)
} else if(robustSE & isMer.y){
YModel.var.cov <- vcov(model.y)
warning("robustSE does not support mer class: non-robust SEs are computed for model.y")
} else if(!is.null(cluster)){
if(nrow(y.data)!=length(cluster)){
warning("length of cluster vector differs from # of obs for outcome model")
}
dta <- merge(y.data, as.data.frame(cluster), sort=FALSE,
by="row.names")
fm <- update(model.y, data=dta)
YModel.var.cov <- sandwich::vcovCL(fm, dta[,ncol(dta)])
} else {
YModel.var.cov <- vcov(model.y)
}
}
# Draw model coefficients from normal
se.ranef.new <- function (object) {
se.bygroup <- lme4::ranef(object, condVar = TRUE)
n.groupings <- length(se.bygroup)
for (m in 1:n.groupings) {
vars.m <- attr(se.bygroup[[m]], "postVar")
K <- dim(vars.m)[1]
J <- dim(vars.m)[3]
names.full <- dimnames(se.bygroup[[m]])
se.bygroup[[m]] <- array(NA, c(J, K))
for (j in 1:J) {
se.bygroup[[m]][j, ] <- sqrt(diag(as.matrix(vars.m[,
, j])))
}
dimnames(se.bygroup[[m]]) <- list(names.full[[1]], names.full[[2]])
}
return(se.bygroup)
}
if(isMer.m){
MModel.fixef.vcov <- as.matrix(vcov(model.m))
MModel.fixef.sim <- rmvnorm(sims,mean=MModel.fixef,sigma=MModel.fixef.vcov)
Nm.ranef <- ncol(lme4::ranef(model.m)[[1]])
MModel.ranef.sim <- vector("list",Nm.ranef)
for (d in 1:Nm.ranef){
MModel.ranef.sim[[d]] <- matrix(rnorm(sims*nrow(lme4::ranef(model.m)[[1]]), mean = lme4::ranef(model.m)[[1]][,d], sd = se.ranef.new(model.m)[[1]][,d]), nrow = sims, byrow = TRUE)
}
} else {
if(sum(is.na(MModel.coef)) > 0){
stop("NA in model coefficients; rerun models with nonsingular design matrix")
}
MModel <- rmvnorm(sims, mean=MModel.coef, sigma=MModel.var.cov)
}
if(isMer.y){
YModel.fixef.vcov <- as.matrix(vcov(model.y))
YModel.fixef.sim <- rmvnorm(sims,mean=YModel.fixef,sigma=YModel.fixef.vcov)
Ny.ranef <- ncol(lme4::ranef(model.y)[[1]])
YModel.ranef.sim <- vector("list",Ny.ranef)
for (d in 1:Ny.ranef){
YModel.ranef.sim[[d]] <- matrix(rnorm(sims*nrow(lme4::ranef(model.y)[[1]]), mean = lme4::ranef(model.y)[[1]][,d], sd = se.ranef.new(model.y)[[1]][,d]), nrow = sims, byrow = TRUE)
}
} else {
if(sum(is.na(YModel.coef)) > 0){
stop("NA in model coefficients; rerun models with nonsingular design matrix")
}
YModel <- rmvnorm(sims, mean=YModel.coef, sigma=YModel.var.cov)
}
if(robustSE && (isSurvreg.m | isSurvreg.y)){
warning("`robustSE' ignored for survival models; fit the model with `robust' option instead\n")
}
if(!is.null(cluster) && (isSurvreg.m | isSurvreg.y)){
warning("`cluster' ignored for survival models; fit the model with 'cluster()' term in the formula\n")
}
#####################################
## Mediator Predictions
#####################################
### number of observations are different when group-level mediator is used
if(isMer.y & !isMer.m){
n <- n.m
}
pred.data.t <- pred.data.c <- m.data
if(isFactorT){
pred.data.t[,treat] <- factor(cat.1, levels = t.levels)
pred.data.c[,treat] <- factor(cat.0, levels = t.levels)
} else {
pred.data.t[,treat] <- cat.1
pred.data.c[,treat] <- cat.0
}
if(!is.null(covariates)){
for(p in 1:length(covariates)){
vl <- names(covariates[p])
if(is.factor(pred.data.t[,vl])){
pred.data.t[,vl] <- pred.data.c[,vl] <- factor(covariates[[p]], levels = levels(m.data[,vl]))
} else {
pred.data.t[,vl] <- pred.data.c[,vl] <- covariates[[p]]
}
}
}