Made comparison statements into a function <<predictionInterval.test(…

…)>> that takes a model formula, model data and prediction data as arguments as well as the ability to specify a unique observation id variable and pass futher arguments to lmer. I think it works, but things should be checked by hand at least one time to double check
jknowles · May 13, 2015 · 2c0cdf9 · 2c0cdf9
1 parent 0fa90db
commit 2c0cdf9
Showing 1 changed file with 156 additions and 147 deletions.
diff --git a/tests/testthat/Compare_bootMer_KF.R b/tests/testthat/Compare_bootMer_KF.R
@@ -17,95 +17,23 @@
  data(sleepstudy)
  data(VerbAgg)
 
- #Build out model
- m1.form <- Reaction ~ Days + (Days | Subject)
- m1.df <- sleepstudy
- m1.new.df <- m1.df
-
- #Estimate model
- m1.estimation.time <- system.time(
-   m1  <- lmer(m1.form, data=m1.df)
- )
- print(m1)
-
-#Step 1: Get bootMer... gold standard ####
- ##Define function for parameter extraction (inspired from bootMer help file)
- ##Pulling:
- ##  Individual level predicted values conditional on ALL random effects
- mySumm <- function(.) {
-    predict(., newdata=m1.new.df, re.form=NULL, type="link")
-    }
-
- ##Maybe try all three (valid) ways listed in bootMer doc in the order listed in details
- boot1.time <- system.time(
-   boot1.m1 <- bootMer(m1, mySumm, nsim=1000,
-                        use.u=FALSE, type="parametric",
-                        .progress = "txt", PBargs=list(style=3))
- )
-
- boot2.time <- system.time(
-   boot2.m1 <- bootMer(m1, mySumm, nsim=1000,
-                        use.u=TRUE, type="parametric",
-                        .progress = "txt", PBargs=list(style=3))
- )
-
- boot3.time <- system.time(
-   boot3.m1 <- bootMer(m1, mySumm, nsim=1000,
-                        use.u=TRUE, type="semiparametric",
-                        .progress = "txt", PBargs=list(style=3))
- )
-
- sumBoot <- function(merBoot) {
-   data.frame(merBoot$data,
-     median = apply(merBoot$t, 2, function(x) as.numeric(quantile(x, probs=.5, na.rm=TRUE))),
-     lwr = apply(merBoot$t, 2, function(x) as.numeric(quantile(x, probs=.025, na.rm=TRUE))),
-     upr = apply(merBoot$t, 2, function(x) as.numeric(quantile(x, probs=.975, na.rm=TRUE))),
-     colNAs = apply(merBoot$t, 2, function(x) sum(is.na(x)))
-   )
- }
-
-#Step 2: Our method ####
- ##"sub" functions
- source("F:/RSandbox/merTools/R/merExtract.R")
-
- reMatrixExtract <- function(model, group, case){
-   slotNum <- ranef(model)[[group]]
-   slotNum$seq <- 1:nrow(slotNum)
-   slotNum <- slotNum[case, ]$seq
-   mat <- as.matrix(attr(ranef(model, condVar=TRUE)[[group]],
-                         which = "postVar")[, , slotNum])
-   row.names(mat) <- names(ranef(model)[[group]])
-   colnames(mat) <- row.names(mat)
-   return(mat)
- }
-
- reMeansExtract <- function(model, group, case){
-   rerow <- ranef(model)[[group]][case, ]
-   out <- as.numeric(rerow)
-   names(out) <- names(ranef(model)[[group]])
-   return(out)
- }
-
- combineREFE <- function(reSim, betaSim){
-   comboCols <- intersect(colnames(reSim), colnames(betaSim))
-   REonly <- setdiff(colnames(reSim), colnames(betaSim))
-   FEonly <- setdiff(colnames(betaSim), colnames(reSim))
-   totCoef <- cbind(reSim[, comboCols] + betaSim[, comboCols])
-   colnames(totCoef) <- comboCols
-   totCoef <- cbind(totCoef, reSim[, REonly])
-   totCoef <- cbind(totCoef, betaSim[, FEonly])
-   return(totCoef)
- }
-
- ##Main Function
+ #Cannonical Models for testing purposes
+ ##1) Sleepstudy eg - lmer with random slope and random intercept
+   m1.form <- Reaction ~ Days + (Days | Subject)
+   m1.df <- sleepstudy
+   m1.new.df <- m1.df
+ ##2) Verbal Aggresion eg - lmer (could to glmer(logit)) with 2 levels
+   m2.form <- r2 ~ (Anger + Gender + btype + situ)^2 + (1|id) + (1|item)
+   m2.df <- VerbAgg
+   m2.new.df <- m2.df
+
+#Step 1: Our method ####
  predictInterval <- function(model, newdata, level = 0.95){
    #Depends
    require(mvtnorm)
    require(lme4)
    #Prep Output
-   outs <- data.frame(fit = rep(NA, nrow(newdata)),
-                      lwr = rep(NA, nrow(newdata)),
-                      upr = rep(NA, nrow(newdata)))
+   outs <- newdata
 
    #Sort out all the levels
    reTerms <- names(ngrps(model))
@@ -138,79 +66,160 @@
    yhat <- fixed.xb + apply(simplify2array(reSim), c(1,2), sum)
 
    #Output prediction intervals
-   outs$fit <- apply(yhat,1,mean)
+   outs$fit <- apply(yhat,1,function(x) as.numeric(quantile(x, .5)))
    outs$upr <- apply(yhat,1,function(x) as.numeric(quantile(x, 1 - ((1-level)/2))))
    outs$lwr <- apply(yhat,1,function(x) as.numeric(quantile(x, ((1-level)/2))))
    #Close it out
    return(outs)
  }
 
- kf.time <- system.time(
-   kf.method <- predictInterval(m1, m1.new.df)
- )
+#Step 2: Unit Test Function####
+  predictInterval.test <- function(model.form, model.df, predict.df, idvar=NULL, ...) {
+   require(lme4); require(dplyr); require(tidyr); require(ggplot2);
+   ##Estimate model
+   modelEstimation.time <- system.time(
+     m1  <- lmer(model.form, data=model.df, ...)
+   )
+   ##If it does not have one, add unique identifier to predict.df
+   if (is.null(idvar)) {
+     predict.df$.newID <- paste("newID", rownames(predict.df), sep="")
+   }
+   ##Functions for bootMer() and objects
+   ####Return predicted values from bootstrap
+   mySumm <- function(.) {
+     predict(., newdata=predict.df, re.form=NULL, type="link")
+   }
+   ####Collapse bootstrap into median, 95% PI
+   sumBoot <- function(merBoot) {
+     data.frame(merBoot$data,
+                median = apply(merBoot$t, 2, function(x) as.numeric(quantile(x, probs=.5, na.rm=TRUE))),
+                lwr = apply(merBoot$t, 2, function(x) as.numeric(quantile(x, probs=.025, na.rm=TRUE))),
+                upr = apply(merBoot$t, 2, function(x) as.numeric(quantile(x, probs=.975, na.rm=TRUE))),
+                colNAs = apply(merBoot$t, 2, function(x) sum(is.na(x)))
+     )
+   }
+   ##Bootstrap
+   ####Method 1: parametric, re-estimate BLUPS
+   cat("\n Bootstrap Method 1")
+   boot1.time <- system.time(
+     boot1 <- bootMer(m1, mySumm, nsim=1000,
+                         use.u=FALSE, type="parametric",
+                         .progress = "txt", PBargs=list(style=3))
+   )
+   ####Method 2: parametric, conditional on estimated BLUPS
+   cat("\n Bootstrap Method 2")
+   boot2.time <- system.time(
+     boot2 <- bootMer(m1, mySumm, nsim=1000,
+                         use.u=TRUE, type="parametric",
+                         .progress = "txt", PBargs=list(style=3))
+   )
+   ####Method 3: semiparametric (draw from resid), conditional on estimated BLUPS
+   cat("\n Bootstrap Method 3")
+   boot3.time <- system.time(
+     boot3 <- bootMer(m1, mySumm, nsim=1000,
+                         use.u=TRUE, type="semiparametric",
+                         .progress = "txt", PBargs=list(style=3))
+   )
+   ##Our Method
+   kf.time <- system.time(
+     kf.method <- predictInterval(m1, predict.df)
+   )
+   ##Compare Times
+   compare.time <- rbind(modelEstimation.time, kf.time, boot1.time, boot2.time, boot3.time)
+
+   ##Summarize and compare results
+   boot1.sum <- sumBoot(boot1)
+   boot2.sum <- sumBoot(boot2)
+   boot3.sum <- sumBoot(boot3)
+
+   eval <- merge(predict.df, boot1.sum) %>%
+     rename(boot1.fit=median, boot1.lwr=lwr, boot1.upr=upr)
+   eval <- merge(eval, boot2.sum) %>%
+     rename(boot2.fit=median, boot2.lwr=lwr, boot2.upr=upr)
+   eval <- merge(eval, boot3.sum) %>%
+     rename(boot3.fit=median, boot3.lwr=lwr, boot3.upr=upr)
+   eval <- merge(eval, kf.method) %>%
+     rename(KF.fit=fit, KF.lwr=lwr, KF.upr=upr)
+
+   #Check if nrow(eval) still equals nrow(predict.df) because it should
+   if (nrow(eval)!=nrow(predict.df)) {
+     stop("Something happened when merging bootstrap summaries together ...")
+   }
 
- ##Compare Times
- compare.time <- rbind(kf.time, boot1.time, boot2.time, boot3.time)
- rm(kf.time, boot1.time, boot2.time, boot3.time)
+   ##Add lmer yhats (predict.merMod) on there
+   eval$with.u <- predict(m1, newdata=predict.df, re.form=NULL, type="link")
+   eval$no.u <- predict(m1, newdata=predict.df, re.form=NA, type="link")
+
+   ##Diagnostic plots
+   ####Data prep
+   Eval <- sample_n(eval, 30) %>%
+     gather(var, value, starts_with("boot"), starts_with("KF")) %>%
+     mutate(
+       simMethod = sub("[[:punct:]][0-9A-Za-z]*", "", var),
+       stat = sub("[0-9A-Za-z]*[[:punct:]]", "", var)
+     ) %>%
+     select(-var) %>%
+     spread(stat, value) %>%
+     group_by(.newID) %>%
+     arrange(simMethod) %>%
+     mutate(
+       x=row_number(simMethod),
+       x=(x-mean(x))/10
+     ) %>%
+     group_by(simMethod) %>%
+     mutate(
+       x=row_number(.newID)+x
+     )
+
+
+   ####Direct Comparison Plot
+   p1 <- ggplot(aes(y=fit,x=x, color=simMethod), data=Eval) +
+           geom_point(size=I(3)) +
+           geom_linerange(aes(ymax=upr, ymin=lwr), size=I(1)) +
+           geom_point(shape="with.u", color="black", size=I(4),
+                      data=summarize(group_by(Eval, .newID), x=mean(x), fit=mean(with.u))) +
+           geom_point(shape="no.u", color="black", size=I(4),
+                      data=summarize(group_by(Eval, .newID), x=mean(x), fit=mean(no.u))) +
+           theme_bw() +
+           labs(x="Index", y="Prediction Interval", title="95% Prediction interval by method for 30 random obs")
+
+   #####Distribution of fitted values
+   p2 <- ggplot(aes(x=fit), data=Eval) +
+           geom_density(aes(color=Sim.Method)) +
+           geom_vline(x=mean(m1@resp$y)) +
+           geom_density(data=summarize(
+                               group_by_(
+                                 cbind(y=m1@resp$y, as.data.frame(m1@flist)),
+                                 names(m1@flist)),
+                              fit = mean(y)), color="black") +
+           theme_bw() +
+           labs(x = "Estimate", y="Density",
+                title="Average point estimates by prediction type \n (vertical black line is sample grand mean, \n black density is distribution of sample group means)")
+
+
+   ##Close it out
+   return(
+     list(
+       compareTimes=compare.time,
+       bootstraps = list(boot1, boot2, boot3),
+       kf.method = kf.method,
+       model=m1,
+       evalData = eval,
+       plot.directCompare = p1,
+       plot.pointEstimateDist = p2
+       )
+     )
+ }
 
 #Step 3: Summarize and Compare Results####
- ##Summarize and Combine
- boot1.sum <- sumBoot(boot1.m1)
- boot1.sum$.new.id <- 1:nrow(boot1.sum)
- boot1.sum <- select(boot1.sum, .new.id, fit=median, lwr, upr) %>%
-   gather(stat, Boot.1, fit, lwr, upr)
-
- boot2.sum <- sumBoot(boot2.m1)
- boot2.sum$.new.id <- 1:nrow(boot2.sum)
- boot2.sum <- select(boot2.sum, .new.id, fit=median, lwr, upr) %>%
-   gather(stat, Boot.2, fit, lwr, upr)
-
- boot3.sum <- sumBoot(boot3.m1)
- boot3.sum$.new.id <- 1:nrow(boot3.sum)
- boot3.sum <- select(boot3.sum, .new.id, fit=median, lwr, upr) %>%
-   gather(stat, Boot.3, fit, lwr, upr)
-
- kf.method$.new.id <- 1:nrow(kf.method)
- kf.method <- select(kf.method, .new.id, fit, lwr, upr) %>%
-   gather(stat, KF, fit, lwr, upr)
-
- eval <- merge(boot1.sum, boot2.sum, by=c(".new.id", "stat"))
- eval <- merge(eval, boot3.sum, by=c(".new.id","stat"))
- eval <- merge(eval, kf.method, by=c(".new.id","stat"))
-
- eval <- eval %>%
-   gather(Sim.Method, value, Boot.1, Boot.2, Boot.3, KF) %>%
-   spread(stat, value)
-
- eval$.new.id[eval$Sim.Method=="Boot.1"] <- eval$.new.id[eval$Sim.Method=="Boot.1"] - .15
- eval$.new.id[eval$Sim.Method=="Boot.2"] <- eval$.new.id[eval$Sim.Method=="Boot.2"] - .05
- eval$.new.id[eval$Sim.Method=="Boot.3"] <- eval$.new.id[eval$Sim.Method=="Boot.3"] + .05
- eval$.new.id[eval$Sim.Method=="KF"]     <- eval$.new.id[eval$Sim.Method=="KF"]     + .15
-
- #Gen both point predictions from predict.merMod for reference
- m1.new.df$.new.id <- 1:nrow(m1.new.df)
- m1.new.df$with.u <- predict(m1, newdata=m1.new.df, re.form=NULL, type="link")
- m1.new.df$no.u <- predict(m1, newdata=m1.new.df, re.form=NA, type="link")
- m1.new.df <- gather(m1.new.df, pred.merMod.type, fit, with.u, no.u) %>%
-   arrange(.new.id)
-
- #Diagnostic plots
- ggplot(aes(y=fit,x=.new.id, color=Sim.Method), data=eval[1:100,]) +
-   geom_point(size=I(3)) +
-   geom_linerange(aes(ymax=upr, ymin=lwr), size=I(1)) +
-   geom_point(aes(shape=pred.merMod.type), data=m1.new.df[1:50,], color="black") +
-   theme_bw() +
-   labs(x="Index", y="Prediction Interval", title="95% Prediction interval by method for select individuals")
-
-
- ggplot(aes(x=fit), data=eval) +
-   geom_density(aes(color=type)) +
-   geom_vline(x=mean(test$y)) +
-   geom_density(data=summarize(group_by(test, d), fit = mean(y)), color="black") +
-   theme_bw() +
-   labs(x = "Estimate", y="Density",
-        title="Average point estimates by prediction type \n (vertical black line is sample grand mean, \n black density is distribution of sample group means)")
+ #debug(predictInterval.test)
+ cannonical.1 <-  predictInterval.test(m1.form, m1.df, m1.new.df)
+ cannonical.1$plot.directCompare
+
+ cannonical.2 <-  predictInterval.test(m2.form, m2.df, m2.new.df)
+ cannonical.2$plot.directCompare
 
+ save.image()