refactor predictInterval for memory efficiency

jknowles · Jun 18, 2015 · 1cd075f · 1cd075f
1 parent fd36a07
commit 1cd075f
Show file tree

Hide file tree

Showing 7 changed files with 133 additions and 68 deletions.
diff --git a/R/helpers.R b/R/helpers.R
@@ -0,0 +1,16 @@
+#Helpers
+
+# Function to take only rows that form distinct levels of factors
+trimModelFrame <- function(data){
+  # Identify numerics
+  nums <- sapply(data, is.numeric)
+  vars <- names(nums[!nums == TRUE])
+  dataList <- vector(mode = "list", length = length(vars))
+  names(dataList) <- vars
+    for(i in vars){
+      dataList[[i]] <- data[!duplicated(data[, i]),]
+    }
+    newdat <- do.call(rbind, dataList)
+    newdat <- newdat[!duplicated(newdat),]
+    return(newdat)
+}
diff --git a/R/merPredict.R b/R/merPredict.R
@@ -47,38 +47,40 @@ predictInterval <- function(model, newdata, level = 0.95,
     stop("    Prediction for this NLMMs or GLMMs that are not mixed logistic regressions is not yet implemented.")
   }
 
-  ##This chunk of code draws from empirical bays distributions for each level of the random effects
+  ##This chunk of code draws from empirical bayes distributions for each level of the random effects
   ##and merges it back onto newdata.
   ##
   ##Right now I am not multiplying the BLUP variance covariance matrices by our
   ##draw of sigma (for linear models) because their variation is unique.  If anything,
   ##this is where one would multiply them by draws of theta from the model.
   reTerms <- names(ngrps(model))
   n.reTerms = length(reTerms)
-  reSim <- NULL
+  reSimA <- array(data = NA, dim = c(nrow(newdata), nsim, n.reTerms))
   for (j in seq_along(reTerms)) {
     group=reTerms[j]
     reMeans <- array(ranef(model)[[group]])
     reMatrix <- attr(ranef(model, condVar=TRUE)[[group]], which = "postVar")
-    reSim[[group]] <- data.frame(rownames(reMeans), matrix(NA, nrow=nrow(reMeans),
-                                                           ncol=nsim))
-    colnames(reSim[[group]]) <- c(group, paste("sim", 1:nsim, sep=""))
+    tmp <- data.frame(rownames(reMeans), matrix(NA, nrow=nrow(reMeans),
+                                                ncol=nsim))
+    colnames(tmp) <- c(group, paste("sim", 1:nsim, sep=""))
     for (k in 1:nrow(reMeans)) {
-      lvl = rownames(reMeans)[k]
-      reSim[[group]][k,2:ncol(reSim[[group]])] <- mvtnorm::rmvnorm(nsim,
-                                                          mean=as.matrix(reMeans[k,]),
-                                                          sigma=as.matrix(reMatrix[,,k]))
+        meanTmp <- as.matrix(reMeans[k, ])
+        matrixTmp <- as.matrix(reMatrix[,,k])
+        tmp[k, 2:ncol(tmp)] <- mvtnorm::rmvnorm(nsim,
+                                             mean=meanTmp,
+                                             sigma=matrixTmp)
     }
-    cnames <- colnames(reSim[[group]])
+    cnames <- colnames(tmp)
     #This is where to check for groups that only exist in newdata
-    new.levels <- setdiff(newdata[,group], reSim[[group]][,1])
+    new.levels <- setdiff(newdata[,group], tmp[,1])
     if (length(new.levels)>0) {
       msg <- paste("     The following levels of ", group, " from newdata -- ", paste0(new.levels, collapse=", "),
                    " -- are not in the model data. \n     Currently, predictions for these values are based only on the fixed coefficients \n     and the observation-level error.", sep="")
       warning(msg, call.=FALSE)
     }
-    reSim[[group]] <- merge(newdata, reSim[[group]], by=group, all.x=TRUE)
-    reSim[[group]] <- as.matrix(reSim[[group]][,setdiff(cnames, group)])
+    tmp <- merge(newdata, tmp, by=group, all.x=TRUE)
+    tmp <- as.matrix(tmp[,setdiff(cnames, group)])
+    reSimA[, , j] <- tmp
   }
 
   ##This chunk of code produces matrix of linear predictors created from the fixed coefs
@@ -93,11 +95,22 @@ predictInterval <- function(model, newdata, level = 0.95,
   }
 
   betaSim <- abind(lapply(1:nsim, function(x) mvtnorm::rmvnorm(1, mean = fixef(model), sigma = sigmahat[x]*as.matrix(vcov(model)))), along=1)
-  newdata.modelMatrix <- lFormula(formula = model@call, data=newdata)$X
+  # If we do it this way, the function will fail on new data without all levels
+  # of X
+  # newdata.modelMatrix <- lFormula(formula = model@call, data=newdata)$X
+  # To be sensitive to this, we can take a performance hit and do:
+  if(identical(newdata, model@frame)){
+    newdata.modelMatrix <- lFormula(formula = model@call, data = model@frame)$X
+  } else{
+    tmp <- plyr::rbind.fill(newdata, trimModelFrame(model@frame))
+    newdata.modelMatrix <- lFormula(formula = model@call, data = tmp)$X[1:nrow(newdata), ]
+    rm(tmp)
+  }
   fixed.xb <- newdata.modelMatrix %*% t(betaSim)
 
   ##Calculate yhat as sum of the components (fixed plus all groupling factors)
-  yhat <- fixed.xb + apply(simplify2array(reSim), c(1,2), function(x) sum(x, na.rm=TRUE))
+  # apply(reSim, c(1,2), function(x), sum(x,na.rm=TRUE))
+  yhat <- fixed.xb + apply(reSimA, c(1,2), function(x) sum(x, na.rm=TRUE))
   if (include.resid.var==TRUE)
     yhat <- abind::abind(lapply(1:nsim, function(x) rnorm(nrow(newdata), yhat[,x], sigmahat[x])), along = 2)
 

diff --git a/tests/testthat/test_helpers.R b/tests/testthat/test_helpers.R
@@ -0,0 +1,12 @@
+# Test helper functions
+
+context("Trimming data frame")
+
+
+
+test_that("Trimming results in correct size", {
+  data(InstEval)
+  trimDat <- trimModelFrame(InstEval)
+  expect_more_than(nrow(InstEval), nrow(trimModelFrame(InstEval)))
+  expect_equal(nrow(trimDat), 4065)
+})
diff --git a/tests/testthat/test_predict.R b/tests/testthat/test_predict.R
@@ -87,3 +87,20 @@ test_that("Prediction interval respects user input", {
 })
 
 
+test_that("Predict handles unused and subset of factor levels", {
+  skip_on_cran()
+  set.seed(101)
+
+  g1 <- lmer(y ~ lectage + studage + (1|d) + (1|s), data=InstEval)
+  d1 <- InstEval[1:100, ]
+  outs1 <- predictInterval(g1, newdata = d1, level = 0.8, nsim = 500,
+                           stat = 'mean', include.resid.var = TRUE)
+  d2 <- rbind(d1, InstEval[670:900,])
+  outs1a <- predictInterval(g1, newdata = d2, level = 0.8, nsim = 500,
+                            stat = 'mean', include.resid.var=TRUE)[1:100,]
+  expect_equal(outs1, outs1a, tolerance = 0.01)
+  expect_equal(nrow(outs1), nrow(outs1a))
+})
+
+
+
diff --git a/tests/testthat/Compare_bootMer_KF.R → tests/timings/Compare_bootMer_KF.R b/tests/testthat/Compare_bootMer_KF.R → tests/timings/Compare_bootMer_KF.R
@@ -38,59 +38,59 @@
 #    m4.new.df <- m4.df
 #
 # #Step 1: Our method ####
-#  predictInterval <- function(model, newdata, level = 0.95, nsim=1000, stat="median", predict.type="link"){
-#    #Depends
-#    require(mvtnorm)
-#    require(lme4)
-#    #Prep Output
-#    outs <- newdata
-#
-#    #Sort out all the levels
-#    reTerms <- names(ngrps(model))
-#    n.reTerms = length(reTerms)
-#
-#    ##The following 3 lines produce a matrix of linear predictors created from the fixed coefs
-#    betaSim <- rmvnorm(nsim, mean = fixef(model), sigma = as.matrix(vcov(model)))
-#    newdata.modelMatrix <- lFormula(formula = model@call, data=newdata)$X
-#    fixed.xb <- newdata.modelMatrix %*% t(betaSim)
-#
-#    ##Draw from random effects distributions for each level and merge onto newdata
-#    reSim <- NULL
-#    for (j in seq_along(reTerms)) {
-#      group=reTerms[j]
-#      reMeans <- array(ranef(model)[[group]])
-#      reMatrix <- attr(ranef(model, condVar=TRUE)[[group]], which = "postVar")
-#      reSim[[group]] <- data.frame(rownames(reMeans), matrix(NA, nrow=nrow(reMeans), ncol=nsim))
-#      colnames(reSim[[group]]) <- c(group, paste("sim", 1:nsim, sep=""))
-#      for (k in 1:nrow(reMeans)) {
-#        lvl = rownames(reMeans)[k]
-#        reSim[[group]][k,2:ncol(reSim[[group]])] <- rmvnorm(nsim, mean=as.matrix(reMeans[k,]), sigma=as.matrix(reMatrix[,,k]))
-#      }
-#      cnames <- colnames(reSim[[group]])
-#      reSim[[group]] <- merge(newdata, reSim[[group]], by=group, all.x=TRUE)
-#      reSim[[group]] <- as.matrix(reSim[[group]][,setdiff(cnames, group)])
-#    }
-#
-#    #Calculate yhat as sum of components
-#    yhat <- fixed.xb + apply(simplify2array(reSim), c(1,2), sum)
-#
-#    #Output prediction intervals
-#    if (stat=="median") {
-#      outs$fit <- apply(yhat,1,function(x) as.numeric(quantile(x, .5)))
-#    }
-#    if (stat=="mean") {
-#      outs$fit <- apply(yhat,1,mean)
-#    }
-#    outs$upr <- apply(yhat,1,function(x) as.numeric(quantile(x, 1 - ((1-level)/2))))
-#    outs$lwr <- apply(yhat,1,function(x) as.numeric(quantile(x, ((1-level)/2))))
-#    if (predict.type=="response") {
-#      outs$fit <- model@resp$family$linkinv(outs$fit)
-#      outs$upr <- model@resp$family$linkinv(outs$upr)
-#      outr$lwr <- model@resp$family$linkinc(outs$lwr)
-#    }
-#    #Close it out
-#    return(outs)
-#  }
+# #  predictInterval <- function(model, newdata, level = 0.95, nsim=1000, stat="median", predict.type="link"){
+# #    #Depends
+# #    require(mvtnorm)
+# #    require(lme4)
+# #    #Prep Output
+# #    outs <- newdata
+# #
+# #    #Sort out all the levels
+# #    reTerms <- names(ngrps(model))
+# #    n.reTerms = length(reTerms)
+# #
+# #    ##The following 3 lines produce a matrix of linear predictors created from the fixed coefs
+# #    betaSim <- rmvnorm(nsim, mean = fixef(model), sigma = as.matrix(vcov(model)))
+# #    newdata.modelMatrix <- lFormula(formula = model@call, data=newdata)$X
+# #    fixed.xb <- newdata.modelMatrix %*% t(betaSim)
+# #
+# #    ##Draw from random effects distributions for each level and merge onto newdata
+# #    reSim <- NULL
+# #    for (j in seq_along(reTerms)) {
+# #      group=reTerms[j]
+# #      reMeans <- array(ranef(model)[[group]])
+# #      reMatrix <- attr(ranef(model, condVar=TRUE)[[group]], which = "postVar")
+# #      reSim[[group]] <- data.frame(rownames(reMeans), matrix(NA, nrow=nrow(reMeans), ncol=nsim))
+# #      colnames(reSim[[group]]) <- c(group, paste("sim", 1:nsim, sep=""))
+# #      for (k in 1:nrow(reMeans)) {
+# #        lvl = rownames(reMeans)[k]
+# #        reSim[[group]][k,2:ncol(reSim[[group]])] <- rmvnorm(nsim, mean=as.matrix(reMeans[k,]), sigma=as.matrix(reMatrix[,,k]))
+# #      }
+# #      cnames <- colnames(reSim[[group]])
+# #      reSim[[group]] <- merge(newdata, reSim[[group]], by=group, all.x=TRUE)
+# #      reSim[[group]] <- as.matrix(reSim[[group]][,setdiff(cnames, group)])
+# #    }
+# #
+# #    #Calculate yhat as sum of components
+# #    yhat <- fixed.xb + apply(simplify2array(reSim), c(1,2), sum)
+# #
+# #    #Output prediction intervals
+# #    if (stat=="median") {
+# #      outs$fit <- apply(yhat,1,function(x) as.numeric(quantile(x, .5)))
+# #    }
+# #    if (stat=="mean") {
+# #      outs$fit <- apply(yhat,1,mean)
+# #    }
+# #    outs$upr <- apply(yhat,1,function(x) as.numeric(quantile(x, 1 - ((1-level)/2))))
+# #    outs$lwr <- apply(yhat,1,function(x) as.numeric(quantile(x, ((1-level)/2))))
+# #    if (predict.type=="response") {
+# #      outs$fit <- model@resp$family$linkinv(outs$fit)
+# #      outs$upr <- model@resp$family$linkinv(outs$upr)
+# #      outr$lwr <- model@resp$family$linkinc(outs$lwr)
+# #    }
+# #    #Close it out
+# #    return(outs)
+# #  }
 #
 # #Step 2: Unit Test Function####
 #  predictInterval.test <- function(model.form, model.df, model.type="lmer",

diff --git a/tests/timings/predictSpeed.R b/tests/timings/predictSpeed.R
@@ -0,0 +1,2 @@
+library(microbenchmark)
+
diff --git a/vignettes/merToolsIntro.Rmd b/vignettes/merToolsIntro.Rmd
@@ -26,3 +26,8 @@ As we have found ourselves using these models more and more within our work, we,
 the authors, have developed a set of tools for simplifying and speeding up common 
 tasks for interacting with `merMod` objects from `lme4`. This package provides 
 those tools. 
+
+## Illustrating Model Effects
+
+As the complexity of the model fit grows, it becomes harder and harder to 
+interpret the substantive effect of parameters in the model.