Skip to content

Commit

Permalink
refactor predictInterval for memory efficiency
Browse files Browse the repository at this point in the history
  • Loading branch information
jknowles committed Jun 18, 2015
1 parent fd36a07 commit 1cd075f
Show file tree
Hide file tree
Showing 7 changed files with 133 additions and 68 deletions.
16 changes: 16 additions & 0 deletions R/helpers.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#Helpers

# Function to take only rows that form distinct levels of factors
trimModelFrame <- function(data){
# Identify numerics
nums <- sapply(data, is.numeric)
vars <- names(nums[!nums == TRUE])
dataList <- vector(mode = "list", length = length(vars))
names(dataList) <- vars
for(i in vars){
dataList[[i]] <- data[!duplicated(data[, i]),]
}
newdat <- do.call(rbind, dataList)
newdat <- newdat[!duplicated(newdat),]
return(newdat)
}
43 changes: 28 additions & 15 deletions R/merPredict.R
Original file line number Diff line number Diff line change
Expand Up @@ -47,38 +47,40 @@ predictInterval <- function(model, newdata, level = 0.95,
stop(" Prediction for this NLMMs or GLMMs that are not mixed logistic regressions is not yet implemented.")
}

##This chunk of code draws from empirical bays distributions for each level of the random effects
##This chunk of code draws from empirical bayes distributions for each level of the random effects
##and merges it back onto newdata.
##
##Right now I am not multiplying the BLUP variance covariance matrices by our
##draw of sigma (for linear models) because their variation is unique. If anything,
##this is where one would multiply them by draws of theta from the model.
reTerms <- names(ngrps(model))
n.reTerms = length(reTerms)
reSim <- NULL
reSimA <- array(data = NA, dim = c(nrow(newdata), nsim, n.reTerms))
for (j in seq_along(reTerms)) {
group=reTerms[j]
reMeans <- array(ranef(model)[[group]])
reMatrix <- attr(ranef(model, condVar=TRUE)[[group]], which = "postVar")
reSim[[group]] <- data.frame(rownames(reMeans), matrix(NA, nrow=nrow(reMeans),
ncol=nsim))
colnames(reSim[[group]]) <- c(group, paste("sim", 1:nsim, sep=""))
tmp <- data.frame(rownames(reMeans), matrix(NA, nrow=nrow(reMeans),
ncol=nsim))
colnames(tmp) <- c(group, paste("sim", 1:nsim, sep=""))
for (k in 1:nrow(reMeans)) {
lvl = rownames(reMeans)[k]
reSim[[group]][k,2:ncol(reSim[[group]])] <- mvtnorm::rmvnorm(nsim,
mean=as.matrix(reMeans[k,]),
sigma=as.matrix(reMatrix[,,k]))
meanTmp <- as.matrix(reMeans[k, ])
matrixTmp <- as.matrix(reMatrix[,,k])
tmp[k, 2:ncol(tmp)] <- mvtnorm::rmvnorm(nsim,
mean=meanTmp,
sigma=matrixTmp)
}
cnames <- colnames(reSim[[group]])
cnames <- colnames(tmp)
#This is where to check for groups that only exist in newdata
new.levels <- setdiff(newdata[,group], reSim[[group]][,1])
new.levels <- setdiff(newdata[,group], tmp[,1])
if (length(new.levels)>0) {
msg <- paste(" The following levels of ", group, " from newdata -- ", paste0(new.levels, collapse=", "),
" -- are not in the model data. \n Currently, predictions for these values are based only on the fixed coefficients \n and the observation-level error.", sep="")
warning(msg, call.=FALSE)
}
reSim[[group]] <- merge(newdata, reSim[[group]], by=group, all.x=TRUE)
reSim[[group]] <- as.matrix(reSim[[group]][,setdiff(cnames, group)])
tmp <- merge(newdata, tmp, by=group, all.x=TRUE)
tmp <- as.matrix(tmp[,setdiff(cnames, group)])
reSimA[, , j] <- tmp
}

##This chunk of code produces matrix of linear predictors created from the fixed coefs
Expand All @@ -93,11 +95,22 @@ predictInterval <- function(model, newdata, level = 0.95,
}

betaSim <- abind(lapply(1:nsim, function(x) mvtnorm::rmvnorm(1, mean = fixef(model), sigma = sigmahat[x]*as.matrix(vcov(model)))), along=1)
newdata.modelMatrix <- lFormula(formula = model@call, data=newdata)$X
# If we do it this way, the function will fail on new data without all levels
# of X
# newdata.modelMatrix <- lFormula(formula = model@call, data=newdata)$X
# To be sensitive to this, we can take a performance hit and do:
if(identical(newdata, model@frame)){
newdata.modelMatrix <- lFormula(formula = model@call, data = model@frame)$X
} else{
tmp <- plyr::rbind.fill(newdata, trimModelFrame(model@frame))
newdata.modelMatrix <- lFormula(formula = model@call, data = tmp)$X[1:nrow(newdata), ]
rm(tmp)
}
fixed.xb <- newdata.modelMatrix %*% t(betaSim)

##Calculate yhat as sum of the components (fixed plus all groupling factors)
yhat <- fixed.xb + apply(simplify2array(reSim), c(1,2), function(x) sum(x, na.rm=TRUE))
# apply(reSim, c(1,2), function(x), sum(x,na.rm=TRUE))
yhat <- fixed.xb + apply(reSimA, c(1,2), function(x) sum(x, na.rm=TRUE))
if (include.resid.var==TRUE)
yhat <- abind::abind(lapply(1:nsim, function(x) rnorm(nrow(newdata), yhat[,x], sigmahat[x])), along = 2)

Expand Down
12 changes: 12 additions & 0 deletions tests/testthat/test_helpers.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# Test helper functions

context("Trimming data frame")



test_that("Trimming results in correct size", {
data(InstEval)
trimDat <- trimModelFrame(InstEval)
expect_more_than(nrow(InstEval), nrow(trimModelFrame(InstEval)))
expect_equal(nrow(trimDat), 4065)
})
17 changes: 17 additions & 0 deletions tests/testthat/test_predict.R
Original file line number Diff line number Diff line change
Expand Up @@ -87,3 +87,20 @@ test_that("Prediction interval respects user input", {
})


test_that("Predict handles unused and subset of factor levels", {
skip_on_cran()
set.seed(101)

g1 <- lmer(y ~ lectage + studage + (1|d) + (1|s), data=InstEval)
d1 <- InstEval[1:100, ]
outs1 <- predictInterval(g1, newdata = d1, level = 0.8, nsim = 500,
stat = 'mean', include.resid.var = TRUE)
d2 <- rbind(d1, InstEval[670:900,])
outs1a <- predictInterval(g1, newdata = d2, level = 0.8, nsim = 500,
stat = 'mean', include.resid.var=TRUE)[1:100,]
expect_equal(outs1, outs1a, tolerance = 0.01)
expect_equal(nrow(outs1), nrow(outs1a))
})



Original file line number Diff line number Diff line change
Expand Up @@ -38,59 +38,59 @@
# m4.new.df <- m4.df
#
# #Step 1: Our method ####
# predictInterval <- function(model, newdata, level = 0.95, nsim=1000, stat="median", predict.type="link"){
# #Depends
# require(mvtnorm)
# require(lme4)
# #Prep Output
# outs <- newdata
#
# #Sort out all the levels
# reTerms <- names(ngrps(model))
# n.reTerms = length(reTerms)
#
# ##The following 3 lines produce a matrix of linear predictors created from the fixed coefs
# betaSim <- rmvnorm(nsim, mean = fixef(model), sigma = as.matrix(vcov(model)))
# newdata.modelMatrix <- lFormula(formula = model@call, data=newdata)$X
# fixed.xb <- newdata.modelMatrix %*% t(betaSim)
#
# ##Draw from random effects distributions for each level and merge onto newdata
# reSim <- NULL
# for (j in seq_along(reTerms)) {
# group=reTerms[j]
# reMeans <- array(ranef(model)[[group]])
# reMatrix <- attr(ranef(model, condVar=TRUE)[[group]], which = "postVar")
# reSim[[group]] <- data.frame(rownames(reMeans), matrix(NA, nrow=nrow(reMeans), ncol=nsim))
# colnames(reSim[[group]]) <- c(group, paste("sim", 1:nsim, sep=""))
# for (k in 1:nrow(reMeans)) {
# lvl = rownames(reMeans)[k]
# reSim[[group]][k,2:ncol(reSim[[group]])] <- rmvnorm(nsim, mean=as.matrix(reMeans[k,]), sigma=as.matrix(reMatrix[,,k]))
# }
# cnames <- colnames(reSim[[group]])
# reSim[[group]] <- merge(newdata, reSim[[group]], by=group, all.x=TRUE)
# reSim[[group]] <- as.matrix(reSim[[group]][,setdiff(cnames, group)])
# }
#
# #Calculate yhat as sum of components
# yhat <- fixed.xb + apply(simplify2array(reSim), c(1,2), sum)
#
# #Output prediction intervals
# if (stat=="median") {
# outs$fit <- apply(yhat,1,function(x) as.numeric(quantile(x, .5)))
# }
# if (stat=="mean") {
# outs$fit <- apply(yhat,1,mean)
# }
# outs$upr <- apply(yhat,1,function(x) as.numeric(quantile(x, 1 - ((1-level)/2))))
# outs$lwr <- apply(yhat,1,function(x) as.numeric(quantile(x, ((1-level)/2))))
# if (predict.type=="response") {
# outs$fit <- model@resp$family$linkinv(outs$fit)
# outs$upr <- model@resp$family$linkinv(outs$upr)
# outr$lwr <- model@resp$family$linkinc(outs$lwr)
# }
# #Close it out
# return(outs)
# }
# # predictInterval <- function(model, newdata, level = 0.95, nsim=1000, stat="median", predict.type="link"){
# # #Depends
# # require(mvtnorm)
# # require(lme4)
# # #Prep Output
# # outs <- newdata
# #
# # #Sort out all the levels
# # reTerms <- names(ngrps(model))
# # n.reTerms = length(reTerms)
# #
# # ##The following 3 lines produce a matrix of linear predictors created from the fixed coefs
# # betaSim <- rmvnorm(nsim, mean = fixef(model), sigma = as.matrix(vcov(model)))
# # newdata.modelMatrix <- lFormula(formula = model@call, data=newdata)$X
# # fixed.xb <- newdata.modelMatrix %*% t(betaSim)
# #
# # ##Draw from random effects distributions for each level and merge onto newdata
# # reSim <- NULL
# # for (j in seq_along(reTerms)) {
# # group=reTerms[j]
# # reMeans <- array(ranef(model)[[group]])
# # reMatrix <- attr(ranef(model, condVar=TRUE)[[group]], which = "postVar")
# # reSim[[group]] <- data.frame(rownames(reMeans), matrix(NA, nrow=nrow(reMeans), ncol=nsim))
# # colnames(reSim[[group]]) <- c(group, paste("sim", 1:nsim, sep=""))
# # for (k in 1:nrow(reMeans)) {
# # lvl = rownames(reMeans)[k]
# # reSim[[group]][k,2:ncol(reSim[[group]])] <- rmvnorm(nsim, mean=as.matrix(reMeans[k,]), sigma=as.matrix(reMatrix[,,k]))
# # }
# # cnames <- colnames(reSim[[group]])
# # reSim[[group]] <- merge(newdata, reSim[[group]], by=group, all.x=TRUE)
# # reSim[[group]] <- as.matrix(reSim[[group]][,setdiff(cnames, group)])
# # }
# #
# # #Calculate yhat as sum of components
# # yhat <- fixed.xb + apply(simplify2array(reSim), c(1,2), sum)
# #
# # #Output prediction intervals
# # if (stat=="median") {
# # outs$fit <- apply(yhat,1,function(x) as.numeric(quantile(x, .5)))
# # }
# # if (stat=="mean") {
# # outs$fit <- apply(yhat,1,mean)
# # }
# # outs$upr <- apply(yhat,1,function(x) as.numeric(quantile(x, 1 - ((1-level)/2))))
# # outs$lwr <- apply(yhat,1,function(x) as.numeric(quantile(x, ((1-level)/2))))
# # if (predict.type=="response") {
# # outs$fit <- model@resp$family$linkinv(outs$fit)
# # outs$upr <- model@resp$family$linkinv(outs$upr)
# # outr$lwr <- model@resp$family$linkinc(outs$lwr)
# # }
# # #Close it out
# # return(outs)
# # }
#
# #Step 2: Unit Test Function####
# predictInterval.test <- function(model.form, model.df, model.type="lmer",
Expand Down
2 changes: 2 additions & 0 deletions tests/timings/predictSpeed.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
library(microbenchmark)

5 changes: 5 additions & 0 deletions vignettes/merToolsIntro.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,8 @@ As we have found ourselves using these models more and more within our work, we,
the authors, have developed a set of tools for simplifying and speeding up common
tasks for interacting with `merMod` objects from `lme4`. This package provides
those tools.

## Illustrating Model Effects

As the complexity of the model fit grows, it becomes harder and harder to
interpret the substantive effect of parameters in the model.

0 comments on commit 1cd075f

Please sign in to comment.