Skip to content

Commit

Permalink
Made comparison statements into a function <<predictionInterval.test(…
Browse files Browse the repository at this point in the history
…)>> that takes a model formula, model data and prediction data as arguments as well as the ability to specify a unique observation id variable and pass futher arguments to lmer.

I think it works, but things should be checked by hand at least one time to double check
  • Loading branch information
carlbfrederick committed May 13, 2015
1 parent 0fa90db commit 2c0cdf9
Showing 1 changed file with 156 additions and 147 deletions.
303 changes: 156 additions & 147 deletions tests/testthat/Compare_bootMer_KF.R
Original file line number Diff line number Diff line change
Expand Up @@ -17,95 +17,23 @@
data(sleepstudy)
data(VerbAgg)

#Build out model
m1.form <- Reaction ~ Days + (Days | Subject)
m1.df <- sleepstudy
m1.new.df <- m1.df

#Estimate model
m1.estimation.time <- system.time(
m1 <- lmer(m1.form, data=m1.df)
)
print(m1)

#Step 1: Get bootMer... gold standard ####
##Define function for parameter extraction (inspired from bootMer help file)
##Pulling:
## Individual level predicted values conditional on ALL random effects
mySumm <- function(.) {
predict(., newdata=m1.new.df, re.form=NULL, type="link")
}

##Maybe try all three (valid) ways listed in bootMer doc in the order listed in details
boot1.time <- system.time(
boot1.m1 <- bootMer(m1, mySumm, nsim=1000,
use.u=FALSE, type="parametric",
.progress = "txt", PBargs=list(style=3))
)

boot2.time <- system.time(
boot2.m1 <- bootMer(m1, mySumm, nsim=1000,
use.u=TRUE, type="parametric",
.progress = "txt", PBargs=list(style=3))
)

boot3.time <- system.time(
boot3.m1 <- bootMer(m1, mySumm, nsim=1000,
use.u=TRUE, type="semiparametric",
.progress = "txt", PBargs=list(style=3))
)

sumBoot <- function(merBoot) {
data.frame(merBoot$data,
median = apply(merBoot$t, 2, function(x) as.numeric(quantile(x, probs=.5, na.rm=TRUE))),
lwr = apply(merBoot$t, 2, function(x) as.numeric(quantile(x, probs=.025, na.rm=TRUE))),
upr = apply(merBoot$t, 2, function(x) as.numeric(quantile(x, probs=.975, na.rm=TRUE))),
colNAs = apply(merBoot$t, 2, function(x) sum(is.na(x)))
)
}

#Step 2: Our method ####
##"sub" functions
source("F:/RSandbox/merTools/R/merExtract.R")

reMatrixExtract <- function(model, group, case){
slotNum <- ranef(model)[[group]]
slotNum$seq <- 1:nrow(slotNum)
slotNum <- slotNum[case, ]$seq
mat <- as.matrix(attr(ranef(model, condVar=TRUE)[[group]],
which = "postVar")[, , slotNum])
row.names(mat) <- names(ranef(model)[[group]])
colnames(mat) <- row.names(mat)
return(mat)
}

reMeansExtract <- function(model, group, case){
rerow <- ranef(model)[[group]][case, ]
out <- as.numeric(rerow)
names(out) <- names(ranef(model)[[group]])
return(out)
}

combineREFE <- function(reSim, betaSim){
comboCols <- intersect(colnames(reSim), colnames(betaSim))
REonly <- setdiff(colnames(reSim), colnames(betaSim))
FEonly <- setdiff(colnames(betaSim), colnames(reSim))
totCoef <- cbind(reSim[, comboCols] + betaSim[, comboCols])
colnames(totCoef) <- comboCols
totCoef <- cbind(totCoef, reSim[, REonly])
totCoef <- cbind(totCoef, betaSim[, FEonly])
return(totCoef)
}

##Main Function
#Cannonical Models for testing purposes
##1) Sleepstudy eg - lmer with random slope and random intercept
m1.form <- Reaction ~ Days + (Days | Subject)
m1.df <- sleepstudy
m1.new.df <- m1.df
##2) Verbal Aggresion eg - lmer (could to glmer(logit)) with 2 levels
m2.form <- r2 ~ (Anger + Gender + btype + situ)^2 + (1|id) + (1|item)
m2.df <- VerbAgg
m2.new.df <- m2.df

#Step 1: Our method ####
predictInterval <- function(model, newdata, level = 0.95){
#Depends
require(mvtnorm)
require(lme4)
#Prep Output
outs <- data.frame(fit = rep(NA, nrow(newdata)),
lwr = rep(NA, nrow(newdata)),
upr = rep(NA, nrow(newdata)))
outs <- newdata

#Sort out all the levels
reTerms <- names(ngrps(model))
Expand Down Expand Up @@ -138,79 +66,160 @@
yhat <- fixed.xb + apply(simplify2array(reSim), c(1,2), sum)

#Output prediction intervals
outs$fit <- apply(yhat,1,mean)
outs$fit <- apply(yhat,1,function(x) as.numeric(quantile(x, .5)))
outs$upr <- apply(yhat,1,function(x) as.numeric(quantile(x, 1 - ((1-level)/2))))
outs$lwr <- apply(yhat,1,function(x) as.numeric(quantile(x, ((1-level)/2))))
#Close it out
return(outs)
}

kf.time <- system.time(
kf.method <- predictInterval(m1, m1.new.df)
)
#Step 2: Unit Test Function####
predictInterval.test <- function(model.form, model.df, predict.df, idvar=NULL, ...) {
require(lme4); require(dplyr); require(tidyr); require(ggplot2);
##Estimate model
modelEstimation.time <- system.time(
m1 <- lmer(model.form, data=model.df, ...)
)
##If it does not have one, add unique identifier to predict.df
if (is.null(idvar)) {
predict.df$.newID <- paste("newID", rownames(predict.df), sep="")
}
##Functions for bootMer() and objects
####Return predicted values from bootstrap
mySumm <- function(.) {
predict(., newdata=predict.df, re.form=NULL, type="link")
}
####Collapse bootstrap into median, 95% PI
sumBoot <- function(merBoot) {
data.frame(merBoot$data,
median = apply(merBoot$t, 2, function(x) as.numeric(quantile(x, probs=.5, na.rm=TRUE))),
lwr = apply(merBoot$t, 2, function(x) as.numeric(quantile(x, probs=.025, na.rm=TRUE))),
upr = apply(merBoot$t, 2, function(x) as.numeric(quantile(x, probs=.975, na.rm=TRUE))),
colNAs = apply(merBoot$t, 2, function(x) sum(is.na(x)))
)
}
##Bootstrap
####Method 1: parametric, re-estimate BLUPS
cat("\n Bootstrap Method 1")
boot1.time <- system.time(
boot1 <- bootMer(m1, mySumm, nsim=1000,
use.u=FALSE, type="parametric",
.progress = "txt", PBargs=list(style=3))
)
####Method 2: parametric, conditional on estimated BLUPS
cat("\n Bootstrap Method 2")
boot2.time <- system.time(
boot2 <- bootMer(m1, mySumm, nsim=1000,
use.u=TRUE, type="parametric",
.progress = "txt", PBargs=list(style=3))
)
####Method 3: semiparametric (draw from resid), conditional on estimated BLUPS
cat("\n Bootstrap Method 3")
boot3.time <- system.time(
boot3 <- bootMer(m1, mySumm, nsim=1000,
use.u=TRUE, type="semiparametric",
.progress = "txt", PBargs=list(style=3))
)
##Our Method
kf.time <- system.time(
kf.method <- predictInterval(m1, predict.df)
)
##Compare Times
compare.time <- rbind(modelEstimation.time, kf.time, boot1.time, boot2.time, boot3.time)

##Summarize and compare results
boot1.sum <- sumBoot(boot1)
boot2.sum <- sumBoot(boot2)
boot3.sum <- sumBoot(boot3)

eval <- merge(predict.df, boot1.sum) %>%
rename(boot1.fit=median, boot1.lwr=lwr, boot1.upr=upr)
eval <- merge(eval, boot2.sum) %>%
rename(boot2.fit=median, boot2.lwr=lwr, boot2.upr=upr)
eval <- merge(eval, boot3.sum) %>%
rename(boot3.fit=median, boot3.lwr=lwr, boot3.upr=upr)
eval <- merge(eval, kf.method) %>%
rename(KF.fit=fit, KF.lwr=lwr, KF.upr=upr)

#Check if nrow(eval) still equals nrow(predict.df) because it should
if (nrow(eval)!=nrow(predict.df)) {
stop("Something happened when merging bootstrap summaries together ...")
}

##Compare Times
compare.time <- rbind(kf.time, boot1.time, boot2.time, boot3.time)
rm(kf.time, boot1.time, boot2.time, boot3.time)
##Add lmer yhats (predict.merMod) on there
eval$with.u <- predict(m1, newdata=predict.df, re.form=NULL, type="link")
eval$no.u <- predict(m1, newdata=predict.df, re.form=NA, type="link")

##Diagnostic plots
####Data prep
Eval <- sample_n(eval, 30) %>%
gather(var, value, starts_with("boot"), starts_with("KF")) %>%
mutate(
simMethod = sub("[[:punct:]][0-9A-Za-z]*", "", var),
stat = sub("[0-9A-Za-z]*[[:punct:]]", "", var)
) %>%
select(-var) %>%
spread(stat, value) %>%
group_by(.newID) %>%
arrange(simMethod) %>%
mutate(
x=row_number(simMethod),
x=(x-mean(x))/10
) %>%
group_by(simMethod) %>%
mutate(
x=row_number(.newID)+x
)


####Direct Comparison Plot
p1 <- ggplot(aes(y=fit,x=x, color=simMethod), data=Eval) +
geom_point(size=I(3)) +
geom_linerange(aes(ymax=upr, ymin=lwr), size=I(1)) +
geom_point(shape="with.u", color="black", size=I(4),
data=summarize(group_by(Eval, .newID), x=mean(x), fit=mean(with.u))) +
geom_point(shape="no.u", color="black", size=I(4),
data=summarize(group_by(Eval, .newID), x=mean(x), fit=mean(no.u))) +
theme_bw() +
labs(x="Index", y="Prediction Interval", title="95% Prediction interval by method for 30 random obs")

#####Distribution of fitted values
p2 <- ggplot(aes(x=fit), data=Eval) +
geom_density(aes(color=Sim.Method)) +
geom_vline(x=mean(m1@resp$y)) +
geom_density(data=summarize(
group_by_(
cbind(y=m1@resp$y, as.data.frame(m1@flist)),
names(m1@flist)),
fit = mean(y)), color="black") +
theme_bw() +
labs(x = "Estimate", y="Density",
title="Average point estimates by prediction type \n (vertical black line is sample grand mean, \n black density is distribution of sample group means)")


##Close it out
return(
list(
compareTimes=compare.time,
bootstraps = list(boot1, boot2, boot3),
kf.method = kf.method,
model=m1,
evalData = eval,
plot.directCompare = p1,
plot.pointEstimateDist = p2
)
)
}

#Step 3: Summarize and Compare Results####
##Summarize and Combine
boot1.sum <- sumBoot(boot1.m1)
boot1.sum$.new.id <- 1:nrow(boot1.sum)
boot1.sum <- select(boot1.sum, .new.id, fit=median, lwr, upr) %>%
gather(stat, Boot.1, fit, lwr, upr)

boot2.sum <- sumBoot(boot2.m1)
boot2.sum$.new.id <- 1:nrow(boot2.sum)
boot2.sum <- select(boot2.sum, .new.id, fit=median, lwr, upr) %>%
gather(stat, Boot.2, fit, lwr, upr)

boot3.sum <- sumBoot(boot3.m1)
boot3.sum$.new.id <- 1:nrow(boot3.sum)
boot3.sum <- select(boot3.sum, .new.id, fit=median, lwr, upr) %>%
gather(stat, Boot.3, fit, lwr, upr)

kf.method$.new.id <- 1:nrow(kf.method)
kf.method <- select(kf.method, .new.id, fit, lwr, upr) %>%
gather(stat, KF, fit, lwr, upr)

eval <- merge(boot1.sum, boot2.sum, by=c(".new.id", "stat"))
eval <- merge(eval, boot3.sum, by=c(".new.id","stat"))
eval <- merge(eval, kf.method, by=c(".new.id","stat"))

eval <- eval %>%
gather(Sim.Method, value, Boot.1, Boot.2, Boot.3, KF) %>%
spread(stat, value)

eval$.new.id[eval$Sim.Method=="Boot.1"] <- eval$.new.id[eval$Sim.Method=="Boot.1"] - .15
eval$.new.id[eval$Sim.Method=="Boot.2"] <- eval$.new.id[eval$Sim.Method=="Boot.2"] - .05
eval$.new.id[eval$Sim.Method=="Boot.3"] <- eval$.new.id[eval$Sim.Method=="Boot.3"] + .05
eval$.new.id[eval$Sim.Method=="KF"] <- eval$.new.id[eval$Sim.Method=="KF"] + .15

#Gen both point predictions from predict.merMod for reference
m1.new.df$.new.id <- 1:nrow(m1.new.df)
m1.new.df$with.u <- predict(m1, newdata=m1.new.df, re.form=NULL, type="link")
m1.new.df$no.u <- predict(m1, newdata=m1.new.df, re.form=NA, type="link")
m1.new.df <- gather(m1.new.df, pred.merMod.type, fit, with.u, no.u) %>%
arrange(.new.id)

#Diagnostic plots
ggplot(aes(y=fit,x=.new.id, color=Sim.Method), data=eval[1:100,]) +
geom_point(size=I(3)) +
geom_linerange(aes(ymax=upr, ymin=lwr), size=I(1)) +
geom_point(aes(shape=pred.merMod.type), data=m1.new.df[1:50,], color="black") +
theme_bw() +
labs(x="Index", y="Prediction Interval", title="95% Prediction interval by method for select individuals")


ggplot(aes(x=fit), data=eval) +
geom_density(aes(color=type)) +
geom_vline(x=mean(test$y)) +
geom_density(data=summarize(group_by(test, d), fit = mean(y)), color="black") +
theme_bw() +
labs(x = "Estimate", y="Density",
title="Average point estimates by prediction type \n (vertical black line is sample grand mean, \n black density is distribution of sample group means)")
#debug(predictInterval.test)
cannonical.1 <- predictInterval.test(m1.form, m1.df, m1.new.df)
cannonical.1$plot.directCompare

cannonical.2 <- predictInterval.test(m2.form, m2.df, m2.new.df)
cannonical.2$plot.directCompare

save.image()



0 comments on commit 2c0cdf9

Please sign in to comment.