simulateData.Rmd

---
title: "Analysis of simulated data"
output: html_document
author: Jeff Leek
---

`r library(knitr); opts_chunk$set(cache=TRUE)`

### Load packages

You will need the RSkittleBrewer, polyester, and ballgown packages for this vignette to run. Installation instructions are available here:

* https://github.com/alyssafrazee/RskittleBrewer
* https://github.com/alyssafrazee/polyester
* https://github.com/alyssafrazee/ballgown

You will also need R version 3.1.0 or greater and Bioconductor 3.0 or greater. The zebrafishRNASeq package might need to be installed from source. These analyses are based on the devel version of sva (version 3.11.2 or greater).

```{r load,message=FALSE}
library(zebrafishRNASeq)
library(RSkittleBrewer)
library(genefilter)
library(polyester)
library(RUVSeq)
library(edgeR)
library(sva)
library(ffpe)
library(RColorBrewer)
library(corrplot)
library(limma)
trop = RSkittleBrewer('tropical')
```


## Load and filter zebrafish data

Here we are going to load the zebrafish data to get an overall feeling
for the count data and relationship between mean and variance. We first filter
data according to the RUVSeq vignette: http://bioconductor.org/packages/devel/bioc/vignettes/RUVSeq/inst/doc/RUVSeq.R.


```{r data,dependson="load"}
data(zfGenes)
filter <- apply(zfGenes, 1, function(x) length(x[x>5])>=2)
counts <- zfGenes[filter,]
```

## Look at mean variance relationship

```{r, dependson="data",fig.align="center"}
plot(rowMeans(log(counts+1)),rowVars(log(counts+1)),pch=19,col=trop[1])
```

## Estimate zero inflated negative binomial parameters from the zebrafish data


```{r nbparams,dependson="data"}

## Estimate the zero inflated negative binomial parameters
params = get_params(counts)

plot(rowMeans(log(counts+1)),rowVars(log(counts+1)),pch=19,col=trop[1],main="zebrafish data w/fit")
lines(params$fit,col=trop[2])
```

## Generate an equal sized simulated data set and compare


```{r simdata,dependson="nbparams"}

## Create some data from the model

dat0 = create_read_numbers(params$mu,params$fit,
                           params$p0,m=dim(counts)[1],n=dim(counts)[2],seed=53535)

par(mfrow=c(1,2))
plot(rowMeans(log(dat0+1)),rowVars(log(dat0+1)),pch=19,col=trop[2],main="simulated data",xlim=c(0,15))
plot(rowMeans(log(counts+1)),rowVars(log(counts+1)),pch=19,col=trop[1],main="zebrafish data",xlim=c(0,15))

```

## Make sure there is no differential expression using voom

```{r nodiffexp,dependson="simdata"}
group = rep(c(0,1),each=3)
design = model.matrix(~group)
dge <- DGEList(counts=dat0)
dge <- calcNormFactors(dge)
v <- voom(dge,design,plot=TRUE)
fit <- lmFit(v,design)
fit <- eBayes(fit)
hist(fit$p.value[,2],col=trop[3],main="No genes DE",breaks=100)
```

## Generate a data set with some differential expression and test

```{r diffexp,dependson="nbparams"}
group = rep(c(-1,1),each=10)
mod = model.matrix(~-1 + group)
coeffs = cbind(c(rnorm(200), rep(0,800)))
dat0 = create_read_numbers(params$mu,params$fit,
                           params$p0,m=dim(counts)[1],n=dim(counts)[2],
                           beta=coeffs,mod=mod,seed=32332)

design = model.matrix(~group)
dge <- DGEList(counts=dat0)
dge <- calcNormFactors(dge)
v <- voom(dge,design,plot=TRUE)
fit <- lmFit(v,design)
fit <- eBayes(fit)
hist(fit$p.value[,2],col=trop[3],main="200 genes DE",breaks=100)
```


## Generate a data set with a batch effect uncorrelated with group

```{r firstbatch,dependson="simdata"}
group = rep(c(-1,1),each=10)
batch = rep(c(-1,1),10)
gcoeffs = rep(0,1000)
bcoeffs = c(rep(0,100),rnorm(400,sd=2),rep(0,500))
coeffs = cbind(bcoeffs,gcoeffs)
mod = model.matrix(~-1 + batch + group)

dat0 = create_read_numbers(params$mu,params$fit,
                           params$p0,m=dim(counts)[1],n=dim(counts)[2],
                           beta=coeffs,mod=mod,seed=323)

design = model.matrix(~group)
dge <- DGEList(counts=dat0)
dge <- calcNormFactors(dge)
v <- voom(dge,design,plot=TRUE)
fit <- lmFit(v,design)
fit <- eBayes(fit)
hist(fit$p.value[,2],col=trop[3],main="400 genes batch/0 DE",breaks=100)
```


## Generate a data set with orthogonal batch and group effects and compare methods.

Here we compare unsupervised svaseq, supervised svaseq, ruv with control probes, ruv with empirical controls, and principal components analysis for estimating batch


```{r batchestimates,dependson="simdata"}

group = rep(c(-1,1),each=10)
batch = 1 - 2*rbinom(20,size=1,prob=0.5)
gcoeffs = c(rnorm(400),rep(0,600))
bcoeffs = c(rep(0,100),rnorm(400,sd=1),rep(0,500))
coeffs = cbind(bcoeffs,gcoeffs)
controls = (bcoeffs != 0) & (gcoeffs==0)
mod = model.matrix(~-1 + batch + group)

dat0 = create_read_numbers(params$mu,params$fit,
                           params$p0,m=dim(counts)[1],n=dim(counts)[2],
                           beta=coeffs,mod=mod,seed=4353)
rm(mod)


## Set null and alternative models (ignore batch)
mod1 = model.matrix(~group)
mod0 = cbind(mod1[,1])

## Estimate batch with svaseq (unsupervised)
batch_unsup_sva = svaseq(dat0,mod1,mod0)$sv

## Estimate batch with svaseq (supervised)
batch_sup_sva = svaseq(dat0,mod1,mod0,controls=controls)$sv

## Estimate batch with pca
ldat0 = log(dat0 + 1)
batch_pca = svd(ldat0 - rowMeans(ldat0))$v[,1]

## Estimate batch with ruv (controls known)
batch_ruv_cp <- RUVg(dat0, cIdx= controls, k=1)$W

## Estimate batch with ruv (residuals)
## this procedure follows the RUVSeq vignette
## http://www.bioconductor.org/packages/devel/bioc/vignettes/RUVSeq/inst/doc/RUVSeq.pdf

x <- as.factor(group)
design <- model.matrix(~x)
y <- DGEList(counts=dat0, group=x)
y <- calcNormFactors(y, method="upperquartile")
y <- estimateGLMCommonDisp(y, design)
y <- estimateGLMTagwiseDisp(y, design)
fit <- glmFit(y, design)
res <- residuals(fit, type="deviance")
seqUQ <- betweenLaneNormalization(dat0, which="upper")
controls = rep(TRUE,dim(dat0)[1])
batch_ruv_res = RUVr(seqUQ,controls,k=1,res)$W

## Estimate batch with ruv empirical controls
## this procedure follows the RUVSeq vignette
## http://www.bioconductor.org/packages/devel/bioc/vignettes/RUVSeq/inst/doc/RUVSeq.pdf

y <- DGEList(counts=dat0, group=x)
y <- calcNormFactors(y, method="upperquartile")
y <- estimateGLMCommonDisp(y, design)
y <- estimateGLMTagwiseDisp(y, design)

fit <- glmFit(y, design)
lrt <- glmLRT(fit, coef=2)

controls =rank(lrt$table$LR) <= 400
batch_ruv_emp <- RUVg(dat0, controls, k=1)$W


## Plot the results
plot(batch,col=trop[1],pch=19,main="batch")
plot(batch_unsup_sva,pch=19,col=trop[2],main="unsupervised sva")
plot(batch_sup_sva,pch=19,col=trop[2],main="supervised sva")
plot(batch_pca,pch=19,col=trop[3],main="pca")
plot(batch_ruv_cp,pch=19,col=trop[4],main="control probes ruv")
plot(batch_ruv_res,pch=19,col=trop[4],main="residual ruv")
plot(batch_ruv_emp,pch=19,col=trop[4],main="empirical controls ruv")

```


## Plot absolute correlation between batch estimates

In this case, all the estimates are highly correlated

```{r corbatch,dependson="batchestimates"}

batchEstimates = cbind(batch,group,batch_unsup_sva,batch_sup_sva,
                       batch_pca,batch_ruv_cp,batch_ruv_res,batch_ruv_emp)
colnames(batchEstimates) = c("batch","group","usva","ssva","pca","ruvcp","ruvres","ruvemp")

corr = abs(cor(batchEstimates))
cols = colorRampPalette(c(trop[2],"white",trop[1]))
corrplot(corr,method="ellipse",type="lower",col=cols(100),tl.pos="d")
```


```{r comparede, dependson="batchestimates",fig.align="center",fig.height=7,fig.width=7}
dge <- DGEList(counts=dat0)
dge <- calcNormFactors(dge)
catplots = tstats = vector("list",8)
truevals = -abs(gcoeffs)
names(truevals) = as.character(1:1000)
adj = c("+ batch","+ batch_pca", "+ batch_sup_sva",
                "+ batch_unsup_sva", "+ batch_ruv_cp", "+ batch_ruv_res",
                "+ batch_ruv_emp", "")

for(i in 1:8){
  design = model.matrix(as.formula(paste0("~ group",adj[i])))
  v <- voom(dge,design,plot=FALSE)
  fit <- lmFit(v,design)
  fit <- eBayes(fit)
  tstats[[i]] = abs(fit$t[,2])
  names(tstats[[i]]) = as.character(1:1000)
  catplots[[i]] = CATplot(-rank(tstats[[i]]),truevals,maxrank=400,make.plot=F)
}

plot(catplots[[1]],ylim=c(0,1),col=trop[1],lwd=3,type="l",ylab="Concordance Between True rank and rank with different methods",xlab="Rank")
lines(catplots[[2]],col=trop[2],lwd=3)
lines(catplots[[3]],col=trop[3],lwd=3,lty=2)
lines(catplots[[4]],col=trop[3],lwd=3,lty=1)
lines(catplots[[5]],col=trop[4],lwd=3,lty=2)
lines(catplots[[6]],col=trop[4],lwd=3,lty=1)
lines(catplots[[7]],col=trop[4],lwd=3,lty=3)
lines(catplots[[8]],col=trop[5],lwd=3)

legend(200,0.5,legend=c("Known batch","PCA","Sup. svaseq", "Unsup. svaseq","RUV CP","RUV Res.", "RUV Emp.", "No adjustment"),col=trop[c(1,2,3,3,4,4,4,5)],lty=c(1,1,1,2,2,1,3),lwd=3)
```


## Generate a data set with correlated batch and group effects and compare methods.

Here we compare unsupervised svaseq, supervised svaseq, ruv with control probes, ruv with empirical controls, and principal components analysis for estimating batch


```{r batchestimates2,dependson="simdata"}

group = rep(c(-1,1),each=10)
coinflip = rbinom(20,size=1,prob=0.9)
batch = group*coinflip + -group*(1-coinflip)
gcoeffs = c(rnorm(400),rep(0,600))
bcoeffs = c(rep(0,100),rnorm(400,sd=1),rep(0,500))
coeffs = cbind(bcoeffs,gcoeffs)
controls = (bcoeffs != 0) & (gcoeffs==0)
mod = model.matrix(~-1 + batch + group)

dat0 = create_read_numbers(params$mu,params$fit,
                           params$p0,m=dim(counts)[1],n=dim(counts)[2],
                           beta=coeffs,mod=mod,seed=333)
rm(mod)


## Set null and alternative models (ignore batch)
mod1 = model.matrix(~group)
mod0 = cbind(mod1[,1])

## Estimate batch with svaseq (unsupervised)
batch_unsup_sva = svaseq(dat0,mod1,mod0)$sv

## Estimate batch with svaseq (supervised)
batch_sup_sva = svaseq(dat0,mod1,mod0,controls=controls)$sv

## Estimate batch with pca
ldat0 = log(dat0 + 1)
batch_pca = svd(ldat0 - rowMeans(ldat0))$v[,1]

## Estimate batch with ruv (controls known)
batch_ruv_cp <- RUVg(dat0, cIdx= controls, k=1)$W

## Estimate batch with ruv (residuals)
## this procedure follows the RUVSeq vignette
## http://www.bioconductor.org/packages/devel/bioc/vignettes/RUVSeq/inst/doc/RUVSeq.pdf

x <- as.factor(group)
design <- model.matrix(~x)
y <- DGEList(counts=dat0, group=x)
y <- calcNormFactors(y, method="upperquartile")
y <- estimateGLMCommonDisp(y, design)
y <- estimateGLMTagwiseDisp(y, design)
fit <- glmFit(y, design)
res <- residuals(fit, type="deviance")
seqUQ <- betweenLaneNormalization(dat0, which="upper")
controls = rep(TRUE,dim(dat0)[1])
batch_ruv_res = RUVr(seqUQ,controls,k=1,res)$W

## Estimate batch with ruv empirical controls
## this procedure follows the RUVSeq vignette
## http://www.bioconductor.org/packages/devel/bioc/vignettes/RUVSeq/inst/doc/RUVSeq.pdf

y <- DGEList(counts=dat0, group=x)
y <- calcNormFactors(y, method="upperquartile")
y <- estimateGLMCommonDisp(y, design)
y <- estimateGLMTagwiseDisp(y, design)

fit <- glmFit(y, design)
lrt <- glmLRT(fit, coef=2)

controls =rank(lrt$table$LR) <= 400
batch_ruv_emp <- RUVg(dat0, controls, k=1)$W


## Plot the results
plot(batch,col=trop[1],pch=19,main="batch")
plot(batch_unsup_sva,pch=19,col=trop[2],main="unsupervised sva")
plot(batch_sup_sva,pch=19,col=trop[2],main="supervised sva")
plot(batch_pca,pch=19,col=trop[3],main="pca")
plot(batch_ruv_cp,pch=19,col=trop[4],main="control probes ruv")
plot(batch_ruv_res,pch=19,col=trop[4],main="residual ruv")
plot(batch_ruv_emp,pch=19,col=trop[4],main="empirical controls ruv")

```


## Plot absolute correlation between batch estimates

```{r corbatch2,dependson="batchestimates2"}

batchEstimates = cbind(batch,group,batch_unsup_sva,batch_sup_sva,
                       batch_pca,batch_ruv_cp,batch_ruv_res,batch_ruv_emp)
colnames(batchEstimates) = c("batch","group","usva","ssva","pca","ruvcp","ruvres","ruvemp")

corr = abs(cor(batchEstimates))
cols = colorRampPalette(c(trop[2],"white",trop[1]))
par(mar=c(5,5,5,5))
corrplot(corr,method="ellipse",type="lower",col=cols(100),tl.pos="d")
```

## Compare differential expression patterns with different adjustments

```{r comparede2, dependson="batchestimates2",fig.align="center",fig.height=7,fig.width=7}
dge <- DGEList(counts=dat0)
dge <- calcNormFactors(dge)
catplots = tstats = vector("list",8)
truevals = -abs(gcoeffs)
names(truevals) = as.character(1:1000)
adj = c("+ batch","+ batch_pca", "+ batch_sup_sva",
                "+ batch_unsup_sva", "+ batch_ruv_cp", "+ batch_ruv_res",
                "+ batch_ruv_emp", "")

for(i in 1:8){
  design = model.matrix(as.formula(paste0("~ group",adj[i])))
  v <- voom(dge,design,plot=FALSE)
  fit <- lmFit(v,design)
  fit <- eBayes(fit)
  tstats[[i]] = abs(fit$t[,2])
  names(tstats[[i]]) = as.character(1:1000)
  catplots[[i]] = CATplot(-rank(tstats[[i]]),truevals,maxrank=400,make.plot=F)
}

plot(catplots[[1]],ylim=c(0,1),col=trop[1],lwd=3,type="l",ylab="Concordance Between True rank and rank with different methods",xlab="Rank")
lines(catplots[[2]],col=trop[2],lwd=3)
lines(catplots[[3]],col=trop[3],lwd=3,lty=1)
lines(catplots[[4]],col=trop[3],lwd=3,lty=2)
lines(catplots[[5]],col=trop[4],lwd=3,lty=2)
lines(catplots[[6]],col=trop[4],lwd=3,lty=1)
lines(catplots[[7]],col=trop[4],lwd=3,lty=3)
lines(catplots[[8]],col=trop[5],lwd=3)

legend(200,0.5,legend=c("Known batch","PCA","Sup. svaseq", "Unsup. svaseq","RUV CP","RUV Res.", "RUV Emp.", "No adjustment"),col=trop[c(1,2,3,3,4,4,4,5)],lty=c(1,1,1,2,2,1,3),lwd=3)
```


### Session Info

```{r}
sessionInfo()
```