Skip to content

Commit

Permalink
Merge pull request #39 from zhangyuqing/master
Browse files Browse the repository at this point in the history
Adding combat-seq to sva
  • Loading branch information
zhangyuqing committed Mar 22, 2020
2 parents cbfaa32 + df7ca8b commit 3f50493
Show file tree
Hide file tree
Showing 9 changed files with 470 additions and 13 deletions.
9 changes: 5 additions & 4 deletions DESCRIPTION
@@ -1,10 +1,10 @@
Package: sva
Title: Surrogate Variable Analysis
Version: 3.35.1
Version: 3.35.2
Author: Jeffrey T. Leek <jtleek@gmail.com>, W. Evan Johnson <wej@bu.edu>,
Hilary S. Parker <hiparker@jhsph.edu>, Elana J. Fertig <ejfertig@jhmi.edu>,
Andrew E. Jaffe <ajaffe@jhsph.edu>, John D. Storey <jstorey@princeton.edu>,
Yuqing Zhang <zhangyuqing.pkusms@gmail.com>,
Andrew E. Jaffe <ajaffe@jhsph.edu>, Yuqing Zhang <zhangyuqing.pkusms@gmail.com>,
John D. Storey <jstorey@princeton.edu>,
Leonardo Collado Torres <lcolladotor@gmail.com>
Description: The sva package contains functions for removing batch
effects and other unwanted variation in high-throughput
Expand Down Expand Up @@ -32,13 +32,14 @@ Depends:
R (>= 3.2),
mgcv,
genefilter,
BiocParallel
BiocParallel
Imports:
matrixStats,
stats,
graphics,
utils,
limma,
edgeR
Suggests:
pamr,
bladderbatch,
Expand Down
12 changes: 12 additions & 0 deletions NAMESPACE
@@ -1,6 +1,7 @@
# Generated by roxygen2: do not edit by hand

export(ComBat)
export(ComBat_seq)
export(empirical.controls)
export(f.pvalue)
export(fstats)
Expand All @@ -21,23 +22,34 @@ import(genefilter)
import(matrixStats, except = c(rowSds, rowVars))

import(mgcv)
importFrom(edgeR,DGEList)
importFrom(edgeR,estimateGLMCommonDisp)
importFrom(edgeR,estimateGLMTagwiseDisp)
importFrom(edgeR,getOffset)
importFrom(edgeR,glmFit)
importFrom(edgeR,glmFit.default)
importFrom(graphics,lines)
importFrom(graphics,par)
importFrom(limma,lmFit)
importFrom(stats,cor)
importFrom(stats,density)
importFrom(stats,dnbinom)
importFrom(stats,dnorm)
importFrom(stats,lm)
importFrom(stats,model.matrix)
importFrom(stats,pf)
importFrom(stats,pnbinom)
importFrom(stats,ppoints)
importFrom(stats,prcomp)
importFrom(stats,predict)
importFrom(stats,qgamma)
importFrom(stats,qnbinom)
importFrom(stats,qnorm)
importFrom(stats,qqline)
importFrom(stats,qqnorm)
importFrom(stats,qqplot)
importFrom(stats,smooth.spline)
importFrom(stats,var)
importFrom(utils,capture.output)
importFrom(utils,read.delim)
useDynLib(sva, .registration = TRUE)
45 changes: 36 additions & 9 deletions R/ComBat.R
Expand Up @@ -43,16 +43,40 @@
#' @export
#'

ComBat <- function (dat, batch, mod = NULL, par.prior = TRUE, prior.plots = FALSE,
mean.only = FALSE, ref.batch = NULL, BPPARAM = bpparam("SerialParam")) {
ComBat <- function(dat, batch, mod = NULL, par.prior = TRUE, prior.plots = FALSE,
mean.only = FALSE, ref.batch = NULL, BPPARAM = bpparam("SerialParam")) {
if(length(dim(batch))>1){
stop("This version of ComBat only allows one batch variable")
} ## to be updated soon!

## coerce dat into a matrix
dat <- as.matrix(dat)

## find genes with zero variance in any of the batches
batch <- as.factor(batch)
zero.rows.lst <- lapply(levels(batch), function(batch_level){
if(sum(batch==batch_level)>1){
return(which(apply(dat[, batch==batch_level], 1, function(x){var(x)==0})))
}else{
return(which(rep(1,3)==2))
}
})
zero.rows <- Reduce(union, zero.rows.lst)
keep.rows <- setdiff(1:nrow(dat), zero.rows)

if (length(zero.rows) > 0) {
cat(sprintf("Found %d genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.\n", length(zero.rows)))
# keep a copy of the original data matrix and remove zero var rows
dat.orig <- dat
dat <- dat[keep.rows, ]
}

## make batch a factor and make a set of indicators for batch
if(any(table(batch)==1)){mean.only=TRUE}
if(mean.only==TRUE){
message("Using the 'mean only' version of ComBat")
}
if(length(dim(batch))>1){
stop("This version of ComBat only allows one batch variable")
} ## to be updated soon!
batch <- as.factor(batch)

batchmod <- model.matrix(~-1+batch)
if (!is.null(ref.batch)){
## check for reference batch, check value, and make appropriate changes
Expand Down Expand Up @@ -267,13 +291,16 @@ ComBat <- function (dat, batch, mod = NULL, par.prior = TRUE, prior.plots = FALS

bayesdata <- (bayesdata*(sqrt(var.pooled)%*%t(rep(1,n.array))))+stand.mean # FIXME

## tiny change still exist when tested on bladder data
## total sum of change within each batch around 1e-15
## (could be computational system error).
## Do not change ref batch at all in reference version
if(!is.null(ref.batch)){
bayesdata[, batches[[ref]]] <- dat[, batches[[ref]]]
}

## put genes with 0 variance in any batch back in data
if (length(zero.rows) > 0) {
dat.orig[keep.rows, ] <- bayesdata
bayesdata <- dat.orig
}

return(bayesdata)
}
216 changes: 216 additions & 0 deletions R/ComBat_seq.R
@@ -0,0 +1,216 @@
#' Adjust for batch effects using an empirical Bayes framework in RNA-seq raw counts
#'
#' ComBat_seq is an improved model from ComBat using negative binomial regression,
#' which specifically targets RNA-Seq count data.
#'
#' @param counts Raw count matrix from genomic studies (dimensions gene x sample)
#' @param batch Vector / factor for batch
#' @param group Vector / factor for biological condition of interest
#' @param covar_mod Model matrix for multiple covariates to include in linear model (signals from these variables are kept in data after adjustment)
#' @param full_mod Boolean, if TRUE include condition of interest in model
#' @param shrink Boolean, whether to apply shrinkage on parameter estimation
#' @param shrink.disp Boolean, whether to apply shrinkage on dispersion
#' @param gene.subset.n Number of genes to use in empirical Bayes estimation, only useful when shrink = TRUE
#'
#' @return data A gene x sample count matrix, adjusted for batch effects.
#'
#' @importFrom edgeR DGEList estimateGLMCommonDisp estimateGLMTagwiseDisp glmFit glmFit.default getOffset
#' @importFrom stats dnbinom lm pnbinom qnbinom
#' @importFrom utils capture.output
#'
#' @examples
#'
#' count_matrix <- matrix(rnbinom(400, size=10, prob=0.1), nrow=50, ncol=8)
#' batch <- c(rep(1, 4), rep(2, 4))
#' group <- rep(c(0,1), 4)
#'
#' # include condition (group variable)
#' adjusted_counts <- ComBat_seq(count_matrix, batch=batch, group=group, full_mod=TRUE)
#'
#' # do not include condition
#' adjusted_counts <- ComBat_seq(count_matrix, batch=batch, group=NULL, full_mod=FALSE)
#'
#' @export
#'

ComBat_seq <- function(counts, batch, group=NULL, covar_mod=NULL, full_mod=TRUE,
shrink=FALSE, shrink.disp=FALSE, gene.subset.n=NULL){
######## Preparation ########
## Does not support 1 sample per batch yet
batch <- as.factor(batch)
if(any(table(batch)<=1)){
stop("ComBat-seq doesn't support 1 sample per batch yet")
}

## Remove genes with only 0 counts in any batch
keep_lst <- lapply(levels(batch), function(b){
which(apply(counts[, batch==b], 1, function(x){!all(x==0)}))
})
keep <- Reduce(intersect, keep_lst)
rm <- setdiff(1:nrow(counts), keep)
countsOri <- counts
counts <- counts[keep, ]

# require bioconductor 3.7, edgeR 3.22.1
dge_obj <- DGEList(counts=counts)

## Prepare characteristics on batches
n_batch <- nlevels(batch) # number of batches
batches_ind <- lapply(1:n_batch, function(i){which(batch==levels(batch)[i])}) # list of samples in each batch
n_batches <- sapply(batches_ind, length)
#if(any(n_batches==1)){mean_only=TRUE; cat("Note: one batch has only one sample, setting mean.only=TRUE\n")}
n_sample <- sum(n_batches)
cat("Found",n_batch,'batches\n')

## Make design matrix
# batch
batchmod <- model.matrix(~-1+batch) # colnames: levels(batch)
# covariate
group <- as.factor(group)
if(full_mod & nlevels(group)>1){
cat("Using full model in ComBat-seq.\n")
mod <- model.matrix(~group)
}else{
cat("Using null model in ComBat-seq.\n")
mod <- model.matrix(~1, data=as.data.frame(t(counts)))
}
# drop intercept in covariate model
if(!is.null(covar_mod)){
if(is.data.frame(covar_mod)){
covar_mod <- do.call(cbind, lapply(1:ncol(covar_mod), function(i){model.matrix(~covar_mod[,i])}))
}
covar_mod <- covar_mod[, !apply(covar_mod, 2, function(x){all(x==1)})]
}
# bind with biological condition of interest
mod <- cbind(mod, covar_mod)
# combine
design <- cbind(batchmod, mod)

## Check for intercept in covariates, and drop if present
check <- apply(design, 2, function(x) all(x == 1))
#if(!is.null(ref)){check[ref]=FALSE} ## except don't throw away the reference batch indicator
design <- as.matrix(design[,!check])
cat("Adjusting for",ncol(design)-ncol(batchmod),'covariate(s) or covariate level(s)\n')

## Check if the design is confounded
if(qr(design)$rank<ncol(design)){
#if(ncol(design)<=(n_batch)){stop("Batch variables are redundant! Remove one or more of the batch variables so they are no longer confounded")}
if(ncol(design)==(n_batch+1)){stop("The covariate is confounded with batch! Remove the covariate and rerun ComBat-Seq")}
if(ncol(design)>(n_batch+1)){
if((qr(design[,-c(1:n_batch)])$rank<ncol(design[,-c(1:n_batch)]))){stop('The covariates are confounded! Please remove one or more of the covariates so the design is not confounded')
}else{stop("At least one covariate is confounded with batch! Please remove confounded covariates and rerun ComBat-Seq")}}
}

## Check for missing values in count matrix
NAs = any(is.na(counts))
if(NAs){cat(c('Found',sum(is.na(counts)),'Missing Data Values\n'),sep=' ')}


######## Estimate gene-wise dispersions within each batch ########
cat("Estimating dispersions\n")
## Estimate common dispersion within each batch as an initial value
disp_common <- sapply(1:n_batch, function(i){
if((n_batches[i] <= ncol(design)-ncol(batchmod)+1) | qr(mod[batches_ind[[i]], ])$rank < ncol(mod)){
# not enough residual degree of freedom
return(estimateGLMCommonDisp(counts[, batches_ind[[i]]], design=NULL, subset=nrow(counts)))
}else{
return(estimateGLMCommonDisp(counts[, batches_ind[[i]]], design=mod[batches_ind[[i]], ], subset=nrow(counts)))
}
})

## Estimate gene-wise dispersion within each batch
genewise_disp_lst <- lapply(1:n_batch, function(j){
if((n_batches[j] <= ncol(design)-ncol(batchmod)+1) | qr(mod[batches_ind[[j]], ])$rank < ncol(mod)){
# not enough residual degrees of freedom - use the common dispersion
return(rep(disp_common[j], nrow(counts)))
}else{
return(estimateGLMTagwiseDisp(counts[, batches_ind[[j]]], design=mod[batches_ind[[j]], ],
dispersion=disp_common[j], prior.df=0))
}
})
names(genewise_disp_lst) <- paste0('batch', levels(batch))

## construct dispersion matrix
phi_matrix <- matrix(NA, nrow=nrow(counts), ncol=ncol(counts))
for(k in 1:n_batch){
phi_matrix[, batches_ind[[k]]] <- vec2mat(genewise_disp_lst[[k]], n_batches[k])
}


######## Estimate parameters from NB GLM ########
cat("Fitting the GLM model\n")
glm_f <- glmFit(dge_obj, design=design, dispersion=phi_matrix, prior.count=1e-4) #no intercept - nonEstimable; compute offset (library sizes) within function
alpha_g <- glm_f$coefficients[, 1:n_batch] %*% as.matrix(n_batches/n_sample) #compute intercept as batch-size-weighted average from batches
new_offset <- t(vec2mat(getOffset(dge_obj), nrow(counts))) + # original offset - sample (library) size
vec2mat(alpha_g, ncol(counts)) # new offset - gene background expression # getOffset(dge_obj) is the same as log(dge_obj$samples$lib.size)
glm_f2 <- glmFit.default(dge_obj$counts, design=design, dispersion=phi_matrix, offset=new_offset, prior.count=1e-4)

gamma_hat <- glm_f2$coefficients[, 1:n_batch]
mu_hat <- glm_f2$fitted.values
phi_hat <- do.call(cbind, genewise_disp_lst)


######## In each batch, compute posterior estimation through Monte-Carlo integration ########
if(shrink){
cat("Apply shrinkage - computing posterior estimates for parameters\n")
mcint_fun <- monte_carlo_int_NB
monte_carlo_res <- lapply(1:n_batch, function(ii){
if(ii==1){
mcres <- mcint_fun(dat=counts[, batches_ind[[ii]]], mu=mu_hat[, batches_ind[[ii]]],
gamma=gamma_hat[, ii], phi=phi_hat[, ii], gene.subset.n=gene.subset.n)
}else{
invisible(capture.output(mcres <- mcint_fun(dat=counts[, batches_ind[[ii]]], mu=mu_hat[, batches_ind[[ii]]],
gamma=gamma_hat[, ii], phi=phi_hat[, ii], gene.subset.n=gene.subset.n)))
}
return(mcres)
})
names(monte_carlo_res) <- paste0('batch', levels(batch))

gamma_star_mat <- lapply(monte_carlo_res, function(res){res$gamma_star})
gamma_star_mat <- do.call(cbind, gamma_star_mat)
phi_star_mat <- lapply(monte_carlo_res, function(res){res$phi_star})
phi_star_mat <- do.call(cbind, phi_star_mat)

if(!shrink.disp){
cat("Apply shrinkage to mean only\n")
phi_star_mat <- phi_hat
}
}else{
cat("Shrinkage off - using GLM estimates for parameters\n")
gamma_star_mat <- gamma_hat
phi_star_mat <- phi_hat
}


######## Obtain adjusted batch-free distribution ########
mu_star <- matrix(NA, nrow=nrow(counts), ncol=ncol(counts))
for(jj in 1:n_batch){
mu_star[, batches_ind[[jj]]] <- exp(log(mu_hat[, batches_ind[[jj]]])-vec2mat(gamma_star_mat[, jj], n_batches[jj]))
}
phi_star <- rowMeans(phi_star_mat)


######## Adjust the data ########
cat("Adjusting the data\n")
adjust_counts <- matrix(NA, nrow=nrow(counts), ncol=ncol(counts))
for(kk in 1:n_batch){
counts_sub <- counts[, batches_ind[[kk]]]
old_mu <- mu_hat[, batches_ind[[kk]]]
old_phi <- phi_hat[, kk]
new_mu <- mu_star[, batches_ind[[kk]]]
new_phi <- phi_star
adjust_counts[, batches_ind[[kk]]] <- match_quantiles(counts_sub=counts_sub,
old_mu=old_mu, old_phi=old_phi,
new_mu=new_mu, new_phi=new_phi)
}

#dimnames(adjust_counts) <- dimnames(counts)
#return(adjust_counts)

## Add back genes with only 0 counts in any batch (so that dimensions won't change)
adjust_counts_whole <- matrix(NA, nrow=nrow(countsOri), ncol=ncol(countsOri))
dimnames(adjust_counts_whole) <- dimnames(countsOri)
adjust_counts_whole[keep, ] <- adjust_counts
adjust_counts_whole[rm, ] <- countsOri[rm, ]
return(adjust_counts_whole)
}

0 comments on commit 3f50493

Please sign in to comment.