Merge pull request #39 from zhangyuqing/master

Adding combat-seq to sva
jtleek · Mar 22, 2020 · 3f50493 · 3f50493
2 parents cbfaa32 + df7ca8b
commit 3f50493
Show file tree

Hide file tree

Showing 9 changed files with 470 additions and 13 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,10 +1,10 @@
 Package: sva
 Title: Surrogate Variable Analysis
-Version: 3.35.1
+Version: 3.35.2
 Author: Jeffrey T. Leek <jtleek@gmail.com>, W. Evan Johnson <wej@bu.edu>,
     Hilary S. Parker <hiparker@jhsph.edu>, Elana J. Fertig <ejfertig@jhmi.edu>,
-    Andrew E. Jaffe <ajaffe@jhsph.edu>, John D. Storey <jstorey@princeton.edu>,
-    Yuqing Zhang <zhangyuqing.pkusms@gmail.com>,
+    Andrew E. Jaffe <ajaffe@jhsph.edu>, Yuqing Zhang <zhangyuqing.pkusms@gmail.com>, 
+    John D. Storey <jstorey@princeton.edu>, 
     Leonardo Collado Torres <lcolladotor@gmail.com>
 Description: The sva package contains functions for removing batch
     effects and other unwanted variation in high-throughput
@@ -32,13 +32,14 @@ Depends:
     R (>= 3.2),
     mgcv,
     genefilter,
-  	BiocParallel
+    BiocParallel
 Imports:
     matrixStats,
     stats,
     graphics,
     utils,
     limma,
+    edgeR
 Suggests:
     pamr,
     bladderbatch,

diff --git a/NAMESPACE b/NAMESPACE
@@ -1,6 +1,7 @@
 # Generated by roxygen2: do not edit by hand
 
 export(ComBat)
+export(ComBat_seq)
 export(empirical.controls)
 export(f.pvalue)
 export(fstats)
@@ -21,23 +22,34 @@ import(genefilter)
 import(matrixStats, except = c(rowSds, rowVars))
 
 import(mgcv)
+importFrom(edgeR,DGEList)
+importFrom(edgeR,estimateGLMCommonDisp)
+importFrom(edgeR,estimateGLMTagwiseDisp)
+importFrom(edgeR,getOffset)
+importFrom(edgeR,glmFit)
+importFrom(edgeR,glmFit.default)
 importFrom(graphics,lines)
 importFrom(graphics,par)
 importFrom(limma,lmFit)
 importFrom(stats,cor)
 importFrom(stats,density)
+importFrom(stats,dnbinom)
 importFrom(stats,dnorm)
+importFrom(stats,lm)
 importFrom(stats,model.matrix)
 importFrom(stats,pf)
+importFrom(stats,pnbinom)
 importFrom(stats,ppoints)
 importFrom(stats,prcomp)
 importFrom(stats,predict)
 importFrom(stats,qgamma)
+importFrom(stats,qnbinom)
 importFrom(stats,qnorm)
 importFrom(stats,qqline)
 importFrom(stats,qqnorm)
 importFrom(stats,qqplot)
 importFrom(stats,smooth.spline)
 importFrom(stats,var)
+importFrom(utils,capture.output)
 importFrom(utils,read.delim)
 useDynLib(sva, .registration = TRUE)
diff --git a/R/ComBat.R b/R/ComBat.R
@@ -43,16 +43,40 @@
 #' @export
 #'
 
-ComBat <- function (dat, batch, mod = NULL, par.prior = TRUE, prior.plots = FALSE,
-                    mean.only = FALSE, ref.batch = NULL, BPPARAM = bpparam("SerialParam")) {
+ComBat <- function(dat, batch, mod = NULL, par.prior = TRUE, prior.plots = FALSE,
+                   mean.only = FALSE, ref.batch = NULL, BPPARAM = bpparam("SerialParam")) {
+    if(length(dim(batch))>1){
+      stop("This version of ComBat only allows one batch variable")
+    }  ## to be updated soon!  
+
+    ## coerce dat into a matrix
+    dat <- as.matrix(dat)
+
+    ## find genes with zero variance in any of the batches
+    batch <- as.factor(batch)
+    zero.rows.lst <- lapply(levels(batch), function(batch_level){
+      if(sum(batch==batch_level)>1){
+        return(which(apply(dat[, batch==batch_level], 1, function(x){var(x)==0})))
+      }else{
+        return(which(rep(1,3)==2))
+      }
+    })
+    zero.rows <- Reduce(union, zero.rows.lst)
+    keep.rows <- setdiff(1:nrow(dat), zero.rows)
+
+    if (length(zero.rows) > 0) {
+      cat(sprintf("Found %d genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.\n", length(zero.rows)))
+      # keep a copy of the original data matrix and remove zero var rows
+      dat.orig <- dat
+      dat <- dat[keep.rows, ]
+    }
+
     ## make batch a factor and make a set of indicators for batch
+    if(any(table(batch)==1)){mean.only=TRUE}
     if(mean.only==TRUE){
         message("Using the 'mean only' version of ComBat")
     }
-    if(length(dim(batch))>1){
-        stop("This version of ComBat only allows one batch variable")
-    }  ## to be updated soon!
-    batch <- as.factor(batch)
+
     batchmod <- model.matrix(~-1+batch)  
     if (!is.null(ref.batch)){
         ## check for reference batch, check value, and make appropriate changes
@@ -267,13 +291,16 @@ ComBat <- function (dat, batch, mod = NULL, par.prior = TRUE, prior.plots = FALS
 
     bayesdata <- (bayesdata*(sqrt(var.pooled)%*%t(rep(1,n.array))))+stand.mean # FIXME
 
-    ## tiny change still exist when tested on bladder data
-    ## total sum of change within each batch around 1e-15 
-    ## (could be computational system error).  
     ## Do not change ref batch at all in reference version
     if(!is.null(ref.batch)){
         bayesdata[, batches[[ref]]] <- dat[, batches[[ref]]]
     }
 
+    ## put genes with 0 variance in any batch back in data
+    if (length(zero.rows) > 0) {
+      dat.orig[keep.rows, ] <- bayesdata
+      bayesdata <- dat.orig
+    }
+
     return(bayesdata)
 }
diff --git a/R/ComBat_seq.R b/R/ComBat_seq.R
@@ -0,0 +1,216 @@
+#' Adjust for batch effects using an empirical Bayes framework in RNA-seq raw counts
+#' 
+#' ComBat_seq is an improved model from ComBat using negative binomial regression, 
+#' which specifically targets RNA-Seq count data.
+#' 
+#' @param counts Raw count matrix from genomic studies (dimensions gene x sample) 
+#' @param batch Vector / factor for batch
+#' @param group Vector / factor for biological condition of interest 
+#' @param covar_mod Model matrix for multiple covariates to include in linear model (signals from these variables are kept in data after adjustment) 
+#' @param full_mod Boolean, if TRUE include condition of interest in model
+#' @param shrink Boolean, whether to apply shrinkage on parameter estimation
+#' @param shrink.disp Boolean, whether to apply shrinkage on dispersion
+#' @param gene.subset.n Number of genes to use in empirical Bayes estimation, only useful when shrink = TRUE
+#' 
+#' @return data A gene x sample count matrix, adjusted for batch effects.
+#' 
+#' @importFrom edgeR DGEList estimateGLMCommonDisp estimateGLMTagwiseDisp glmFit glmFit.default getOffset
+#' @importFrom stats dnbinom lm pnbinom qnbinom
+#' @importFrom utils capture.output
+#' 
+#' @examples 
+#' 
+#' count_matrix <- matrix(rnbinom(400, size=10, prob=0.1), nrow=50, ncol=8)
+#' batch <- c(rep(1, 4), rep(2, 4))
+#' group <- rep(c(0,1), 4)
+#' 
+#' # include condition (group variable)
+#' adjusted_counts <- ComBat_seq(count_matrix, batch=batch, group=group, full_mod=TRUE)
+#' 
+#' # do not include condition
+#' adjusted_counts <- ComBat_seq(count_matrix, batch=batch, group=NULL, full_mod=FALSE)
+#' 
+#' @export
+#' 
+
+ComBat_seq <- function(counts, batch, group=NULL, covar_mod=NULL, full_mod=TRUE, 
+                       shrink=FALSE, shrink.disp=FALSE, gene.subset.n=NULL){  
+  ########  Preparation  ########  
+  ## Does not support 1 sample per batch yet
+  batch <- as.factor(batch)
+  if(any(table(batch)<=1)){
+    stop("ComBat-seq doesn't support 1 sample per batch yet")
+  }
+
+  ## Remove genes with only 0 counts in any batch
+  keep_lst <- lapply(levels(batch), function(b){
+    which(apply(counts[, batch==b], 1, function(x){!all(x==0)}))
+  })
+  keep <- Reduce(intersect, keep_lst)
+  rm <- setdiff(1:nrow(counts), keep)
+  countsOri <- counts
+  counts <- counts[keep, ]
+
+  # require bioconductor 3.7, edgeR 3.22.1
+  dge_obj <- DGEList(counts=counts)
+
+  ## Prepare characteristics on batches
+  n_batch <- nlevels(batch)  # number of batches
+  batches_ind <- lapply(1:n_batch, function(i){which(batch==levels(batch)[i])}) # list of samples in each batch  
+  n_batches <- sapply(batches_ind, length)
+  #if(any(n_batches==1)){mean_only=TRUE; cat("Note: one batch has only one sample, setting mean.only=TRUE\n")}
+  n_sample <- sum(n_batches)
+  cat("Found",n_batch,'batches\n')
+
+  ## Make design matrix 
+  # batch
+  batchmod <- model.matrix(~-1+batch)  # colnames: levels(batch)
+  # covariate
+  group <- as.factor(group)
+  if(full_mod & nlevels(group)>1){
+    cat("Using full model in ComBat-seq.\n")
+    mod <- model.matrix(~group)
+  }else{
+    cat("Using null model in ComBat-seq.\n")
+    mod <- model.matrix(~1, data=as.data.frame(t(counts)))
+  }
+  # drop intercept in covariate model
+  if(!is.null(covar_mod)){
+    if(is.data.frame(covar_mod)){
+      covar_mod <- do.call(cbind, lapply(1:ncol(covar_mod), function(i){model.matrix(~covar_mod[,i])}))
+    }
+    covar_mod <- covar_mod[, !apply(covar_mod, 2, function(x){all(x==1)})]
+  }
+  # bind with biological condition of interest
+  mod <- cbind(mod, covar_mod)
+  # combine
+  design <- cbind(batchmod, mod)
+
+  ## Check for intercept in covariates, and drop if present
+  check <- apply(design, 2, function(x) all(x == 1))
+  #if(!is.null(ref)){check[ref]=FALSE} ## except don't throw away the reference batch indicator
+  design <- as.matrix(design[,!check])
+  cat("Adjusting for",ncol(design)-ncol(batchmod),'covariate(s) or covariate level(s)\n')
+
+  ## Check if the design is confounded
+  if(qr(design)$rank<ncol(design)){
+    #if(ncol(design)<=(n_batch)){stop("Batch variables are redundant! Remove one or more of the batch variables so they are no longer confounded")}
+    if(ncol(design)==(n_batch+1)){stop("The covariate is confounded with batch! Remove the covariate and rerun ComBat-Seq")}
+    if(ncol(design)>(n_batch+1)){
+      if((qr(design[,-c(1:n_batch)])$rank<ncol(design[,-c(1:n_batch)]))){stop('The covariates are confounded! Please remove one or more of the covariates so the design is not confounded')
+      }else{stop("At least one covariate is confounded with batch! Please remove confounded covariates and rerun ComBat-Seq")}}
+  }
+
+  ## Check for missing values in count matrix
+  NAs = any(is.na(counts))
+  if(NAs){cat(c('Found',sum(is.na(counts)),'Missing Data Values\n'),sep=' ')}
+
+
+  ########  Estimate gene-wise dispersions within each batch  ########
+  cat("Estimating dispersions\n")
+  ## Estimate common dispersion within each batch as an initial value
+  disp_common <- sapply(1:n_batch, function(i){
+    if((n_batches[i] <= ncol(design)-ncol(batchmod)+1) | qr(mod[batches_ind[[i]], ])$rank < ncol(mod)){ 
+      # not enough residual degree of freedom
+      return(estimateGLMCommonDisp(counts[, batches_ind[[i]]], design=NULL, subset=nrow(counts)))
+    }else{
+      return(estimateGLMCommonDisp(counts[, batches_ind[[i]]], design=mod[batches_ind[[i]], ], subset=nrow(counts)))
+    }
+  })
+
+  ## Estimate gene-wise dispersion within each batch 
+  genewise_disp_lst <- lapply(1:n_batch, function(j){
+    if((n_batches[j] <= ncol(design)-ncol(batchmod)+1) | qr(mod[batches_ind[[j]], ])$rank < ncol(mod)){
+      # not enough residual degrees of freedom - use the common dispersion
+      return(rep(disp_common[j], nrow(counts)))
+    }else{
+      return(estimateGLMTagwiseDisp(counts[, batches_ind[[j]]], design=mod[batches_ind[[j]], ], 
+                                    dispersion=disp_common[j], prior.df=0))
+    }
+  })
+  names(genewise_disp_lst) <- paste0('batch', levels(batch))
+
+  ## construct dispersion matrix
+  phi_matrix <- matrix(NA, nrow=nrow(counts), ncol=ncol(counts))
+  for(k in 1:n_batch){
+    phi_matrix[, batches_ind[[k]]] <- vec2mat(genewise_disp_lst[[k]], n_batches[k]) 
+  }
+
+
+  ########  Estimate parameters from NB GLM  ########
+  cat("Fitting the GLM model\n")
+  glm_f <- glmFit(dge_obj, design=design, dispersion=phi_matrix, prior.count=1e-4) #no intercept - nonEstimable; compute offset (library sizes) within function
+  alpha_g <- glm_f$coefficients[, 1:n_batch] %*% as.matrix(n_batches/n_sample) #compute intercept as batch-size-weighted average from batches
+  new_offset <- t(vec2mat(getOffset(dge_obj), nrow(counts))) +   # original offset - sample (library) size
+    vec2mat(alpha_g, ncol(counts))  # new offset - gene background expression # getOffset(dge_obj) is the same as log(dge_obj$samples$lib.size)
+  glm_f2 <- glmFit.default(dge_obj$counts, design=design, dispersion=phi_matrix, offset=new_offset, prior.count=1e-4) 
+
+  gamma_hat <- glm_f2$coefficients[, 1:n_batch]
+  mu_hat <- glm_f2$fitted.values
+  phi_hat <- do.call(cbind, genewise_disp_lst)
+
+
+  ########  In each batch, compute posterior estimation through Monte-Carlo integration  ########  
+  if(shrink){
+    cat("Apply shrinkage - computing posterior estimates for parameters\n")
+    mcint_fun <- monte_carlo_int_NB
+    monte_carlo_res <- lapply(1:n_batch, function(ii){
+      if(ii==1){
+        mcres <- mcint_fun(dat=counts[, batches_ind[[ii]]], mu=mu_hat[, batches_ind[[ii]]], 
+                           gamma=gamma_hat[, ii], phi=phi_hat[, ii], gene.subset.n=gene.subset.n)
+      }else{
+        invisible(capture.output(mcres <- mcint_fun(dat=counts[, batches_ind[[ii]]], mu=mu_hat[, batches_ind[[ii]]], 
+                                                    gamma=gamma_hat[, ii], phi=phi_hat[, ii], gene.subset.n=gene.subset.n)))
+      }
+      return(mcres)
+    })
+    names(monte_carlo_res) <- paste0('batch', levels(batch))
+
+    gamma_star_mat <- lapply(monte_carlo_res, function(res){res$gamma_star})
+    gamma_star_mat <- do.call(cbind, gamma_star_mat)
+    phi_star_mat <- lapply(monte_carlo_res, function(res){res$phi_star})
+    phi_star_mat <- do.call(cbind, phi_star_mat)
+
+    if(!shrink.disp){
+      cat("Apply shrinkage to mean only\n")
+      phi_star_mat <- phi_hat
+    }
+  }else{
+    cat("Shrinkage off - using GLM estimates for parameters\n")
+    gamma_star_mat <- gamma_hat
+    phi_star_mat <- phi_hat
+  }
+
+
+  ########  Obtain adjusted batch-free distribution  ########
+  mu_star <- matrix(NA, nrow=nrow(counts), ncol=ncol(counts))
+  for(jj in 1:n_batch){
+    mu_star[, batches_ind[[jj]]] <- exp(log(mu_hat[, batches_ind[[jj]]])-vec2mat(gamma_star_mat[, jj], n_batches[jj]))
+  }
+  phi_star <- rowMeans(phi_star_mat)
+
+
+  ########  Adjust the data  ########  
+  cat("Adjusting the data\n")
+  adjust_counts <- matrix(NA, nrow=nrow(counts), ncol=ncol(counts))
+  for(kk in 1:n_batch){
+    counts_sub <- counts[, batches_ind[[kk]]]
+    old_mu <- mu_hat[, batches_ind[[kk]]]
+    old_phi <- phi_hat[, kk]
+    new_mu <- mu_star[, batches_ind[[kk]]]
+    new_phi <- phi_star
+    adjust_counts[, batches_ind[[kk]]] <- match_quantiles(counts_sub=counts_sub, 
+                                                          old_mu=old_mu, old_phi=old_phi, 
+                                                          new_mu=new_mu, new_phi=new_phi)
+  }
+
+  #dimnames(adjust_counts) <- dimnames(counts)
+  #return(adjust_counts)
+
+  ## Add back genes with only 0 counts in any batch (so that dimensions won't change)
+  adjust_counts_whole <- matrix(NA, nrow=nrow(countsOri), ncol=ncol(countsOri))
+  dimnames(adjust_counts_whole) <- dimnames(countsOri)
+  adjust_counts_whole[keep, ] <- adjust_counts
+  adjust_counts_whole[rm, ] <- countsOri[rm, ]
+  return(adjust_counts_whole)
+}