Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #39 from zhangyuqing/master
Adding combat-seq to sva
- Loading branch information
Showing
9 changed files
with
470 additions
and
13 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,216 @@ | ||
#' Adjust for batch effects using an empirical Bayes framework in RNA-seq raw counts | ||
#' | ||
#' ComBat_seq is an improved model from ComBat using negative binomial regression, | ||
#' which specifically targets RNA-Seq count data. | ||
#' | ||
#' @param counts Raw count matrix from genomic studies (dimensions gene x sample) | ||
#' @param batch Vector / factor for batch | ||
#' @param group Vector / factor for biological condition of interest | ||
#' @param covar_mod Model matrix for multiple covariates to include in linear model (signals from these variables are kept in data after adjustment) | ||
#' @param full_mod Boolean, if TRUE include condition of interest in model | ||
#' @param shrink Boolean, whether to apply shrinkage on parameter estimation | ||
#' @param shrink.disp Boolean, whether to apply shrinkage on dispersion | ||
#' @param gene.subset.n Number of genes to use in empirical Bayes estimation, only useful when shrink = TRUE | ||
#' | ||
#' @return data A gene x sample count matrix, adjusted for batch effects. | ||
#' | ||
#' @importFrom edgeR DGEList estimateGLMCommonDisp estimateGLMTagwiseDisp glmFit glmFit.default getOffset | ||
#' @importFrom stats dnbinom lm pnbinom qnbinom | ||
#' @importFrom utils capture.output | ||
#' | ||
#' @examples | ||
#' | ||
#' count_matrix <- matrix(rnbinom(400, size=10, prob=0.1), nrow=50, ncol=8) | ||
#' batch <- c(rep(1, 4), rep(2, 4)) | ||
#' group <- rep(c(0,1), 4) | ||
#' | ||
#' # include condition (group variable) | ||
#' adjusted_counts <- ComBat_seq(count_matrix, batch=batch, group=group, full_mod=TRUE) | ||
#' | ||
#' # do not include condition | ||
#' adjusted_counts <- ComBat_seq(count_matrix, batch=batch, group=NULL, full_mod=FALSE) | ||
#' | ||
#' @export | ||
#' | ||
|
||
ComBat_seq <- function(counts, batch, group=NULL, covar_mod=NULL, full_mod=TRUE, | ||
shrink=FALSE, shrink.disp=FALSE, gene.subset.n=NULL){ | ||
######## Preparation ######## | ||
## Does not support 1 sample per batch yet | ||
batch <- as.factor(batch) | ||
if(any(table(batch)<=1)){ | ||
stop("ComBat-seq doesn't support 1 sample per batch yet") | ||
} | ||
|
||
## Remove genes with only 0 counts in any batch | ||
keep_lst <- lapply(levels(batch), function(b){ | ||
which(apply(counts[, batch==b], 1, function(x){!all(x==0)})) | ||
}) | ||
keep <- Reduce(intersect, keep_lst) | ||
rm <- setdiff(1:nrow(counts), keep) | ||
countsOri <- counts | ||
counts <- counts[keep, ] | ||
|
||
# require bioconductor 3.7, edgeR 3.22.1 | ||
dge_obj <- DGEList(counts=counts) | ||
|
||
## Prepare characteristics on batches | ||
n_batch <- nlevels(batch) # number of batches | ||
batches_ind <- lapply(1:n_batch, function(i){which(batch==levels(batch)[i])}) # list of samples in each batch | ||
n_batches <- sapply(batches_ind, length) | ||
#if(any(n_batches==1)){mean_only=TRUE; cat("Note: one batch has only one sample, setting mean.only=TRUE\n")} | ||
n_sample <- sum(n_batches) | ||
cat("Found",n_batch,'batches\n') | ||
|
||
## Make design matrix | ||
# batch | ||
batchmod <- model.matrix(~-1+batch) # colnames: levels(batch) | ||
# covariate | ||
group <- as.factor(group) | ||
if(full_mod & nlevels(group)>1){ | ||
cat("Using full model in ComBat-seq.\n") | ||
mod <- model.matrix(~group) | ||
}else{ | ||
cat("Using null model in ComBat-seq.\n") | ||
mod <- model.matrix(~1, data=as.data.frame(t(counts))) | ||
} | ||
# drop intercept in covariate model | ||
if(!is.null(covar_mod)){ | ||
if(is.data.frame(covar_mod)){ | ||
covar_mod <- do.call(cbind, lapply(1:ncol(covar_mod), function(i){model.matrix(~covar_mod[,i])})) | ||
} | ||
covar_mod <- covar_mod[, !apply(covar_mod, 2, function(x){all(x==1)})] | ||
} | ||
# bind with biological condition of interest | ||
mod <- cbind(mod, covar_mod) | ||
# combine | ||
design <- cbind(batchmod, mod) | ||
|
||
## Check for intercept in covariates, and drop if present | ||
check <- apply(design, 2, function(x) all(x == 1)) | ||
#if(!is.null(ref)){check[ref]=FALSE} ## except don't throw away the reference batch indicator | ||
design <- as.matrix(design[,!check]) | ||
cat("Adjusting for",ncol(design)-ncol(batchmod),'covariate(s) or covariate level(s)\n') | ||
|
||
## Check if the design is confounded | ||
if(qr(design)$rank<ncol(design)){ | ||
#if(ncol(design)<=(n_batch)){stop("Batch variables are redundant! Remove one or more of the batch variables so they are no longer confounded")} | ||
if(ncol(design)==(n_batch+1)){stop("The covariate is confounded with batch! Remove the covariate and rerun ComBat-Seq")} | ||
if(ncol(design)>(n_batch+1)){ | ||
if((qr(design[,-c(1:n_batch)])$rank<ncol(design[,-c(1:n_batch)]))){stop('The covariates are confounded! Please remove one or more of the covariates so the design is not confounded') | ||
}else{stop("At least one covariate is confounded with batch! Please remove confounded covariates and rerun ComBat-Seq")}} | ||
} | ||
|
||
## Check for missing values in count matrix | ||
NAs = any(is.na(counts)) | ||
if(NAs){cat(c('Found',sum(is.na(counts)),'Missing Data Values\n'),sep=' ')} | ||
|
||
|
||
######## Estimate gene-wise dispersions within each batch ######## | ||
cat("Estimating dispersions\n") | ||
## Estimate common dispersion within each batch as an initial value | ||
disp_common <- sapply(1:n_batch, function(i){ | ||
if((n_batches[i] <= ncol(design)-ncol(batchmod)+1) | qr(mod[batches_ind[[i]], ])$rank < ncol(mod)){ | ||
# not enough residual degree of freedom | ||
return(estimateGLMCommonDisp(counts[, batches_ind[[i]]], design=NULL, subset=nrow(counts))) | ||
}else{ | ||
return(estimateGLMCommonDisp(counts[, batches_ind[[i]]], design=mod[batches_ind[[i]], ], subset=nrow(counts))) | ||
} | ||
}) | ||
|
||
## Estimate gene-wise dispersion within each batch | ||
genewise_disp_lst <- lapply(1:n_batch, function(j){ | ||
if((n_batches[j] <= ncol(design)-ncol(batchmod)+1) | qr(mod[batches_ind[[j]], ])$rank < ncol(mod)){ | ||
# not enough residual degrees of freedom - use the common dispersion | ||
return(rep(disp_common[j], nrow(counts))) | ||
}else{ | ||
return(estimateGLMTagwiseDisp(counts[, batches_ind[[j]]], design=mod[batches_ind[[j]], ], | ||
dispersion=disp_common[j], prior.df=0)) | ||
} | ||
}) | ||
names(genewise_disp_lst) <- paste0('batch', levels(batch)) | ||
|
||
## construct dispersion matrix | ||
phi_matrix <- matrix(NA, nrow=nrow(counts), ncol=ncol(counts)) | ||
for(k in 1:n_batch){ | ||
phi_matrix[, batches_ind[[k]]] <- vec2mat(genewise_disp_lst[[k]], n_batches[k]) | ||
} | ||
|
||
|
||
######## Estimate parameters from NB GLM ######## | ||
cat("Fitting the GLM model\n") | ||
glm_f <- glmFit(dge_obj, design=design, dispersion=phi_matrix, prior.count=1e-4) #no intercept - nonEstimable; compute offset (library sizes) within function | ||
alpha_g <- glm_f$coefficients[, 1:n_batch] %*% as.matrix(n_batches/n_sample) #compute intercept as batch-size-weighted average from batches | ||
new_offset <- t(vec2mat(getOffset(dge_obj), nrow(counts))) + # original offset - sample (library) size | ||
vec2mat(alpha_g, ncol(counts)) # new offset - gene background expression # getOffset(dge_obj) is the same as log(dge_obj$samples$lib.size) | ||
glm_f2 <- glmFit.default(dge_obj$counts, design=design, dispersion=phi_matrix, offset=new_offset, prior.count=1e-4) | ||
|
||
gamma_hat <- glm_f2$coefficients[, 1:n_batch] | ||
mu_hat <- glm_f2$fitted.values | ||
phi_hat <- do.call(cbind, genewise_disp_lst) | ||
|
||
|
||
######## In each batch, compute posterior estimation through Monte-Carlo integration ######## | ||
if(shrink){ | ||
cat("Apply shrinkage - computing posterior estimates for parameters\n") | ||
mcint_fun <- monte_carlo_int_NB | ||
monte_carlo_res <- lapply(1:n_batch, function(ii){ | ||
if(ii==1){ | ||
mcres <- mcint_fun(dat=counts[, batches_ind[[ii]]], mu=mu_hat[, batches_ind[[ii]]], | ||
gamma=gamma_hat[, ii], phi=phi_hat[, ii], gene.subset.n=gene.subset.n) | ||
}else{ | ||
invisible(capture.output(mcres <- mcint_fun(dat=counts[, batches_ind[[ii]]], mu=mu_hat[, batches_ind[[ii]]], | ||
gamma=gamma_hat[, ii], phi=phi_hat[, ii], gene.subset.n=gene.subset.n))) | ||
} | ||
return(mcres) | ||
}) | ||
names(monte_carlo_res) <- paste0('batch', levels(batch)) | ||
|
||
gamma_star_mat <- lapply(monte_carlo_res, function(res){res$gamma_star}) | ||
gamma_star_mat <- do.call(cbind, gamma_star_mat) | ||
phi_star_mat <- lapply(monte_carlo_res, function(res){res$phi_star}) | ||
phi_star_mat <- do.call(cbind, phi_star_mat) | ||
|
||
if(!shrink.disp){ | ||
cat("Apply shrinkage to mean only\n") | ||
phi_star_mat <- phi_hat | ||
} | ||
}else{ | ||
cat("Shrinkage off - using GLM estimates for parameters\n") | ||
gamma_star_mat <- gamma_hat | ||
phi_star_mat <- phi_hat | ||
} | ||
|
||
|
||
######## Obtain adjusted batch-free distribution ######## | ||
mu_star <- matrix(NA, nrow=nrow(counts), ncol=ncol(counts)) | ||
for(jj in 1:n_batch){ | ||
mu_star[, batches_ind[[jj]]] <- exp(log(mu_hat[, batches_ind[[jj]]])-vec2mat(gamma_star_mat[, jj], n_batches[jj])) | ||
} | ||
phi_star <- rowMeans(phi_star_mat) | ||
|
||
|
||
######## Adjust the data ######## | ||
cat("Adjusting the data\n") | ||
adjust_counts <- matrix(NA, nrow=nrow(counts), ncol=ncol(counts)) | ||
for(kk in 1:n_batch){ | ||
counts_sub <- counts[, batches_ind[[kk]]] | ||
old_mu <- mu_hat[, batches_ind[[kk]]] | ||
old_phi <- phi_hat[, kk] | ||
new_mu <- mu_star[, batches_ind[[kk]]] | ||
new_phi <- phi_star | ||
adjust_counts[, batches_ind[[kk]]] <- match_quantiles(counts_sub=counts_sub, | ||
old_mu=old_mu, old_phi=old_phi, | ||
new_mu=new_mu, new_phi=new_phi) | ||
} | ||
|
||
#dimnames(adjust_counts) <- dimnames(counts) | ||
#return(adjust_counts) | ||
|
||
## Add back genes with only 0 counts in any batch (so that dimensions won't change) | ||
adjust_counts_whole <- matrix(NA, nrow=nrow(countsOri), ncol=ncol(countsOri)) | ||
dimnames(adjust_counts_whole) <- dimnames(countsOri) | ||
adjust_counts_whole[keep, ] <- adjust_counts | ||
adjust_counts_whole[rm, ] <- countsOri[rm, ] | ||
return(adjust_counts_whole) | ||
} |
Oops, something went wrong.