This notebook implements an efficient version of pseudobulk nb-glm based differential expression analysis with DESeq2. Pseudobulk means that all reads from a single batch group (e.g. donor) get pooled into a single observation. 

In general, pseudobulk is statistically preferable to but much slower than Wilcoxon, especially when you need to consider covariates. A more robust but considerably slower alternative to pseudobulk is including donors as random effects. Random effects are preferable for small cell count groups but likely give similar results to pseudobulk estimates for large groups. 

This idea is not at all new. The earliest reference I know for is from Lun et al: 
https://genomebiology.biomedcentral.com/articles/10.1186/s13059-016-0947-7. 


A few implementation notes: 

1) To find markers most upregulated in a cluster, I divide samples into those in and out of the cluster. An alternative is to let each out group remain an independent pseudobulk sample. This is in fact the recommended way from Mike Love: https://support.bioconductor.org/p/118090/. While this is certainly faster than re-estimate size factors for each cluster-specific analysis, I find it gives strange results. Namely, I get more inflated p-values and significant p-values for the wrong canonical marker genes (e.g. CD14 for B cells).  

2) On my laptop, it takes ~20 seconds to run do ~3000 genes from 2700 cells, 3 donors, 2 batches, and 9 cell types. 

# Load some data

In [1]:
suppressPackageStartupMessages({
    library(tidyverse)
#     library(presto)
    library(singlecellmethods)
    library(SeuratData)
    library(Seurat)
    library(DESeq2)    
})

fig.size <- function (h, w) 
{
    options(repr.plot.height = h, repr.plot.width = w)
}

Load small dataset for exposition

In [2]:
if (!SeuratData::AvailableData()['pbmc3k.SeuratData', 'Installed']) {
    SeuratData::InstallData("pbmc3k")
}
data("pbmc3k")

Add fake donor and batch columns

In [3]:
pbmc3k@meta.data$donor <- factor(sample(LETTERS[1:3], ncol(pbmc3k), TRUE))
pbmc3k@meta.data$batch <- factor(sample(LETTERS[1:2], ncol(pbmc3k), TRUE))

In [4]:
head(pbmc3k@meta.data)

Unnamed: 0_level_0,orig.ident,nCount_RNA,nFeature_RNA,seurat_annotations,donor,batch
Unnamed: 0_level_1,<fct>,<dbl>,<int>,<fct>,<fct>,<fct>
AAACATACAACCAC,pbmc3k,2419,779,Memory CD4 T,A,A
AAACATTGAGCTAC,pbmc3k,4903,1352,B,B,B
AAACATTGATCAGC,pbmc3k,3147,1129,Memory CD4 T,A,A
AAACCGTGCTTCCG,pbmc3k,2639,960,CD14+ Mono,C,B
AAACCGTGTATGCG,pbmc3k,980,521,NK,C,A
AAACGCACTGGTAC,pbmc3k,2163,781,Memory CD4 T,A,B


In [12]:
devtools::load_all('../R/pseudobulk.R')

Loading presto


# Functions

## Collapse to pseudobulk

In [5]:
# compute_hash <- function(data_df, vars_use) {
#     base <- 1
#     hash <- rep(0, nrow(data_df))
#     for (varname in vars_use) {
#         vals <- factor(data.frame(data_df)[, varname, drop = TRUE])
#         nlevel <- nlevels(vals)
#         hash <- hash + (as.integer(vals) - 1) * base
#         base <- base * nlevel
#     }
#     return(hash)
# }


In [6]:
# collapse_counts <- function(counts_mat, meta_data, varnames) {
#     ## give each unique row a hash value for indexing
#     hash <- compute_hash(meta_data, varnames)
#     idx_keep <- which(!is.na(hash))
#     hash <- hash[idx_keep]
#     hash <- factor(sprintf('sample_%d', as.integer(hash)))
#     meta_data <- meta_data[idx_keep, ]
#     counts_mat <- counts_mat[, idx_keep]
    
#     ## one hot encoded design matrix, sample level
#     design_collapsed <- data.frame(meta_data)[, varnames, drop = FALSE] %>% 
#         cbind(sample_id = hash) %>% 
#         unique()
#     row.names(design_collapsed) <- design_collapsed$sample_id

#     ## sum over samples
#     counts_collapsed <- presto:::sumGroups(counts_mat, hash, 1) %>% t()
#     row.names(counts_collapsed) <- row.names(counts_mat)
#     colnames(counts_collapsed) <- levels(hash)

#     ## reorder to match design matrix
#     counts_collapsed <- counts_collapsed[, design_collapsed$sample_id]
#     design_collapsed$sample_id <- NULL
#     return(list(counts_mat = counts_collapsed, meta_data = design_collapsed))
# }

## DESeq2 wrappers

In [7]:
# pseudobulk_deseq2 <- function(dge_formula, meta_data, counts_df, verbose=TRUE, 
#                    min_counts_per_sample=10, present_in_min_samples=5, collapse_background=TRUE) {
#     message('WARNING: meta_data should only contain pseudobulk identifying variables')
    
#     ## filter low expressed genes
#     genes_keep <- which(Matrix::rowSums(counts_df >= min_counts_per_sample) >= present_in_min_samples)
#     if (verbose) {
#         message(sprintf('Filtered out %d genes, analyzing %d genes', nrow(counts_df) - length(genes_keep), length(genes_keep)))
#     }
#     counts_df <- counts_df[genes_keep, ]
    
#     ## assume that the first variable in formula is the main contrast variable
#     all_vars <- unlist(strsplit(tail(as.character(dge_formula), 1), split = ' \\+ '))
#     if (verbose) {
#         message(sprintf('All vars: %s', paste(all_vars, collapse = ', ')))
#     }
#     contrast_var <- head(all_vars, 1)
#     if (verbose) {
#         message(sprintf('Contrast var: %s', contrast_var))
#     }
#     Reduce(rbind, lapply(unique(meta_data[[contrast_var]]), function(foreground_id) {
#         if (verbose) {
#             message(foreground_id)      
#         }
#         suppressMessages({suppressWarnings({
#             ## setup design 
#             design <- meta_data            
#             design[[contrast_var]] <- factor(ifelse(design[[contrast_var]] == foreground_id,
#                                                paste0('cluster_', foreground_id), 
#                                                'background'))
            
#             ## background clusters should not be treated as independent observations
#             if (collapse_background) {
#                 res <- collapse_counts(counts_df, design, colnames(design))
#                 design <- res$meta_data
#                 counts_df <- res$counts_mat                
#             }
                        
#             ## Do DGE with DESeq2
#             dds <- DESeqDataSetFromMatrix(
#                 countData = counts_df,
#                 colData = design,
#                 design = dge_formula) %>% 
#                 DESeq2::DESeq()

#             ## Get results 
#             contrast_name <- grep('cluster.*_vs_background', resultsNames(dds), value = TRUE)
#             dge_res <- results(dds, name = contrast_name) %>% 
#                     data.frame() %>% 
#                     tibble::rownames_to_column('feature') %>% 
#                     dplyr::arrange(-stat) %>% 
#                     dplyr::mutate(group = foreground_id)
#         })})
#         return(dge_res)
#     })) %>% 
#     dplyr::select(group, feature, dplyr::everything())

# }

In [8]:
# res_mat <- pseudobulk_deseq2(~seurat_annotations + donor, data_collapsed$meta_data, data_collapsed$counts_mat, verbose = TRUE)


ERROR: Error in pseudobulk_deseq2(~seurat_annotations + donor, data_collapsed$meta_data, : could not find function "pseudobulk_deseq2"


In [9]:
# top_markers_dds <- function(res, n=10, pval_max=1, padj_max=1, lfc_min=1) {
#     res %>% 
#         dplyr::filter(
#             .data$pvalue <= pval_max & 
#             .data$padj <= padj_max  &
#             log2FoldChange >= lfc_min
#         ) %>%
#         dplyr::group_by(.data$group) %>%
#         dplyr::top_n(n = n, wt = .data$stat) %>% 
#         dplyr::mutate(rank = rank(-.data$stat, ties.method = 'random')) %>% 
#         dplyr::ungroup() %>% 
#         dplyr::select(.data$feature, .data$group, .data$rank) %>% 
#         tidyr::spread(.data$group, .data$feature, fill = NA) %>% 
#         identity()
# }

# Test 

## Collapse to pseudobulk

In [17]:
data_collapsed <- collapse_counts(pbmc3k@assays$RNA@counts, 
                                  pbmc3k@meta.data, 
                                  c('seurat_annotations', 'donor', 'batch'))
head(data_collapsed$meta_data)

Unnamed: 0_level_0,seurat_annotations,donor,batch
Unnamed: 0_level_1,<fct>,<fct>,<fct>
sample_1,Memory CD4 T,A,A
sample_39,B,B,B
sample_47,CD14+ Mono,C,B
sample_24,NK,C,A
sample_28,Memory CD4 T,A,B
sample_4,CD8 T,A,A


## Do DESeq2

In [None]:
res_mat <- pseudobulk_deseq2(~seurat_annotations + donor + batch, 
                             data_collapsed$meta_data,
                             data_collapsed$counts_mat, verbose = TRUE)


In [None]:
head(res_mat)

In [None]:
top_markers_dds(res_mat, lfc_min = 1, padj_max = .05)

## Volcano plots

In [None]:
options(repr.plot.height = 6, repr.plot.width = 8)
res_mat %>% 
    ggplot(aes(log2FoldChange, -log10(pvalue), color = padj < .01 & abs(log2FoldChange) > 1)) + 
        geom_point(shape = 21) + 
        facet_wrap(~group, scales = 'free') + 
        guides(color = FALSE) + 
        NULL

# Comparison to Wilcoxon

In this artificial example, donor and batch are fictitious, so DESeq2's GLM $\beta$ estimates should not be that different from the Wilcoxon estimates. Here, we'll compare $\beta$s to auROC, which is essentially equivalent to the Wilxocon statistic. 

In [None]:
## Wilcoxon on CP10K normalized counts 
exprs_norm <- singlecellmethods::normalizeData(pbmc3k@assays$RNA@counts, scaling_factor = 1e4, method = 'log')
dge_wilxocon <- wilcoxauc(exprs_norm, factor(pbmc3k@meta.data$seurat_annotations))


In [None]:
head(dge_wilxocon)

In [None]:
options(repr.plot.height = 6, repr.plot.width = 8)
dplyr::inner_join(dge_wilxocon, res_mat, by = c('feature', 'group')) %>% 
    ggplot(aes(auc, stat)) + 
        geom_point(shape = '.') + 
        facet_wrap(~group, scales = 'free') + 
        geom_vline(xintercept = .5) + 
        geom_hline(yintercept = 0) + 
        labs(x = 'AUC', y = 'GLM beta') + 
        NULL

Most of the results agree, more or less. Interestingly, the Wilcoxon labels almost all genes as upregulated in DCs and CD16+ Monocytes and downregulated in Platelets. What's going on here? It turns out that DCs and CD16+ Monos are mRNA rich cells while platelets are mRNA poor cells overall. DESeq2 is able to account for this effect better than CP10K normalization. 


In [None]:
options(repr.plot.height = 3, repr.plot.width = 5)
pbmc3k@meta.data %>% 
    subset(!is.na(seurat_annotations)) %>% 
    ggplot(aes(reorder(seurat_annotations, nCount_RNA), nCount_RNA)) + 
        geom_boxplot(outlier.shape = NA) + 
        geom_jitter(shape = '.', height = 0) + 
        coord_flip() + 
        labs(x = '') + 
        NULL

# Pairwise tests

Instead of 1-vs-all, let's do pairwise test and then summarize statistics conservatively. 

In [None]:
# devtools::document('..')

In [None]:
devtools::load_all('..')

In [None]:
data_collapsed <- collapse_counts(pbmc3k@assays$RNA@counts, 
                                  pbmc3k@meta.data, 
                                  c('seurat_annotations', 'donor', 'batch'))
head(data_collapsed$meta_data)

BUG: when testing all vs all pairwise, crashes

In [None]:
# table(data_collapsed$meta_data$seurat_annotations)

In [None]:
res_pair <- pseudobulk_deseq2(~seurat_annotations + donor + batch, 
                             data_collapsed$meta_data,
                             data_collapsed$counts_mat, verbose = TRUE, mode = 'pairwise')#, vals_test = c('B', 'NK'))


In [None]:
res_min <- summarize_dge_pairs(res_pair, 'min')
# res_max <- summarize_dge_pairs(res_pair, 'max')

In [None]:
data.table(res_min)[, head(.SD), by = group]

In [None]:
# dge_formula <- ~seurat_annotations + donor + batch
# meta_data <- data_collapsed$meta_data
# counts_df <- data_collapsed$counts_mat
# verbose = TRUE
# contrast_var <- 'seurat_annotations'
# vals_test <- 'B'


In [None]:
# dge_1va_no <- pseudobulk_one_vs_all(dge_formula, counts_df, meta_data, contrast_var, vals_test, collapse_background = FALSE, verbose = TRUE)
# dge_1va_col <- pseudobulk_one_vs_all(dge_formula, counts_df, meta_data, contrast_var, vals_test, collapse_background = TRUE, verbose = TRUE)

In [None]:
dge_pairs_min <- summarize_dge_pairs(dge_pairs, 'min')
dge_pairs_max <- summarize_dge_pairs(dge_pairs, 'max')


In [None]:
## Compare to other modes

In [None]:
dge <- Reduce(rbind, list(
    dplyr::mutate(dge_pairs_min, mode = 'Pairs_min'), 
    dplyr::mutate(dge_pairs_max, mode = 'Pairs_max'), 
    dplyr::mutate(dge_1va_col, mode = 'Onevall_collapse'), 
    dplyr::mutate(dge_1va_no, mode = 'Onevall_no')
    )) 

In [None]:
plt_df <- dge %>% 
    dplyr::select(feature, log2FoldChange, mode) %>% 
    spread(mode, log2FoldChange)

plt_df <- plt_df[(rowSums(is.na(plt_df)) == 0), ]

In [None]:
library(ggforce)
fig.size(6, 8)
plt_df %>% 
    ggplot(aes(x = .panel_x, y = .panel_y)) + 
        geom_point(shape = '.') + 
        geom_autodensity(alpha = 0.3, position = 'identity') + 
#         geom_autodensity(position = 'identity') + 
        facet_matrix(
            vars(Pairs_min, Pairs_max, Onevall_collapse, Onevall_no), 
            layer.diag = 2
        ) + 
        geom_vline(aes(xintercept = 0), linetype = 2) + 
        geom_hline(aes(yintercept = 0), linetype = 2) + 
        geom_abline(aes(slope = 1, intercept = 0)) + 
        NULL

In [None]:
dge_pairs_min %>% 
    subset(stat > 0) %>% 
    dplyr::arrange(-stat) %>% 
    head()

In [None]:
dge_pairs_max %>% 
    subset(stat > 0) %>% 
    dplyr::arrange(-stat) %>% 
    head()

# 