In [2]:
library(tidyverse)
library(Seurat)

library(nebula)
library(fixest)
library(glmGamPoi)
library(limma)
library(edgeR)
library(SingleCellExperiment)
library(data.table)

── [1mAttaching core tidyverse packages[22m ──────────────────────────────────────────────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.3     [32m✔[39m [34mreadr    [39m 2.1.4
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.0
[32m✔[39m [34mggplot2  [39m 3.4.4     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.2     [32m✔[39m [34mtidyr    [39m 1.3.0
[32m✔[39m [34mpurrr    [39m 1.0.2     
── [1mConflicts[22m ────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors
Attaching SeuratObject


Attaching package: ‘glmGamPoi’


The following object is masked 

In [3]:
selectCol <- function(mat, j.col){    
    x.col.dense <- rep(0,nrow(mat))
    p.begin <- mat@p[j.col]+1
    p.end <- mat@p[j.col+1]
    i.col <- mat@i[p.begin:p.end]+1 # i counts from 0
    x.col <- mat@x[p.begin:p.end]
    x.col.dense[i.col] <- x.col
    return(x.col.dense)
    }

selectCols <- function(mat, j.cols){    
    return(sapply(j.cols, selectCol, mat=mat))
    }

fixest.mult <- function(formula, count, df){
    df.result <- data.frame(matrix(nrow=0, ncol=4))
    colnames(df.result) <- c('Estimate', 'Std. Error', 't value', 'Pr(>|t|)')
    for (j in 1:ncol(count)){
        df$y <- selectCol(count, j)
        fit <- fixest::fepois(formula, vcov='hetero', data=df)
        df.result[colnames(count)[j],] <- coeftable(fit)['tx_cell',] # fixed effect o/x 에 따라 다르게 들어가야함
        } 
    return(df.result)
    }

nebula.mult <- function(formula, count, df){
    pred <- model.matrix(formula, data=df)
    sid <- df$id
    fit.nebula <- nebula::nebula(
        count,
        sid,
        pred=pred,
        cpc=0,
        mincp=0
        )
    fit.result <- fit.nebula$summary
    rownames(fit.result) <- fit.result$gene
    return(
            fit.result %>%
            mutate(
                Estimate=logFC_tx_cell,
                'Std. Error'=se_tx_cell,
                't value'=logFC_tx_cell/se_tx_cell,
                'Pr(>|t|)'=p_tx_cell
                ) %>%
            select(Estimate, 'Std. Error', 't value', 'Pr(>|t|)')
        )
    }

glmgp.mult <- function(formula, count, df){
    sce.obj <- SingleCellExperiment::SingleCellExperiment(list(counts=count), colData=df)
    sce.pb <- glmGamPoi::pseudobulk(
        sce.obj,
        group_by=vars(id, tx_cell),
        verbose=FALSE
        )

    fit <- glmGamPoi::glm_gp(sce.pb, design=~1+tx_cell)
    test <- glmGamPoi::test_de(fit, reduced_design=~1)
    
    beta <- fit$Beta[,'tx_cell']
    pval <- test$pval
    tval <- qnorm(1-pval/2) * sign(beta)
    se <- beta/tval
    result <- cbind(beta, se, tval, pval)
    colnames(result) <- c('Estimate', 'Std. Error', 't value', 'Pr(>|t|)')
    return(result)
    }

glmgp.cell.mult <- function(formula, count, df){
    sce.obj <- SingleCellExperiment::SingleCellExperiment(list(counts=count), colData=df)
    fit <- glmGamPoi::glm_gp(sce.obj, design=~1+tx_cell, on_disk=FALSE, size_factors=FALSE)
    test <- glmGamPoi::test_de(fit, reduced_design=~1)
    
    beta <- fit$Beta[,'tx_cell']
    pval <- test$pval
    tval <- qnorm(1-pval/2) * sign(beta)
    se <- beta/tval
    result <- cbind(beta, se, tval, pval)
    colnames(result) <- c('Estimate', 'Std. Error', 't value', 'Pr(>|t|)')
    return(result)
    }

edger.mult <- function(formula, count, df){
    sce.obj <- SingleCellExperiment::SingleCellExperiment(list(counts=count), colData=df)
    sce.pb <- glmGamPoi::pseudobulk(
        sce.obj,
        group_by=vars(id, tx_cell),
        verbose=FALSE
        )

    design <- model.matrix(~1+tx_cell, data=colData(sce.pb))
    edger.obj <- edgeR::DGEList(counts(sce.pb))
    edger.obj <- edgeR::estimateDisp(edger.obj, design)
    fit <- edgeR::glmQLFit(y=edger.obj, design=design)
    test <- edgeR::glmTreat(fit, coef=2)

    beta <- test$coefficients[,'tx_cell']
    pval <- test$table[,'PValue']
    tval <- qnorm(1-pval/2) * sign(beta)
    se <- beta/tval

    result <- cbind(beta, se, tval, pval)
    colnames(result) <- c('Estimate', 'Std. Error', 't value', 'Pr(>|t|)')
    
    return(result)
    }

limma.mult <- function(formula, count, df){
    sce.obj <- SingleCellExperiment::SingleCellExperiment(list(counts=count), colData=df)
    sce.pb <- glmGamPoi::pseudobulk(
        sce.obj,
        group_by=vars(id, tx_cell),
        verbose=FALSE
        )

    design <- model.matrix(~1+tx_cell, data=colData(sce.pb))
    edger.obj <- edgeR::DGEList(counts(sce.pb))
    v <- limma::voom(edger.obj, design)
    vfit <- limma::lmFit(v, design)
    efit <- limma::eBayes(vfit)
    
    beta <- efit$coefficients[,'tx_cell'] * log(2)
    pval <- efit$p.value[,'tx_cell']
    tval <- qnorm(1-pval/2) * sign(beta)
    se <- beta/tval

    result <- cbind(beta, se, tval, pval)
    colnames(result) <- c('Estimate', 'Std. Error', 't value', 'Pr(>|t|)')
    
    return(result)
    }

In [3]:
obj <- readRDS("/data02/hanbin973/deg_practical/datasets/kang.2018.sce.rds")

In [5]:
colData(obj)

DataFrame with 28458 rows and 4 columns
                 group_id patient_id sample_id      cluster_id
                 <factor>   <factor>  <factor>        <factor>
AAACATACAATGCC-1     ctrl       107   ctrl107  CD4 T cells    
AAACATACATTTCC-1     ctrl       1016  ctrl1016 CD14+ Monocytes
AAACATACCAGAAA-1     ctrl       1256  ctrl1256 CD14+ Monocytes
AAACATACCAGCTA-1     ctrl       1256  ctrl1256 CD14+ Monocytes
AAACATACCATGCA-1     ctrl       1488  ctrl1488 CD4 T cells    
...                   ...        ...       ...             ...
TTTGCATGCTAAGC-1     stim       107   stim107      CD4 T cells
TTTGCATGGGACGA-1     stim       1488  stim1488     CD4 T cells
TTTGCATGGTGAGG-1     stim       1488  stim1488     CD4 T cells
TTTGCATGGTTTGG-1     stim       1244  stim1244     CD4 T cells
TTTGCATGTCTTAC-1     stim       1016  stim1016     CD4 T cells

In [6]:
as.data.frame(colData(obj)) %>% 
    group_by(group_id, patient_id, .drop=FALSE) %>% 
    summarise(n=n()) %>% 
    pivot_wider(names_from=patient_id, values_from=n)

[1m[22m`summarise()` has grouped output by 'group_id'. You can override using the `.groups` argument.


group_id,101,107,1015,1016,1039,1244,1256,1488
<fct>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
ctrl,1022,641,3131,2173,476,2241,2439,2362
stim,1321,619,2631,1938,689,1717,2217,2841


In [53]:
# adopted from the authors
target_gene_overexpression <- function(obj){
    
    # extract information
    cell_types <- unique(colData(obj)$cluster_id)
    result.list <- list()
    for (cell_type in cell_types){
        obj.ct <- obj[,colData(obj)$cluster_id == cell_type]
        cnt <- counts(obj.ct)
        col.data <- as.data.frame(colData(obj.ct))[,c('group_id', 'patient_id', 'cluster_id')]
        colnames(col.data) <- c('tx_cell', 'donor_id', 'cell_type')
        col.data$tx_cell <- ifelse(col.data$tx_cell == 'ctrl', 0, 1)

        col.data <- col.data %>% arrange(donor_id)
        col.data$id <- col.data$donor_id
        cnt <- cnt[,rownames(col.data)]
        cnt <- cnt[rowMeans(cnt) > 0.01,]

        func.list <- list(nebula.mult, glmgp.mult, edger.mult, limma.mult, glmgp.cell.mult, fixest.mult)
        #func.list <- list(glmgp.mult)
        data.list <- list(cnt, cnt, cnt, cnt, cnt, t(cnt))
        form.list <- list(
            as.formula('~tx_cell'),
            as.formula('~tx_cell'),
            as.formula('~tx_cell'),
            as.formula('~tx_cell'),
            as.formula('~tx_cell'),
            as.formula('y~tx_cell | id') # 
            )

        result.list.ct <- list()
        for (i in 1:length(func.list)){
            df <- as.data.frame(func.list[[i]](
                form.list[[i]],
                data.list[[i]],
                col.data
                ))
            df$gene <- rownames(df)
            result.list.ct[[i]] <- df
            }
        names(result.list.ct) <- c('NB GLMM', 'glmGamPoi (Pb)', 'edgeR (Pb)', 'limma (Pb)', 'glmGamPoi (cell)', 'robust GLM (cell)')
        #names(result.list.ct) <- c('glmGamPoi (Pb)')
        result.list[[cell_type]] <- rbindlist(result.list.ct, idcol='method')
    }
    
    return(rbindlist(result.list, idcol='celltype'))
}

In [None]:
x <- target_gene_overexpression(obj)
saveRDS(x, 'kang.result.rds')

Remove  0  genes having low expression.
Analyzing  5738  genes with  8  subjects and  11848  cells.


NOTE: 1 fixed-effect (770 observations) removed because of only 0 outcomes.

NOTE: 1 fixed-effect (420 observations) removed because of only 0 outcomes.

NOTE: 3 fixed-effects (3,483 observations) removed because of only 0 outcomes.

NOTE: 1 fixed-effect (520 observations) removed because of only 0 outcomes.

NOTE: 1 fixed-effect (420 observations) removed because of only 0 outcomes.

NOTE: 1 fixed-effect (770 observations) removed because of only 0 outcomes.

NOTE: 1 fixed-effect (420 observations) removed because of only 0 outcomes.

NOTE: 1 fixed-effect (520 observations) removed because of only 0 outcomes.

NOTE: 1 fixed-effect (420 observations) removed because of only 0 outcomes.

NOTE: 1 fixed-effect (420 observations) removed because of only 0 outcomes.

NOTE: 1 fixed-effect (420 observations) removed because of only 0 outcomes.

NOTE: 1 fixed-effect (420 observations) removed because of only 0 outcomes.

NOTE: 1 fixed-effect (420 observations) removed because of only 0 outcome

Remove  0  genes having low expression.
Analyzing  6147  genes with  8  subjects and  6295  cells.


NOTE: 1 fixed-effect (458 observations) removed because of only 0 outcomes.

NOTE: 1 fixed-effect (458 observations) removed because of only 0 outcomes.

NOTE: 1 fixed-effect (315 observations) removed because of only 0 outcomes.

NOTE: 1 fixed-effect (315 observations) removed because of only 0 outcomes.

NOTE: 2 fixed-effects (1,166 observations) removed because of only 0 outcomes.

NOTE: 1 fixed-effect (458 observations) removed because of only 0 outcomes.

NOTE: 1 fixed-effect (315 observations) removed because of only 0 outcomes.

NOTE: 1 fixed-effect (315 observations) removed because of only 0 outcomes.

NOTE: 1 fixed-effect (315 observations) removed because of only 0 outcomes.

NOTE: 1 fixed-effect (315 observations) removed because of only 0 outcomes.

NOTE: 1 fixed-effect (742 observations) removed because of only 0 outcomes.

NOTE: 1 fixed-effect (458 observations) removed because of only 0 outcomes.

NOTE: 1 fixed-effect (315 observations) removed because of only 0 outcome

Remove  0  genes having low expression.
Analyzing  7439  genes with  8  subjects and  455  cells.


NOTE: 6 fixed-effects (307 observations) removed because of only 0 outcomes.

NOTE: 2 fixed-effects (63 observations) removed because of only 0 outcomes.

NOTE: 4 fixed-effects (225 observations) removed because of only 0 outcomes.

NOTE: 1 fixed-effect (23 observations) removed because of only 0 outcomes.

NOTE: 2 fixed-effects (50 observations) removed because of only 0 outcomes.

NOTE: 3 fixed-effects (86 observations) removed because of only 0 outcomes.

NOTE: 3 fixed-effects (175 observations) removed because of only 0 outcomes.

NOTE: 5 fixed-effects (268 observations) removed because of only 0 outcomes.

NOTE: 3 fixed-effects (148 observations) removed because of only 0 outcomes.

NOTE: 4 fixed-effects (147 observations) removed because of only 0 outcomes.

NOTE: 4 fixed-effects (145 observations) removed because of only 0 outcomes.

NOTE: 3 fixed-effects (111 observations) removed because of only 0 outcomes.

NOTE: 3 fixed-effects (132 observations) removed because of only 0 ou

Remove  0  genes having low expression.
Analyzing  5863  genes with  8  subjects and  2289  cells.


NOTE: 1 fixed-effect (99 observations) removed because of only 0 outcomes.

NOTE: 1 fixed-effect (54 observations) removed because of only 0 outcomes.

NOTE: 2 fixed-effects (302 observations) removed because of only 0 outcomes.

NOTE: 1 fixed-effect (54 observations) removed because of only 0 outcomes.

NOTE: 1 fixed-effect (54 observations) removed because of only 0 outcomes.

NOTE: 1 fixed-effect (54 observations) removed because of only 0 outcomes.

NOTE: 1 fixed-effect (54 observations) removed because of only 0 outcomes.

NOTE: 1 fixed-effect (54 observations) removed because of only 0 outcomes.

NOTE: 1 fixed-effect (99 observations) removed because of only 0 outcomes.

NOTE: 1 fixed-effect (54 observations) removed because of only 0 outcomes.

NOTE: 1 fixed-effect (99 observations) removed because of only 0 outcomes.

NOTE: 1 fixed-effect (54 observations) removed because of only 0 outcomes.

NOTE: 1 fixed-effect (99 observations) removed because of only 0 outcomes.

NOTE: 1 fi

In [4]:
result <- readRDS('kang.result.rds')

In [4]:
head(result)

celltype,method,Estimate,Std. Error,t value,Pr(>|t|),gene
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
CD4 T cells,NB GLMM,0.05662706,0.06440173,0.8792787,0.3792502,NOC2L
CD4 T cells,NB GLMM,1.30265284,0.14964196,8.7051307,3.172091e-18,HES4
CD4 T cells,NB GLMM,3.43307187,0.02901797,118.3084857,0.0,ISG15
CD4 T cells,NB GLMM,-0.06544499,0.1040499,-0.628977,0.5293641,TNFRSF18
CD4 T cells,NB GLMM,0.17667338,0.07957735,2.2201466,0.02640882,TNFRSF4
CD4 T cells,NB GLMM,-0.18788198,0.07383974,-2.5444563,0.0109448,SDF4


In [21]:
df.hm <- result %>%
    mutate(gene_celltype = paste(gene, celltype, sep='_')) %>%
    select(gene_celltype, method, `Pr(>|t|)`) %>%
    pivot_wider(names_from=method, values_from=`Pr(>|t|)`)
df.hm <- as.data.frame(df.hm[,2:ncol(df.hm)])
df.hm[!is.finite(df.hm[,1]),] <- 1
mat <- as.matrix(df.hm)
pval <- 0.05/6257
mat <- (mat < pval)
cor.mat <- round((t(mat) %*% mat)/nrow(mat) * 100,2)
name.ord <- c('glmGamPoi (Pb)', 'edgeR (Pb)', 'limma (Pb)', 'glmGamPoi (cell)', 'fixest (cell)', 'NB GLMM')
write.csv(cor.mat[name.ord, name.ord], 'kang.pow.csv')
cor.mat[name.ord, name.ord]


Unnamed: 0,glmGamPoi (Pb),edgeR (Pb),limma (Pb),glmGamPoi (cell),fixest (cell),NB GLMM
glmGamPoi (Pb),6.54,5.34,6.25,6.53,6.47,6.45
edgeR (Pb),5.34,5.55,5.32,5.49,5.42,5.38
limma (Pb),6.25,5.32,6.76,6.72,6.63,6.62
glmGamPoi (cell),6.53,5.49,6.72,12.32,11.23,11.56
fixest (cell),6.47,5.42,6.63,11.23,12.55,11.15
NB GLMM,6.45,5.38,6.62,11.56,11.15,11.68


In [24]:
df.hm <- result %>%
    mutate(gene_celltype = paste(gene, celltype, sep='_')) %>%
    select(gene_celltype, method, `Pr(>|t|)`) %>%
    pivot_wider(names_from=method, values_from=`Pr(>|t|)`)
df.hm <- as.data.frame(df.hm[,2:ncol(df.hm)])
df.hm[!is.finite(df.hm[,1]),] <- 1
mat <- as.matrix(df.hm)
cor.mat <- cor(mat, method = c("spearman"))
name.ord <- c('glmGamPoi (Pb)', 'edgeR (Pb)', 'limma (Pb)', 'glmGamPoi (cell)', 'fixest (cell)', 'NB GLMM')
write.csv(cor.mat[name.ord, name.ord], 'kang.sp.csv')
cor.mat[name.ord, name.ord]



Unnamed: 0,glmGamPoi (Pb),edgeR (Pb),limma (Pb),glmGamPoi (cell),fixest (cell),NB GLMM
glmGamPoi (Pb),1.0,0.9729917,0.9496521,0.903343,0.8848327,0.8608584
edgeR (Pb),0.9729917,1.0,0.9118087,0.8792684,0.8672583,0.8265981
limma (Pb),0.9496521,0.9118087,1.0,0.8571769,0.8319463,0.8378374
glmGamPoi (cell),0.903343,0.8792684,0.8571769,1.0,0.97816,0.9592919
fixest (cell),0.8848327,0.8672583,0.8319463,0.97816,1.0,0.9159255
NB GLMM,0.8608584,0.8265981,0.8378374,0.9592919,0.9159255,1.0
