# B cells: process reference for integration with MERFISH

In [None]:
require(Seurat)
require(tidyverse)
require(readxl)
require(patchwork)
require(sf)
require(ggpubr)
require(ggthemes)
require(harmony)
require(presto)
require(ComplexHeatmap)
require(circlize)
require(glue)
require(e1071) 
require(caTools) 
require(class) 
require(gghighlight)
options(repr.matrix.max.cols=100, repr.matrix.max.rows=100)
set.seed(1)

# load processed scRNA reference dataset

In [None]:
completeReference = readr::read_rds('/n/scratch/users/m/mup728/mup728/Cell_Typing_CRC_MERFISH/Pelka_reference_cleaning/pelka_dataset_with_merfish_genes.rds')


In [None]:
completeReference@meta.data$technology = 'scRNA'
completeReference@meta.data$biosample_id = completeReference@meta.data$batchID

In [None]:
completeReference@meta.data$combined_cell_names = colnames(completeReference)
length(unique(completeReference@meta.data$combined_cell_names))

In [None]:
summary(completeReference@meta.data$nCount_RNA)
summary(completeReference@meta.data$nFeature_RNA)

In [None]:
completeReference@meta.data$orig.ident %>% unique()

## select B cells for finetyping

In [None]:
scRNA_BCells = completeReference@meta.data %>% filter(ClusterTop == 'B') %>% select(combined_cell_names) %>% as.matrix() %>% as.vector()
temp = completeReference@meta.data %>% filter(ClusterTop == 'B')
rownames(temp) = temp$combined_cell_names
scRNA_BCells = CreateSeuratObject(GetAssayData(completeReference, 'counts')[,scRNA_BCells], meta.data = temp)
scRNA_BCells
rm(temp)

In [None]:
scRNA_BCells@meta.data$batchID %>% unique() %>% length()

In [None]:
table(scRNA_BCells@meta.data$biosample_id, scRNA_BCells@meta.data$ClusterFull) %>% as.matrix()

In [None]:
table(scRNA_BCells@meta.data$biosample_id) %>% as.matrix()

In [None]:
options(repr.plot.width=5, repr.plot.height=5)
table(scRNA_BCells@meta.data$biosample_id) %>% as.data.frame() %>%
ggplot() + 
geom_histogram(aes(Freq)) 
summary(as.data.frame(table(scRNA_BCells@meta.data$biosample_id))$Freq)

#### filter out donors with < 100 B cells

In [None]:
donorsToRetain = table(scRNA_BCells@meta.data$biosample_id) %>% as.data.frame() %>% filter(Freq > 100) %>% select(Var1) %>% as.matrix() %>% as.vector()
length(unique(scRNA_BCells@meta.data$biosample_id))
donorsToRetain %>% length()

In [None]:
scRNA_BCells = subset(scRNA_BCells, subset = biosample_id %in% donorsToRetain)
scRNA_BCells

In [None]:
options(repr.plot.width=20, repr.plot.height=10)
constantPalette = rep('red', length(unique(scRNA_BCells@meta.data$ClusterFull)))
names(constantPalette) = unique(scRNA_BCells@meta.data$ClusterFull)
ggplot(scRNA_BCells@meta.data) +
geom_point(aes(x=nFeature_RNA,y=nCount_RNA, color = ClusterFull), shape = '.', alpha = 0.5) +
geom_vline(xintercept = 15, color = 'blue') +
geom_hline(yintercept = 50, color = 'blue') +
scale_x_continuous(trans = 'log10') +
scale_y_continuous(trans = 'log10') +
facet_wrap(~ClusterFull) +
gghighlight() +
scale_color_manual(values = constantPalette) +
theme_minimal(base_size=30) +
theme(legend.position="none") 

## scale/normalize/pca

In [None]:
normFactor = scRNA_BCells@meta.data %>%
select(technology, nCount_RNA) %>%
group_by(technology) %>%
summarize(medianCounts = median(nCount_RNA)) 
normFactor = mean(normFactor$medianCounts) #median of medians (across technology) 
normFactor
scRNA_BCells = NormalizeData(scRNA_BCells, 
                          normalization.method = "LogNormalize", 
                          scale.factor = normFactor) %>%
    ScaleData(features = rownames(scRNA_BCells)) %>%
    singlecellmethods::RunBalancedPCA(weight.by='biosample_id', npcs=15)
scRNA_BCells

### in scrna, are the b subtypes distinguishable?

#### qualitative look - umap

In [None]:
U = uwot::umap(scRNA_BCells@reductions$pca@cell.embeddings[, 1:15], 
               min_dist = 0.05, 
               spread = 0.30, 
               ret_extra = 'fgraph', 
               n_sgd_threads = nbrOfWorkers(), 
               fast_sgd = TRUE)
colnames(U$embedding) = c('PCAUMAP1', 'PCAUMAP2')
rownames(U$fgraph) = colnames(U$fgraph) = Cells(scRNA_BCells)
scRNA_BCells[['pcaumap']] = Seurat::CreateDimReducObject(
    embeddings = U$embedding,
    assay = 'RNA', 
    key = 'PCAUMAP_', 
    global = TRUE
)
new_graph = Seurat::as.Graph(U$fgraph)
DefaultAssay(new_graph) = DefaultAssay(scRNA_BCells)
scRNA_BCells[['pcaumap_fgraph']] = new_graph

In [None]:
temp = Embeddings(scRNA_BCells, 'pcaumap') %>% as.data.frame()
temp$combined_cell_names = rownames(temp)
pcaumap_embeddings = right_join(scRNA_BCells@meta.data %>% select(combined_cell_names, ClusterTop, ClusterFull, biosample_id), temp)
pcaumap_embeddings %>% head()
pcaumap_embeddings %>% tail()

In [None]:
options(repr.plot.width = 5, repr.plot.height = 5)
ggplot(pcaumap_embeddings, aes(x = PCAUMAP_1, y = PCAUMAP_2, color = biosample_id)) + 
geom_point(shape = '.', alpha = 0.5) + 
ggtitle('Pre-harmony UMAP') +
guides(color = guide_legend(override.aes = list(size = 10, shape = 16))) +
ggpubr::theme_pubr(base_size=10) +
theme(legend.position = 'none') 

In [None]:
options(repr.plot.width = 20, repr.plot.height = 20)
constantPalette = rep('red', length(unique(pcaumap_embeddings$biosample_id)))
names(constantPalette) = unique(pcaumap_embeddings$biosample_id)
ggplot(pcaumap_embeddings, aes(x = PCAUMAP_1, y = PCAUMAP_2, color = biosample_id)) + 
geom_point(shape = '.', alpha = 0.5) + 
ggtitle('Pre-harmony UMAP') +
scale_color_manual(values = constantPalette) +
ggpubr::theme_pubr(base_size=10) +
theme(legend.position = 'none') +
facet_wrap(~biosample_id) + 
gghighlight::gghighlight()

In [None]:
options(repr.plot.width = 5, repr.plot.height = 5)
ggplot(pcaumap_embeddings, aes(x = PCAUMAP_1, y = PCAUMAP_2, color = ClusterFull)) + 
geom_point(shape = '.', alpha = 0.5) + 
ggtitle('Pre-harmony UMAP') +
ggthemes::scale_color_tableau('Tableau 20' , name = "") + 
guides(color = guide_legend(override.aes = list(size = 10, shape = 16))) +
ggpubr::theme_pubr(base_size=10) +
theme(legend.position = 'top') 

In [None]:
options(repr.plot.width = 5, repr.plot.height = 5)
ggplot(pcaumap_embeddings, aes(x = PCAUMAP_1, y = PCAUMAP_2, color = ClusterFull)) + 
geom_point(shape = '.', alpha = 0.5) + 
ggtitle('Pre-harmony UMAP') +
ggthemes::scale_color_tableau('Tableau 20' , name = "") + 
guides(color = guide_legend(override.aes = list(size = 10, shape = 16))) +
ggpubr::theme_pubr(base_size=10) +
theme(legend.position = 'top') +
facet_wrap(~ClusterFull) +
gghighlight::gghighlight()

### harmonize over donor

In [None]:
options(repr.plot.width = 5, repr.plot.height = 5)
scRNA_BCells = scRNA_BCells %>% harmony::RunHarmony("biosample_id", 
    plot_convergence = TRUE, 
    lambda = NULL,
    max.iter = 10,
    early_stop = F,
    sigma = 0.2
) 

### post harmony umap

In [None]:
U = uwot::umap(scRNA_BCells@reductions$harmony@cell.embeddings[, 1:15], 
               min_dist = 0.05, 
               spread = 0.30, 
               ret_extra = 'fgraph', 
               n_sgd_threads = nbrOfWorkers(), 
               fast_sgd = TRUE)
colnames(U$embedding) = c('HUMAP1', 'HUMAP2')
rownames(U$fgraph) = colnames(U$fgraph) = Cells(scRNA_BCells)
scRNA_BCells[['humap']] = Seurat::CreateDimReducObject(
    embeddings = U$embedding,
    assay = 'RNA', 
    key = 'HUMAP_', 
    global = TRUE
)
new_graph = Seurat::as.Graph(U$fgraph)
DefaultAssay(new_graph) = DefaultAssay(scRNA_BCells)
scRNA_BCells[['humap_fgraph']] = new_graph

In [None]:
temp = Embeddings(scRNA_BCells, 'humap') %>% as.data.frame()
temp$combined_cell_names = rownames(temp)
humap_embeddings = right_join(scRNA_BCells@meta.data %>% select(combined_cell_names, ClusterTop, ClusterFull, biosample_id), temp)
humap_embeddings %>% head()
humap_embeddings %>% tail()

In [None]:
options(repr.plot.width = 5, repr.plot.height = 5)
ggplot(humap_embeddings, aes(x = HUMAP_1, y = HUMAP_2, color = biosample_id)) + 
geom_point(shape = '.', alpha = 0.5) + 
ggtitle('Post-harmony UMAP') +
ggpubr::theme_pubr(base_size=10) +
theme(legend.position = 'none') 

In [None]:
options(repr.plot.width = 20, repr.plot.height = 20)
constantPalette = rep('red', length(unique(humap_embeddings$biosample_id)))
names(constantPalette) = unique(humap_embeddings$biosample_id)
ggplot(humap_embeddings, aes(x = HUMAP_1, y = HUMAP_2, color = biosample_id)) + 
geom_point(shape = '.', alpha = 0.5) + 
ggtitle('Post-harmony UMAP') +
ggpubr::theme_pubr(base_size=10) +
theme(legend.position = 'none') +
scale_color_manual(values = constantPalette) +
facet_wrap(~biosample_id) +
gghighlight::gghighlight()

In [None]:
options(repr.plot.width = 5, repr.plot.height = 5)
ggplot(humap_embeddings, aes(x = HUMAP_1, y = HUMAP_2, color = ClusterFull)) + 
geom_point(shape = '.', alpha = 0.5) + 
ggtitle('Post-harmony UMAP') +
ggthemes::scale_color_tableau('Tableau 20' , name = "") + 
guides(color = guide_legend(override.aes = list(size = 10, shape = 16, alpha = 1))) +
ggpubr::theme_pubr(base_size=10) +
theme(legend.position = 'top') 

## DE genes for clusterfull b cells

In [None]:
Idents(scRNA_BCells) = 'ClusterFull'
require(presto)
top_markers(wilcoxauc(scRNA_BCells),
                      n = Inf,
                      padj = 0.05,
                      auc_min = 0.55)

## Cluster at a few resolutions to remove non B cells

In [None]:
set.seed(1)
resolutions_test = seq(0.1, 1.5, by = 0.1)
resolutions_test
scRNA_BCells = Seurat::FindClusters(scRNA_BCells, 
                                    graph.name = 'humap_fgraph', 
                                    resolution = resolutions_test, 
                                    verbose = TRUE)
scRNA_BCells

### stash the results that you used to filter out non B cells

In [None]:
scRNA_BCells@meta.data$Louvain_for_filtering = droplevels(scRNA_BCells@meta.data$humap_fgraph_res.1.5)
scRNA_BCells@meta.data$Louvain_for_filtering 
scRNA_BCells

## Cache

In [None]:
readr::write_rds(scRNA_BCells, 'Bcells_fineTyping_all_genes.rds')

## DE with wilcox to figure out which of the ~30 clusters are non B cells

In [None]:
Idents(scRNA_BCells) = 'Louvain_for_filtering' 
require(presto)
top_markers(wilcoxauc(scRNA_BCells),
                      n = Inf,
                      padj = 0.05,
                      auc_min = 0.6)

In [None]:
humap_embeddings = Embeddings(scRNA_BCells, 'humap') %>% 
    as.data.frame() %>%
    mutate(combined_cell_names = rownames(.)) %>%
    left_join(., scRNA_BCells@meta.data %>% 
              select(combined_cell_names, Louvain_for_filtering, biosample_id))
rbind(humap_embeddings %>% head(), humap_embeddings %>% tail())

In [None]:
options(repr.plot.width = 20, repr.plot.height = 20)
constantPalette = rep('red', length(unique(humap_embeddings$Louvain_for_filtering)))
names(constantPalette) = unique(humap_embeddings$Louvain_for_filtering)
ggplot(humap_embeddings, aes(x = HUMAP_1, y = HUMAP_2, color = Louvain_for_filtering)) + 
geom_point(shape = '.', alpha = 1) + 
ggtitle('For filtering non B cells:\nOver-clustering of B cells UMAP') +
ggpubr::theme_pubr(base_size=20) +
theme(legend.position = 'none') +
scale_color_manual(values = constantPalette) +
facet_wrap(~Louvain_for_filtering) +
gghighlight::gghighlight()

### Remove cluster 12 - these are T cells - and cluster 15 - distinguished only by heatshock proteins

In [None]:
scRNA_BCells = readr::read_rds('Bcells_fineTyping_all_genes.rds')

In [None]:
scRNA_BCells = subset(scRNA_BCells, subset = Louvain_for_filtering %in% c(12, 15), invert = TRUE)
scRNA_BCells

## Redo scale/normalize/PCA/harmony/UMAP after removing the non-B cells

In [None]:
normFactor = scRNA_BCells@meta.data %>%
select(biosample_id, nCount_RNA) %>%
group_by(biosample_id) %>%
summarize(medianCounts = median(nCount_RNA)) 
normFactor = mean(normFactor$medianCounts) #median of medians
normFactor
scRNA_BCells = NormalizeData(scRNA_BCells, 
                          normalization.method = "LogNormalize", 
                          scale.factor = normFactor) %>%
    ScaleData(features = rownames(scRNA_BCells)) %>%
    singlecellmethods::RunBalancedPCA(weight.by='biosample_id', npcs=15)
scRNA_BCells

In [None]:
options(repr.plot.width = 5, repr.plot.height = 5)
scRNA_BCells = scRNA_BCells %>% harmony::RunHarmony("biosample_id", 
    plot_convergence = TRUE, 
    lambda = NULL,
    max.iter = 10,
    early_stop = F,
    sigma = 0.2
) 

In [None]:
U = uwot::umap(scRNA_BCells@reductions$harmony@cell.embeddings[, 1:15], 
               min_dist = 0.05, 
               spread = 0.30, 
               ret_extra = 'fgraph', 
               n_sgd_threads = nbrOfWorkers(), 
               fast_sgd = TRUE)
colnames(U$embedding) = c('HUMAP1', 'HUMAP2')
rownames(U$fgraph) = colnames(U$fgraph) = Cells(scRNA_BCells)
scRNA_BCells[['humap']] = Seurat::CreateDimReducObject(
    embeddings = U$embedding,
    assay = 'RNA', 
    key = 'HUMAP_', 
    global = TRUE
)
new_graph = Seurat::as.Graph(U$fgraph)
DefaultAssay(new_graph) = DefaultAssay(scRNA_BCells)
scRNA_BCells[['humap_fgraph']] = new_graph

In [None]:
humap_embeddings = Embeddings(scRNA_BCells, 'humap') %>% 
    as.data.frame() %>%
    mutate(combined_cell_names = rownames(.)) %>%
    left_join(., scRNA_BCells@meta.data %>% 
              select(combined_cell_names, ClusterFull, biosample_id))
rbind(humap_embeddings %>% head(), humap_embeddings %>% tail())

In [None]:
options(repr.plot.width = 20, repr.plot.height = 20)
constantPalette = rep('red', length(unique(humap_embeddings$biosample_id)))
names(constantPalette) = unique(humap_embeddings$biosample_id)
ggplot(humap_embeddings, aes(x = HUMAP_1, y = HUMAP_2, color = biosample_id)) + 
geom_point(shape = '.', alpha = 0.5) + 
ggtitle('Post-harmony UMAP') +
ggpubr::theme_pubr(base_size=10) +
theme(legend.position = 'none') +
scale_color_manual(values = constantPalette) +
facet_wrap(~biosample_id) +
gghighlight::gghighlight()

In [None]:
options(repr.plot.width = 10, repr.plot.height = 7)
constantPalette = rep('red', length(unique(humap_embeddings$ClusterFull)))
names(constantPalette) = unique(humap_embeddings$ClusterFull)
ggplot(humap_embeddings, aes(x = HUMAP_1, y = HUMAP_2, color = ClusterFull)) + 
geom_point(shape = '.', alpha = 0.5) + 
ggtitle('Post-harmony UMAP') +
ggpubr::theme_pubr(base_size=10) +
theme(legend.position = 'none') +
scale_color_manual(values = constantPalette) +
facet_wrap(~ClusterFull) +
gghighlight::gghighlight()

## one more round of clustering to get clusters that will map back to fine types

In [None]:
set.seed(1)
resolutions_test = seq(0.1, 1.5, by = 0.1)
scRNA_BCells = Seurat::FindClusters(scRNA_BCells, 
    graph.name = 'humap_fgraph', 
    resolution = resolutions_test, 
    verbose = TRUE)
scRNA_BCells

## stash clusters used for mapping fine cell types to clean louvain clusters

In [None]:
levels(scRNA_BCells@meta.data$humap_fgraph_res.0.5)

In [None]:
scRNA_BCells@meta.data$Louvain_for_fine_types = droplevels(scRNA_BCells@meta.data$humap_fgraph_res.0.5)
scRNA_BCells@meta.data$Louvain_for_fine_types 

## concordance of new clusters with pelka fine types

In [None]:
options(repr.plot.width = 15, repr.plot.height = 6)
oldClustersInNew3 = table(scRNA_BCells@meta.data$Louvain_for_fine_types, 
                          scRNA_BCells@meta.data$ClusterFull) %>% 
    t() %>% 
    prop.table(1) %>% 
    scale()
colOrder = hclust(dist(t(oldClustersInNew3), 
                       method = "euclidean"), 
                  method = "ward.D" )$order
rowOrder = hclust(dist(oldClustersInNew3, 
                       method = "euclidean"), 
                  method = "ward.D" )$order
oldClustersInNew3[rowOrder, colOrder] %>% 
as.data.frame() %>% 
rename('Fine Type' = Var1, 
       'Louvain Cluster' = Var2, 
       'Scaled\nProportion' = Freq) %>%
ggplot(aes(y = `Fine Type`, 
           x = `Louvain Cluster`)) +
geom_tile(aes(fill = `Scaled\nProportion`), 
          alpha = 1, 
          color = 'black') +
geom_label(aes(label = round(.data$`Scaled\nProportion`, 1)), 
           color = 'red') +
scale_fill_viridis_c(direction = -1) +
theme_minimal(base_size = 16) +
theme(axis.text.x = element_text(angle = 90, 
                                 vjust = 0.5, 
                                 hjust=1)) +
ggtitle('Louvain to map to fine types - step 1\nIdentify mixed clusters')

## Select clusters for sub-clustering

4

### subcluster 4

In [None]:
Idents(scRNA_BCells) = 'Louvain_for_fine_types'
for (cluster in c('0', '4')){
scRNA_BCells = FindSubCluster(
  scRNA_BCells,
  cluster,
  graph.name = 'humap_fgraph',
  subcluster.name = paste("sub.cluster_",cluster, sep=""),
  resolution = 0.5,
  algorithm = 1
)
    scRNA_BCells = SetIdent(scRNA_BCells, value = paste("sub.cluster_",cluster, sep=""))
}

## stash subclustered Louvain clusters

In [None]:
scRNA_BCells@meta.data$subclustered_Louvain_for_fine_types = droplevels(as.factor(scRNA_BCells@meta.data$sub.cluster_4))
scRNA_BCells@meta.data$subclustered_Louvain_for_fine_types 
scRNA_BCells = SetIdent(scRNA_BCells, 
                        value = 'subclustered_Louvain_for_fine_types')

## plot concordance

In [None]:
options(repr.plot.width = 20, repr.plot.height = 6)
oldClustersInNew3 = table(scRNA_BCells@meta.data$subclustered_Louvain_for_fine_types, 
                          scRNA_BCells@meta.data$ClusterFull) %>% 
    t() %>% 
    prop.table(1) %>% 
    scale()
colOrder = hclust(dist(t(oldClustersInNew3), 
                       method = "euclidean"), 
                  method = "ward.D" )$order
rowOrder = hclust(dist(oldClustersInNew3, 
                       method = "euclidean"), 
                  method = "ward.D" )$order
oldClustersInNew3[rowOrder, colOrder] %>% 
as.data.frame() %>% 
rename('Fine Type' = Var1, 
       'Louvain Cluster' = Var2, 
       'Scaled\nProportion' = Freq) %>%
ggplot(aes(y = `Fine Type`, 
           x = `Louvain Cluster`)) +
geom_tile(aes(fill = `Scaled\nProportion`), 
          alpha = 1, 
          color = 'black') +
geom_label(aes(label = round(.data$`Scaled\nProportion`, 1)), 
           color = 'red') +
scale_fill_viridis_c(direction = -1) +
theme_minimal(base_size = 16) +
theme(axis.text.x = element_text(angle = 90, 
                                 vjust = 0.5, 
                                 hjust=1)) +
ggtitle('Louvain to map to fine types - step 2\nMap subclustered Louvain clusters back to Pelka fine types')

### plot umap of louvain subclusters

In [None]:
options(repr.plot.width = 20, repr.plot.height = 20)
humap_embeddings = Embeddings(scRNA_BCells, 'humap') %>% 
    as.data.frame() %>%
    mutate(combined_cell_names = rownames(.)) %>%
    left_join(., scRNA_BCells@meta.data %>% 
              select(combined_cell_names, subclustered_Louvain_for_fine_types))
rbind(humap_embeddings %>% head(), humap_embeddings %>% tail())
options(repr.plot.width = 20, repr.plot.height = 20)
constantPalette = rep('red', length(unique(humap_embeddings$subclustered_Louvain_for_fine_types)))
names(constantPalette) = unique(humap_embeddings$subclustered_Louvain_for_fine_types)
ggplot(humap_embeddings, aes(x = HUMAP_1, y = HUMAP_2, color = subclustered_Louvain_for_fine_types)) + 
geom_point(shape = '.', alpha = 0.5) + 
ggtitle('Post-harmony UMAP') +
ggpubr::theme_pubr(base_size=20) +
theme(legend.position = 'none') +
scale_color_manual(values = constantPalette) +
facet_wrap(~subclustered_Louvain_for_fine_types) +
gghighlight::gghighlight()

### remove ambiguous clusters

4_2
NOTE: I tried to subcluster these and wasn't successful, so I'm going to just remove these cells

In [None]:
scRNA_BCells@meta.data$subclustered_Louvain_for_fine_types

In [None]:
scRNA_BCells = subset(scRNA_BCells, 
                      subset = subclustered_Louvain_for_fine_types %in% c('0_0', '4_0', '4_2'), 
                      invert = TRUE)
scRNA_BCells@meta.data$subclustered_Louvain_for_fine_types = droplevels(scRNA_BCells@meta.data$subclustered_Louvain_for_fine_types)
scRNA_BCells

## Relabel and merge clusters

In [None]:
table(scRNA_BCells@meta.data$subclustered_Louvain_for_fine_types) %>% as.matrix() 

### Here are the new labels:

In [None]:
oldClustersInNew3 = table(scRNA_BCells@meta.data$subclustered_Louvain_for_fine_types, 
                          scRNA_BCells@meta.data$ClusterFull) %>% 
    t() %>% 
    prop.table(1) %>% 
    scale() 
oldClustersInNew3

In [None]:
mergedClusters = names(apply(oldClustersInNew3, 2, which.max))
names(mergedClusters) = rownames(oldClustersInNew3)[apply(oldClustersInNew3, 2, which.max)]
mergedClusters 

In [None]:
scRNA_BCells@meta.data$cleaned_fine_types = droplevels(
    fct_recode(
        scRNA_BCells@meta.data$subclustered_Louvain_for_fine_types, 
        !!! mergedClusters))
scRNA_BCells = SetIdent(scRNA_BCells, 
                        value = 'cleaned_fine_types')
table(scRNA_BCells@meta.data$cleaned_fine_types)

In [None]:
options(repr.plot.width = 20, repr.plot.height = 6)
oldClustersInNew3 = table(scRNA_BCells@meta.data$subclustered_Louvain_for_fine_types, 
                          scRNA_BCells@meta.data$ClusterFull) %>% 
    t() %>% 
    prop.table(1) %>% 
    scale()
colOrder = hclust(dist(t(oldClustersInNew3), 
                       method = "euclidean"), 
                  method = "ward.D" )$order
rowOrder = hclust(dist(oldClustersInNew3, 
                       method = "euclidean"), 
                  method = "ward.D" )$order
oldClustersInNew3[rowOrder, colOrder] %>% 
as.data.frame() %>% 
rename('Fine Type' = Var1, 
       'Louvain Cluster' = Var2, 
       'Scaled\nProportion' = Freq) %>%
ggplot(aes(y = `Fine Type`, 
           x = `Louvain Cluster`)) +
geom_tile(aes(fill = `Scaled\nProportion`), 
          alpha = 1, 
          color = 'black') +
geom_label(aes(label = round(.data$`Scaled\nProportion`, 1)), 
           color = 'red') +
scale_fill_viridis_c(direction = -1) +
theme_minimal(base_size = 16) +
theme(axis.text.x = element_text(angle = 90, 
                                 vjust = 0.5, 
                                 hjust=1)) +
ggtitle('Louvain to map to fine types - step 2\nMap subclustered Louvain clusters back to Pelka fine types')

#### discard conflicts

In [None]:
scRNA_BCells = subset(scRNA_BCells, subset = cleaned_fine_types == ClusterFull)

## Subcluster cB2 and cB3

#### need to recalculate graph

In [None]:
U = uwot::umap(scRNA_BCells@reductions$harmony@cell.embeddings[, 1:15], 
               min_dist = 0.05, 
               spread = 0.30, 
               ret_extra = 'fgraph', 
               n_sgd_threads = nbrOfWorkers(), 
               fast_sgd = TRUE)
colnames(U$embedding) = c('HUMAP1', 'HUMAP2')
rownames(U$fgraph) = colnames(U$fgraph) = Cells(scRNA_BCells)
scRNA_BCells[['humap']] = Seurat::CreateDimReducObject(
    embeddings = U$embedding,
    assay = 'RNA', 
    key = 'HUMAP_', 
    global = TRUE
)
new_graph = Seurat::as.Graph(U$fgraph)
DefaultAssay(new_graph) = DefaultAssay(scRNA_BCells)
scRNA_BCells[['humap_fgraph']] = new_graph

In [None]:
Idents(scRNA_BCells) = 'cleaned_fine_types'
for (cluster in c('cB2 (B GC-like)', 'cB3 (B CD40+ GC-like)')){
scRNA_BCells = FindSubCluster(
  scRNA_BCells,
  cluster,
  graph.name = 'humap_fgraph',
  subcluster.name = make.names(paste("sub.cluster_",cluster, sep="")),
  resolution = 0.5,
  algorithm = 1
)
    scRNA_BCells = SetIdent(scRNA_BCells, value = make.names(paste("sub.cluster_",cluster, sep="")))
}
scRNA_BCells@meta.data$original_cleaned_fine_types = scRNA_BCells@meta.data$cleaned_fine_types
new_cluster = make.names(paste("sub.cluster_",cluster, sep="")) 
print(new_cluster)
scRNA_BCells@meta.data$cleaned_fine_types = scRNA_BCells@meta.data[, new_cluster]

In [None]:
options(repr.plot.width = 20, repr.plot.height = 10)
humap_embeddings = Embeddings(scRNA_BCells, 'humap') %>% 
    as.data.frame() %>%
    mutate(combined_cell_names = rownames(.)) %>%
    left_join(., scRNA_BCells@meta.data %>% 
              select(combined_cell_names, cleaned_fine_types))
rbind(humap_embeddings %>% head(), humap_embeddings %>% tail())
constantPalette = rep('red', length(unique(humap_embeddings$cleaned_fine_types)))
names(constantPalette) = unique(humap_embeddings$cleaned_fine_types)
ggplot(humap_embeddings, aes(x = HUMAP_1, y = HUMAP_2, color = cleaned_fine_types)) + 
geom_point(shape = '.', alpha = 0.5) + 
ggtitle('Cleaned fine types') +
ggpubr::theme_pubr(base_size=20) +
theme(legend.position = 'none') +
scale_color_manual(values = constantPalette) +
facet_wrap(~cleaned_fine_types) +
gghighlight::gghighlight()

In [None]:
options(repr.plot.width = 20, repr.plot.height = 10)
humap_embeddings = Embeddings(scRNA_BCells, 'humap') %>% 
    as.data.frame() %>%
    mutate(combined_cell_names = rownames(.)) %>%
    left_join(., scRNA_BCells@meta.data %>% 
              select(combined_cell_names, ClusterFull))
rbind(humap_embeddings %>% head(), humap_embeddings %>% tail())
constantPalette = rep('red', length(unique(humap_embeddings$ClusterFull)))
names(constantPalette) = unique(humap_embeddings$ClusterFull)
ggplot(humap_embeddings, aes(x = HUMAP_1, y = HUMAP_2, color = ClusterFull)) + 
geom_point(shape = '.', alpha = 0.5) + 
ggtitle('Pelka labels') +
ggpubr::theme_pubr(base_size=20) +
theme(legend.position = 'none') +
scale_color_manual(values = constantPalette) +
facet_wrap(~ClusterFull) +
gghighlight::gghighlight()

In [None]:
wilcoxauc(scRNA_BCells) %>% top_markers(auc_min = 0.7, n = Inf)

## Merge subclusters for cB2 and cB3

In [None]:
colnames(scRNA_BCells@meta.data)
table(scRNA_BCells@meta.data$sub.cluster_cB3..B.CD40..GC.like.) %>% as.data.frame

In [None]:
scRNA_BCells@meta.data$merged_sub.cluster_cB3..B.CD40..GC.like. = fct_recode(scRNA_BCells@meta.data$sub.cluster_cB3..B.CD40..GC.like.,                                       "cB3 (B CD40+ GC-like)_2" = "cB3 (B CD40+ GC-like)_3", # merge cB3 (B CD40+ GC-like)_2	and cB3 (B CD40+ GC-like)_3, relabel as cB3 (B CD40+ GC-like)_2	
"cB3 (B CD40+ GC-like)_1" = "cB3 (B CD40+ GC-like)_0", # merge "cB3 (B CD40+ GC-like)_0" and "cB3 (B CD40+ GC-like)_1", relabel as "cB3 (B CD40+ GC-like)_1"
"cB2 (B GC-like)_0" = "cB2 (B GC-like)_2", # merge cB2 (B GC-like)_0 and cB2 (B GC-like)_2, relabel as cB2 (B GC-like)_0
"cB2 (B GC-like)_1" = "cB2 (B GC-like)_3", # merge cB2 (B GC-like)_3 and cB2 (B GC-like)_1, relabel as cB2 (B GC-like)_1
"cB2 (B GC-like)_1" = "cB2 (B GC-like)_4" # merge cB2 (B GC-like)_4 and cB2 (B GC-like)_1, relabel as cB2 (B GC-like)_1
)
scRNA_BCells = SetIdent(object = scRNA_BCells, value = 'merged_sub.cluster_cB3..B.CD40..GC.like.')
wilcoxauc(scRNA_BCells) %>% top_markers(auc_min = 0.7, n = Inf)

In [None]:
scRNA_BCells@meta.data$merged_sub.cluster_cB3..B.CD40..GC.like. = fct_recode(scRNA_BCells@meta.data$merged_sub.cluster_cB3..B.CD40..GC.like.,                 
 "cB2 (B GC-like)_GPR183+" = "cB2 (B GC-like)_0", # merge cB2 (B GC-like)_0 and cB2 (B GC-like)_2, relabel as cB2 (B GC-like)_0
"cB2 (B GC-like)" = "cB2 (B GC-like)_1",
"cB3 (B CD40+ GC-like)_CD40+" = "cB3 (B CD40+ GC-like)_1", # merge "cB3 (B CD40+ GC-like)_0" and "cB3 (B CD40+ GC-like)_1", relabel as "cB3 (B CD40+ GC-like)_1"
"cB3 (B CD40+ GC-like)_MK167+" = "cB3 (B CD40+ GC-like)_2"
)

In [None]:
options(repr.plot.width = 20, repr.plot.height = 10)
humap_embeddings = Embeddings(scRNA_BCells, 'humap') %>% 
    as.data.frame() %>%
    mutate(combined_cell_names = rownames(.)) %>%
    left_join(., scRNA_BCells@meta.data %>% 
              select(combined_cell_names, merged_sub.cluster_cB3..B.CD40..GC.like.))
rbind(humap_embeddings %>% head(), humap_embeddings %>% tail())
constantPalette = rep('red', length(unique(humap_embeddings$merged_sub.cluster_cB3..B.CD40..GC.like.)))
names(constantPalette) = unique(humap_embeddings$merged_sub.cluster_cB3..B.CD40..GC.like.)
ggplot(humap_embeddings, aes(x = HUMAP_1, y = HUMAP_2, color = merged_sub.cluster_cB3..B.CD40..GC.like.)) + 
geom_point(shape = '.', alpha = 0.5) + 
ggtitle('Cleaned fine types') +
ggpubr::theme_pubr(base_size=20) +
theme(legend.position = 'none') +
scale_color_manual(values = constantPalette) +
facet_wrap(~merged_sub.cluster_cB3..B.CD40..GC.like.) +
gghighlight::gghighlight()

In [None]:
scRNA_BCells = SetIdent(object = scRNA_BCells, value = 'merged_sub.cluster_cB3..B.CD40..GC.like.')
wilcoxauc(scRNA_BCells) %>% top_markers(auc_min = 0.7, n = Inf)

In [None]:
FeaturePlot(scRNA_BCells, features = c('CD40', 'GPR183', 'CD69', 'MKI67'))

In [None]:
scRNA_BCells@meta.data$cleaned_fine_types = scRNA_BCells@meta.data$merged_sub.cluster_cB3..B.CD40..GC.like.
table(scRNA_BCells@meta.data$cleaned_fine_types)

## Justify new labels
- Once we have labels, we want to justify them to collaborators
- DGE analysis in original labels (post-QC cells) 
- DGE analysis in new labels 
- Correlate logFC in matched clusters

### DGE analysis in original labels (post-QC cells) 

In [None]:
clusterFull_markers = presto::wilcoxauc(scRNA_BCells, 'ClusterFull') 
top_markers(clusterFull_markers,
    n = Inf,
    padj = 0.05,
    auc_min = 0.6)

### DGE analysis in new labels 

In [None]:
cleaned_fine_types_markers = presto::wilcoxauc(scRNA_BCells, 'cleaned_fine_types') 
top_markers(cleaned_fine_types_markers,
    n = Inf,
    padj = 0.05,
    auc_min = 0.6)

### Correlate logFC in matched clusters

In [None]:
clusterFull_markers = clusterFull_markers %>% filter(group %in% cleaned_fine_types_markers$group)
clusterFull_markers$Comparison = 'ClusterFull'
cleaned_fine_types_markers$Comparison = 'cleaned_fine_types'

In [None]:
comparison_markers = dplyr::left_join(x = clusterFull_markers %>% select(feature, group, logFC), 
                                      y = cleaned_fine_types_markers %>% select(feature, group, logFC),
                                    by = join_by(feature, group))
colnames(comparison_markers) = c('feature', 'group', 'clusterFull', 'cleaned_fine_types')
rbind(head(comparison_markers), tail(comparison_markers))

In [None]:
options(repr.plot.width=10, repr.plot.height=5)
ggplot(comparison_markers) + 
geom_point(aes(x=clusterFull, y=cleaned_fine_types), shape = '.', alpha = 0.5) +
facet_wrap(~group, nrow = 1) + 
theme_pubr(base_size = 16) +
xlab('Pelka cell types') +
ylab('Cleaned fine types') +
geom_hline(aes(yintercept = 0)) +
geom_vline(aes(xintercept = 0)) 

### Pearson correlations

In [None]:
cellTypeCor = comparison_markers %>% 
group_by(group) %>% 
dplyr::summarize(cor(clusterFull, cleaned_fine_types))
colnames(cellTypeCor) = c('group', 'correlation')
cellTypeCor

## Visualize final clusters in UMAP space, and also the original labels (after merge)

In [None]:
options(repr.plot.width = 20, repr.plot.height = 20)
humap_embeddings = Embeddings(scRNA_BCells, 'humap') %>% 
    as.data.frame() %>%
    mutate(combined_cell_names = rownames(.)) %>%
    left_join(., scRNA_BCells@meta.data %>% 
              select(combined_cell_names, cleaned_fine_types, ClusterFull))
rbind(humap_embeddings %>% head(), humap_embeddings %>% tail())

#### Cleaned clusters

In [None]:
options(repr.plot.width = 21, repr.plot.height = 7)
constantPalette = rep('red', length(unique(humap_embeddings$cleaned_fine_types)))
names(constantPalette) = unique(humap_embeddings$cleaned_fine_types)
ggplot(humap_embeddings, aes(x = HUMAP_1, y = HUMAP_2, color = cleaned_fine_types)) + 
geom_point(shape = '.', alpha = 0.5) + 
ggpubr::theme_pubr(base_size=20) +
theme(legend.position = 'none') +
scale_color_manual(values = constantPalette) +
facet_wrap(~cleaned_fine_types) +
gghighlight::gghighlight() +
ggtitle('Cleaned fine types - B cells')

#### Pelka clusters

In [None]:
options(repr.plot.width = 21, repr.plot.height = 7)
constantPalette = rep('red', length(unique(humap_embeddings$ClusterFull)))
names(constantPalette) = unique(humap_embeddings$ClusterFull)
ggplot(humap_embeddings, aes(x = HUMAP_1, y = HUMAP_2, color = ClusterFull)) + 
geom_point(shape = '.', alpha = 0.5) + 
ggpubr::theme_pubr(base_size=20) +
theme(legend.position = 'none') +
scale_color_manual(values = constantPalette) +
facet_wrap(~ClusterFull) +
gghighlight::gghighlight() +
ggtitle('Pelka fine types - B cells')

## GLMM to find DE genes between clusters

In [None]:
require(tidyverse)
require(Seurat)
require(data.table)
require(lme4)
require(presto)
require(singlecellmethods)
require(future)
require(furrr)
require(gghighlight)
options(future.globals.maxSize = 1000 * 1024 ^2)
set.seed(1)

In [None]:
require(presto)
require(singlecellmethods)
pb = presto::collapse_counts(
    GetAssayData(scRNA_BCells, 'counts'), 
    scRNA_BCells@meta.data, 
    c("biosample_id", "cleaned_fine_types"), 
    min_cells_per_group = 3
)
pb$meta_data %>% head()
dim(pb$counts)

colnames(scRNA_BCells@meta.data)
dim(pb$meta_data)
head(pb$meta_data)
dim(pb$counts_mat)

In [None]:
pb$exprs_norm = pb$exprs_norm[rownames(pb$counts_mat), colnames(pb$counts_mat)]
dim(pb$exprs_norm)
pb$exprs_norm[1:5, 1:5]

In [None]:
system.time({
    suppressWarnings({
        presto_res = presto::presto.presto(
            y ~ 1 + (1|cleaned_fine_types) + (1|cleaned_fine_types:biosample_id) + (1|biosample_id) + offset(logUMI), 
            pb$meta_data, 
            pb$counts_mat,
            size_varname = "logUMI", 
            effects_cov = "cleaned_fine_types",
            ncore = 1, 
            min_sigma = .05,
            family = "poisson",
            nsim = 1000
        )    
    })
})
readr::write_rds(presto_res, 'Bcells_fineTyping_GLMM.rds')

In [None]:
presto_res = readr::read_rds('Bcells_fineTyping_GLMM.rds')

### Make contrasts

In [None]:
contrasts_mat = make_contrast.presto(
    presto_res, 
    var_contrast = 'cleaned_fine_types')
contrasts_mat

### Find marginal effects

In [None]:
effects_marginal = contrasts.presto(
    presto_res, 
    contrasts_mat, 
    one_tailed = TRUE
) %>% 
    dplyr::mutate(cluster = contrast) %>% 
    dplyr::mutate(
        logFC = sign(beta) * log2(exp(abs(beta))), ## convert stats to log2 for interpretability 
        SD = log2(exp(sigma)),
        zscore = logFC / SD
    ) %>%
    arrange(pvalue)

effects_marginal$fdr = p.adjust(effects_marginal$pvalue, method = 'BH')
effects_marginal$corr_fdr = effects_marginal$fdr
effects_marginal$corr_fdr[effects_marginal$fdr == 0] = min(effects_marginal$fdr[effects_marginal$fdr != 0])
effects_marginal$`-log10_fdr` = (-1) * log10(effects_marginal$corr_fdr) 
dim(effects_marginal)
head(effects_marginal)


In [None]:
meanExp = rowMeans(GetAssayData(scRNA_BCells, 'data')) 
meanExp = data.frame(feature = names(meanExp), meanExp = meanExp)
for (cluster in unique(effects_marginal$cluster)) {
    temp = GetAssayData(scRNA_BCells, 'counts')[,scRNA_BCells@meta.data$sampleID[scRNA_BCells@meta.data$cleaned_fine_types == cluster]] %>% as.data.frame()
    temp = temp %>%
    rowwise() %>%
    mutate(`N_zeros` = sum(c_across(everything()) == 0)) %>%
    select(`N_zeros`) %>% as.data.frame()
    rownames(temp) = rownames(GetAssayData(scRNA_BCells, 'counts'))
    meanExp[,cluster] = temp$`N_zeros`/length(scRNA_BCells@meta.data$sampleID[scRNA_BCells@meta.data$cleaned_fine_types == cluster])
}
head(meanExp)

#effects_marginal = effects_marginal %>% left_join(meanExp, by = join_by(feature), relationship = 'many-to-one') 
#tail(effects_marginal)

In [None]:
temp = meanExp %>% pivot_longer(cols =unique(scRNA_BCells@meta.data$cleaned_fine_types)) 
colnames(temp) = c('feature', 'meanExp', 'contrast', 'prop.expressed')
temp$prop.expressed = 1 - temp$prop.expressed 
effects_marginal = left_join(effects_marginal, temp, by = join_by(feature, contrast)) 
rm(temp)
head(effects_marginal)

# volcano plot

In [None]:
sum(effects_marginal$logFC > 2)
sum(effects_marginal$logFC > 0.5)
sum(effects_marginal$logFC > 0.1)
effects_marginal%>% group_by(cluster) %>% filter(fdr < 0.05) %>% summarize(medianLFC = median(logFC), n = n())

In [None]:
options(repr.plot.width = 15, repr.plot.height = 7, repr.plot.res = 200)
require(ggrepel)
ggplot(effects_marginal, aes(x = logFC, y = `-log10_fdr`)) +
geom_point(shape = '.') +
geom_point(data = effects_marginal[effects_marginal$`-log10_fdr` > (-1) * log10(0.05) & effects_marginal$`logFC` < 0.1,], color = 'blue', shape = 16) +
geom_point(data = effects_marginal[effects_marginal$logFC > 0.1 & effects_marginal$fdr < 0.05 ,], color = 'red', shape = 16) +
geom_label_repel(data = effects_marginal[effects_marginal$`logFC` > 0.1 & effects_marginal$fdr < 0.05,], aes(x = logFC, y = `-log10_fdr`, label = feature), color = 'red', max.overlaps = 50) +
geom_label_repel(data = effects_marginal[effects_marginal$`logFC` < 0.1 & effects_marginal$fdr < 0.05,], aes(x = logFC, y = `-log10_fdr`, label = feature), color = 'blue', max.overlaps = 50) +
facet_wrap(~cluster, scales = 'free_y', ncol= 3) +
theme_bw(base_size = 20) +
geom_hline(aes(yintercept = (-1)*log10(0.05)), color = 'lightgrey') +
geom_vline(aes(xintercept = 2), color = 'lightgrey') 

In [None]:
#effects_marginal$`-log10_fdr`[effects_marginal$`-log10_fdr` > 50] = 50

## MA plot

In [None]:
# logFC vs mean of normalized counts
ggplot(effects_marginal, aes(x = meanExp, y = logFC)) +
geom_point(shape = '.') +
geom_point(data = effects_marginal[effects_marginal$logFC > 2,], color = 'red', shape = 16) +
geom_label_repel(data = effects_marginal[effects_marginal$logFC > 2,], aes(label = feature), color = 'red') +
geom_point(data = effects_marginal[effects_marginal$meanExp > 1,], color = 'blue', shape = 16) +
geom_label_repel(data = effects_marginal[effects_marginal$meanExp > 1,], aes(label = feature), color = 'blue') +
facet_wrap(~cluster) +
theme_bw(base_size = 20) +
xlab('Mean normalized expression')

In [None]:
ggplot(effects_marginal, aes(meanExp)) +
geom_histogram() +
facet_wrap(~cluster) +
theme_bw(base_size = 20) +
scale_y_continuous(trans = 'log10') +
xlab('Mean expression') +
ylab('Frequency')

In [None]:
unique(effects_marginal$cluster)
scRNA_BCells@meta.data$ClusterFull %>% unique()

In [None]:
# logFC vs percent expressed
ggplot(effects_marginal, aes(x = prop.expressed, y = logFC)) +
geom_point(shape = '.') +
geom_point(data = effects_marginal[effects_marginal$logFC > 2,], color = 'red', shape = 16) +
geom_label_repel(data = effects_marginal[effects_marginal$logFC > 2,], aes(label = feature), color = 'red') +
geom_point(data = effects_marginal[effects_marginal$prop.expressed > 0.5,], color = 'blue', shape = 16) +
geom_label_repel(data = effects_marginal[effects_marginal$prop.expressed > 0.5,], aes(label = feature), color = 'blue') +
facet_wrap(~cluster) +
theme_bw(base_size = 20) +
xlab('Proportions of cells expressing the gene')

In [None]:
write.csv(effects_marginal, 'effects_marginal_B.csv')

## filters 

In [None]:
head(effects_marginal)

In [None]:
effects_marginal2 = effects_marginal %>% 
filter(logFC > 1) %>%
filter(prop.expressed > 0.05)

In [None]:
dim(effects_marginal)

### Make a plot of cells lost after QC at every nGene cutoff

- x-axis: number of genes kept
- y-axis: number of cells kept

In [None]:
getwd()

In [None]:
dim(scRNA_BCells)

In [None]:
genes_ranked_by_max_zscore = effects_marginal2 %>% 
        group_by(feature) %>% 
        mutate(max_zscore = max(zscore)) %>% 
        select(feature, max_zscore) %>% 
        distinct() %>%
        arrange(desc(max_zscore))
dim(genes_ranked_by_max_zscore)
rbind(head(genes_ranked_by_max_zscore), tail(genes_ranked_by_max_zscore))

In [None]:
dim(genes_ranked_by_max_zscore)

In [None]:
nGenes_used = c(unique(seq(from = 10, to = min(nrow(genes_ranked_by_max_zscore), nrow(scRNA_BCells)))), min(nrow(genes_ranked_by_max_zscore), nrow(scRNA_BCells)))
nGenes_used
nCells_left = rep(NA, length(nGenes_used))
names(nCells_left) = as.character(nGenes_used)
countMat = GetAssayData(scRNA_BCells, 'counts')
dim(countMat)

for(trial in nGenes_used){
    print(trial)
    selectedGenes = genes_ranked_by_max_zscore$feature[1:trial]
    print(length(selectedGenes %>% unique()))
    counts = countMat[selectedGenes,]
    # filter
    nFeatures = colSums(counts > 0)
    nCounts = colSums(counts)
    nCells_left[[as.character(trial)]] = sum(nFeatures > 10 & nCounts > 30)
    print(nCells_left[[as.character(trial)]])
}

options(repr.plot.width=9, repr.plot.height=9)
nCells_left %>% 
as.data.frame() %>% 
rename('nCells_left' = colnames(.)[1]) %>% 
mutate("nGenes_used" = nGenes_used) %>%
mutate(nGenes_used = as.integer(.$nGenes_used)) %>%
ggplot() + geom_point(aes(x = nGenes_used, y=nCells_left)) + 
ggpubr::theme_pubr() + 
scale_y_continuous(trans='log10') + 
xlab('Number of genes used') + 
ylab('Number of cells left') + 
geom_segment(aes(xend=nGenes_used, 
                 x = nGenes_used, 
                 y=nCells_left), 
             yend=0, 
             linetype=3) + 
ggtitle('GLMM DEGs - B cells') 

In [None]:
genes_ranked_by_max_zscore

## Select 50 genes and cells with counts > 30 and features > 10

In [None]:
selectedGenes = genes_ranked_by_max_zscore$feature[1:50]
selectedGenes %>% length()
selectedGenes

In [None]:
scRNA_BCells_selected_genes = scRNA_BCells[selectedGenes, ]

In [None]:
selectedCells = scRNA_BCells_selected_genes@meta.data %>% 
    filter(nFeature_RNA > 10 & nCount_RNA > 30) %>%
    select(combined_cell_names) %>%
    as.matrix() %>%
    as.vector()
length(selectedCells)
c(head(selectedCells), tail(selectedCells))

In [None]:
scRNA_BCells_selected_genes = scRNA_BCells[selectedGenes, selectedCells]
summary(scRNA_BCells_selected_genes@meta.data$nFeature_RNA)
summary(scRNA_BCells_selected_genes@meta.data$nCount_RNA)

In [None]:
options(repr.plot.height = 7, repr.plot.width = 14)
(ggplot(scRNA_BCells_selected_genes@meta.data) +
geom_histogram(aes(nFeature_RNA)) +
ggtitle('nFeature_RNA') +
scale_y_continuous(trans = 'log10')) +
(ggplot(scRNA_BCells_selected_genes@meta.data) +
geom_histogram(aes(nCount_RNA)) +
ggtitle('nCount_RNA') +
scale_y_continuous(trans = 'log10'))

## Cache

### B cells with all genes

In [None]:
readr::write_rds(scRNA_BCells, 'Bcells_fineTyping_all_genes.rds')

### B cells with selected genes

In [None]:
readr::write_rds(scRNA_BCells_selected_genes, 'Bcells_fineTyping_selected_genes.rds')

In [None]:
scRNA_BCells = readr::read_rds('Bcells_fineTyping_selected_genes.rds')

In [None]:
scRNA_BCells

In [None]:
colnames(scRNA_BCells@meta.data)

In [None]:
scRNA_BCells@meta.data %>% select(cleaned_fine_types) %>% head()

In [None]:
require(tidyverse)
scRNA_BCells@meta.data %>% select(cleaned_fine_types) %>% table()
# use biosample id instead of donor as the batch variable, including for harmonization

In [None]:
rownames(scRNA_BCells)