# TNKILC cells: process reference for integration with MERFISH

In [None]:
require(Seurat)
require(tidyverse)
require(readxl)
require(patchwork)
require(sf)
require(ggpubr)
require(ggthemes)
require(harmony)
require(presto)
require(ComplexHeatmap)
require(circlize)
require(glue)
require(e1071) 
require(caTools) 
require(class) 
require(gghighlight)
require(UpSetR)
options(repr.matrix.max.cols=100, repr.matrix.max.rows=100, repr.plot.res=300)
set.seed(1)

# load processed scRNA reference dataset

In [None]:
completeReference = readr::read_rds('/n/scratch3/users/m/mup728/Cell_Typing_CRC_MERFISH/Pelka_reference_cleaning/pelka_dataset_with_merfish_genes.rds')

In [None]:
completeReference@meta.data$technology = 'scRNA'
completeReference@meta.data$biosample_id = completeReference@meta.data$batchID

In [None]:
completeReference@meta.data$combined_cell_names = colnames(completeReference)
length(unique(completeReference@meta.data$combined_cell_names))

In [None]:
summary(completeReference@meta.data$nCount_RNA)
summary(completeReference@meta.data$nFeature_RNA)

## select TNKILC cells for finetyping

In [None]:
scRNA_TNKILC = completeReference@meta.data %>% filter(ClusterTop == 'TNKILC') %>% select(combined_cell_names) %>% as.matrix() %>% as.vector()
temp = completeReference@meta.data %>% filter(ClusterTop == 'TNKILC')
rownames(temp) = temp$combined_cell_names
scRNA_TNKILC = CreateSeuratObject(GetAssayData(completeReference, 'counts')[,scRNA_TNKILC], meta.data = temp)
scRNA_TNKILC
rm(temp)

In [None]:
table(scRNA_TNKILC@meta.data$orig.ident, scRNA_TNKILC@meta.data$ClusterFull) %>% as.matrix()

In [None]:
table(scRNA_TNKILC@meta.data$orig.ident) %>% as.matrix()

In [None]:
options(repr.plot.width=5, repr.plot.height=5)
table(scRNA_TNKILC@meta.data$orig.ident) %>% as.data.frame() %>%
ggplot() + 
geom_histogram(aes(Freq)) 
summary(as.data.frame(table(scRNA_TNKILC@meta.data$orig.ident))$Freq)

In [None]:
options(repr.plot.width=20, repr.plot.height=20)
constantPalette = rep('red', length(unique(scRNA_TNKILC@meta.data$ClusterFull)))
names(constantPalette) = unique(scRNA_TNKILC@meta.data$ClusterFull)
ggplot(scRNA_TNKILC@meta.data) +
geom_point(aes(x=nFeature_RNA,y=nCount_RNA, color = ClusterFull), shape = '.', alpha = 0.5) +
geom_vline(xintercept = 15, color = 'blue') +
geom_hline(yintercept = 50, color = 'blue') +
scale_x_continuous(trans = 'log10') +
scale_y_continuous(trans = 'log10') +
facet_wrap(~ClusterFull) +
gghighlight() +
scale_color_manual(values = constantPalette) +
theme_minimal(base_size=20) +
theme(legend.position="none") 

## scale/normalize/pca

In [None]:
normFactor = scRNA_TNKILC@meta.data %>%
select(technology, nCount_RNA) %>%
group_by(technology) %>%
summarize(medianCounts = median(nCount_RNA)) 
normFactor = mean(normFactor$medianCounts) #median of medians (across technology) 
normFactor
scRNA_TNKILC = NormalizeData(scRNA_TNKILC, 
                          normalization.method = "LogNormalize", 
                          scale.factor = normFactor) %>%
    ScaleData(features = rownames(scRNA_TNKILC)) %>%
    singlecellmethods::RunBalancedPCA(weight.by='orig.ident', npcs=15)
scRNA_TNKILC

In [None]:
options(repr.plot.width=25, repr.plot.height=5)
ElbowPlot(scRNA_TNKILC, ndims = 15)

### in scrna, are the TNKILC cellssubtypes distinguishable?

#### qualitative look - umap

In [None]:
U = uwot::umap(scRNA_TNKILC@reductions$pca@cell.embeddings[, 1:15], 
               min_dist = 0.05, 
               spread = 0.30, 
               ret_extra = 'fgraph', 
               n_sgd_threads = nbrOfWorkers(), 
               fast_sgd = TRUE)
colnames(U$embedding) = c('PCAUMAP1', 'PCAUMAP2')
rownames(U$fgraph) = colnames(U$fgraph) = Cells(scRNA_TNKILC)
scRNA_TNKILC[['pcaumap']] = Seurat::CreateDimReducObject(
    embeddings = U$embedding,
    assay = 'RNA', 
    key = 'PCAUMAP_', 
    global = TRUE
)
new_graph = Seurat::as.Graph(U$fgraph)
DefaultAssay(new_graph) = DefaultAssay(scRNA_TNKILC)
scRNA_TNKILC[['pcaumap_fgraph']] = new_graph

In [None]:
temp = Embeddings(scRNA_TNKILC, 'pcaumap') %>% as.data.frame()
temp$combined_cell_names = rownames(temp)
pcaumap_embeddings = right_join(scRNA_TNKILC@meta.data %>% select(combined_cell_names, ClusterTop, ClusterFull, orig.ident), temp)
pcaumap_embeddings %>% head()
pcaumap_embeddings %>% tail()

In [None]:
options(repr.plot.width = 5, repr.plot.height = 5)
ggplot(pcaumap_embeddings, aes(x = PCAUMAP_1, y = PCAUMAP_2, color = orig.ident)) + 
geom_point(shape = '.', alpha = 0.5) + 
ggtitle('Pre-harmony UMAP - TNKILC cells') +
guides(color = guide_legend(override.aes = list(size = 10, shape = 16))) +
ggpubr::theme_pubr(base_size=10) +
theme(legend.position = 'none') 

In [None]:
options(repr.plot.width = 20, repr.plot.height = 20)
constantPalette = rep('red', length(unique(pcaumap_embeddings$orig.ident)))
names(constantPalette) = unique(pcaumap_embeddings$orig.ident)
ggplot(pcaumap_embeddings, aes(x = PCAUMAP_1, y = PCAUMAP_2, color = orig.ident)) + 
geom_point(shape = '.', alpha = 0.5) + 
ggtitle('Pre-harmony UMAP - TNKILC cells') +
scale_color_manual(values = constantPalette) +
ggpubr::theme_pubr(base_size=10) +
theme(legend.position = 'none') +
facet_wrap(~orig.ident) + 
gghighlight::gghighlight()

In [None]:
options(repr.plot.width = 10, repr.plot.height = 10)
ggplot(pcaumap_embeddings, aes(x = PCAUMAP_1, y = PCAUMAP_2, color = ClusterFull)) + 
geom_point(shape = '.', alpha = 0.5) + 
ggtitle('Pre-harmony UMAP - TNKILC cells') +
#ggthemes::scale_color_tableau('Tableau 20' , name = "") + 
guides(color = guide_legend(override.aes = list(size = 10, shape = 16))) +
ggpubr::theme_pubr(base_size=10) +
theme(legend.position = 'top') 

In [None]:
options(repr.plot.width = 10, repr.plot.height = 10)
ggplot(pcaumap_embeddings, aes(x = PCAUMAP_1, y = PCAUMAP_2, color = ClusterFull)) + 
geom_point(shape = '.', alpha = 0.5) + 
ggtitle('Pre-harmony UMAP - TNKILC cells') +
#ggthemes::scale_color_tableau('Tableau 20' , name = "") + 
guides(color = guide_legend(override.aes = list(size = 10, shape = 16))) +
ggpubr::theme_pubr(base_size=10) +
theme(legend.position = 'top') +
facet_wrap(~ClusterFull) +
gghighlight::gghighlight()

### harmonize over donor

In [None]:
options(repr.plot.width = 5, repr.plot.height = 5)
scRNA_TNKILC = scRNA_TNKILC %>% harmony::RunHarmony("orig.ident", 
    plot_convergence = TRUE, 
    lambda = NULL,
    max.iter = 10,
    early_stop = F
) 

### post harmony umap

In [None]:
U = uwot::umap(scRNA_TNKILC@reductions$harmony@cell.embeddings[, 1:15], 
               min_dist = 0.05, 
               spread = 0.30, 
               ret_extra = 'fgraph', 
               n_sgd_threads = nbrOfWorkers(), 
               fast_sgd = TRUE)
colnames(U$embedding) = c('HUMAP1', 'HUMAP2')
rownames(U$fgraph) = colnames(U$fgraph) = Cells(scRNA_TNKILC)
scRNA_TNKILC[['humap']] = Seurat::CreateDimReducObject(
    embeddings = U$embedding,
    assay = 'RNA', 
    key = 'HUMAP_', 
    global = TRUE
)
new_graph = Seurat::as.Graph(U$fgraph)
DefaultAssay(new_graph) = DefaultAssay(scRNA_TNKILC)
scRNA_TNKILC[['humap_fgraph']] = new_graph

In [None]:
temp = Embeddings(scRNA_TNKILC, 'humap') %>% as.data.frame()
temp$combined_cell_names = rownames(temp)
humap_embeddings = right_join(scRNA_TNKILC@meta.data %>% select(combined_cell_names, ClusterTop, ClusterFull, orig.ident), temp)
humap_embeddings %>% head()
humap_embeddings %>% tail()

In [None]:
options(repr.plot.width = 5, repr.plot.height = 5)
ggplot(humap_embeddings %>% sample_n(nrow(.)), aes(x = HUMAP_1, y = HUMAP_2, color = orig.ident)) + 
geom_point(shape = '.', alpha = 0.5) + 
ggtitle('Post-harmony UMAP - TNKILC cells') +
ggpubr::theme_pubr(base_size=10) +
theme(legend.position = 'none') 

In [None]:
options(repr.plot.width = 20, repr.plot.height = 20)
constantPalette = rep('red', length(unique(humap_embeddings$orig.ident)))
names(constantPalette) = unique(humap_embeddings$orig.ident)
ggplot(humap_embeddings, aes(x = HUMAP_1, y = HUMAP_2, color = orig.ident)) + 
geom_point(shape = '.', alpha = 0.5) + 
ggtitle('Post-harmony UMAP - TNKILC cells') +
ggpubr::theme_pubr(base_size=10) +
theme(legend.position = 'none') +
scale_color_manual(values = constantPalette) +
facet_wrap(~orig.ident) +
gghighlight::gghighlight()

In [None]:
options(repr.plot.width = 10, repr.plot.height = 10)
ggplot(humap_embeddings, aes(x = HUMAP_1, y = HUMAP_2, color = ClusterFull)) + 
geom_point(shape = '.', alpha = 0.5) + 
ggtitle('Post-harmony UMAP - TNKILC cells') +
#ggthemes::scale_color_tableau('Tableau 20' , name = "") + 
guides(color = guide_legend(override.aes = list(size = 10, shape = 16, alpha = 1))) +
ggpubr::theme_pubr(base_size=10) +
theme(legend.position = 'top') 

In [None]:
options(repr.plot.width = 15, repr.plot.height = 15)
ggplot(humap_embeddings, aes(x = HUMAP_1, y = HUMAP_2, color = ClusterFull)) + 
geom_point(shape = '.', alpha = 0.5) + 
ggtitle('Post-harmony UMAP - TNKILC cells') +
#ggthemes::scale_color_tableau('Tableau 20' , name = "") + 
guides(color = guide_legend(override.aes = list(size = 10, shape = 16, alpha = 1))) +
ggpubr::theme_pubr(base_size=10) +
theme(legend.position = 'top') +
facet_wrap(~ClusterFull) +
gghighlight::gghighlight()

## DE genes for clusterfull TNKILC cells

In [None]:
Idents(scRNA_TNKILC) = 'ClusterFull'
require(presto)
top_markers(wilcoxauc(scRNA_TNKILC),
                      n = Inf,
                      padj = 0.05,
                      auc_min = 0.55)

## Cluster at a few resolutions to remove non TNKILC cells

In [None]:
set.seed(1)
#resolutions_test = seq(0.1, 3.5, by = 0.1)
#resolutions_test
scRNA_TNKILC = Seurat::FindClusters(scRNA_TNKILC, 
                                    graph.name = 'humap_fgraph', 
                                    resolution = 3, 
                                    verbose = TRUE)
scRNA_TNKILC

In [None]:
scRNA_TNKILC@meta.data %>% colnames()

### stash the results that you used to filter out non TNKILC cells

In [None]:
scRNA_TNKILC@meta.data$Louvain_for_filtering = droplevels(scRNA_TNKILC@meta.data$humap_fgraph_res.3)
scRNA_TNKILC@meta.data$Louvain_for_filtering 
scRNA_TNKILC

## DE with wilcox to figure out which of the ~30 clusters are non TNKILC cells

In [None]:
Idents(scRNA_TNKILC) = 'Louvain_for_filtering' 
require(presto)
top_markers(wilcoxauc(scRNA_TNKILC),
                      n = Inf,
                      padj = 0.05,
                      auc_min = 0.6) 

In [None]:
humap_embeddings = Embeddings(scRNA_TNKILC, 'humap') %>% 
    as.data.frame() %>%
    mutate(combined_cell_names = rownames(.)) %>%
    left_join(., scRNA_TNKILC@meta.data %>% 
              select(combined_cell_names, Louvain_for_filtering, orig.ident))
rbind(humap_embeddings %>% head(), humap_embeddings %>% tail())

In [None]:
options(repr.plot.width = 20, repr.plot.height = 20)
constantPalette = rep('red', length(unique(humap_embeddings$Louvain_for_filtering)))
names(constantPalette) = unique(humap_embeddings$Louvain_for_filtering)
ggplot(humap_embeddings, aes(x = HUMAP_1, y = HUMAP_2, color = Louvain_for_filtering)) + 
geom_point(shape = '.', alpha = 1) + 
ggtitle('For filtering non TNKILC cells:\nOver-clustering of TNKILC cells UMAP') +
ggpubr::theme_pubr(base_size=20) +
theme(legend.position = 'none') +
scale_color_manual(values = constantPalette) +
facet_wrap(~Louvain_for_filtering) +
gghighlight::gghighlight()

## cache

In [None]:
readr::write_rds(scRNA_TNKILC, 'TNKILC_fineTyping_all_genes.rds')

## Redo scale/normalize/PCA/harmony/UMAP after removing the non-TNKILC cells

In [None]:
normFactor = scRNA_TNKILC@meta.data %>%
select(orig.ident, nCount_RNA) %>%
group_by(orig.ident) %>%
summarize(medianCounts = median(nCount_RNA)) 
normFactor = mean(normFactor$medianCounts) #median of medians
normFactor
scRNA_TNKILC = NormalizeData(scRNA_TNKILC, 
                          normalization.method = "LogNormalize", 
                          scale.factor = normFactor) %>%
    ScaleData(features = rownames(scRNA_TNKILC)) %>%
    singlecellmethods::RunBalancedPCA(weight.by='orig.ident', npcs=15)
scRNA_TNKILC

In [None]:
options(repr.plot.width = 5, repr.plot.height = 5)
scRNA_TNKILC = scRNA_TNKILC %>% harmony::RunHarmony("orig.ident", 
    plot_convergence = TRUE, 
    lambda = NULL,
    max.iter = 10,
    early_stop = F
) 

In [None]:
U = uwot::umap(scRNA_TNKILC@reductions$harmony@cell.embeddings[, 1:15], 
               min_dist = 0.05, 
               spread = 0.30, 
               ret_extra = 'fgraph', 
               n_sgd_threads = nbrOfWorkers(), 
               fast_sgd = TRUE)
colnames(U$embedding) = c('HUMAP1', 'HUMAP2')
rownames(U$fgraph) = colnames(U$fgraph) = Cells(scRNA_TNKILC)
scRNA_TNKILC[['humap']] = Seurat::CreateDimReducObject(
    embeddings = U$embedding,
    assay = 'RNA', 
    key = 'HUMAP_', 
    global = TRUE
)
new_graph = Seurat::as.Graph(U$fgraph)
DefaultAssay(new_graph) = DefaultAssay(scRNA_TNKILC)
scRNA_TNKILC[['humap_fgraph']] = new_graph

In [None]:
humap_embeddings = Embeddings(scRNA_TNKILC, 'humap') %>% 
    as.data.frame() %>%
    mutate(combined_cell_names = rownames(.)) %>%
    left_join(., scRNA_TNKILC@meta.data %>% 
              select(combined_cell_names, ClusterFull, orig.ident))
rbind(humap_embeddings %>% head(), humap_embeddings %>% tail())

In [None]:
options(repr.plot.width = 20, repr.plot.height = 20)
constantPalette = rep('red', length(unique(humap_embeddings$orig.ident)))
names(constantPalette) = unique(humap_embeddings$orig.ident)
ggplot(humap_embeddings, aes(x = HUMAP_1, y = HUMAP_2, color = orig.ident)) + 
geom_point(shape = '.', alpha = 0.5) + 
ggtitle('Post-harmony, post-filter (step 1) UMAP - TNKILC cells') +
ggpubr::theme_pubr(base_size=10) +
theme(legend.position = 'none') +
scale_color_manual(values = constantPalette) +
facet_wrap(~orig.ident) +
gghighlight::gghighlight()

In [None]:
options(repr.plot.width = 10, repr.plot.height = 7)
constantPalette = rep('red', length(unique(humap_embeddings$ClusterFull)))
names(constantPalette) = unique(humap_embeddings$ClusterFull)
ggplot(humap_embeddings, aes(x = HUMAP_1, y = HUMAP_2, color = ClusterFull)) + 
geom_point(shape = '.', alpha = 0.5) + 
ggtitle('Post-harmony, post-filter (step 1) UMAP - TNKILC cells') +
ggpubr::theme_pubr(base_size=10) +
theme(legend.position = 'none') +
scale_color_manual(values = constantPalette) +
facet_wrap(~ClusterFull) +
gghighlight::gghighlight()

## one more round of clustering to get clusters that will map back to fine types

In [None]:
set.seed(1)
scRNA_TNKILC = Seurat::FindClusters(scRNA_TNKILC, 
    graph.name = 'humap_fgraph', 
    resolution = 0.5, 
    verbose = TRUE)
scRNA_TNKILC

## stash clusters used for mapping fine cell types to clean louvain clusters

In [None]:
levels(scRNA_TNKILC@meta.data$humap_fgraph_res.0.5)

In [None]:
scRNA_TNKILC@meta.data$Louvain_for_fine_types = droplevels(scRNA_TNKILC@meta.data$humap_fgraph_res.0.5)
scRNA_TNKILC@meta.data$Louvain_for_fine_types 

## concordance of new clusters with pelka fine types

In [None]:
options(repr.plot.width = 15, repr.plot.height = 10)
oldClustersInNew3 = table(scRNA_TNKILC@meta.data$Louvain_for_fine_types, 
                          scRNA_TNKILC@meta.data$ClusterFull) %>% 
    t() %>% 
    prop.table(1) %>% 
    scale()
colOrder = hclust(dist(t(oldClustersInNew3), 
                       method = "euclidean"), 
                  method = "ward.D" )$order
rowOrder = hclust(dist(oldClustersInNew3, 
                       method = "euclidean"), 
                  method = "ward.D" )$order
oldClustersInNew3[rowOrder, colOrder] %>% 
as.data.frame() %>% 
rename('Fine Type' = Var1, 
       'Louvain Cluster' = Var2, 
       'Scaled\nProportion' = Freq) %>%
ggplot(aes(y = `Fine Type`, 
           x = `Louvain Cluster`)) +
geom_tile(aes(fill = `Scaled\nProportion`), 
          alpha = 1, 
          color = 'black') +
geom_label(aes(label = round(.data$`Scaled\nProportion`, 1)), 
           color = 'red') +
scale_fill_viridis_c(direction = -1) +
theme_minimal(base_size = 16) +
theme(axis.text.x = element_text(angle = 90, 
                                 vjust = 0.5, 
                                 hjust=1)) +   
ggtitle('Louvain to map to fine types - step 1\nIdentify mixed clusters')

## Select clusters for sub-clustering

In [None]:
Idents(scRNA_TNKILC) = 'Louvain_for_fine_types'
for (i in c(0,1,2,3,4,6,7,8,9,10)){
    scRNA_TNKILC = FindSubCluster(
      scRNA_TNKILC,
      i,
      graph.name = 'humap_fgraph',
      subcluster.name = paste("subcluster_", i, sep = ""),
      resolution = 0.5,
      algorithm = 1
    )
    scRNA_TNKILC = SetIdent(scRNA_TNKILC, value = paste("subcluster_", i, sep = ""))
}
scRNA_TNKILC

## stash subclustered Louvain clusters

In [None]:
scRNA_TNKILC@meta.data$subclustered_Louvain_for_fine_types = droplevels(as.factor(scRNA_TNKILC@meta.data$subcluster_10))
levels(scRNA_TNKILC@meta.data$subclustered_Louvain_for_fine_types)
scRNA_TNKILC = SetIdent(scRNA_TNKILC, 
                        value = 'subclustered_Louvain_for_fine_types')

## plot concordance

In [None]:
options(repr.plot.width = 40, repr.plot.height = 15)
oldClustersInNew3 = table(scRNA_TNKILC@meta.data$subclustered_Louvain_for_fine_types, 
                          scRNA_TNKILC@meta.data$ClusterFull) %>% 
    t() %>% 
    prop.table(1) %>% 
    scale()
#colOrder = hclust(dist(t(oldClustersInNew3), 
#                       method = "euclidean"), 
#                  method = "ward.D" )$order
rowOrder = hclust(dist(oldClustersInNew3, 
                       method = "euclidean"), 
                  method = "ward.D" )$order
oldClustersInNew3[rowOrder, ] %>% #[rowOrder, colOrder] %>% 
as.data.frame() %>% 
rename('Fine Type' = Var1, 
       'Louvain Cluster' = Var2, 
       'Scaled\nProportion' = Freq) %>%
ggplot(aes(y = `Fine Type`, 
           x = `Louvain Cluster`)) +
geom_tile(aes(fill = `Scaled\nProportion`), 
          alpha = 1, 
          color = 'black') +
geom_label(aes(label = round(.data$`Scaled\nProportion`, 1)), 
           color = 'red') +
scale_fill_viridis_c(direction = -1) +
theme_minimal(base_size = 16) +
theme(axis.text.x = element_text(angle = 90, 
                                 vjust = 0.5, 
                                 hjust=1)) +
ggtitle('Louvain to map to fine types - step 2\nMap subclustered Louvain clusters back to Pelka fine types')

### plot umap of louvain subclusters

In [None]:
options(repr.plot.width = 20, repr.plot.height = 20)
humap_embeddings = Embeddings(scRNA_TNKILC, 'humap') %>% 
    as.data.frame() %>%
    mutate(combined_cell_names = rownames(.)) %>%
    left_join(., scRNA_TNKILC@meta.data %>% 
              select(combined_cell_names, subclustered_Louvain_for_fine_types))
rbind(humap_embeddings %>% head(), humap_embeddings %>% tail())
options(repr.plot.width = 20, repr.plot.height = 20)
constantPalette = rep('red', length(unique(humap_embeddings$subclustered_Louvain_for_fine_types)))
names(constantPalette) = unique(humap_embeddings$subclustered_Louvain_for_fine_types)
ggplot(humap_embeddings, aes(x = HUMAP_1, y = HUMAP_2, color = subclustered_Louvain_for_fine_types)) + 
geom_point(shape = '.', alpha = 0.5) + 
ggtitle('Louvain to map to fine types - step 2\nMap subclustered Louvain clusters back to Pelka fine types') +
ggpubr::theme_pubr(base_size=20) +
theme(legend.position = 'none') +
scale_color_manual(values = constantPalette) +
facet_wrap(~subclustered_Louvain_for_fine_types) +
gghighlight::gghighlight()

### remove ambiguous sub clusters

- 4_2

NOTE: I tried to subcluster these and wasn't successful, so I'm going to just remove these cells

In [None]:
scRNA_TNKILC@meta.data$subclustered_Louvain_for_fine_types

In [None]:
scRNA_TNKILC = subset(scRNA_TNKILC, 
                      subset = subclustered_Louvain_for_fine_types  == '4_2', 
                      invert = TRUE)
scRNA_TNKILC@meta.data$subclustered_Louvain_for_fine_types = droplevels(scRNA_TNKILC@meta.data$subclustered_Louvain_for_fine_types)
scRNA_TNKILC

In [None]:
scRNA_TNKILC = subset(scRNA_TNKILC, 
                      subset = subclustered_Louvain_for_fine_types  == '4_1', 
                      invert = TRUE)
scRNA_TNKILC@meta.data$subclustered_Louvain_for_fine_types = droplevels(scRNA_TNKILC@meta.data$subclustered_Louvain_for_fine_types)
scRNA_TNKILC

## Merge and remove ambiguous fine types

In [None]:
scRNA_TNKILC@meta.data$ClusterFull = as.factor(scRNA_TNKILC@meta.data$ClusterFull)
scRNA_TNKILC@meta.data$merged_fine_types = as.factor(scRNA_TNKILC@meta.data$ClusterFull)

### remove

In [None]:
table(scRNA_TNKILC@meta.data$ClusterFull) %>% as.matrix() 

In [None]:
scRNA_TNKILC = subset(scRNA_TNKILC, 
                      subset = merged_fine_types  %in% c("cTNI13 (CD8+ T IL17+)", "cTNI05 (CD4+ IL17+)"), 
                      invert = TRUE)
scRNA_TNKILC@meta.data$merged_fine_types = droplevels(as.factor(scRNA_TNKILC@meta.data$merged_fine_types))
scRNA_TNKILC

#### merge

In [None]:
scRNA_TNKILC@meta.data$merged_fine_types %>% levels()

In [None]:
scRNA_TNKILC@meta.data$merged_fine_types = fct_recode(scRNA_TNKILC@meta.data$merged_fine_types, !!! c('cTNI01 & 02 & 03 & 04 (CD4+ IL7R+)' = 'cTNI01 (CD4+ IL7R+)',
'cTNI01 & 02 & 03 & 04 (CD4+ IL7R+)' = 'cTNI02 (CD4+ IL7R+SELL+)',
'cTNI01 & 02 & 03 & 04 (CD4+ IL7R+)' = 'cTNI03 (CD4+ IL7R+HSP+)',
'cTNI01 & 02 & 03 & 04 (CD4+ IL7R+)' = 'cTNI04 (CD4+ IL7R+CCL5+)',
'cTNI06 (CD4+ TFH)' = 'cTNI06 (CD4+ TFH)',
'cTNI07 (CD4+ CXCL13+)' = 'cTNI07 (CD4+ CXCL13+)',
'cTNI08 (CD4+ Treg)' = 'cTNI08 (CD4+ Treg)',
'cTNI09 (CD4+ Treg prolif)' = 'cTNI09 (CD4+ Treg prolif)',
'cTNI10 & 12 (CD8+ IL7R+)' = 'cTNI10 (CD8+ IL7R+)',
'cTNI11 (CD8+GZMK+)' = 'cTNI11 (CD8+GZMK+)',
'cTNI10 & 12 (CD8+ IL7R+)' = 'cTNI12 (CD8+ IL7R+)',
'cTNI14 (CD8+ CXCL13+) & cTNI15 (CD8+ CXCL13+ HSP+)' = 'cTNI14 (CD8+ CXCL13+)',
'cTNI14 (CD8+ CXCL13+) & cTNI15 (CD8+ CXCL13+ HSP+)' = 'cTNI15 (CD8+ CXCL13+ HSP+)',
'cTNI16 (CD8+ CXCL13+ prolif)' = 'cTNI16 (CD8+ CXCL13+ prolif)',
'cTNI17 (gd-like T)' = 'cTNI17 (gd-like T)', 
'cTNI18 (gd-like T PDCD1+) & cTNI19 (gd-like T prolif)' = 'cTNI18 (gd-like T PDCD1+)',
'cTNI18 (gd-like T PDCD1+) & cTNI19 (gd-like T prolif)' = 'cTNI19 (gd-like T prolif)',
'cTNI20 (PLZF+ T)' = 'cTNI20 (PLZF+ T)',
'cTNI21 (PLZF+ T prolif)' = 'cTNI21 (PLZF+ T prolif)',
'cTNI22 (cTNI22)' = 'cTNI22 (cTNI22)',
'cTNI23 (NK CD16A+)' = 'cTNI23 (NK CD16A+)',
'cTNI24 (NK GZMK+)' = 'cTNI24 (NK GZMK+)',
'cTNI25 (NK XCL1+)' = 'cTNI25 (NK XCL1+)',
'cTNI26 (ILC3)' = 'cTNI26 (ILC3)'))

In [None]:
options(repr.plot.width = 40, repr.plot.height = 15)
oldClustersInNew3 = table(scRNA_TNKILC@meta.data$subclustered_Louvain_for_fine_types, 
                          scRNA_TNKILC@meta.data$merged_fine_types) %>% 
    t() %>% 
    prop.table(1) %>% 
    scale()
#colOrder = hclust(dist(t(oldClustersInNew3), 
#                       method = "euclidean"), 
#                  method = "ward.D" )$order
rowOrder = hclust(dist(oldClustersInNew3, 
                       method = "euclidean"), 
                  method = "ward.D" )$order
oldClustersInNew3[rowOrder, ] %>% #[rowOrder, colOrder] %>% 
as.data.frame() %>% 
rename('Fine Type' = Var1, 
       'Louvain Cluster' = Var2, 
       'Scaled\nProportion' = Freq) %>%
ggplot(aes(y = `Fine Type`, 
           x = `Louvain Cluster`)) +
geom_tile(aes(fill = `Scaled\nProportion`), 
          alpha = 1, 
          color = 'black') +
geom_label(aes(label = round(.data$`Scaled\nProportion`, 1)), 
           color = 'red') +
scale_fill_viridis_c(direction = -1) +
theme_minimal(base_size = 16) +
theme(axis.text.x = element_text(angle = 90, 
                                 vjust = 0.5, 
                                 hjust=1)) +
ggtitle('Louvain to map to fine types - step 2\nMap subclustered Louvain clusters back to Pelka fine types')

## Relabel clusters

In [None]:
table(scRNA_TNKILC@meta.data$subclustered_Louvain_for_fine_types) %>% as.matrix() 

### Here are the new labels:

In [None]:
oldClustersInNew3 = table(scRNA_TNKILC@meta.data$subclustered_Louvain_for_fine_types, 
                          scRNA_TNKILC@meta.data$merged_fine_types) %>% 
    t() %>% 
    prop.table(1) %>% 
    scale() 
oldClustersInNew3

In [None]:
mergedClusters = names(apply(oldClustersInNew3, 2, which.max))
names(mergedClusters) = rownames(oldClustersInNew3)[apply(oldClustersInNew3, 2, which.max)]
mergedClusters 

In [None]:
scRNA_TNKILC@meta.data$cleaned_fine_types = droplevels(
    fct_recode(
        scRNA_TNKILC@meta.data$subclustered_Louvain_for_fine_types, 
        !!! mergedClusters))
scRNA_TNKILC = SetIdent(scRNA_TNKILC, 
                        value = 'cleaned_fine_types')

In [None]:
cbind(table(scRNA_TNKILC@meta.data$cleaned_fine_types) %>% as.matrix(),
table(scRNA_TNKILC@meta.data$merged_fine_types) %>% as.matrix()) %>%
as.data.frame() %>%
rename("Cleaned Fine Types" = V1, "Pelka Labels" = V2)

In [None]:
options(repr.plot.width=40, repr.plot.height=15)
listInput = list(
    "Pelka cTNI10 & 12 (CD8+ IL7R+)" = rownames(scRNA_TNKILC@meta.data[scRNA_TNKILC@meta.data$merged_fine_types == 'cTNI10 & 12 (CD8+ IL7R+)', ]),
    "Pelka cTNI20 (PLZF+ T)" = rownames(scRNA_TNKILC@meta.data[scRNA_TNKILC@meta.data$merged_fine_types == 'cTNI20 (PLZF+ T)', ]),
    "Pelka cTNI17 (gd-like T)" =rownames(scRNA_TNKILC@meta.data[scRNA_TNKILC@meta.data$merged_fine_types == 'cTNI17 (gd-like T)', ]),
    "Pelka cTNI21 (PLZF+ T prolif)" = rownames(scRNA_TNKILC@meta.data[scRNA_TNKILC@meta.data$merged_fine_types == 'cTNI21 (PLZF+ T prolif)', ]),
    "Pelka cTNI18 (gd-like T PDCD1+) & cTNI19 (gd-like T prolif)" = rownames(scRNA_TNKILC@meta.data[scRNA_TNKILC@meta.data$merged_fine_types == 'cTNI18 (gd-like T PDCD1+) & cTNI19 (gd-like T prolif)', ]),
    "Pelka cTNI14 (CD8+ CXCL13+) & cTNI15 (CD8+ CXCL13+ HSP+)" = rownames(scRNA_TNKILC@meta.data[scRNA_TNKILC@meta.data$merged_fine_types == 'cTNI14 (CD8+ CXCL13+) & cTNI15 (CD8+ CXCL13+ HSP+)', ]),
    "Pelka cTNI26 (ILC3)" = rownames(scRNA_TNKILC@meta.data[scRNA_TNKILC@meta.data$merged_fine_types == 'cTNI26 (ILC3)', ]),
    "Pelka cTNI01 & 02 & 03 & 04 (CD4+ IL7R+)" = rownames(scRNA_TNKILC@meta.data[scRNA_TNKILC@meta.data$merged_fine_types == 'cTNI01 & 02 & 03 & 04 (CD4+ IL7R+)', ]),
    "Pelka cTNI07 (CD4+ CXCL13+)" = rownames(scRNA_TNKILC@meta.data[scRNA_TNKILC@meta.data$merged_fine_types == 'cTNI07 (CD4+ CXCL13+)', ]),
    "Pelka cTNI08 (CD4+ Treg)" = rownames(scRNA_TNKILC@meta.data[scRNA_TNKILC@meta.data$merged_fine_types == 'cTNI08 (CD4+ Treg)', ]),
    "Pelka cTNI06 (CD4+ TFH)" = rownames(scRNA_TNKILC@meta.data[scRNA_TNKILC@meta.data$merged_fine_types == 'cTNI06 (CD4+ TFH)', ]),
    "Pelka cTNI24 (NK GZMK+)" = rownames(scRNA_TNKILC@meta.data[scRNA_TNKILC@meta.data$merged_fine_types == 'cTNI24 (NK GZMK+)', ]),
    "Pelka cTNI23 (NK CD16A+)" = rownames(scRNA_TNKILC@meta.data[scRNA_TNKILC@meta.data$merged_fine_types == 'cTNI23 (NK CD16A+)', ]),
    "Pelka cTNI22 (cTNI22)" = rownames(scRNA_TNKILC@meta.data[scRNA_TNKILC@meta.data$merged_fine_types == 'cTNI22 (cTNI22)', ]),
    "Pelka cTNI25 (NK XCL1+)" = rownames(scRNA_TNKILC@meta.data[scRNA_TNKILC@meta.data$merged_fine_types == 'cTNI25 (NK XCL1+)', ]),
    "Pelka cTNI16 (CD8+ CXCL13+ prolif)" = rownames(scRNA_TNKILC@meta.data[scRNA_TNKILC@meta.data$merged_fine_types == 'cTNI16 (CD8+ CXCL13+ prolif)', ]),
    "Pelka cTNI09 (CD4+ Treg prolif)" = rownames(scRNA_TNKILC@meta.data[scRNA_TNKILC@meta.data$merged_fine_types == 'cTNI09 (CD4+ Treg prolif)', ]),
    "Cleaned cTNI10 & 12 (CD8+ IL7R+)" = rownames(scRNA_TNKILC@meta.data[scRNA_TNKILC@meta.data$cleaned_fine_types == 'cTNI10 & 12 (CD8+ IL7R+)', ]),
    "Cleaned cTNI20 (PLZF+ T)" = rownames(scRNA_TNKILC@meta.data[scRNA_TNKILC@meta.data$cleaned_fine_types == 'cTNI20 (PLZF+ T)', ]),
    "Cleaned cTNI17 (gd-like T)" =rownames(scRNA_TNKILC@meta.data[scRNA_TNKILC@meta.data$cleaned_fine_types == 'cTNI17 (gd-like T)', ]),
    "Cleaned cTNI21 (PLZF+ T prolif)" = rownames(scRNA_TNKILC@meta.data[scRNA_TNKILC@meta.data$cleaned_fine_types == 'cTNI21 (PLZF+ T prolif)', ]),
    "Cleaned cTNI18 (gd-like T PDCD1+) & cTNI19 (gd-like T prolif)" = rownames(scRNA_TNKILC@meta.data[scRNA_TNKILC@meta.data$cleaned_fine_types == 'cTNI18 (gd-like T PDCD1+) & cTNI19 (gd-like T prolif)', ]),
    "Cleaned cTNI14 (CD8+ CXCL13+) & cTNI15 (CD8+ CXCL13+ HSP+)" = rownames(scRNA_TNKILC@meta.data[scRNA_TNKILC@meta.data$cleaned_fine_types == 'cTNI14 (CD8+ CXCL13+) & cTNI15 (CD8+ CXCL13+ HSP+)', ]),
    "Cleaned cTNI26 (ILC3)" = rownames(scRNA_TNKILC@meta.data[scRNA_TNKILC@meta.data$cleaned_fine_types == 'cTNI26 (ILC3)', ]),
    "Cleaned cTNI01 & 02 & 03 & 04 (CD4+ IL7R+)" = rownames(scRNA_TNKILC@meta.data[scRNA_TNKILC@meta.data$cleaned_fine_types == 'cTNI01 & 02 & 03 & 04 (CD4+ IL7R+)', ]),
    "Cleaned cTNI07 (CD4+ CXCL13+)" = rownames(scRNA_TNKILC@meta.data[scRNA_TNKILC@meta.data$cleaned_fine_types == 'cTNI07 (CD4+ CXCL13+)', ]),
    "Cleaned cTNI08 (CD4+ Treg)" = rownames(scRNA_TNKILC@meta.data[scRNA_TNKILC@meta.data$cleaned_fine_types == 'cTNI08 (CD4+ Treg)', ]),
    "Cleaned cTNI06 (CD4+ TFH)" = rownames(scRNA_TNKILC@meta.data[scRNA_TNKILC@meta.data$cleaned_fine_types == 'cTNI06 (CD4+ TFH)', ]),
    "Cleaned cTNI24 (NK GZMK+)" = rownames(scRNA_TNKILC@meta.data[scRNA_TNKILC@meta.data$cleaned_fine_types == 'cTNI24 (NK GZMK+)', ]),
    "Cleaned cTNI23 (NK CD16A+)" = rownames(scRNA_TNKILC@meta.data[scRNA_TNKILC@meta.data$cleaned_fine_types == 'cTNI23 (NK CD16A+)', ]),
    "Cleaned cTNI22 (cTNI22)" = rownames(scRNA_TNKILC@meta.data[scRNA_TNKILC@meta.data$cleaned_fine_types == 'cTNI22 (cTNI22)', ]),
    "Cleaned cTNI25 (NK XCL1+)" = rownames(scRNA_TNKILC@meta.data[scRNA_TNKILC@meta.data$cleaned_fine_types == 'cTNI25 (NK XCL1+)', ]),
    "Cleaned cTNI16 (CD8+ CXCL13+ prolif)" = rownames(scRNA_TNKILC@meta.data[scRNA_TNKILC@meta.data$cleaned_fine_types == 'cTNI16 (CD8+ CXCL13+ prolif)', ]),
    "Cleaned cTNI09 (CD4+ Treg prolif)" = rownames(scRNA_TNKILC@meta.data[scRNA_TNKILC@meta.data$cleaned_fine_types == 'cTNI09 (CD4+ Treg prolif)', ])

)
upset(fromList(listInput), 
      nsets = length(listInput), 
      keep.order = T, 
      text.scale = c(2, 2, 2, 1, 2, 4),
      number.angles = 60,
      cutoff = 0)

## Justify new labels
- Once we have labels, we want to justify them to collaborators
- DGE analysis in original labels (post-QC cells) 
- DGE analysis in new labels 
- Correlate logFC in matched clusters

### DGE analysis in original labels (post-QC cells) 

In [None]:
rownames(scRNA_TNKILC)

In [None]:
clusterFull_markers = presto::wilcoxauc(scRNA_TNKILC, 'merged_fine_types') 
top_markers(clusterFull_markers,
    n = Inf,
    padj = 0.05,
    auc_min = 0.6)

### DGE analysis in new labels 

In [None]:
cleaned_fine_types_markers = presto::wilcoxauc(scRNA_TNKILC, 'cleaned_fine_types') 
top_markers(cleaned_fine_types_markers,
    n = Inf,
    padj = 0.05,
    auc_min = 0.6)

### Correlate logFC in matched clusters

In [None]:
clusterFull_markers = clusterFull_markers %>% filter(group %in% cleaned_fine_types_markers$group)
clusterFull_markers$Comparison = 'ClusterFull'
cleaned_fine_types_markers$Comparison = 'cleaned_fine_types'

In [None]:
comparison_markers = dplyr::left_join(x = clusterFull_markers %>% select(feature, group, logFC), 
                                      y = cleaned_fine_types_markers %>% select(feature, group, logFC),
                                    by = join_by(feature, group))
colnames(comparison_markers) = c('feature', 'group', 'clusterFull', 'cleaned_fine_types')
getColors = coef(lm(cleaned_fine_types ~ clusterFull, 
                    data = comparison_markers))
getIntercept = as.numeric(getColors)[1]
getSlope = as.numeric(getColors)[2]
comparison_markers = comparison_markers %>% 
    mutate(ye = getIntercept + (getSlope*cleaned_fine_types), color = cleaned_fine_types < ye) %>% 
    mutate(labelTRUE = (cleaned_fine_types > quantile(cleaned_fine_types, 0.25)) | (clusterFull > quantile(clusterFull, 0.25))) %>% 
    mutate(label = if_else(labelTRUE, feature, NA)) 
rbind(head(comparison_markers), tail(comparison_markers))

In [None]:
options(repr.plot.width=40, repr.plot.height=40)
ggplot(comparison_markers, aes(x=clusterFull, y=cleaned_fine_types)) + 
geom_point(shape = '.', alpha = 0.5) +
facet_wrap(~group, nrow = 10) + 
theme_pubr(base_size = 16) +
xlab('Pelka cell types') +
ylab('Cleaned fine types') +
geom_hline(aes(yintercept = 0)) +
geom_vline(aes(xintercept = 0)) +
ggrepel::geom_label_repel(aes(label=feature, color=color)) + 
geom_smooth(method = "lm", se = FALSE, linetype = 1, alpha = 0.15, linewidth = 0.1)

### Pearson correlations

In [None]:
cellTypeCor = comparison_markers %>% 
group_by(group) %>% 
dplyr::summarize(cor(clusterFull, cleaned_fine_types))
colnames(cellTypeCor) = c('group', 'correlation')
cellTypeCor

## Visualize final clusters in UMAP space, and also the original labels (after merge)

In [None]:
options(repr.plot.width = 20, repr.plot.height = 20)
humap_embeddings = Embeddings(scRNA_TNKILC, 'humap') %>% 
    as.data.frame() %>%
    mutate(combined_cell_names = rownames(.)) %>%
    left_join(., scRNA_TNKILC@meta.data %>% 
              select(combined_cell_names, cleaned_fine_types, ClusterFull))
rbind(humap_embeddings %>% head(), humap_embeddings %>% tail())

#### Cleaned clusters

In [None]:
options(repr.plot.width = 21, repr.plot.height = 21)
constantPalette = rep('red', length(unique(humap_embeddings$cleaned_fine_types)))
names(constantPalette) = unique(humap_embeddings$cleaned_fine_types)
ggplot(humap_embeddings, aes(x = HUMAP_1, y = HUMAP_2, color = cleaned_fine_types)) + 
geom_point(shape = '.', alpha = 0.5) + 
ggpubr::theme_pubr(base_size=20) +
theme(legend.position = 'none') +
scale_color_manual(values = constantPalette) +
facet_wrap(~cleaned_fine_types) +
gghighlight::gghighlight() +
ggtitle('Cleaned fine types - TNKILC cells')

#### Pelka clusters

In [None]:
options(repr.plot.width = 21, repr.plot.height = 21)
constantPalette = rep('red', length(unique(humap_embeddings$ClusterFull)))
names(constantPalette) = unique(humap_embeddings$ClusterFull)
ggplot(humap_embeddings, aes(x = HUMAP_1, y = HUMAP_2, color = ClusterFull)) + 
geom_point(shape = '.', alpha = 0.5) + 
ggpubr::theme_pubr(base_size=20) +
theme(legend.position = 'none') +
scale_color_manual(values = constantPalette) +
facet_wrap(~ClusterFull) +
gghighlight::gghighlight() +
ggtitle('Pelka fine types - TNKILC cells')

## Cache

### TNKILC cells with all genes

In [None]:
readr::write_rds(scRNA_TNKILC, 'TNKILC_fineTyping_all_genes.rds')

## GLMM to find DE genes between clusters

In [None]:
require(tidyverse)
require(Seurat)
require(data.table)
require(lme4)
require(presto)
require(singlecellmethods)
require(future)
require(furrr)
require(gghighlight)
options(future.globals.maxSize = 1000 * 1024 ^2)
set.seed(1)

In [None]:
scRNA_TNKILC = readr::read_rds('TNKILC_fineTyping_all_genes.rds')

In [None]:
require(presto)
require(singlecellmethods)
pb = presto::collapse_counts(
    GetAssayData(scRNA_TNKILC, 'counts'), 
    scRNA_TNKILC@meta.data, 
    c("biosample_id", "cleaned_fine_types"), 
    min_cells_per_group = 3
)
pb$meta_data %>% head()
dim(pb$counts)

colnames(scRNA_TNKILC@meta.data)
dim(pb$meta_data)
head(pb$meta_data)
dim(pb$counts_mat)

In [None]:
pb$exprs_norm = pb$exprs_norm[rownames(pb$counts_mat), colnames(pb$counts_mat)]
dim(pb$exprs_norm)
pb$exprs_norm[1:5, 1:5]

In [None]:
system.time({
    suppressWarnings({
        presto_res = presto::presto.presto(
            y ~ 1 + (1|cleaned_fine_types) + (1|cleaned_fine_types:biosample_id) + (1|biosample_id) + offset(logUMI), 
            pb$meta_data, 
            pb$counts_mat,
            size_varname = "logUMI", 
            effects_cov = "cleaned_fine_types",
            ncore = 1, 
            min_sigma = .05,
            family = "poisson",
            nsim = 1000
        )    
    })
})
readr::write_rds(presto_res, 'TNKILC_fineTyping_GLMM.rds')

In [None]:
presto_res = readr::read_rds('TNKILC_fineTyping_GLMM.rds')

### Make contrasts

In [None]:
contrasts_mat = make_contrast.presto(
    presto_res, 
    var_contrast = 'cleaned_fine_types')
contrasts_mat

### Find marginal effects

In [None]:
effects_marginal = contrasts.presto(
    presto_res, 
    contrasts_mat, 
    one_tailed = TRUE
) %>% 
    dplyr::mutate(cluster = contrast) %>% 
    dplyr::mutate(
        logFC = sign(beta) * log2(exp(abs(beta))), ## convert stats to log2 for interpretability 
        SD = log2(exp(sigma)),
        zscore = logFC / SD
    ) %>%
    arrange(pvalue)

effects_marginal$fdr = p.adjust(effects_marginal$pvalue, method = 'BH')
effects_marginal$corr_fdr = effects_marginal$fdr
effects_marginal$corr_fdr[effects_marginal$fdr == 0] = min(effects_marginal$fdr[effects_marginal$fdr != 0])
effects_marginal$`-log10_fdr` = (-1) * log10(effects_marginal$corr_fdr) 
dim(effects_marginal)
head(effects_marginal)


In [None]:
unique(effects_marginal$cluster)
unique(scRNA_TNKILC@meta.data$ClusterFull)

In [None]:
meanExp = rowMeans(GetAssayData(scRNA_TNKILC, 'data')) 
meanExp = data.frame(feature = names(meanExp), meanExp = meanExp)
for (cluster in unique(effects_marginal$cluster)) {
    print(cluster)
    temp = GetAssayData(scRNA_TNKILC, 'counts')[,scRNA_TNKILC@meta.data$sampleID[scRNA_TNKILC@meta.data$cleaned_fine_types == cluster]] %>% as.data.frame()
    print(dim(temp))
    temp = temp %>%
    rowwise() %>%
    mutate(`N_zeros` = sum(c_across(everything()) == 0)) %>%
    select(`N_zeros`) %>% as.data.frame()
    rownames(temp) = rownames(GetAssayData(scRNA_TNKILC, 'counts'))
    meanExp[,cluster] = temp$`N_zeros`/length(scRNA_TNKILC@meta.data$sampleID[scRNA_TNKILC@meta.data$cleaned_fine_types == cluster])
}
head(meanExp)

#effects_marginal = effects_marginal %>% left_join(meanExp, by = join_by(feature), relationship = 'many-to-one') 
#tail(effects_marginal)

In [None]:
temp = meanExp %>% pivot_longer(cols = unique(effects_marginal$cluster)) 
colnames(temp) = c('feature', 'meanExp', 'contrast', 'prop.expressed')
temp$prop.expressed = 1 - temp$prop.expressed 
effects_marginal = left_join(effects_marginal, temp, by = join_by(feature, contrast)) 
rm(temp)
head(effects_marginal)

## volcano plot

In [None]:
sum(effects_marginal$logFC > 2)
sum(effects_marginal$logFC > 0.5)
sum(effects_marginal$logFC > 0.1)
effects_marginal%>% group_by(cluster) %>% filter(fdr < 0.05) %>% summarize(medianLFC = median(logFC), n = n())

In [None]:
options(repr.plot.width = 20, repr.plot.height = 20, repr.plot.res = 200)
require(ggrepel)
ggplot(effects_marginal, aes(x = logFC, y = `-log10_fdr`)) +
geom_point(shape = '.') +
geom_point(data = effects_marginal[effects_marginal$`-log10_fdr` > (-1) * log10(0.05),], color = 'blue', shape = 16) +
geom_point(data = effects_marginal[effects_marginal$logFC > 2,], color = 'red', shape = 16) +
facet_wrap(~cluster, scales = 'free_y', ncol= 3) +
theme_bw(base_size = 20) +
geom_hline(aes(yintercept = (-1)*log10(0.05)), color = 'lightgrey') +
geom_vline(aes(xintercept = 2), color = 'lightgrey') + 
geom_label_repel(data = effects_marginal[effects_marginal$logFC > 2,], aes(label = feature)) 

In [None]:
#effects_marginal$`-log10_fdr`[effects_marginal$`-log10_fdr` > 50] = 50

## MA plot

In [None]:
# logFC vs mean of normalized counts
ggplot(effects_marginal, aes(x = meanExp, y = logFC)) +
geom_point(shape = '.') +
geom_point(data = effects_marginal[effects_marginal$logFC > 2,], color = 'red', shape = 16) +
geom_label_repel(data = effects_marginal[effects_marginal$logFC > 2,], aes(label = feature)) +
geom_point(data = effects_marginal[effects_marginal$meanExp > 1,], color = 'blue', shape = 16) +
geom_label_repel(data = effects_marginal[effects_marginal$meanExp > 1,], aes(label = feature)) +
facet_wrap(~cluster) +
theme_bw(base_size = 20) +
xlab('Mean normalized expression')

In [None]:
ggplot(effects_marginal, aes(meanExp)) +
geom_histogram() +
facet_wrap(~cluster) +
theme_bw(base_size = 20) +
scale_y_continuous(trans = 'log10') +
xlab('Mean expression') +
ylab('Frequency')

In [None]:
unique(effects_marginal$cluster)
scRNA_TNKILC@meta.data$ClusterFull %>% unique()

In [None]:
# logFC vs percent expressed
ggplot(effects_marginal, aes(x = prop.expressed, y = logFC)) +
geom_point(shape = '.') +
geom_point(data = effects_marginal[effects_marginal$logFC > 2,], color = 'red', shape = 16) +
geom_label_repel(data = effects_marginal[effects_marginal$logFC > 2,], aes(label = feature), color = 'red') +
geom_point(data = effects_marginal[effects_marginal$prop.expressed > 0.5,], color = 'blue', shape = 16) +
geom_label_repel(data = effects_marginal[effects_marginal$prop.expressed > 0.5,], aes(label = feature), color = 'blue') +
facet_wrap(~cluster) +
theme_bw(base_size = 20) +
xlab('Proportions of cells expressing the gene')

In [None]:
write.csv(effects_marginal, 'effects_marginal_TNKILC.csv')

## filters 

In [None]:
head(effects_marginal)

In [None]:
effects_marginal2 = effects_marginal %>% 
filter(logFC > 0.1) %>%
filter(prop.expressed > 0.05)

In [None]:
dim(effects_marginal)

### Make a plot of cells lost after QC at every nGene cutoff

- x-axis: number of genes kept
- y-axis: number of cells kept

In [None]:
dim(scRNA_TNKILC)
effects_marginal$feature %>% 
unique() %>% 
length()

In [None]:
genes_ranked_by_max_zscore = effects_marginal %>% 
        group_by(feature) %>% 
        mutate(max_zscore = max(zscore)) %>% 
        select(feature, max_zscore) %>% 
        distinct() %>%
        arrange(desc(max_zscore))
dim(genes_ranked_by_max_zscore)
rbind(head(genes_ranked_by_max_zscore), tail(genes_ranked_by_max_zscore))

In [None]:
nGenes_used = c(unique(seq(from = 100, to = min(nrow(genes_ranked_by_max_zscore), nrow(scRNA_TNKILC)))), min(nrow(genes_ranked_by_max_zscore), nrow(scRNA_TNKILC)))
nGenes_used
nCells_left = rep(NA, length(nGenes_used))
names(nCells_left) = as.character(nGenes_used)
countMat = GetAssayData(scRNA_TNKILC, 'counts')
dim(countMat)

for(trial in nGenes_used){
    print(trial)
    selectedGenes = genes_ranked_by_max_zscore$feature[1:trial]
    print(length(selectedGenes %>% unique()))
    counts = countMat[selectedGenes,]
    # filter
    nFeatures = colSums(counts > 0)
    nCounts = colSums(counts)
    nCells_left[[as.character(trial)]] = sum(nFeatures > 10 & nCounts > 30)
    print(nCells_left[[as.character(trial)]])
}

options(repr.plot.width=9, repr.plot.height=9)
nCells_left %>% 
as.data.frame() %>% 
rename('nCells_left' = colnames(.)[1]) %>% 
mutate("nGenes_used" = nGenes_used) %>%
mutate(nGenes_used = as.integer(.$nGenes_used)) %>%
ggplot() + geom_point(aes(x = nGenes_used, y=nCells_left)) + 
ggpubr::theme_pubr() + 
scale_y_continuous(trans='log10') + 
xlab('Number of genes used') + 
ylab('Number of cells left') + 
geom_segment(aes(xend=nGenes_used, 
                 x = nGenes_used, 
                 y=nCells_left), 
             yend=0, 
             linetype=3) + 
ggtitle('GLMM DEGs - TNKILCs') 

## Select 275 genes and cells with counts > 30 and features > 10

In [None]:
selectedGenes = genes_ranked_by_max_zscore$feature[1:150]
selectedGenes %>% length()
selectedGenes

In [None]:
scRNA_TNKILC_selected_genes = scRNA_TNKILC[selectedGenes, ]

In [None]:
selectedCells = scRNA_TNKILC_selected_genes@meta.data %>% 
    filter(nFeature_RNA > 10 & nCount_RNA > 30) %>%
    select(combined_cell_names) %>%
    as.matrix() %>%
    as.vector()
length(selectedCells)
c(head(selectedCells), tail(selectedCells))

In [None]:
scRNA_TNKILC_selected_genes = scRNA_TNKILC[selectedGenes, selectedCells]
summary(scRNA_TNKILC_selected_genes@meta.data$nFeature_RNA)
summary(scRNA_TNKILC_selected_genes@meta.data$nCount_RNA)

In [None]:
options(repr.plot.height = 7, repr.plot.width = 14)
(ggplot(scRNA_TNKILC_selected_genes@meta.data) +
geom_histogram(aes(nFeature_RNA)) +
ggtitle('nFeature_RNA') +
scale_y_continuous(trans = 'log10')) +
(ggplot(scRNA_TNKILC_selected_genes@meta.data) +
geom_histogram(aes(nCount_RNA)) +
ggtitle('nCount_RNA') +
scale_y_continuous(trans = 'log10'))

## Cache

### TNKILC cells with all genes

In [None]:
readr::write_rds(scRNA_TNKILC, 'TNKILC_fineTyping_all_genes.rds')

### TNKILC cells with selected genes

In [None]:
readr::write_rds(scRNA_TNKILC_selected_genes, 'TNKILC_fineTyping_selected_genes.rds')

In [None]:
scRNA_TNKILC = readr::read_rds('TNKILC_fineTyping_all_genes.rds')

In [None]:
scRNA_TNKILC