In [None]:
sampleID = "G4712_Beta10"

## libraries

In [None]:
require(Seurat)
require(tidyverse)
require(readxl)
require(patchwork)
require(sf)
require(ggpubr)
require(ggthemes)
require(harmony)
require(presto)
require(ComplexHeatmap)
require(circlize)
require(glue)
require(e1071) 
require(caTools) 
require(class) 
require(gghighlight)
set.seed(1)

## load post-qc merged seurat object (object created in 'Initial_Quality_Control.ipynb')

In [None]:
merged_datasets = readr::read_rds('/n/scratch3/users/m/mup728/Pelka_Baysor_segmentation/data_and_ingest/merged_merfish_pelka_seurat_after_qc.rds')
merged_datasets

In [None]:
unique(merged_datasets@meta.data$orig.ident)

## subset merged_datasets to just one dataset

In [None]:
merged_datasets = subset(merged_datasets, subset = orig.ident == sampleID)
merged_datasets

In [None]:
merged_datasets@meta.data$technology = 'MERFISH'

In [None]:
merged_datasets@meta.data$combined_cell_names = colnames(merged_datasets)
length(unique(merged_datasets@meta.data$combined_cell_names))

In [None]:
merged_datasets = RenameCells(merged_datasets, new.names = merged_datasets@meta.data$combined_cell_names)
head(Cells(merged_datasets))

## load pelka reference object

In [None]:
completeReference = readr::read_rds('/n/scratch3/users/m/mup728/Cell_Typing_CRC_MERFISH/Pelka_reference_cleaning/pelka_dataset_with_merfish_genes.rds')

In [None]:
completeReference@meta.data$technology = 'scRNA'

In [None]:
completeReference@meta.data$combined_cell_names = colnames(completeReference)
completeReference@meta.data$orig.ident = completeReference@meta.data$biosample_id
length(unique(completeReference@meta.data$combined_cell_names))

## merge 

In [None]:
merged_datasets = merge(merged_datasets, completeReference)
merged_datasets

## rename cells

In [None]:
merged_datasets@meta.data$combined_cell_names = colnames(merged_datasets)
length(unique(merged_datasets@meta.data$combined_cell_names))

In [None]:
merged_datasets = RenameCells(merged_datasets, new.names = merged_datasets@meta.data$combined_cell_names)
head(Cells(merged_datasets))

## normalize, scale, pca the merged dataset

In [None]:
normFactor = merged_datasets@meta.data %>%
select(technology, nCount_RNA) %>%
group_by(technology) %>%
summarize(medianCounts = median(nCount_RNA)) 
normFactor = mean(normFactor$medianCounts) #median of medians (across technology) 
normFactor
merged_datasets = NormalizeData(merged_datasets, 
                          normalization.method = "LogNormalize", 
                          scale.factor = normFactor) %>%
    ScaleData(features = rownames(merged_datasets)) %>%
    singlecellmethods::RunBalancedPCA(weight.by='orig.ident', npcs=15)
merged_datasets

## pre harmony umap 

In [None]:
U = uwot::umap(Embeddings(merged_datasets, 'pca'), min_dist = 0.05, spread = 0.30, ret_extra = 'fgraph', fast_sgd = TRUE)
colnames(U$embedding) = c('PCAUMAP1', 'PCAUMAP2')
rownames(U$fgraph) = colnames(U$fgraph) = Cells(merged_datasets)
merged_datasets[['pcaumap']] = Seurat::CreateDimReducObject(
    embeddings = U$embedding,
    assay = 'RNA', 
    key = 'PCAUMAP_', 
    global = TRUE
)
new_graph = Seurat::as.Graph(U$fgraph)
DefaultAssay(new_graph) = DefaultAssay(merged_datasets)
merged_datasets[['pcaumap_fgraph']] = new_graph

In [None]:
options(repr.plot.width=10, repr.plot.height=10)
temp = Embeddings(merged_datasets, 'pcaumap') %>% 
    as.data.frame() %>% 
    mutate(combined_cell_names=rownames(.)) %>% 
    left_join(merged_datasets@meta.data %>% 
    select(combined_cell_names, technology)) 
head(temp)
ggplot(temp) +
geom_point(aes(x = PCAUMAP_1, 
               y = PCAUMAP_2, 
               color = technology), 
           shape='.', 
           alpha=0.5) +
ggthemes::scale_color_colorblind() +
ggpubr::theme_pubr(base_size=18) +
facet_wrap(~technology) +
theme(legend.position = 'none')
rm(temp)

## harmonize

In [None]:
merged_datasets = merged_datasets %>%
harmony::RunHarmony(
    c("orig.ident", "technology"), 
    plot_convergence = TRUE, 
    lambda = NULL,
    max.iter = 10,
    early_stop = F
) 

## post harmony umap, split by technology

In [None]:
U = uwot::umap(Embeddings(merged_datasets, 'harmony'), min_dist = 0.05, spread = 0.30, ret_extra = 'fgraph', fast_sgd = TRUE)
colnames(U$embedding) = c('HUMAP1', 'HUMAP2')
rownames(U$fgraph) = colnames(U$fgraph) = Cells(merged_datasets)
merged_datasets[['humap']] = Seurat::CreateDimReducObject(
    embeddings = U$embedding,
    assay = 'RNA', 
    key = 'HUMAP_', 
    global = TRUE
)
new_graph = Seurat::as.Graph(U$fgraph)
DefaultAssay(new_graph) = DefaultAssay(merged_datasets)
merged_datasets[['humap_fgraph']] = new_graph

In [None]:
options(repr.plot.width=10, repr.plot.height=10)
temp = Embeddings(merged_datasets, 'humap') %>% 
    as.data.frame() %>% 
    mutate(combined_cell_names=rownames(.)) %>% 
    left_join(merged_datasets@meta.data %>% 
    select(combined_cell_names, technology)) 
ggplot(temp) +
geom_point(aes(x = HUMAP_1, 
               y = HUMAP_2, 
               color = technology), 
           shape='.', 
           alpha=0.5) +
ggthemes::scale_color_colorblind() +
ggpubr::theme_pubr(base_size=18) +
facet_wrap(~technology) +
theme(legend.position = 'none')
rm(temp)

## cache

In [None]:
readr::write_rds(merged_datasets, paste('coarse_', sampleID, '.rds', sep = ""))

## Coarse label transfer with weighted knn

#### functions for weighted knn

In [None]:
TransferLabels <- function(obj, ...) UseMethod("TransferLabels")

TransferLabels.default <- function(
    embedding, ## low dimensional space (e.g. cells-by-PCs)
    meta_data, ## meta data table 
    group.by.varname, ## Variable that splits reference and query 
    label.varname, ## Label I want to transfer
    from, ## Reference cell group(s)
    to, ## Query cell group(s)
    k = 10, ## Number nearest neighbors for prediction
    weighted = TRUE ## Given more weight to closer neighbors
) {
    if (is(embedding, 'data.frame')) {
        embedding <- as.matrix(as.data.frame(embedding))
    } else if (is(embedding, 'matrix')) {
        ## Good, do nothing! 
        
    } else {
        stop('Embedding must be compatible with matrix type.')
    }
    if (nrow(embedding) != nrow(meta_data)) {
        stop('Embedding and metadata must have same number of observations (rows).')
    }
    if (!group.by.varname %in% colnames(meta_data)) {
        stop(glue('Column named "{group.by.varname}" not defined in meta_data'))
    }
    if (!group.by.varname %in% colnames(meta_data)) {
        stop(glue('Column named "{label.varname}" not defined in meta_data'))
    }

    ## Use L2 distance instead of Euclidean 
    message('Use L2 distance instead of Euclidean')
    embedding <- singlecellmethods:::cosine_normalize_cpp(embedding, 1)    
    
    ids_from <- which(meta_data[[group.by.varname]] == from)
    ids_to <- which(meta_data[[group.by.varname]] == to)
    nn <- RANN::nn2(
        data = embedding[ids_from, ],
        query = embedding[ids_to, ],
        k = k,
        eps = 0
    )
    if (weighted) {
        ## Closer neighbors given more weight 
        probs <- prop.table(exp(-nn$nn.dists), 1)
        xvec <- c(t(probs))
    } else {
        ## Each neighbor given equal weight
        xvec <- rep(1, length(ids_to) * k)
    }
    nn_mat <- Matrix::sparseMatrix(
        i = rep(1:length(ids_to), each=k) ,
        j = c(t(nn$nn.idx)), 
        x = xvec,
        dims = c(length(ids_to), length(ids_from))
    )
    type_design <- Matrix::sparse.model.matrix(~0 + meta_data[ids_from, ][[label.varname]])
    colnames(type_design) <- gsub('meta_data.ids_from, ...label.varname..', '', colnames(type_design))
    prob_assign <- nn_mat %*% type_design
    hard_assign <- colnames(prob_assign)[max.col(prob_assign)]
    # confidence_assign <- apply(prob_assign, 1, max)
    # hard_assign[confidence_assign < confidence_thresh] <- NA
    meta_data[ids_to, ][[label.varname]] <- hard_assign
    meta_data[[paste0(label.varname, '_confidence')]] <- 1
    meta_data[ids_to, ][[paste0(label.varname, '_confidence')]] <- apply(prob_assign, 1, max)
    
    return(meta_data)
}


TransferLabels.Seurat <- function(
    obj, ## Seurat object
    reduction, ## reduction name for embeddings 
    group.by.varname, ## Variable that splits reference and query 
    label.varname, ## Label I want to transfer
    from, ## Reference cell group(s)
    to, ## Query cell group(s)
    k = 10, ## Number nearest neighbors for prediction
    weighted = TRUE ## Given more weight to closer neighbors
) {
    obj@meta.data <- TransferLabels.default(
        Seurat::Embeddings(obj, reduction = reduction), 
        obj@meta.data, 
        group.by.varname, 
        label.varname, 
        from, 
        to,
        k = k,
        weighted = weighted
    )
    return(obj)
}
merged_datasets = readr::read_rds(paste('coarse_', sampleID, '.rds', sep = ""))
merged_datasets
merged_datasets@meta.data$technology = as.factor(merged_datasets@meta.data$technology)
merged_datasets@meta.data$ClusterTop = as.factor(merged_datasets@meta.data$ClusterTop)

### transfer labels

In [None]:
merged_datasets_weightedknn = TransferLabels(merged_datasets, reduction = 'harmony', group.by.varname = 'technology', label.varname = 'ClusterTop', from = 'scRNA', to = 'MERFISH', k = 10, weighted = TRUE)

In [None]:
merged_datasets_weightedknn@meta.data %>% colnames()

In [None]:
length(na.omit(merged_datasets_weightedknn@meta.data$ClusterTop))

In [None]:
merged_datasets = merged_datasets_weightedknn

In [None]:
options(repr.plot.width = 20, repr.plot.height = 20)
temp = Embeddings(merged_datasets, 'humap') %>%
    as.data.frame() %>%
    mutate(combined_cell_names = rownames(.)) %>%
    left_join(merged_datasets@meta.data %>% select(combined_cell_names, ClusterTop, technology))
label.df = temp %>% 
  group_by(ClusterTop) %>% 
  summarize(x = median(HUMAP_1), y = median(HUMAP_2)) %>% na.omit()
label.df
ggplot(temp) +
geom_point(aes(HUMAP_1, HUMAP_2, color = ClusterTop), shape = '.') +
facet_wrap(~technology, nrow = 2) +
theme_pubr(base_size=40) +
ggthemes::scale_color_tableau('Tableau 20', name = "") +
guides(color = guide_legend(override.aes = list(size=10, shape=16))) +
theme(legend.position = 'right') +
geom_label(data = label.df, aes(x=x, y=y, label=ClusterTop, color=ClusterTop), size = 12) +
ggtitle(paste('Coarse label transfer with KNN - ', sampleID, sep = "")) +
gghighlight()

In [None]:
options(repr.plot.width = 30, repr.plot.height = 20)
temp = Embeddings(merged_datasets, 'humap') %>%
    as.data.frame() %>%
    mutate(combined_cell_names = rownames(.)) %>%
    left_join(merged_datasets@meta.data %>% select(combined_cell_names, ClusterTop, technology))
ggplot(temp) +
geom_point(aes(HUMAP_1, HUMAP_2, color = ClusterTop), shape = '.') +
facet_wrap(~technology, nrow = 2) +
theme_pubr(base_size=30) +
ggthemes::scale_color_tableau('Tableau 20', name = "") +
guides(color = guide_legend(override.aes = list(size=10, shape=16))) +
theme(legend.position = 'right') +
ggtitle(paste('Coarse label transfer with KNN - ', sampleID, sep = "")) +
gghighlight() +
facet_wrap(~technology + ClusterTop, ncol = 7)

#### cache

In [None]:
readr::write_rds(merged_datasets, paste('coarse_', sampleID, '.rds', sep = ""))

## find markers

In [None]:
merged_datasets
colnames(merged_datasets@meta.data)

In [None]:
merged_datasets = SetIdent(merged_datasets, value = 'ClusterTop')

In [None]:
merfishCells = merged_datasets@meta.data %>% filter(technology == 'MERFISH') %>% select(combined_cell_names) %>% as.matrix() %>% as.vector()
length(merfishCells)
temp = merged_datasets@meta.data %>% filter(technology == 'MERFISH')
rownames(temp) = temp$combined_cell_names
merfish = CreateSeuratObject(GetAssayData(merged_datasets, 'counts')[,merfishCells], meta.data = temp)
merfish
rm(temp)

In [None]:
merfish@meta.data$ClusterTop %>% unique()

In [None]:
require(presto)
options(repr.matrix.max.rows=600, repr.matrix.max.cols=200)
merfish = SetIdent(merfish, value='ClusterTop') #knn_results
knnMarkers = wilcoxauc(merfish) 
knnMarkers$method = 'MERFISH'
top_markers(knnMarkers, auc_min=0.6, padj=0.05, n=Inf)

In [None]:
scRNACells = merged_datasets@meta.data %>% filter(technology == 'scRNA') %>% select(combined_cell_names) %>% as.matrix() %>% as.vector()
length(merfishCells)
temp = merged_datasets@meta.data %>% filter(technology == 'scRNA')
rownames(temp) = temp$combined_cell_names
scRNA = CreateSeuratObject(GetAssayData(merged_datasets, 'counts')[,scRNACells], meta.data = temp)
scRNA
rm(temp)

In [None]:
scRNA@meta.data$ClusterTop %>% unique()

In [None]:
require(presto)
options(repr.matrix.max.rows=600, repr.matrix.max.cols=200)
scRNA = SetIdent(scRNA, value='ClusterTop')
clusterTopMarkers = wilcoxauc(scRNA) 
clusterTopMarkers$method = 'scRNA'
top_markers(clusterTopMarkers, auc_min=0.6, padj=0.05, n=Inf)

## umaps

In [None]:
temp = top_markers(knnMarkers, auc_min=0.6, padj=0.05)
for (i in 2:ncol(temp)){
    genes = na.omit(as.vector(as.matrix(temp[,i])))
    options(repr.plot.width = 10, repr.plot.height = length(genes) * 5)
    print(FeaturePlot(merged_datasets, reduction = 'humap', features = genes, split.by = 'technology', raster = TRUE) + plot_annotation(title = colnames(temp)[i]))
}

# compare approaches with logFC

In [None]:
markers_for_comparison = rbind(knnMarkers, clusterTopMarkers)
head(markers_for_comparison)

In [None]:
options(repr.plot.width = 40, repr.plot.height = 10)
temp = markers_for_comparison %>% filter(auc > 0.6 & padj < 0.05) %>% pivot_wider(names_from = feature, values_from = logFC, values_fill = 0, id_cols = c(group, method)) %>% as.data.frame()
rownames(temp) = paste(temp$group, temp$method, sep = "_")
temp = temp %>% select(!group) %>% select(!method)
temp = scale(temp) # column scale
dim(temp)
require(circlize)
require(ComplexHeatmap)
colors = rev(tableau_gradient_pal(pal = "Classic Orange-White-Blue", type = "ordered-diverging")(seq(0, 1, length = 25)))
colors[13] = '#ffffff'
f1 = colorRamp2(unique(c(seq(min(temp), 0, length = 13), seq(0, max(temp), length = 13))), colors)
Heatmap(temp, col = f1, row_km = 7, column_km = 7, border = TRUE, row_gap = unit(5, "mm"), column_gap = unit(5, "mm"), clustering_distance_rows = "pearson", clustering_distance_columns = "pearson")

In [None]:
options(repr.plot.width = 10, repr.plot.height = 10)
combnVars = expand.grid(unique(clusterTopMarkers$group), unique(knnMarkers$group))
head(combnVars)
plotlist = list()
j = 1
for (i in 1:nrow(combnVars)){
    var1 = as.character(combnVars$Var1[i])
    var2 = as.character(combnVars$Var2[i])
    temp = dplyr::left_join(clusterTopMarkers %>% 
                                filter(group == var1) %>% 
                                mutate(logFC_scRNA = logFC, 
                                       group_scRNA = group) %>% 
                                select(feature, 
                                       group_scRNA, 
                                       logFC_scRNA), 
                            knnMarkers %>% 
                                filter(group == var2) %>% 
                                mutate(logFC_knn = logFC, 
                                       group_knn = group) %>% 
                                select(feature, group_knn, logFC_knn))
    temp = temp %>% filter(! feature == 'CD74') %>% filter(! feature == 'COL3A1')
    temp = temp %>% mutate(labelTRUE = (logFC_knn > quantile(temp$logFC_knn, 0.75)) | (logFC_scRNA > quantile(temp$logFC_scRNA, 0.75))) %>% mutate(label = if_else(labelTRUE, feature, NA)) 
    getColors=coef(lm(logFC_scRNA ~ logFC_knn, data = temp))
    getIntercept = as.numeric(getColors)[1]
    getSlope = as.numeric(getColors)[2]
    temp = temp %>% mutate(ye = getIntercept + (getSlope*logFC_knn), color = logFC_scRNA < ye)
    if (cor(temp$logFC_scRNA, temp$logFC_knn) > 0) {
        p1 = ggplot(temp, aes(x = logFC_knn, y = logFC_scRNA), shape = 16) +
        geom_point() +
        ylab(glue::glue('scRNA: ', var1)) +
        xlab(glue::glue('MERFISH: ', var2)) +
        ggtitle(glue::glue('MERFISH: ', var2, '\nscRNA: ', var1, '\nR2: ', round(cor(temp$logFC_scRNA, temp$logFC_knn), 2))) +
        geom_hline(aes(yintercept = 0)) +
        geom_vline(aes(xintercept = 0)) +
        ggrepel::geom_label_repel(aes(label=label, color=color)) + 
        theme_minimal(base_size=18) +
        geom_abline(intercept = getIntercept, slope = getSlope) +
        scale_color_brewer(palette="Set1") +
        theme(legend.position='none')
        print(p1)
        if (var1 == var2) {
            plotlist[[j]] = p1
            j = 1 + j
        }
    }
}
length(plotlist)

In [None]:
options(repr.plot.width = 10, repr.plot.height = 10)
for (p in plotlist){
    print(p)
}

In [None]:
require(patchwork)
length(plotlist)
options(repr.plot.width = 30, repr.plot.height = 30)
annotate_figure(ggarrange(plotlist=plotlist), top = paste('Coarse typing of ', sampleID, 'with weighted KNN\nComparison of logFC between scRNA & MERFISH', sep = ""))

# compare approaches with auc

In [None]:
merfishCells = merged_datasets@meta.data %>% filter(technology == 'MERFISH') %>% select(combined_cell_names) %>% as.matrix() %>% as.vector()
length(merfishCells)
temp = merged_datasets@meta.data %>% filter(technology == 'MERFISH')
rownames(temp) = temp$combined_cell_names
merfish = CreateSeuratObject(GetAssayData(merged_datasets, 'counts')[,merfishCells], meta.data = temp)
merfish
rm(temp)

In [None]:
merfish@meta.data$ClusterTop %>% unique()

In [None]:
require(presto)
options(repr.matrix.max.rows=600, repr.matrix.max.cols=200)
merfish = SetIdent(merfish, value='ClusterTop') #knn_results
knnMarkers = wilcoxauc(merfish) 
knnMarkers$method = 'MERFISH'
top_markers(knnMarkers, auc_min=0.6, padj=0.05, n=Inf)

In [None]:
scRNACells = merged_datasets@meta.data %>% filter(technology == 'scRNA') %>% select(combined_cell_names) %>% as.matrix() %>% as.vector()
length(merfishCells)
temp = merged_datasets@meta.data %>% filter(technology == 'scRNA')
rownames(temp) = temp$combined_cell_names
scRNA = CreateSeuratObject(GetAssayData(merged_datasets, 'counts')[,scRNACells], meta.data = temp)
scRNA
rm(temp)

In [None]:
scRNA@meta.data$ClusterTop %>% unique()

In [None]:
require(presto)
options(repr.matrix.max.rows=600, repr.matrix.max.cols=200)
scRNA = SetIdent(scRNA, value='ClusterTop')
clusterTopMarkers = wilcoxauc(scRNA) 
clusterTopMarkers$method = 'scRNA'
top_markers(clusterTopMarkers, auc_min=0.6, padj=0.05, n=Inf)

# compare approaches with AUC

In [None]:
markers_for_comparison = rbind(knnMarkers, clusterTopMarkers)
head(markers_for_comparison)

In [None]:
options(repr.plot.width = 10, repr.plot.height = 10)
combnVars = expand.grid(unique(clusterTopMarkers$group), unique(knnMarkers$group))
head(combnVars)
plotlist = list()
j = 1
for (i in 1:nrow(combnVars)){
    var1 = as.character(combnVars$Var1[i])
    var2 = as.character(combnVars$Var2[i])
    temp = dplyr::left_join(clusterTopMarkers %>% 
                                filter(group == var1) %>% 
                                mutate(auc_scRNA = auc, 
                                       group_scRNA = group) %>% 
                                select(feature, 
                                       group_scRNA, 
                                       auc_scRNA), 
                            knnMarkers %>% 
                                filter(group == var2) %>% 
                                mutate(auc_knn = auc, 
                                       group_knn = group) %>% 
                                select(feature, group_knn, auc_knn))
    temp = temp %>% mutate(labelTRUE = (auc_knn > 0.6) | (auc_scRNA > 0.6)) %>% mutate(label = if_else(labelTRUE, feature, NA)) 
    temp$color = unlist(lapply(1:nrow(temp), function(i){
        if (temp$auc_knn[i] > 0.6 & temp$auc_scRNA[i] > 0.6){
            return('auc > 0.6 in both')
        } else {
            if (temp$auc_knn[i] > 0.6) {
                return('auc > 0.6 in merfish')
            } else {
                if (temp$auc_scRNA[i] > 0.6){
                    return('auc > 0.6 in scRNA')
                } else {
                    return('auc < 0.6 in both')
                }
            } 
        }
    }))
    temp$color = as.factor(temp$color)
    if (var1 == var2) {
        p1 = ggplot(temp, aes(x = auc_knn, y = auc_scRNA, color=color), shape = 16) +
        geom_point() +
        ylab(glue::glue('scRNA: ', var1)) +
        xlab(glue::glue('MERFISH: ', var2)) +
        ggtitle(glue::glue('MERFISH: ', var2, '\nscRNA: ', var1, '\nR2: ', round(cor(temp$auc_scRNA, temp$auc_knn), 2))) +
        geom_hline(aes(yintercept = 0)) +
        geom_vline(aes(xintercept = 0)) +
        ggrepel::geom_label_repel(aes(label=label, color=color)) + 
        theme_minimal(base_size=18) +
        geom_abline(intercept = 0, slope = 1) +
        scale_color_brewer(palette="Set1") +
        theme(legend.position='right') +
        xlim(0,1) +
        ylim(0,1) +
        geom_vline(xintercept = 0.6) +
        geom_hline(yintercept = 0.6) +
        guides(color = guide_legend(override.aes = list(size=10, shape=16)))
        print(p1)
        plotlist[[j]] = p1
        j = 1 + j
    }
}
length(plotlist)

# tabulate number of cells of each type

In [None]:
merged_datasets@meta.data %>% group_by(technology, ClusterTop) %>% summarize(n = n()) %>% pivot_wider(names_from = technology, values_from = n)

In [None]:
sampleID

# plot cells in space

In [None]:
require(Seurat)
require(tidyverse)
require(readxl)
require(patchwork)
require(sf)
require(ggpubr)
require(ggthemes)
require(harmony)
require(presto)
require(ComplexHeatmap)
require(circlize)
require(glue)
require(e1071) 
require(caTools) 
require(class) 
require(gghighlight)
set.seed(1)

In [None]:
merged_datasets@meta.data  %>% head() %>% select(x,y,ClusterTop)

In [None]:
merged_datasets@meta.data$technology %>% table()

In [None]:
merfish = subset(merged_datasets, subset = technology == 'MERFISH')
merfish

In [None]:
options(repr.plot.width = 10, repr.plot.height = 10, repr.plot.res = 500)
ggplot(merfish@meta.data %>% sample_n(ncol(merfish)/3)) +
geom_point(aes(x,y, color=ClusterTop),shape='.') +
theme_void(base_size=18) +
scale_color_colorblind(name='') +
coord_sf() +
guides(color = guide_legend(override.aes = list(size = 8, shape = 16, alpha=1))) +
ggtitle(paste('Coarse cell types in ', sampleID, sep = ""))

In [None]:
options(repr.plot.width = 10, repr.plot.height = 10, repr.plot.res = 500)
ggplot(merfish@meta.data %>% sample_n(ncol(merfish)/3)) +
geom_point(aes(x,y, color=ClusterTop),shape='.') +
theme_void(base_size=18) +
scale_color_colorblind(name='') +
guides(color = guide_legend(override.aes = list(size = 8, shape = 16, alpha=1))) +
facet_wrap(~ClusterTop) +
gghighlight::gghighlight() +
coord_sf() +
ggtitle(paste('Coarse cell types in ', sampleID, sep = ""))

# compare with geneformer annotations

In [None]:
merged_datasets = readr::read_rds(paste('coarse_', sampleID, '.rds', sep = ""))
merged_datasets

In [None]:
merfishCells = merged_datasets@meta.data %>% filter(technology == 'MERFISH') %>% select(combined_cell_names) %>% as.matrix() %>% as.vector()
length(merfishCells)
temp = merged_datasets@meta.data %>% filter(technology == 'MERFISH')
rownames(temp) = temp$combined_cell_names
merfish = CreateSeuratObject(GetAssayData(merged_datasets, 'counts')[,merfishCells], meta.data = temp)
merfish
rm(temp)

In [None]:
filename = paste('/n/scratch3/users/m/mup728/Pelka_Baysor_segmentation/data_and_ingest/', sampleID, '/Broad_', sampleID, '_geneformer-processed_labels-only.csv', sep = "")
geneformer = read.delim(filename, sep = ",")
geneformer$orig.cell.id = geneformer$X
geneformer = select(geneformer, orig.cell.id, cl46Top) %>% na.omit()
colnames(geneformer) = c('orig.cell.id', 'geneformer')
rownames(geneformer) = 1:nrow(geneformer)
head(geneformer)

In [None]:
weightedknn = merfish@meta.data %>% select(orig.cell.id, ClusterTop) %>% na.omit()
colnames(weightedknn) = c('orig.cell.id', 'weightedknn')
rownames(weightedknn) = 1:nrow(weightedknn)
head(weightedknn)

In [None]:
temp = left_join(x = weightedknn, y = geneformer, by = join_by(orig.cell.id))
temp = apply(temp, c(1,2), as.character) %>% as.data.frame()
dim(temp)
dim(geneformer)
dim(weightedknn)

In [None]:
head(temp)

In [None]:
sum(na.omit(temp$weightedknn == temp$geneformer)) / length(na.omit(temp$weightedknn == temp$geneformer))

In [None]:
merfish@meta.data = left_join(merfish@meta.data, geneformer)

In [None]:
merfish = SetIdent(merfish, value = 'geneformer')
geneformer_markers = wilcoxauc(merfish)
top_markers(geneformer_markers, auc_min = 0.6, padj_max = 0.05, n = Inf)
top_markers(geneformer_markers)

In [None]:
merfish = SetIdent(merfish, value = 'ClusterTop')
knnMarkers = wilcoxauc(merfish) 
top_markers(knnMarkers, auc_min = 0.6, padj_max = 0.05, n = Inf)

In [None]:
options(repr.plot.width = 10, repr.plot.height = 10)
combnVars = expand.grid(unique(geneformer_markers$group), unique(knnMarkers$group))
head(combnVars)
plotlist = list()
cols = c("auc > 0.6 in both" = "#009E73",  "auc > 0.6 only in weightedknn" = "red", "auc > 0.6 only in geneformer" = "blue", "auc < 0.6 in both" = "black")
j = 1
for (i in 1:nrow(combnVars)){
    var1 = as.character(combnVars$Var1[i])
    var2 = as.character(combnVars$Var2[i])
    temp = dplyr::left_join(geneformer_markers %>% 
                                filter(group == var1) %>% 
                                mutate(logFC_geneformer = logFC, 
                                       group_geneformer = group,
                                        auc_geneformer = auc) %>% 
                                select(feature, 
                                       group_geneformer, 
                                       logFC_geneformer,
                                      auc_geneformer), 
                            knnMarkers %>% 
                                filter(group == var2) %>% 
                                mutate(logFC_knn = logFC, 
                                       group_knn = group,
                                      auc_knn = auc) %>% 
                                select(feature, 
                                       group_knn, 
                                       logFC_knn,
                                        auc_knn))
    temp = temp %>% mutate(labelTRUE = (auc_knn > 0.6) | (auc_geneformer > 0.6)) %>% mutate(label = if_else(labelTRUE, feature, NA)) 
    temp$color = unlist(lapply(1:nrow(temp), function(i){
        if (temp$auc_knn[i] > 0.6 & temp$auc_geneformer[i] > 0.6){
            return('auc > 0.6 in both')
        } else {
            if (temp$auc_knn[i] > 0.6) {
                return('auc > 0.6 only in weightedknn')
            } else {
                if (temp$auc_geneformer[i] > 0.6){
                    return('auc > 0.6 only in geneformer')
                } else {
                    return('auc < 0.6 in both')
                }
            } 
        }
    }))
    temp$color = as.factor(temp$color)
    if (var1 == var2) {
        p1 = ggplot(temp, aes(x = logFC_knn, y = logFC_geneformer, color=color), shape = 16) +
        geom_point() +
        ylab(glue::glue('logFC geneformer: ', var1)) +
        xlab(glue::glue('logFC weightedknn: ', var2)) +
        ggtitle(glue::glue(var1, '\nR2: ', round(cor(temp$logFC_geneformer, temp$logFC_knn), 2))) +
        geom_hline(aes(yintercept = 0)) +
        geom_vline(aes(xintercept = 0)) +
        ggrepel::geom_label_repel(aes(label=label, color=color)) + 
        theme_minimal(base_size=18) +
        #geom_abline(intercept = 0, slope = 1) +
        scale_color_manual(values = cols) +
        theme(legend.position='right') +
        guides(color = guide_legend(override.aes = list(size=10, shape=16)))
        print(p1)
        plotlist[[j]] = p1
        j = 1 + j
    }
}
length(plotlist)

In [None]:
require(patchwork)
length(plotlist)
options(repr.plot.width = 15, repr.plot.height = 15)
plotlist