# Ingest Pelka CRC atlas data (MGP)
## Here, we take the scRNA atlas data and clean it before **coarse typing** our MERFISH data. 
## However before fine typing the data, we have to clean it further and select the relevant genes. This is documented in the 'process_reference_for_fine_typing.upynb' files. 

Data downloaded from https://singlecell.broadinstitute.org/single_cell/study/SCP1162/human-colon-cancer-atlas-c295?cluster=Stromal%20cells%20%28tSNE%29&spatialGroups=--&annotation=ClusterFull--group--cluster&subsample=all#study-summary as matrixmarket files

In [None]:
getwd()

In [None]:
require(ggplot2)
require(tidyverse)
require(ggthemes)
require(data.table)
require(Matrix)
require(spatula)
require(Seurat)
require(ggsci)
require(ggthemes)
require(pals)
require(ggpubr)
require(presto)
require(ComplexHeatmap)
require(circlize)
require(gghighlight)
#setwd('/n/data1/bwh/medicine/korsunsky/lab/mup728/CRC_MERFISH/Pelka_reference_cleaning')
options(repr.plot.width=20, repr.plot.height=10)
theme_set(theme_pubr(
  base_size = 18,
  base_family = "sans",
  border = FALSE,
  margin = TRUE,
  legend = 'bottom',
  x.text.angle = 90
) + theme(legend.key.size = unit(30,"point")))
set.seed(1)

In [None]:
crc_counts = Read10X_h5('GSE178341_crc10x_full_c295v4_submit.h5', use.names = TRUE, unique.features = TRUE)

In [None]:
crc_counts[1:20, 1:20]

# Read metadata

In [None]:
metaData = readr::read_delim('GSE178341_crc10x_full_c295v4_submit_metatables.csv.gz')
metaData = metaData[-1,]
tail(metaData)
colnames(metaData)

In [None]:
clusterAnnotations = data.table::fread('GSE178341_crc10x_full_c295v4_submit_cluster.csv.gz')
clusterAnnotations$cellID = clusterAnnotations$sampleID
head(clusterAnnotations)
dim(clusterAnnotations)

In [None]:
metaData = as.data.frame(metaData)

In [None]:
dim(metaData)
metaData = dplyr::full_join(metaData, clusterAnnotations, by = 'cellID') %>% as.data.frame()
rownames(metaData) = metaData$NAME
dim(metaData)
dim(clusterAnnotations)
head(metaData)

In [None]:
rownames(metaData) = metaData$cellID

In [None]:
length(rownames(metaData))

# Create Seurat object

In [None]:
obj = Seurat::CreateSeuratObject(counts = crc_counts, meta.data = metaData)
obj

In [None]:
obj@meta.data %>% head()

In [None]:
readr::write_rds(obj, 'complete_pelka_dataset.rds')

In [None]:
getwd()

# Create a reference with only MERFISH genes, and QC it

## load pelka reference object

In [None]:
completeReference = readr::read_rds('complete_pelka_dataset.rds')

In [None]:
completeReference@meta.data$technology = 'scRNA'

In [None]:
completeReference@meta.data$combined_cell_names = colnames(completeReference)
length(unique(completeReference@meta.data$combined_cell_names))

In [None]:
completeReference = RenameCells(completeReference, new.names = completeReference@meta.data$combined_cell_names)
head(Cells(completeReference))

In [None]:
completeReference@meta.data$ClusterTop = completeReference@meta.data$clTopLevel	
completeReference@meta.data$ClusterMid = completeReference@meta.data$clMidwayPr
completeReference@meta.data$ClusterFull = completeReference@meta.data$cl295v11SubFull

In [None]:
completeReference@meta.data$ClusterTop %>% unique()

## qc pelka reference object

### remove non-MERFISH-panel genes

In [None]:
merfishGenes = read.csv('merfishGenes.csv')[,2]
merfishGenes
length(merfishGenes)

# TODO: fix aliases

In [None]:
setdiff(merfishGenes, rownames(completeReference))

In [None]:
completeReference = completeReference[merfishGenes,]
completeReference

In [None]:
completeReference

## recalculate nFeature_RNA and nCount_RNA

In [None]:
completeReference@meta.data$nFeature_RNA %>% summary()

In [None]:
new_nCount_RNA = GetAssayData(completeReference, slot = 'counts') %>% colSums() %>% data.frame(new_nCount_RNA = .)
head(new_nCount_RNA)

In [None]:
sum(new_nCount_RNA[rownames(completeReference@meta.data), 'new_nCount_RNA'] == completeReference@meta.data$nCount_RNA)

In [None]:
dim(completeReference)

In [None]:
summary(new_nCount_RNA[,1])

In [None]:
summary(completeReference@meta.data$nCount_RNA)

In [None]:
head(completeReference@meta.data)

### remove low count cells

In [None]:
getSummary = completeReference@meta.data %>% group_by(ClusterTop) %>% summarize(n=n(), 
                                                                   mean_nCount=mean(nCount_RNA), 
                                                                   sd_nCount=sd(nCount_RNA),
                                                                   nCount_threshold = mean_nCount-sd_nCount,
                                                                   mean_nFeature=mean(nFeature_RNA), 
                                                                   sd_nFeature=sd(nFeature_RNA), 
                                                                   nFeature_threshold = mean_nFeature-sd_nFeature)
getSummary
colMeans(getSummary %>% select(!ClusterTop) %>% select(nCount_threshold, nFeature_threshold))

In [None]:
summary(completeReference@meta.data$nCount_RNA, completeReference@meta.data$ClusterTop)
summary(completeReference@meta.data$nFeature_RNA)

In [None]:
options(repr.plot.width=20, repr.plot.height=20)
ggplot(completeReference@meta.data) +
geom_histogram(aes(nCount_RNA, fill=ClusterTop)) +
facet_wrap(~ClusterTop) +
theme_minimal(base_size=30) +
ggthemes::scale_fill_colorblind() 

In [None]:
options(repr.plot.width=20, repr.plot.height=20)
ggplot(completeReference@meta.data) +
geom_histogram(aes(nFeature_RNA, fill=ClusterTop)) +
facet_wrap(~ClusterTop) +
theme_minimal(base_size=30) +
ggthemes::scale_fill_colorblind() 

In [None]:
options(repr.plot.width=20, repr.plot.height=20)
ggplot(completeReference@meta.data) +
geom_point(aes(x=nFeature_RNA,y=nCount_RNA, color = ClusterTop), shape = '.', alpha = 0.5) +
geom_vline(xintercept = 30, color = 'red') +
geom_hline(yintercept = 60, color = 'red') +
geom_vline(xintercept = 15, color = 'blue') +
geom_hline(yintercept = 50, color = 'blue') +
scale_x_continuous(trans = 'log10') +
scale_y_continuous(trans = 'log10') +
facet_wrap(~ClusterTop) +
gghighlight() +
ggthemes::scale_color_colorblind() +
theme_minimal(base_size=30) +
guides(color = guide_legend(override.aes = list(size=16, shape=16)))

In [None]:
options(repr.plot.width=20, repr.plot.height=20)
ggplot(completeReference@meta.data) +
geom_point(aes(x=nFeature_RNA,y=nCount_RNA, color = ClusterTop), shape = '.', alpha = 0.5) +
geom_vline(xintercept = 30, color = 'red') +
geom_hline(yintercept = 60, color = 'red') +
geom_vline(xintercept = 15, color = 'blue') +
geom_hline(yintercept = 50, color = 'blue') +
scale_x_continuous(trans = 'log10') +
scale_y_continuous(trans = 'log10') +
ggthemes::scale_color_colorblind() +
theme_minimal(base_size=30) +
guides(color = guide_legend(override.aes = list(size=16, shape=16)))

In [None]:
completeReference = subset(completeReference, subset = nFeature_RNA > 15 & nCount_RNA > 50)
completeReference@meta.data$biosample_id = completeReference@meta.data$batchID
completeReference@meta.data$orig.ident = completeReference@meta.data$biosample_id
completeReference

In [None]:
readr::write_rds(completeReference, 'pelka_dataset_with_merfish_genes.rds')

In [None]:
completeReference = readr::read_rds('pelka_dataset_with_merfish_genes.rds')

In [None]:
completeReference@meta.data %>% head()
completeReference@meta.data %>% filter(is.na(biosample_id))
completeReference@meta.data %>% filter(is.na(orig.ident))

In [None]:
new_nCount_RNA = GetAssayData(completeReference, slot = 'counts') %>% colSums() %>% data.frame(new_nCount_RNA = .)
head(new_nCount_RNA)

In [None]:
head(completeReference@meta.data)

In [None]:
getwd()

In [None]:
completeReference

In [None]:
summary(completeReference@meta.data$nCount_RNA)

In [None]:
summary(completeReference@meta.data$nFeature_RNA)

# Find DEGs between cell lineages in scrna dataset

In [None]:
completeReference_merfishGenes = readr::read_rds('pelka_dataset_with_merfish_genes.rds')
completeReference_merfishGenes

In [None]:
temp = GetAssayData(completeReference_merfishGenes, layer = 'counts')
varyingGenes = rownames(temp[apply(temp, 1, function(x){length(unique(x)) > 3}),])
rm(temp)
length(varyingGenes)

In [None]:
# This installs the package from GitHub without updating other packages
remotes::install_github("immunogenomics/presto@glmm", upgrade = "never")

In [None]:
completeReference_merfishGenes = completeReference_merfishGenes[varyingGenes, ]

pb = presto::collapse_counts(
    GetAssayData(completeReference_merfishGenes, 'counts'), 
    completeReference_merfishGenes@meta.data, 
    c('orig.ident', 'fov', 'ClusterTop'), 
    min_cells_per_group = 3
)
pb$exprs_norm = pb$exprs_norm[rownames(pb$counts_mat), colnames(pb$counts_mat)]

In [None]:
    obj = readr::read_rds(objects[i])
    temp = GetAssayData(obj, 'counts')
    varyingGenes = rownames(temp[apply(temp, 1, function(x){length(unique(x)) > 3}),])
    rm(temp)
    
    obj = obj[varyingGenes, ]
    
    pb = presto::collapse_counts(
        GetAssayData(obj, 'counts'), 
        obj@meta.data, 
        c('orig.ident', 'fov', 'ClusterTop'), 
        min_cells_per_group = 3
    )
    pb$exprs_norm = pb$exprs_norm[rownames(pb$counts_mat), colnames(pb$counts_mat)]

    system.time({
    presto_res = presto::presto.presto(
        y ~ 1 + (1|ClusterTop) +  (1|fov/ClusterTop) + offset(logUMI), 
        pb$meta_data, 
        pb$counts_mat,
        size_varname = "logUMI", 
        effects_cov = 'ClusterTop',
        ncore = 1, 
        min_sigma = .05,
        family = "poisson",
        nsim = 1000 
    )})
    filename = objects[i] %>% gsub(pattern = "annotated_", replacement = "coarse_glmm_")
    filename
    readr::write_rds(presto_res, filename)

    contrasts_mat = make_contrast.presto(
        presto_res, 
        var_contrast = 'ClusterTop'
    )

    effects_marginal = contrasts.presto(
            presto_res, 
            contrasts_mat, 
            one_tailed = TRUE
        ) %>% 
        dplyr::mutate(cluster = contrast) %>% 
        dplyr::mutate(
            logFC = sign(beta) * log2(exp(abs(beta))), # convert stats to log2 for interpretability 
            SD = log2(exp(sigma)),
            zscore = logFC / SD
        ) %>%
        arrange(pvalue)
    effects_marginal$fdr = p.adjust(effects_marginal$pvalue, method = 'BH')
    effects_marginal$corr_fdr = effects_marginal$fdr
    effects_marginal$corr_fdr[effects_marginal$fdr == 0] = min(effects_marginal$fdr[effects_marginal$fdr != 0])
    effects_marginal$`-log10_fdr` = (-1) * log10(effects_marginal$corr_fdr) 
    effects_marginal %>% fwrite(file = gsub(x = filename, pattern = ".rds", replacement = "_marginal_effects.csv"))