Here are my ideas:
-run emptydrops on spliced matrices
-Generate "raw" seurat objects with spliced counts, unspliced counts, and combined.

Fig. 1) exploration of phenotype and UMAP

Fig. 2) Pathway analysis

Fig. 3) comparison with bulk data

Fig. 4) transdifferentiation


In [None]:
library(BUSpaRse)
library(here)
library(Matrix)
library(tidyverse)
library(Seurat)
library(ggpointdensity)
library(scico)
library(scales)
library(DropletUtils)
library(reticulate)
library(repr)
library(DoubletFinder)
library(future)
options(future.globals.maxSize = 100000 * 1024^2)

proto_genes=read.csv("../data/bulk_data/protoplasting.csv")
proto_list=as.character(proto_genes[abs(proto_genes$logFC) > 4,]$genes)

# Slightly modified from BUSpaRse, just to avoid installing a few dependencies not used here
read_count_output <- function(dir, name) {
  dir <- normalizePath(dir, mustWork = TRUE)
  m <- readMM(paste0(dir, "/", name, ".mtx"))
  m <- Matrix::t(m)
  m <- as(m, "dgCMatrix")
  # The matrix read has cells in rows
  ge <- ".genes.txt"
  genes <- readLines(file(paste0(dir, "/", name, ge)))
  barcodes <- readLines(file(paste0(dir, "/", name, ".barcodes.txt")))
  colnames(m) <- barcodes
  rownames(m) <- genes
  return(m)
}

setwd(here())

In [3]:
files <- list.files(path="../data/scKB_outs/", full.names=TRUE, recursive=FALSE)
files_base = list.files(path="../data/scKB_outs/", full.names=FALSE, recursive=FALSE)

In [3]:
print(files_base)

 [1] "sc_101"         "sc_102"         "sc_103"         "sc_104"        
 [5] "sc_26_combined" "sc_27_combined" "sc_67"          "sc_68"         
 [9] "sc_69"          "sc_70"         


In [None]:
#prevent warnings from printing
defaultW <- getOption("warn") 
options(warn = -1) 

seu_list = list()

#loop through all files and perform empty drops quantification and make seurat objects
for (i in 1:length(files)){
#for (i in 3:3){
    sample = files[i]
    
    #read in spliced matrix and retrain only Arabidopsis gene counts (important for species mixing experiments)
    spliced = read_count_output(sample, "spliced")
    spliced = spliced[grepl("AT",unlist(spliced@Dimnames[1]), fixed=TRUE),, drop=FALSE]
    
    #read in unspliced matrix and retrain only Arabidopsis gene counts
    unspliced = read_count_output(sample, "unspliced") 
    unspliced = unspliced[grepl("AT",unlist(unspliced@Dimnames[1]), fixed=TRUE),, drop=FALSE]  
    
    #We take all cells from spliced and unspliced that were called by emptydrops, and then we sum them into combined matrix
    shared = intersect(colnames(spliced), colnames(unspliced))
    
    spliced = spliced[,shared] 
    unspliced = unspliced[,shared]
    
    combined = spliced + unspliced
    
    #run empty drops on just the spliced matrix (vast majority of reads are spliced so should give sufficient signal)
    empty_drops = emptyDrops(combined[grepl(pattern = "AT[1-5]", unlist(spliced@Dimnames[1])),, drop=FALSE], ignore = 500, lower = 300)
    
    #We take all cells from spliced and unspliced that were called by emptydrops, and then we sum them into combined matrix
    shared = intersect(intersect(colnames(spliced), colnames(unspliced)), rownames(empty_drops[!is.na(empty_drops$FDR) & empty_drops$FDR < .001,]))
    
    spliced = spliced[,shared] 
    unspliced = unspliced[,shared]
    
    combined = spliced + unspliced
    
    #create seurat object
    seu_obj <- CreateSeuratObject(combined, min.cells = 3)
    
    #add spliced and unspliced as assays
    spliced_assay <- CreateAssayObject(counts = spliced)
    unspliced_assay <- CreateAssayObject(counts = unspliced)
    
    seu_obj[["spliced"]] = spliced_assay
    seu_obj[["unspliced"]] = unspliced_assay
    
    #mito and plastid read percent
    seu_obj = PercentageFeatureSet(seu_obj, pattern = "ATM", col.name = "percent.mito", assay = "RNA")
    seu_obj = PercentageFeatureSet(seu_obj, pattern = "ATC", col.name = "percent.cp", assay = "RNA")
    
    
    
    #doublet finder        
    df_seu <- NormalizeData(seu_obj)
    df_seu <- FindVariableFeatures(df_seu, selection.method = "vst", nfeatures = 2000)
    df_seu <- ScaleData(df_seu)
    df_seu <- RunPCA(df_seu)
    df_seu <- RunTSNE(df_seu, dims = 1:15)
    nExp_poi <- round(0.075*(nrow(df_seu@meta.data)/10000)*nrow(df_seu@meta.data))


    sweep.sweep.df_seu <- paramSweep_v3(df_seu, PCs = 1:15, sct = FALSE)
    sweep.stats_df_seu <- summarizeSweep(sweep.sweep.df_seu, GT = FALSE)
    bcmvn_sweep.df_seu <- find.pK(sweep.stats_df_seu)
    
    pK = double(bcmvn_sweep.df_seu[max(bcmvn_sweep.df_seu$BCmetric)==bcmvn_sweep.df_seu$BCmetric,2])
    df_seu <- doubletFinder_v3(df_seu, PCs = 1:15, pN = 0.25, pK = pK, nExp = nExp_poi, reuse.pANN = FALSE, sct = FALSE)

    seu_obj <- subset(seu_obj, subset = (percent.mito < 10) & df_seu@meta.data[,dim(df_seu@meta.data)[2]] =="Singlet")
    
    #original experiment
    seu_obj@meta.data$orig.ident = files_base[i]
    
    #geno
    if (files_base[i] %in% c("sc_101", "sc_103", "sc_26_combined", "sc_67", "sc_69")) {
        seu_obj@meta.data$geno = "WT"
    }
    else {
        seu_obj@meta.data$geno = "mutant"
    }
    
    #experiment
    if (files_base[i] %in% c("sc_101", "sc_102", "sc_103", "sc_104", "sc_69", "sc_70")) {
        seu_obj@meta.data$experiment = "sorted"
    }
    else {
        seu_obj@meta.data$experiment = "nonsorted"
    }   
        
    saveRDS(seu_obj, file = paste("../data/seurat_objects/seurat_raw_3_11_21/", files_base[i], ".rds", sep=""))
}