In [1]:
#THIS SCRIPT WILL PERFORM EMPTYDROPS ANALYSIS, SEURAT OBJECT CREATION, AND MULTIPLET IDENTIFICATION

suppressMessages(library(BUSpaRse))
suppressMessages(library(Matrix))
suppressMessages(library(tidyverse))
suppressMessages(library(Seurat))
suppressMessages(library(DropletUtils))
suppressMessages(library(DoubletFinder))

proto_genes=read.csv("../data/bulk_data/protoplasting.csv")
proto_list=as.character(proto_genes[abs(proto_genes$logFC) > 2,]$genes)

# Slightly modified from BUSpaRse, just to avoid installing a few dependencies not used here
read_count_output <- function(dir, name) {
  dir <- normalizePath(dir, mustWork = TRUE)
  m <- readMM(paste0(dir, "/", name, ".mtx"))
  m <- Matrix::t(m)
  m <- as(m, "dgCMatrix")
  # The matrix read has cells in rows
  ge <- ".genes.txt"
  genes <- readLines(file(paste0(dir, "/", name, ge)))
  barcodes <- readLines(file(paste0(dir, "/", name, ".barcodes.txt")))
  colnames(m) <- barcodes
  rownames(m) <- genes
  return(m)
}


In [2]:
sessionInfo()

R version 3.6.3 (2020-02-29)
Platform: x86_64-conda_cos6-linux-gnu (64-bit)
Running under: Ubuntu 20.04.2 LTS

Matrix products: default
BLAS/LAPACK: /home/robotmessenger810/anaconda3/envs/r_3/lib/libopenblasp-r0.3.9.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] parallel  stats4    stats     graphics  grDevices utils     datasets 
[8] methods   base     

other attached packages:
 [1] DoubletFinder_2.0.3         DropletUtils_1.6.1         
 [3] SingleCellExperiment_1.8.0  SummarizedExperiment_1.16.1
 [5] DelayedArray_0.12.3         BiocParallel_1.20.1        
 [7] matrixStats_0.61.0          Biobase_2.46.0             
 [9] GenomicRanges_1.38.0

In [2]:
files <- list.files(path="../data/scKB_outs/", full.names=TRUE, recursive=FALSE)
files_base = list.files(path="../data/scKB_outs/", full.names=FALSE, recursive=FALSE)

In [None]:
#prevent warnings from printing
defaultW <- getOption("warn") 
options(warn = -1) 

seu_list = list()

#loop through all files and perform empty drops quantification and make seurat objects
for (i in 1:length(files)){
    sample = files[i]
    
    #read in spliced matrix and retrain only Arabidopsis gene counts (important for species mixing experiments)
    spliced = read_count_output(sample, "spliced")
    spliced = spliced[grepl("AT",unlist(spliced@Dimnames[1]), fixed=TRUE),, drop=FALSE]
    
    #read in unspliced matrix and retrain only Arabidopsis gene counts
    unspliced = read_count_output(sample, "unspliced") 
    unspliced = unspliced[grepl("AT",unlist(unspliced@Dimnames[1]), fixed=TRUE),, drop=FALSE]  
    
    #Find barcodes identified in both spliced and unspliced count matrices
    shared = intersect(colnames(spliced), colnames(unspliced))
    
    #filter to barcodes present in both spliced and unspliced matrices
    spliced = spliced[,shared] 
    unspliced = unspliced[,shared]
    
    combined = spliced + unspliced
    
    #run empty drops on combined count matrix
    empty_drops = emptyDrops(combined[grepl(pattern = "AT[1-5]", unlist(spliced@Dimnames[1])),, drop=FALSE], ignore = 500, lower = 300)
    
    #We take all cells from spliced and unspliced that were called by emptydrops, and then we sum them into combined matrix
    shared = intersect(intersect(colnames(spliced), colnames(unspliced)), rownames(empty_drops[!is.na(empty_drops$FDR) & empty_drops$FDR < .001,]))
    
    spliced = spliced[,shared] 
    unspliced = unspliced[,shared]
    
    combined = spliced + unspliced
    
    #create seurat object
    seu_obj <- CreateSeuratObject(combined, min.cells = 3)
    
    #add spliced and unspliced as assays
    spliced_assay <- CreateAssayObject(counts = spliced)
    unspliced_assay <- CreateAssayObject(counts = unspliced)
    
    seu_obj[["spliced"]] = spliced_assay
    seu_obj[["unspliced"]] = unspliced_assay
    
    #mito and plastid read percent
    seu_obj = PercentageFeatureSet(seu_obj, pattern = "ATM", col.name = "percent.mito", assay = "RNA")
    seu_obj = PercentageFeatureSet(seu_obj, pattern = "ATC", col.name = "percent.cp", assay = "RNA")
    
    
    
    #doublet finder        
    df_seu <- NormalizeData(seu_obj)
    df_seu <- FindVariableFeatures(df_seu, selection.method = "vst", nfeatures = 2000)
    df_seu <- ScaleData(df_seu)
    df_seu <- RunPCA(df_seu)
    df_seu <- RunTSNE(df_seu, dims = 1:15)
    
    #simple approximate expected doublet rate based on equation from 10x data: doublet_percent = .004/500 * #_cells 
    nExp_poi <- round((0.004 /500*(nrow(df_seu@meta.data))) * nrow(df_seu@meta.data))


    sweep.sweep.df_seu <- paramSweep_v3(df_seu, PCs = 1:15, sct = FALSE)
    sweep.stats_df_seu <- summarizeSweep(sweep.sweep.df_seu, GT = FALSE)
    bcmvn_sweep.df_seu <- find.pK(sweep.stats_df_seu)
    
    pK = double(bcmvn_sweep.df_seu[max(bcmvn_sweep.df_seu$BCmetric)==bcmvn_sweep.df_seu$BCmetric,2])
    df_seu <- doubletFinder_v3(df_seu, PCs = 1:15, pN = 0.25, pK = pK, nExp = nExp_poi, reuse.pANN = FALSE, sct = FALSE)

    seu_obj <- subset(seu_obj, subset = (percent.mito < 10) & df_seu@meta.data[,dim(df_seu@meta.data)[2]] =="Singlet")
    
    #set original experiment
    seu_obj@meta.data$orig.ident = files_base[i]
    
    #set genotype
    if (files_base[i] %in% c("sc_101", "sc_103", "sc_26_combined", "sc_67", "sc_69")) {
        seu_obj@meta.data$geno = "WT"
    }
    else {
        seu_obj@meta.data$geno = "mutant"
    }
    
    #set experiment
    if (files_base[i] %in% c("sc_101", "sc_102", "sc_103", "sc_104", "sc_69", "sc_70")) {
        seu_obj@meta.data$experiment = "sorted"
    }
    else {
        seu_obj@meta.data$experiment = "nonsorted"
    }   
    
    
    print(sample)
    print("original # cells: ")
    print(nrow(df_seu@meta.data))
    print("singlet # cells: ")
    print(nrow(seu_obj@meta.data))
    
    saveRDS(seu_obj, file = paste("../data/seurat_objects/seurat_raw_1_4_22/", files_base[i], ".rds", sep=""))
}