In [None]:
#PREPROCESSING SCRIPT
#THIS TAKES IN THE SPLICED AND UNSPLICED MATRICES, MAKES A SUMMED COUNT MATRIX WITH EMPTYDROPS CALLED CELLS, MAKE SEURAT
#OBJECTS AND ADD MT AND CP PERCENTAGE, THEN SAVE IN SEURAT1. 

In [3]:
if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")
if (!requireNamespace("here", quietly = TRUE))
    install.packages("here")
if (!requireNamespace("ggplot2", quietly = TRUE))
    install.packages("ggplot2")
if (!requireNamespace("ggpointdensity", quietly = TRUE))
    install.packages("ggpointdensity")
if (!requireNamespace("DropletUtils", quietly = TRUE)) 
    BiocManager::install("DropletUtils")
if (!requireNamespace("BUSpaRse", quietly = TRUE)) 
    BiocManager::install("BUSpaRse")
if (!requireNamespace("scico", quietly = TRUE))
    install.packages("scico")
if (!requireNamespace("reticulate", quietly = TRUE))
    install.packages("reticulate")
if (!requireNamespace("future", quietly = TRUE))
    install.packages("future")
if (!requireNamespace("DoubletFinder", quietly = TRUE))
    remotes::install_github('chris-mcginnis-ucsf/DoubletFinder')
if (!requireNamespace("tidyverse", quietly = TRUE))
    install.packages("tidyverse")
#devtools::install_github(repo = 'satijalab/seurat', ref = 'develop')
# if (!requireNamespace("COPILOT", quietly = TRUE))
#     devtools::install_github('Hsu-Che-Wei/COPILOT')


In [None]:
library(BUSpaRse)
library(here)
library(Matrix)
library(tidyverse)
library(Seurat)
library(ggpointdensity)
library(scico)
library(scales)
library(DropletUtils)
library(reticulate)
library(repr)
library(DoubletFinder)
library(future)
options(future.globals.maxSize = 100000 * 1024^2)

proto_genes=read.csv("../data/bulk_data/protoplasting.csv")
proto_list=as.character(proto_genes[abs(proto_genes$logFC) > 4,]$genes)

# Slightly modified from BUSpaRse, just to avoid installing a few dependencies not used here
read_count_output <- function(dir, name) {
  dir <- normalizePath(dir, mustWork = TRUE)
  m <- readMM(paste0(dir, "/", name, ".mtx"))
  m <- Matrix::t(m)
  m <- as(m, "dgCMatrix")
  # The matrix read has cells in rows
  ge <- ".genes.txt"
  genes <- readLines(file(paste0(dir, "/", name, ge)))
  barcodes <- readLines(file(paste0(dir, "/", name, ".barcodes.txt")))
  colnames(m) <- barcodes
  rownames(m) <- genes
  return(m)
}


In [4]:
#BE SURE TO RUN read_count_output FUNCTION AT END OF THIS NOTEBOOK FIRST. THIS IS NECESSARY TO READ DATA.
#setwd
setwd(here())

#all spliced data
wt1_spliced = read_count_output("../data/sc_26", "spliced")

wt2_spliced = read_count_output("../data/sc_67", "spliced")

mut1_spliced = read_count_output("../data/sc_27", "spliced")

mut2_spliced = read_count_output("../data/sc_68", "spliced")

#all unspliced data
wt1_unspliced = read_count_output("../data/sc_26", "unspliced")

wt2_unspliced = read_count_output("../data/sc_67", "unspliced")

mut1_unspliced = read_count_output("../data/sc_27", "unspliced")

mut2_unspliced = read_count_output("../data/sc_68", "unspliced")

In [5]:
#pull out only arabidopsis genes
#WT1
wt1_spliced_arab = wt1_spliced[grepl("AT",unlist(wt1_spliced@Dimnames[1]), fixed=TRUE),, drop=FALSE]

wt1_unspliced_arab = wt1_unspliced[grepl("AT",unlist(wt1_unspliced@Dimnames[1]), fixed=TRUE),, drop=FALSE]

#WT2
wt2_spliced_arab = wt2_spliced[grepl("AT",unlist(wt2_spliced@Dimnames[1]), fixed=TRUE),, drop=FALSE]

wt2_unspliced_arab = wt2_unspliced[grepl("AT",unlist(wt2_unspliced@Dimnames[1]), fixed=TRUE),, drop=FALSE]

#MUT1
mut1_spliced_arab = mut1_spliced[grepl("AT",unlist(mut1_spliced@Dimnames[1]), fixed=TRUE),, drop=FALSE]

mut1_unspliced_arab = mut1_unspliced[grepl("AT",unlist(mut1_unspliced@Dimnames[1]), fixed=TRUE),, drop=FALSE]

#MUT2
mut2_spliced_arab = mut2_spliced[grepl("AT",unlist(mut2_spliced@Dimnames[1]), fixed=TRUE),, drop=FALSE]

mut2_unspliced_arab = mut2_unspliced[grepl("AT",unlist(mut2_unspliced@Dimnames[1]), fixed=TRUE),, drop=FALSE]

In [11]:
#I think empty drops here on the spliced matrices. Better to do it here than after next step which is tantamount to weak filtering
wt1_empty_drops = emptyDrops(wt1_spliced_arab[grepl(pattern = "AT[1-5]", unlist(wt1_spliced_arab@Dimnames[1])),, drop=FALSE], ignore = 25, lower = 125)
wt2_empty_drops = emptyDrops(wt2_spliced_arab[grepl(pattern = "AT[1-5]", unlist(wt2_spliced_arab@Dimnames[1])),, drop=FALSE], ignore = 25, lower = 125)
mut1_empty_drops = emptyDrops(mut1_spliced_arab[grepl(pattern = "AT[1-5]", unlist(mut1_spliced_arab@Dimnames[1])),, drop=FALSE], ignore = 25, lower = 125)
mut2_empty_drops = emptyDrops(mut2_spliced_arab[grepl(pattern = "AT[1-5]", unlist(mut2_spliced_arab@Dimnames[1])),, drop=FALSE], ignore = 25, lower = 125)

In [13]:
sum(wt1_empty_drops$FDR<.001, na.rm = TRUE)
sum(wt2_empty_drops$FDR<.001, na.rm = TRUE)
sum(mut1_empty_drops$FDR<.001, na.rm = TRUE)
sum(mut2_empty_drops$FDR<.001, na.rm = TRUE)

In [14]:
#Make combined spliced/unspliced. Assume real cells will be present in both lists, so take barcodes in intersection, and intersect with the emptyDrops called cells
#WT1
shared = intersect(intersect(colnames(wt1_spliced_arab), colnames(wt1_unspliced_arab)), rownames(wt1_empty_drops[!is.na(wt1_empty_drops$FDR) & wt1_empty_drops$FDR < .001,]))
wt1_combined = wt1_spliced_arab[,shared] + wt1_unspliced_arab[,shared]

#WT2
shared = intersect(intersect(colnames(wt2_spliced_arab), colnames(wt2_unspliced_arab)), rownames(wt2_empty_drops[!is.na(wt2_empty_drops$FDR) & wt2_empty_drops$FDR < .001,]))
wt2_combined = wt2_spliced_arab[,shared] + wt2_unspliced_arab[,shared]

#mut1
shared = intersect(intersect(colnames(mut1_spliced_arab), colnames(mut1_unspliced_arab)), rownames(mut1_empty_drops[!is.na(mut1_empty_drops$FDR) & mut1_empty_drops$FDR < .001,]))
mut1_combined =mut1_spliced_arab[,shared] + mut1_unspliced_arab[,shared]

#mut2
shared = intersect(intersect(colnames(mut2_spliced_arab), colnames(mut2_unspliced_arab)), rownames(mut2_empty_drops[!is.na(mut2_empty_drops$FDR) & mut2_empty_drops$FDR < .001,]))
mut2_combined =mut2_spliced_arab[,shared] + mut2_unspliced_arab[,shared]

#MAKE COMBINED SPLICED/UNSPLICED COUNT MATRICES FOR ALL SAMPLES. THEN REMOVE EMPTYDROPS FOR CELLS. THEN REMOVE DOUBLETS. THEN ROCK N ROLL

In [15]:
wt_1_seu <- CreateSeuratObject(wt1_combined, min.cells = 3)
wt_1_seu

wt_2_seu <- CreateSeuratObject(wt2_combined, min.cells = 3)
wt_2_seu

mut_1_seu <- CreateSeuratObject(mut1_combined, min.cells = 3)
mut_1_seu

mut_2_seu <- CreateSeuratObject(mut2_combined, min.cells = 3)
mut_2_seu

An object of class Seurat 
22966 features across 8894 samples within 1 assay 
Active assay: RNA (22966 features, 0 variable features)

An object of class Seurat 
22946 features across 8756 samples within 1 assay 
Active assay: RNA (22946 features, 0 variable features)

An object of class Seurat 
22617 features across 12319 samples within 1 assay 
Active assay: RNA (22617 features, 0 variable features)

An object of class Seurat 
22676 features across 8392 samples within 1 assay 
Active assay: RNA (22676 features, 0 variable features)

In [17]:
wt_1_seu=PercentageFeatureSet(wt_1_seu, pattern = "ATM", col.name = "percent.mito", assay = "RNA")
wt_2_seu=PercentageFeatureSet(wt_2_seu, pattern = "ATM", col.name = "percent.mito", assay = "RNA")
mut_1_seu=PercentageFeatureSet(mut_1_seu, pattern = "ATM", col.name = "percent.mito", assay = "RNA")
mut_2_seu=PercentageFeatureSet(mut_2_seu, pattern = "ATM", col.name = "percent.mito", assay = "RNA")

wt_1_seu=PercentageFeatureSet(wt_1_seu, pattern = "ATC", col.name = "percent.cp", assay = "RNA")
wt_2_seu=PercentageFeatureSet(wt_2_seu, pattern = "ATC", col.name = "percent.cp", assay = "RNA")
mut_1_seu=PercentageFeatureSet(mut_1_seu, pattern = "ATC", col.name = "percent.cp", assay = "RNA")
mut_2_seu=PercentageFeatureSet(mut_2_seu, pattern = "ATC", col.name = "percent.cp", assay = "RNA")

In [18]:
#Save the protoplast genes-, doublet-, high mito- seurat objects
saveRDS(wt_1_seu, file = "../data/seurat1/sc_26_seu_1_ED_no_min_features.rds")
saveRDS(wt_2_seu, file = "../data/seurat1/sc_67_seu_1_ED_no_min_features.rds")
saveRDS(mut_1_seu, file = "../data/seurat1/sc_27_seu_1_ED_no_min_features.rds")
saveRDS(mut_2_seu, file = "../data/seurat1/sc_68_seu_1_ED_no_min_features.rds")