Draft of scRNA-seq integration pipeline for joint analysis of GFP replicate samples 1-3. 

Resources: 
1) https://satijalab.org/seurat/articles/integration_introduction.html
2) CART_I1.R from Zhaoyang

In [None]:
library(Seurat)
library(data.table)
library(tidyverse)
library(ggplot2)
library(patchwork)
lapply(c("dplyr","Seurat","HGNChelper","openxlsx"), library, character.only = T)
source("https://raw.githubusercontent.com/IanevskiAleksandr/sc-type/master/R/gene_sets_prepare.R"); source("https://raw.githubusercontent.com/IanevskiAleksandr/sc-type/master/R/sctype_score_.R")

In [None]:
# Load data (pre-normalized and scaled to regress out percent-MT)
# (load GFP, RBD, G1C separately)
g1 <- readRDS("/oasis/tscc/scratch/mac008/seurat_objs/4RBD1_afterscale.rds")
g2 <- readRDS("/oasis/tscc/scratch/hsaha/jaesu-analysis-rds/5RBD2_afterscaling.rds")
g3 <- readRDS("/oasis/tscc/scratch/hsaha/jaesu-analysis-rds/6RBD3_afterscaling.rds")

In [None]:
# Identify variable features for each dataset independently
gfp.list <- c(g1,g2,g3)
gfp.list <- lapply(X = gfp.list, FUN = function(x) {
    x <- FindVariableFeatures(x, selection.method = "mean.var.plot", 
                              mean.cutoff = c(0.0125, 3), 
                              dispersion.cutoff = c(0.5, Inf))
})

NOTE: CART_I1 analysis uses sctransformed data, skips above steps

In [None]:
# Select features that are repeatedly variable across datasets for integration
features <- SelectIntegrationFeatures(object.list = gfp.list)

In [None]:
gfp.anchors <- FindIntegrationAnchors(object.list = gfp.list, anchor.features = features)
gfp.integrated <- IntegrateData(anchorset = gfp.anchors)

NOTE: CART_I1 analysis scales and runs pca on each dataset, then finds integration 
anchors by setting the reduction to "rpca". Additional parameters are set for creating the integrated object. I am currently using default parameters per the Seurat integration tutorial.

In [None]:
saveRDS(gfp.integrated, "/oasis/tscc/scratch/mac008/seurat_objs/RBD4-6_afterintegrate.rds")

In [None]:
gfp.integrated <- readRDS("/oasis/tscc/scratch/mac008/seurat_objs/RBD4-6_afterintegrate.rds")

# Run standard workflow
gfp.integrated <- ScaleData(gfp.integrated, verbose = FALSE)

saveRDS(gfp.integrated, "/oasis/tscc/scratch/mac008/seurat_objs/RBD4-6_afterscale.rds")

gfp.integrated <- RunPCA(gfp.integrated, seed.use = 42)
gfp.integrated <- RunTSNE(gfp.integrated, dims.use = 1:16, do.fast = T, perplexity =  15, seed.use = 7777)

saveRDS(gfp.integrated, "/oasis/tscc/scratch/mac008/seurat_objs/RBD4-6_aftertsne.rds")

In [None]:
seurat <- readRDS("/oasis/tscc/scratch/mac008/seurat_objs/GFP1-3_aftertsne.rds")

DimPlot(seurat, reduction = "tsne")

In [None]:
# ----------------------- LABEL W/ NEW MARKER GENES -----------------------

seurat <- readRDS("/oasis/tscc/scratch/mac008/seurat_objs/G1C10-12_aftertsne.rds")

seurat <- FindNeighbors(seurat, dims = 1:35)
seurat <- FindClusters(seurat, resolution = 5, random.seed = 42)

# Load in required files for SCType. SCType is used for labeling clusters.

# load gene set preparation function
source("https://raw.githubusercontent.com/IanevskiAleksandr/sc-type/master/R/gene_sets_prepare.R")

# load cell type annotation function
source("https://raw.githubusercontent.com/IanevskiAleksandr/sc-type/master/R/sctype_score_.R")

gs_list = gene_sets_prepare("https://raw.githubusercontent.com/IanevskiAleksandr/sc-type/master/ScTypeDB_short.xlsx", "Immune system")

# Set the file path for db_ to the excel file for marker genes. This file should be formatted correctly. For guidance
# on formatting, reference link in cell above in gs_list object.

# DB file
db_ = "/home/mac008/scrna_project/nov15-W7F/jaesudata_markergenes_nov15.xlsx";

# Set the tissue type you are working with
tissue = "Bone marrow" # e.g. Immune system,Pancreas,Liver,Eye,Kidney,Brain,Lung,Adrenal,Heart,Intestine,Muscle,Placenta,Spleen,Stomach,Thymus

# Prepares the marker gene data for application to your results.
gs_list = gene_sets_prepare(db_, tissue)

# Performs calculation of distinguising cell types from marker genes. May take a minute or two.
es.max = sctype_score(scRNAseqData = seurat[["integrated"]]@scale.data, scaled = TRUE, 
                      gs = gs_list$gs_positive, gs2 = gs_list$gs_negative) 

# Changes the metadata of the seurat object to include the cell types you discovered in previous steps

# NOTE: scRNAseqData parameter should correspond to your input scRNA-seq matrix. 
# In case Seurat is used, it is either pbmc[["RNA"]]@scale.data (default), pbmc[["SCT"]]@scale.data, in case sctransform is used for normalization,
# or pbmc[["integrated"]]@scale.data, in case a joint analysis of multiple single-cell datasets is performed.

# merge by cluster
cL_results = do.call("rbind", lapply(unique(seurat@meta.data$seurat_clusters), function(cl){
  es.max.cl = sort(rowSums(es.max[ ,rownames(seurat@meta.data[seurat@meta.data$seurat_clusters==cl, ])]), decreasing = !0)
  head(data.frame(cluster = cl, type = names(es.max.cl), scores = es.max.cl, ncells = sum(seurat@meta.data$seurat_clusters==cl)), 10)
}))

sctype_scores = cL_results %>% group_by(cluster) %>% top_n(n = 1, wt = scores)  

# Performs quality check by setting clusters that were not easily disguishable as "unknown"

# set low-confident (low ScType score) clusters to "unknown"
sctype_scores$type[as.numeric(as.character(sctype_scores$scores)) < sctype_scores$ncells/4] = "Unknown"

print(sctype_scores[,1:3])

seurat@meta.data$customclassif = ""
for(j in unique(sctype_scores$cluster)){
  cl_type = sctype_scores[sctype_scores$cluster==j,]; 
  seurat@meta.data$customclassif[seurat@meta.data$seurat_clusters == j] = as.character(cl_type$type[1])
}

# Outputs final tSNE plot as pdf with clusters labeled
# Change name based on resolution

pdf("nov15-W7F/G1C10-12_markergenes_nov15.pdf", width = 10, height = 10)
plot <- DimPlot(seurat, reduction = "tsne", label = TRUE, label.size = 4, repel = TRUE, group.by = 'customclassif') 
print(plot+ ggtitle("G1C Integrated"))
dev.off()

saveRDS(seurat, "/oasis/tscc/scratch/mac008/seurat_objs/G1C10-12_afterlabels_markergenes_nov15.rds")

In [None]:
# ----------------------- OBTAIN CELL COUNTS PER CLUSTER -----------------------

sample <- "RBD4-6"
markergenes <- "markergenes_nov15"

seurat <- readRDS(paste("/oasis/tscc/scratch/mac008/seurat_objs/",sample,"_afterlabels_",markergenes,".rds",sep=""))

# Overwrite clusters from FindClusters with SC-Type labeled clusters in metadata
Idents(seurat) <- "customclassif"

md_df <- seurat@meta.data

# Create a data.table from the metadata_df
md_dt <- as.data.table(md_df)

# Calculate the count of rows by orig.ident and customclassif
md_count <- md_dt[, .N, by = c("orig.ident", "customclassif")]

# Reshape the data
reshaped_table <- dcast(md_count, orig.ident ~ customclassif, value.var = "N")

numeric_columns <- reshaped_table %>%
  select_if(is.numeric)

# Calculate row sums for each row in reshaped_table
row_sums <- rowSums(numeric_columns <- reshaped_table %>%
  select_if(is.numeric), na.rm = TRUE)

# Add row sums as a new column to the table
reshaped_table$TotalCells <- row_sums

# Print or view the table
reshaped_table

write.csv(reshaped_table, "nov15-W7F/rbd4-6_counts.csv", row.names=FALSE)

In [None]:
# # Below is the pipeline using UMAP over tSNE (per Luca's request)
# # Continues in place of 'run standard workflow' steps above
# # ---------------------------------------------------------------

# gfp.integrated <- readRDS("/oasis/tscc/scratch/mac008/seurat_objs/GFP1-3_afterscale.rds")

# gfp.integrated <- RunPCA(gfp.integrated, seed.use = 42)
# gfp.integrated <- RunUMAP(gfp.integrated, dims = 1:16)

# saveRDS(gfp.integrated, "/oasis/tscc/scratch/mac008/seurat_objs/GFP1-3_afterumap.rds")

# DimPlot(gfp.integrated, reduction = "umap")

# # ---------------------------------------------------------------

# seurat <- readRDS("/oasis/tscc/scratch/mac008/seurat_objs/GFP1-3_afterumap.rds")

# seurat <- FindNeighbors(seurat, dims = 1:35)
# seurat <- FindClusters(seurat, resolution = 5, random.seed = 42)

# # Load in required files for SCType. SCType is used for labeling clusters.

# # load gene set preparation function
# source("https://raw.githubusercontent.com/IanevskiAleksandr/sc-type/master/R/gene_sets_prepare.R")

# # load cell type annotation function
# source("https://raw.githubusercontent.com/IanevskiAleksandr/sc-type/master/R/sctype_score_.R")

# gs_list = gene_sets_prepare("https://raw.githubusercontent.com/IanevskiAleksandr/sc-type/master/ScTypeDB_short.xlsx", "Immune system")

# # Set the file path for db_ to the excel file for marker genes. This file should be formatted correctly. For guidance
# # on formatting, reference link in cell above in gs_list object.

# # DB file
# db_ = "/home/mac008/scrna_project/nov15-W7F/markergenes_nov15.xlsx";

# # Set the tissue type you are working with
# tissue = "Bone marrow" # e.g. Immune system,Pancreas,Liver,Eye,Kidney,Brain,Lung,Adrenal,Heart,Intestine,Muscle,Placenta,Spleen,Stomach,Thymus

# # Prepares the marker gene data for application to your results.
# gs_list = gene_sets_prepare(db_, tissue)

# # Performs calculation of distinguising cell types from marker genes. May take a minute or two.
# es.max = sctype_score(scRNAseqData = seurat[["integrated"]]@scale.data, scaled = TRUE, 
#                       gs = gs_list$gs_positive, gs2 = gs_list$gs_negative) 

# # Changes the metadata of the seurat object to include the cell types you discovered in previous steps

# # NOTE: scRNAseqData parameter should correspond to your input scRNA-seq matrix. 
# # In case Seurat is used, it is either pbmc[["RNA"]]@scale.data (default), pbmc[["SCT"]]@scale.data, in case sctransform is used for normalization,
# # or pbmc[["integrated"]]@scale.data, in case a joint analysis of multiple single-cell datasets is performed.

# # merge by cluster
# cL_results = do.call("rbind", lapply(unique(seurat@meta.data$seurat_clusters), function(cl){
#   es.max.cl = sort(rowSums(es.max[ ,rownames(seurat@meta.data[seurat@meta.data$seurat_clusters==cl, ])]), decreasing = !0)
#   head(data.frame(cluster = cl, type = names(es.max.cl), scores = es.max.cl, ncells = sum(seurat@meta.data$seurat_clusters==cl)), 10)
# }))

# sctype_scores = cL_results %>% group_by(cluster) %>% top_n(n = 1, wt = scores)  

# # Performs quality check by setting clusters that were not easily disguishable as "unknown"

# # set low-confident (low ScType score) clusters to "unknown"
# sctype_scores$type[as.numeric(as.character(sctype_scores$scores)) < sctype_scores$ncells/4] = "Unknown"

# print(sctype_scores[,1:3])

# seurat@meta.data$customclassif = ""
# for(j in unique(sctype_scores$cluster)){
#   cl_type = sctype_scores[sctype_scores$cluster==j,]; 
#   seurat@meta.data$customclassif[seurat@meta.data$seurat_clusters == j] = as.character(cl_type$type[1])
# }

# saveRDS(seurat, "/oasis/tscc/scratch/mac008/seurat_objs/GFP_UMAP_afterlabels_markergenes_nov15.rds")

# seurat <- subset(x = seurat, subset = customclassif != "Unknown")

# # Outputs final tSNE plot as pdf with clusters labeled
# # Change name based on resolution
# pdf("nov15-W7F/GFP_UMAP_markergenes_nov15.pdf", width = 10, height = 10)
# plot <- DimPlot(seurat, reduction = "umap", label = TRUE, label.size = 4, repel = TRUE, group.by = 'customclassif') 
# print(plot+ ggtitle("GFP UMAP"))
# dev.off()

# # ---------------------------------------------------------------

