# Visualizing embeddings obtained from LDVAE and scETM

In [1]:
source("/root/data/DBP_sa_bc/preprocess/utils.R")
setwd("/root/data/DBP_sa_bc/")
library(RColorBrewer)

parser <- ArgumentParser()
parser$add_argument("--task", type = "character", default = "wnn_rna")
parser$add_argument("--method", type = "character", default = "LDVAE")
parser$add_argument("--exp", type = "character", default = "e1")
parser$add_argument("--model", type = "character", default = "default")
parser$add_argument("--init_model", type = "character", default = "sp_00001899")
parser$add_argument("--K", type = "integer", default = "20")
o <- parser$parse_known_args()[[1]]

config <- parseTOML("configs/data.toml")[[o$task]]
subset_names <- basename(config$raw_data_dirs)
subset_ids <- sapply(seq_along(subset_names) - 1, toString)
input_dirs <- pj("result", o$task, o$exp, o$model, "predict", o$init_model, paste0("subset_", subset_ids))
pp_dir <- pj("data", "processed", o$task)
output_dir <- pj("result", "comparison", o$task, o$method, o$K)
mkdir(output_dir, remove_old = F)
# label_paths <- pj(config$raw_data_dirs, "label_seurat", "l1.csv")
label_paths <- pj(config$raw_data_dirs, "label", "meta.csv")

# K <- parseTOML("configs/model.toml")[["default"]]$dim_c
K <- o$K
l <- 7.5  # figure size
L <- 10   # figure size
m <- 0.5  # legend margin

Attaching SeuratObject

Registered S3 method overwritten by 'SeuratDisk':
  method            from  
  as.sparse.H5Group Seurat

Loading required package: ensembldb

Loading required package: BiocGenerics


Attaching package: 'BiocGenerics'


The following objects are masked from 'package:stats':

    IQR, mad, sd, var, xtabs


The following objects are masked from 'package:base':

    Filter, Find, Map, Position, Reduce, anyDuplicated, append,
    as.data.frame, basename, cbind, colnames, dirname, do.call,
    duplicated, eval, evalq, get, grep, grepl, intersect, is.unsorted,
    lapply, mapply, match, mget, order, paste, pmax, pmax.int, pmin,
    pmin.int, rank, rbind, rownames, sapply, setdiff, sort, table,
    tapply, union, unique, unsplit, which.max, which.min


Loading required package: GenomicRanges

Loading required package: stats4

Loading required package: S4Vectors


Attaching package: 'S4Vectors'


The following object is masked from 'package:future':

    values


The fol

## Load preprossed data

In [2]:
rna_list <- list()
cell_name_list <- list()
label_list1 <- list()
label_list2 <- list()
subset_name_list <- list()
S <- length(subset_names)
for (i in seq_along(subset_names)) {
    subset_name <- subset_names[i]
    rna_dir  <- pj(input_dirs[i], "x", "rna")
    fnames <- dir(path = rna_dir, pattern = ".csv$")
    fnames <- str_sort(fnames, decreasing = F)

    rna_subset_list <- list()
    N <- length(fnames)
    for (n in seq_along(fnames)) {
        message(paste0("Loading Subset ", i, "/", S, ", File ", n, "/", N))
        rna_subset_list[[n]] <- read.csv(file.path(rna_dir, fnames[n]), header = F)
    }
    rna_list[[subset_name]] <- bind_rows(rna_subset_list)

    cell_name_list[[subset_name]] <- read.csv(pj(pp_dir, paste0("subset_", subset_ids[i]),
        "cell_names.csv"), header = T)[, 2]
        
    if ("lung_ts" %in% o$task){
        label_list1[[subset_name]] <- read.csv(label_paths[i], header = T)[, "Celltypes1"]
        label_list2[[subset_name]] <- read.csv(label_paths[i], header = T)[, "Celltypes_updated_July_2020"]
    }else if("wnn_rna" %in% o$task){
        label_list1[[subset_name]] <- read.csv(label_paths[i], header = T)[, "celltype.l1"]
        label_list2[[subset_name]] <- read.csv(label_paths[i], header = T)[, "celltype.l2"]
    }
    subset_name_list[[subset_name]] <- rep(subset_name, length(cell_name_list[[subset_name]]))
}

Loading Subset 1/8, File 1/25

Loading Subset 1/8, File 2/25

Loading Subset 1/8, File 3/25

Loading Subset 1/8, File 4/25

Loading Subset 1/8, File 5/25

Loading Subset 1/8, File 6/25

Loading Subset 1/8, File 7/25

Loading Subset 1/8, File 8/25

Loading Subset 1/8, File 9/25

Loading Subset 1/8, File 10/25

Loading Subset 1/8, File 11/25

Loading Subset 1/8, File 12/25

Loading Subset 1/8, File 13/25

Loading Subset 1/8, File 14/25

Loading Subset 1/8, File 15/25

Loading Subset 1/8, File 16/25

Loading Subset 1/8, File 17/25

Loading Subset 1/8, File 18/25

Loading Subset 1/8, File 19/25

Loading Subset 1/8, File 20/25

Loading Subset 1/8, File 21/25

Loading Subset 1/8, File 22/25

Loading Subset 1/8, File 23/25

Loading Subset 1/8, File 24/25

Loading Subset 1/8, File 25/25

Loading Subset 2/8, File 1/24

Loading Subset 2/8, File 2/24

Loading Subset 2/8, File 3/24

Loading Subset 2/8, File 4/24

Loading Subset 2/8, File 5/24

Loading Subset 2/8, File 6/24

Loading Subset 2/8, Fil

## Create seurat object

In [3]:
cell_name <- do.call("c", unname(cell_name_list))

rna <- t(data.matrix(bind_rows(rna_list)))
colnames(rna) <- cell_name
rownames(rna) <- read.csv(pj(pp_dir, "feat", "feat_names_rna.csv"), header = T)[, 2]


# remove missing features
rna_mask_list <- list()
for (i in seq_along(subset_names)) {
    subset_name <- subset_names[i]
    rna_mask_list[[subset_name]] <- read.csv(pj(pp_dir, paste0("subset_", subset_ids[i]),
        "mask", "rna.csv"), header = T)[, -1]
}
rna_mask <- as.logical(apply(data.matrix(bind_rows(rna_mask_list)), 2, prod))
rna <- rna[rna_mask, ]


obj <- CreateSeuratObject(counts = rna, assay = "rna")
# obj@meta.data$celltype <- do.call("c", unname(label_list))

obj@meta.data$celltype1 <- do.call("c", unname(label_list1))
obj@meta.data$celltype2 <- do.call("c", unname(label_list2))
obj@meta.data$batch <- factor(x = do.call("c", unname(subset_name_list)), levels = subset_names)
table(obj@meta.data$batch)[unique(obj@meta.data$batch)]

obj <- subset(obj, subset = nCount_rna > 0)
obj


p1_0 p2_0 p3_0 p4_0 p5_0 p6_0 p7_0 p8_0 
6378 5899 4628 5285 6952 6060 8854 8908 

An object of class Seurat 
3613 features across 52964 samples within 1 assay 
Active assay: rna (3613 features, 0 variable features)

In [4]:
c <- read.csv(pj(output_dir, "embeddings.csv"), header = FALSE)
c <- data.matrix(c)
colnames(c) <- paste0("c_", seq_len(ncol(c)))
rownames(c) <- colnames(obj)
obj[["c"]] <- CreateDimReducObject(embeddings = c, key = "c_", assay = "rna")

# obj <- subset(obj, subset = nCount_rna_bc > 0)
obj

An object of class Seurat 
3613 features across 52964 samples within 1 assay 
Active assay: rna (3613 features, 0 variable features)
 1 dimensional reduction calculated: c

## Visualization

In [5]:
obj <- RunUMAP(obj, reduction = 'c', dims = 1:K, reduction.name = "umap")
# SaveH5Seurat(obj, pj(output_dir, "obj.h5seurat"), overwrite = TRUE)

"The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
This message will be shown once per session"
09:38:24 UMAP embedding parameters a = 0.9922 b = 1.112

09:38:24 Read 52964 rows and found 20 numeric columns

09:38:24 Using Annoy for neighbor search, n_neighbors = 30

09:38:24 Building Annoy index with metric = cosine, n_trees = 50

0%   10   20   30   40   50   60   70   80   90   100%

[----|----|----|----|----|----|----|----|----|----|

*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
|

09:38:30 Writing NN index file to temp file /tmp/RtmplAYB4M/filef5a1232a1e22

09:38:30 Searching Annoy index using 64 threads, search_k = 3000

09:38:31 Annoy recall = 100%

09:38:32 Commencing smooth kNN distance calibration using 64 threads

09:38:35 Initializing from normalized

In [6]:
if ("wnn_rna" %in% o$task){
    batch_cols <- col_8
    celltype1_cols <- col_8
    celltype2_cols <- col_31
}else if("lung_ts" %in% o$task){
    batch_cols <- col_5
    celltype1_cols <- col_16
    celltype2_cols <- col_28
}

dim_plot(obj, w = L, h = L, reduction = "umap", no_axes = T,
    split.by = NULL, group.by = "batch", label = F, repel = T, 
    label.size = 4, pt.size = 0.1, cols = batch_cols, legend = F,
    save_path = pj(output_dir, paste(o$method, "merged_batch", sep = "_")))
     
dim_plot(obj, w = L, h = L, reduction = "umap", no_axes = T,
    split.by = NULL, group.by = "celltype1", label = F, repel = T, 
    label.size = 4, pt.size = 0.1, cols = celltype1_cols, legend = F,
    save_path = pj(output_dir, paste(o$method, "merged_label1", sep = "_")))

dim_plot(obj, w = L, h = L, reduction = "umap", no_axes = T,
    split.by = NULL, group.by = "celltype2", label = F, repel = T, 
    label.size = 4, pt.size = 0.1, cols = celltype2_cols, legend = F,
    save_path = pj(output_dir, paste(o$method, "merged_label2", sep = "_")))

dim_plot(obj, w = L*6, h = L, reduction = "umap", no_axes = T,
    split.by = "batch", group.by = "celltype1", label = F, repel = T, 
    label.size = 4, pt.size = 0.1, cols = celltype1_cols, legend = F,
    save_path = pj(output_dir, paste(o$method, "batch_split1", sep = "_"))) 

dim_plot(obj, w = L*6, h = L, reduction = "umap", no_axes = T,
    split.by = "batch", group.by = "celltype2", label = F, repel = T, 
    label.size = 4, pt.size = 0.1, cols = celltype2_cols, legend = F,
    save_path = pj(output_dir, paste(o$method, "batch_split2", sep = "_"))) 

In [None]:
# # convert the notebook to html
# system(paste0("jupyter nbconvert --to html comparison/", o$method, ".ipynb"))
# system(paste0("mv comparison/", o$method, ".html comparison/", o$task, "_", o$method, ".html"))