In [None]:
source("/root/data/DBP_sa_bc/preprocess/utils.R")
setwd("/root/data/DBP_sa_bc/")
library(mclust) 
library(RColorBrewer)

parser <- ArgumentParser()
parser$add_argument("--task", type = "character", default = "ga_tumor")
parser$add_argument("--method", type = "character", default = "DBP_sa_bc")
parser$add_argument("--exp", type = "character", default = "e4")
parser$add_argument("--init_model", type = "character", default = "sp_latest")
parser$add_argument("--K", type = "integer", default = "30")
o <- parser$parse_known_args()[[1]]

K <- o$K
config <- parseTOML("configs/data.toml")[[o$task]]
subset_names <- basename(config$raw_data_dirs)
subset_ids <- sapply(seq_along(subset_names) - 1, toString)
pp_dir <- pj("data", "processed", o$task)
rnk_dir <- pj("result", "analysis", o$task, o$method, o$exp)
input_dirs <- pj("result", o$task, o$exp, "default", "predict", o$init_model, paste0("subset_", subset_ids))
if("DBP_sa_bc" %in% o$method){
    output_dir <- pj("result", "comparison", o$task, o$method, o$exp, o$init_model, "fa", "figs")
}else if("mofa" %in% o$method | "liger" %in% o$method | "LDVAE" %in% o$method | "scETM" %in% o$method){
    emb_dir <- pj("result", "comparison", o$task, o$method)
    output_dir <- pj("result", "comparison", o$task, o$method, "fa", "figs")
}
break_index_dir <- pj("result", o$task, o$exp, "default", "predict", o$init_model)
mkdir(output_dir, remove_old = F)
label_paths <- pj(config$raw_data_dirs, "label", "meta.csv")

dim_c <- parseTOML("configs/model.toml")[["default"]]$dim_c
qual_col_pals <- brewer.pal.info[brewer.pal.info$category == 'qual',]
dcols <- unlist(mapply(brewer.pal, qual_col_pals$maxcolors, rownames(qual_col_pals)))
l <- 5  # figure size
L <- 8   # figure size
m <- 0.5  # legend margin

In [None]:
z_list <- list()
w_list <- list()
rna_bc_list <- list()
cell_name_list <- list()
label_list1 <- list()
label_list2 <- list()
label_list3 <- list()
subset_name_list <- list()
S <- length(subset_names)
for (i in seq_along(subset_names)) {
    subset_name <- subset_names[i]
    z_dir       <- pj(input_dirs[i], "z", "joint")
    w_dir       <- pj(input_dirs[i], "w", "joint")
    rna_bc_dir  <- pj(input_dirs[i], "x_bc", "rna")
    fnames <- dir(path = z_dir, pattern = ".csv$")
    fnames <- str_sort(fnames, decreasing = F)

    z_subset_list <- list()
    w_subset_list <- list()
    rna_bc_subset_list <- list()

    N <- length(fnames)
    for (n in seq_along(fnames)) {
        message(paste0("Loading Subset ", i, "/", S, ", File ", n, "/", N))
        z_subset_list[[n]] <- read.csv(file.path(z_dir, fnames[n]), header = F)
        w_subset_list[[n]] <- read.csv(file.path(w_dir, fnames[n]), header = F)
        rna_bc_subset_list[[n]] <- read.csv(file.path(rna_bc_dir, fnames[n]), header = F)

    }
    z_list[[subset_name]] <- bind_rows(z_subset_list)
    w_list[[subset_name]] <- bind_rows(w_subset_list)
    rna_bc_list[[subset_name]] <- bind_rows(rna_bc_subset_list)

    cell_name_list[[subset_name]] <- read.csv(pj(pp_dir, paste0("subset_", subset_ids[i]),
        "cell_names.csv"), header = T)[, 2]
    if ("lung_ts" %in% o$task){
        label_list1[[subset_name]] <- read.csv(label_paths[i], header = T)[, "Celltypes_updated_July_2020"]
        label_list2[[subset_name]] <- read.csv(label_paths[i], header = T)[, "Celltypes1"]
    }else if("wnn_rna" %in% o$task){
        label_list1[[subset_name]] <- read.csv(label_paths[i], header = T)[, "celltype.l1"]
        label_list2[[subset_name]] <- read.csv(label_paths[i], header = T)[, "celltype.l2"]
        label_list3[[subset_name]] <- read.csv(label_paths[i], header = T)[, "celltype.l3"]
    }else if("scsim1-10" %in% o$task){
        label_list1[[subset_name]] <- read.csv(label_paths[i], header = T)[, "celltypes"]
        label_list2[[subset_name]] <- read.csv(label_paths[i], header = T)[, "celltypes"]
        label_list3[[subset_name]] <- read.csv(label_paths[i], header = T)[, "celltypes"]
    }else if("ga" %in% o$task){
        label_list1[[subset_name]] <- read.csv(label_paths[i], header = T)[, "celltype"]
        label_list2[[subset_name]] <- read.csv(label_paths[i], header = T)[, "celltype"]
        label_list3[[subset_name]] <- read.csv(label_paths[i], header = T)[, "celltype"]
    }else if("ga_tumor" %in% o$task){
        label_list1[[subset_name]] <- read.csv(label_paths[i], header = T)[, "c1_5"]
        label_list2[[subset_name]] <- read.csv(label_paths[i], header = T)[, "c1_5"]
        label_list3[[subset_name]] <- read.csv(label_paths[i], header = T)[, "c1_5"]
    }
    
    subset_name_list[[subset_name]] <- rep(subset_name, length(cell_name_list[[subset_name]]))
}

## Create seurat object

In [None]:
rna_bc <- t(data.matrix(bind_rows(rna_bc_list)))
colnames(rna_bc) <- do.call("c", unname(cell_name_list))
rownames(rna_bc) <- read.csv(pj(pp_dir, "feat", "feat_names_rna.csv"), header = T)[, 2]
obj <- CreateSeuratObject(counts = rna_bc, assay = "rna_bc")

annotation <- GetGRangesFromEnsDb(EnsDb.Hsapiens.v86)
# seqlevelsStyle(annotation) <- "UCSC"
genome(annotation) <- "hg38"

if ("DBP_sa_bc" %in% o$method){
    z <- data.matrix(bind_rows(z_list))
    w <- data.matrix(bind_rows(w_list))
    c <- z[, 1:dim_c]*w
    # break
    index <- read.csv(pj(break_index_dir, "break_index.csv"), header = FALSE)
    index <- index+1
    names(index) <- "id"
    tc <- data.frame(id = 1:dim(w)[2], y = t(c)) 
    loc <- match(index$id,tc$id)
    c_ord <- tc[loc,]
    c_bre <- c_ord[1:K, !colnames(c_ord) %in% c("id")]
    emc <- data.matrix(t(c_bre))
    # c_bre <- abs(c_bre)
}else if("mofa" %in% o$method | "liger" %in% o$method){
    c <- read.csv(pj(emb_dir, "embeddings.csv"), header = TRUE, row.names = 1)
    emc <- data.matrix(c)
    # c_bre <- abs(c_bre)
}else if("LDVAE" %in% o$method| "scETM" %in% o$method){
    c <- read.csv(pj(emb_dir, "embeddings.csv"), header = FALSE)
    emc <- data.matrix(c)
    # c_bre <- abs(c_bre)
}

# for umap
colnames(emc) <- paste0("F_", seq_len(ncol(emc)))
rownames(emc) <- colnames(obj)
obj[["emc"]] <- CreateDimReducObject(embeddings = emc, key = "F_", assay = "rna_bc")

obj@meta.data$celltype1 <- do.call("c", unname(label_list1))
obj@meta.data$celltype2 <- do.call("c", unname(label_list2))
obj@meta.data$celltype3 <- do.call("c", unname(label_list3))
obj@meta.data$batch <- do.call("c", unname(subset_name_list))
table(obj@meta.data$batch)[unique(obj@meta.data$batch)]

# obj <- subset(obj, subset = nCount_rna_c > 0)
obj

In [None]:
obj <- RunUMAP(obj, reduction = 'emc', dims = 1:dim(emc)[2], reduction.name = 'umap')
# SaveH5Seurat(obj, pj(output_dir, "obj_break.h5seurat"), overwrite = TRUE)

In [None]:
# obj <- LoadH5Seurat(pj(output_dir, "obj.h5seurat"), assays = "rna", reductions = "umap")
# obj

if ("wnn_rna" %in% o$task){
    batch_cols <- col_8
    celltype1_cols <- col_8
    celltype2_cols <- col_31
}else if("lung_ts" %in% o$task){
    batch_cols <- col_5
    celltype1_cols <- col_16
    celltype2_cols <- col_28
}else if("ga" %in% o$task){
    batch_cols <- col_14
    celltype1_cols <- col_14
    celltype2_cols <- col_14
}else if("ga_tumor" %in% o$task){
    batch_cols <- col_5
    celltype1_cols <- col_8
    celltype2_cols <- col_8
}


# dim_plot(obj, w = L, h = L, reduction = 'umap', no_axes = T, border = T,
#     split.by = NULL, group.by = "batch", label = F, repel = T, 
#     label.size = 4, pt.size = 0.1, cols = batch_cols, legend = F,
#     save_path = pj(output_dir, paste(o$method, "merged_batch", sep = "_")))
     
# dim_plot(obj, w = L, h = L, reduction = 'umap', no_axes = T, border = T,
#     split.by = NULL, group.by = "celltype1", label = F, repel = T, 
#     label.size = 4, pt.size = 0.1, cols = celltype1_cols, legend = F,
#     save_path = pj(output_dir, paste(o$method, "merged_label1", sep = "_")))

# dim_plot(obj, w = L, h = L, reduction = 'umap', no_axes = T, border = T,
#     split.by = NULL, group.by = "celltype2", label = F, repel = T, 
#     label.size = 4, pt.size = 0.1, cols = celltype2_cols, legend = F,
#     save_path = pj(output_dir, paste(o$method, "merged_label2", sep = "_")))

# dim_plot(obj, w = L*6, h = L, reduction = 'umap', no_axes = T, border = T,
#     split.by = "batch", group.by = "celltype1", label = F, repel = T, 
#     label.size = 4, pt.size = 0.1, cols = celltype1_cols, legend = F,
#     save_path = pj(output_dir, paste(o$method, "batch_split1", sep = "_"))) 

# dim_plot(obj, w = L*6, h = L, reduction = 'umap', no_axes = T, border = T,
#     split.by = "batch", group.by = "celltype2", label = F, repel = T, 
#     label.size = 4, pt.size = 0.1, cols = celltype2_cols, legend = F,
#     save_path = pj(output_dir, paste(o$method, "batch_split2", sep = "_")))

DimPlot(obj, group.by = "celltype1", label = F, cols = celltype1_cols, reduction = "umap", pt.size = 1, raster = T, raster.dpi = c(512, 512)) + 
        theme(panel.border = element_rect(color = "black", linewidth = 0.8),
              axis.ticks.length = unit(0, "pt"),
              plot.title = element_blank()) +
              NoAxes() +
              NoLegend()
ggsave(file = pj(output_dir, paste(o$method, "merged_label1.png", sep = "_")), width = L, height = L)
ggsave(file = pj(output_dir, paste(o$method, "merged_label1.pdf", sep = "_")), width = L, height = L)


## Plot all factors

In [None]:
obj$seurat_clusters <- obj$celltype1
Idents(object=obj) <- obj$celltype1
factors <- colnames(emc)

if ("wnn_rna" %in% o$task){
    batch_cols <- col_8
    celltype1_cols <- col_8
    celltype2_cols <- col_31
}else if("lung_ts" %in% o$task){
    batch_cols <- col_5
    celltype1_cols <- col_16
    celltype2_cols <- col_28
}else if("ga" %in% o$task){
    batch_cols <- col_5
    celltype1_cols <- col_16
    celltype2_cols <- col_28
}

# FeaturePlot(obj, features = factors, label = T)
for(f in factors){
    FeaturePlot(obj, features = f, label = T) +
            scale_colour_gradient2(low="#f19007", high="#990033", mid="white") 
    ggsave(file = pj(output_dir, paste(f, "fea1.png", sep = "_")), width = 8, height = 8)
    
}

# FeaturePlot(obj, features = factors, label = T)
for(f in factors){
    VlnPlot(obj, features = f, cols = celltype1_cols, group.by = "celltype1", pt.size = 0) + 
    NoLegend() + 
    scale_y_continuous(position = "right") +
    labs(title = "") +
    theme(axis.title = element_blank(),
      axis.title.x = element_blank(),
      axis.text.x = element_blank(),
      axis.line = element_blank(),
      # axis.text.y = element_text(angle = 90, hjust = 1),
      axis.ticks.x = element_blank(),
      panel.border = element_rect(color = "black", fill = NA, size = 0.8)
      )
    ggsave(file = pj(output_dir, paste(f, "vln1.png", sep = "_")), width = 10, height = 3)
    # ggsave(file = pj(output_dir, paste(f, "vln2.png", sep = "_")), width = 15, height = 5)
    }

## Plot single factor

In [None]:
# Set colour
obj$seurat_clusters <- obj$celltype1
Idents(object=obj) <- obj$celltype1
factors <- colnames(emc)

if ("wnn_rna" %in% o$task){
    batch_cols <- col_8
    celltype1_cols <- col_8
    celltype2_cols <- col_31
}else if("lung_ts" %in% o$task){
    batch_cols <- col_5
    celltype1_cols <- col_28
    celltype2_cols <- col_16
}else if("ga" %in% o$task){
    batch_cols <- col_14
    celltype1_cols <- col_14
    celltype2_cols <- col_14
}else if("ga_tumor" %in% o$task){
    batch_cols <- col_5
    celltype1_cols <- col_8
    celltype2_cols <- col_8
}


### FeaturePlot-factors

In [None]:
f <- "F_21"
# obj$seurat_clusters <- obj$celltype1
# Idents(object=obj) <- obj$celltype1
# factors <- colnames(emc)

FeaturePlot(obj, features = f, label = F, pt.size = 1, raster = T, raster.dpi = c(512, 512)) +
            scale_colour_gradient2(low="#f19007", high="#990033", mid="lightgrey") +
            theme(panel.border = element_rect(color = "black", linewidth = 0.8),
            axis.ticks.length = unit(0, "pt"),
            plot.title = element_blank()) +
            NoAxes() +
            NoLegend()
ggsave(file = pj(output_dir, paste(f, "fea1.png", sep = "_")), width = 8, height = 8)
ggsave(file = pj(output_dir, paste(f, "fea1.pdf", sep = "_")), width = 8, height = 8)

### FeaturePlot-genes

In [None]:
FeaturePlot(obj, features = "LIPF", cols = c("lightgrey", "#C23D38"), label = F, pt.size = 0.1, raster = T, raster.dpi = c(512, 512)) +
            theme(panel.border = element_rect(color = "black", linewidth = 1),
            axis.ticks.length = unit(0, "pt"),
            plot.title = element_blank()) +
            NoAxes() +
            NoLegend()
ggsave(file = pj(output_dir, "LIPF_F18_nonmali.png"), width = 8, height = 8)
ggsave(file = pj(output_dir, "LIPF_F18_nonmali.pdf"), width = 8, height = 8)

In [None]:
FeaturePlot(obj, features = "PI3", cols = c("lightgrey", "#C23D38"), label = F, pt.size = 0.1, raster = T, raster.dpi = c(512, 512))+
            theme(panel.border = element_rect(color = "black", linewidth = 1),
            axis.ticks.length = unit(0, "pt"),
            plot.title = element_blank())+
            NoAxes() +
            NoLegend() 
ggsave(file = pj(output_dir, "PI3_F28_mali.png"), width = 8, height = 8)
ggsave(file = pj(output_dir, "PI3_F28_mali.pdf"), width = 8, height = 8)

In [None]:
FeaturePlot(obj, features = "FABP1", cols = c("lightgrey", "#C23D38"), label = F, pt.size = 0.1, raster = T, raster.dpi = c(512, 512))+
            theme(panel.border = element_rect(color = "black", linewidth = 1),
            axis.ticks.length = unit(0, "pt"),
            plot.title = element_blank()) +
            NoAxes() +
            NoLegend()
ggsave(file = pj(output_dir, "FABP1_F28_mali.png"), width = 8, height = 8)
ggsave(file = pj(output_dir, "FABP1_F28_mali.pdf"), width = 8, height = 8)

In [None]:
FeaturePlot(obj, features = "TM4SF20", cols = c("lightgrey", "#C23D38"), label = F, pt.size = 0.1, raster = T, raster.dpi = c(512, 512))+
            theme(panel.border = element_rect(color = "black", linewidth = 1),
            axis.ticks.length = unit(0, "pt"),
            plot.title = element_blank()) +
            NoAxes() +
            NoLegend()
ggsave(file = pj(output_dir, "TM4SF20_F28_mali.png"), width = 8, height = 8)
ggsave(file = pj(output_dir, "TM4SF20_F28_mali.pdf"), width = 8, height = 8)

### VlnPlot-factors

In [None]:
f <- "F_6"
VlnPlot(obj, features = f, cols = celltype2_cols, group.by = "celltype2", pt.size = 0) + 
NoLegend() + 
scale_y_continuous(position = "right") +
# labs(title = f) +
theme(axis.title = element_blank(),
    axis.title.x = element_blank(),
    axis.text.x = element_blank(),
    axis.line = element_blank(),
    # axis.text.y = element_text(angle = 90, hjust = 1),
    axis.ticks.x = element_blank(),
    panel.border = element_rect(color = "black", fill = NA, size = 0.8)
    )
    
ggsave(file = pj(output_dir, paste(f, "vln1.png", sep = "_")), width = 10, height = 3)
ggsave(file = pj(output_dir, paste(f, "vln1.pdf", sep = "_")), width = 10, height = 3)

### DotPlot-genes

In [None]:
# top
f <- "F35"
genes_dir <- pj(rnk_dir, "fa", "all_rnks", f)
genes <- read.table(pj(genes_dir, paste0(f, ".rnk")), quote = "\t", header = F)[1]
tg <- head(genes, 20)
tg <- as.data.frame(t(tg), row.names = NULL)
tg <- rev(tg)
tg <- as.character(tg)
p1 <- DotPlot(obj, features = tg, group.by = "celltype1", 
            cols = c("#4684BD", "#C23D38")) + 
            # RotatedAxis() + 
            coord_flip() +
            # NoLegend() +
            # scale_y_discrete(limits = c("Neuroendocrine (uncertain)", "C5-EBV","C4-CDA","C3-Mixed","C2-Intestinal", "C1-Diffuse","Abandoned"), position = "left") +
            theme(
        #  panel.grid.major = element_line(colour = "lightgrey", linetype = "dashed", linewidth = 0.1),
            # axis.text.x = element_blank(),
            axis.title.x = element_blank(),
            axis.title.y = element_blank(),
            axis.line.x = element_blank(),
            axis.ticks.x = element_blank(),
            axis.text.x = element_blank()
            # axis.text.x = element_text(angle = 90, hjust = 1)
            )
p1
ggsave(p1, file = pj(output_dir, paste(f, "top_fea1.png", sep = "_")), width = 10, height = 8)
ggsave(p1, file = pj(output_dir, paste(f, "top_fea1.pdf", sep = "_")), width = 10, height = 8)

In [None]:
# tail
f <- "F35"
genes_dir <- pj(rnk_dir, "fa", "all_rnks", f)
genes <- read.table(pj(genes_dir, paste0(f, ".rnk")), quote = "\t", header = F)[1]
lg <- tail(genes, 20)
lg <- as.data.frame(t(lg), row.names = NULL)
lg <- as.character(lg)
p1 <- DotPlot(obj, features = lg, group.by = "celltype1", 
            cols = c("#4684BD", "#C23D38")) + 
            # RotatedAxis() + 
            coord_flip() +
            # scale_y_discrete(limits = c("Neuroendocrine (uncertain)", "C5-EBV","C4-CDA","C3-Mixed","C2-Intestinal", "C1-Diffuse","Abandoned"), position = "left") +
            # NoLegend()+
            theme(
        #  panel.grid.major = element_line(colour = "lightgrey", linetype = "dashed", linewidth = 0.1),
            # axis.text.x = element_blank(),
            axis.title.x = element_blank(),
            axis.title.y = element_blank(),
            axis.line.x = element_blank(),
            axis.ticks.x = element_blank(),
            axis.text.x = element_blank()
            # axis.text.x = element_text(angle = 90, hjust = 1)
            )
p1
ggsave(p1, file = pj(output_dir, paste(f, "tail_fea1.png", sep = "_")), width = 10, height = 8)
ggsave(p1, file = pj(output_dir, paste(f, "tail_fea1.pdf", sep = "_")), width = 10, height = 8)