In [3]:
library(data.table)
library(Seurat)
#library(SeuratData)
library(ggplot2)
source("../src/process_pdac.R")
source("../src/pdac_plots.R")
#library(parallelDist)
library(grid)
library(dplyr)
library(pheatmap)
library(DoubletFinder)
library(ggpubr)
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
library(org.Hs.eg.db)
library(BSgenome.Hsapiens.UCSC.hg19)
library(robustbase)
library(gridExtra)
library(RColorBrewer)
library(stringr)
library(monocle)
library(fgsea)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:data.table’:

    between, first, last


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


Loading required package: AnnotationDbi

Loading required package: stats4

Loading required package: BiocGenerics

Loading required package: parallel


Attaching package: ‘BiocGenerics’


The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB


The following objects are masked from ‘package:dplyr’:

    combine, intersect, setdiff, union


The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


The following objects are masked from ‘package:base’:

    anyDuplicated, append, as.data.frame, basena

# PDAC

In [4]:
base_path <- file.path("..","data")
pdac_matrix_file_name <- "CRA001160.matrix"
pdac_anno_file_name <- "CRA001160.celltypes.tsv"
pdac_matrix_path <- file.path( base_path, pdac_matrix_file_name )
pdac_annotation_path <- file.path( base_path, pdac_anno_file_name )

pdac_anno_dt <- process_annotations( pdac_annotation_path )
seurat_obj <- create_full_seurat_object( read_gene_exp_mat( 
    pdac_matrix_path ), pdac_anno_dt )
meta_data_dt <- data.table( seurat_obj@meta.data, keep.rownames = T ) %>% setnames(.,"rn","cell.name")

doublet_cells <- fread( file.path( base_path, "cell_doublets.tsv" ),header=F) %>% .$V1

filtered_cells <- meta_data_dt[!cell.name %in% doublet_cells,cell.name]
pdac_anno_dt <- pdac_anno_dt[cell.name %in% filtered_cells,]

seurat_obj <- subset( seurat_obj, cells=filtered_cells )
seurat_obj <- NormalizeData( seurat_obj )
meta_data_dt <- data.table( seurat_obj@meta.data, keep.rownames = T ) %>% setnames(.,"rn","cell.name")
edge_info_all <- readRDS( file.path( base_path, "edge_info_all_doublet_filtered_celltypes.rds" ) )


“Detected 57530 column names but the data has 57531 columns (i.e. invalid file). Added 1 extra default column name for the first column which is guessed to be row names or an index. Use setnames() afterwards if this guess is not correct, or fix the file write command that created the file to create a valid file.”
“Feature names cannot have underscores ('_'), replacing with dashes ('-')”


## Comparison with trajectory analysis

In [None]:
cds_obj <- readRDS(file.path(base_path,"ductal_malignant_cds_obj.rds"))
cell_type <- "Ductal cell type 1"
cds_meta_data_dt <- data.table( pData(cds_obj) )
cells <- cds_meta_data_dt[!cell.name %in% doublet_cells,cell.name]
# cds_obj <- cds_obj[,cds_meta_data_dt$cell.name]
cds_meta_data_dt$sub_cluster <- cds_meta_data_dt$cluster
cds_meta_data_dt[cell.name %in% edge_info_all$edge_center_dt[normal_cell_type==cell_type & cell_category == "edge",cell.name],sub_cluster:=paste("Edge", cell_type)]
cds_meta_data_dt[!grepl("Edge",sub_cluster) & cluster == cell_type,sub_cluster:=paste("Non-Edge", cell_type)]
df <- as.data.frame(cds_meta_data_dt[,!c("cell.name")])
rownames(df) <- cds_meta_data_dt$cell.name
pData(cds_obj) <- df

In [None]:
options(repr.plot.width=14,repr.plot.height=6)
p1 <- plot_cell_trajectory( cds_obj, color_by="sub_cluster" ) + labs(color="Cell Type") + theme_classic(base_size=15) +
theme(legend.position=c(1,0.8),legend.justification = "right",legend.text=element_text(size=15)) +
scale_color_discrete(name="Cell Type",labels=c("Ductal cell type 2"="Malignant Ductal",
                                              "Edge Ductal cell type 1"="Outlier Ductal",
                                              "Non-Edge Ductal cell type 1"="Non-Outlier Ductal"))

p2 <- ggboxplot( cds_meta_data_dt[sub_cluster!="Acinar cell",], x="State",y="Pseudotime",color="sub_cluster" ) + 
theme_classic(base_size=15) + xlab("State") +
stat_compare_means(comparisons=list(c("Non-Edge Acinar cell","Edge Acinar cell")),label.y=7) +
theme(legend.position="none")
final <- ggarrange(p1,p2,nrow=1)
print(final)

In [None]:
options(repr.plot.width=14,repr.plot.height=6)
p1 <- plot_cell_trajectory( cds_obj, color_by="sub_cluster" ) + labs(color="") + theme_classic(base_size=15) +
theme(legend.position=c(1,0.8),legend.justification = "right",legend.text=element_text(size=15)) #+ 
# scale_color_manual(breaks=c("Ductal cell type 2"="Malignant Ductal","Edge Ductal Cell Type 1"="Outlier Ductal",
#                            "Non-Edge Ductal cell type 1"="Non-Outlier Ductal"))


p2 <- ggboxplot( cds_meta_data_dt[sub_cluster!=cell_type,], x="sub_cluster",y="Pseudotime" ) + 
theme_classic(base_size=15) + xlab("") +
stat_compare_means(comparisons=list(c("Non-Edge Ductal cell type 1","Edge Ductal cell type 1")),label.y=20)
final <- ggarrange(p1,p2,nrow=1)
print(final)

In [None]:
options(repr.plot.width=10,repr.plot.height=6)
# p1 <- plot_cell_trajectory( cds_obj, color_by="sub_cluster" ) + labs(color="") + theme_classic(base_size=15) + 
# facet_grid(rows=vars(sub_cluster),cols=vars(State)) + theme(legend.position="bottom",legend.justification = "right",legend.text=element_text(size=15)) #+ 
# # scale_color_manual(breaks=c("Ductal cell type 2"="Malignant Ductal","Edge Ductal Cell Type 1"="Outlier Ductal",
# #                            "Non-Edge Ductal cell type 1"="Non-Outlier Ductal"))


p2 <- ggboxplot( cds_meta_data_dt, color="sub_cluster",y="Pseudotime",x="State" ) + 
theme_classic(base_size=20) + labs(color="Cell Type") +
stat_compare_means(comparisons=list(c("Non-Edge Ductal cell type 1","Edge Ductal cell type 1"))) + 
theme(legend.position="bottom",legend.text=element_text(size=15))
p2
# final <- ggarrange(p1,p2,ncol=1)
# print(final)

## Sub-sampling reads and genes from edge cells

In [None]:
source("process_pdac.R")
source("pdac_plots.R")
norm_type <- "Acinar cell"

options(repr.plot.width=8, repr.plot.height=3)
#theme_set(theme_classic(base_size = 15))
resample_info_list <- resample_edge_cells( seurat_obj, pdac_anno_dt, edge_info_all$edge_center_dt, c("Acinar cell"), c("Ductal cell type 2"), "reads",
                                      no_tumour_adjacent=F)

#Before resampling
p1 <- ggplot( resample_info_list$resampled_meta_data_dt ) + geom_boxplot(aes(x=cell_category_before_resampling,y=log(nCount_RNA_before_resampling))) + 
stat_compare_means(aes(x=cell_category_before_resampling,y=log(nCount_RNA_before_resampling)),label="p.format",label.x=1.5) + theme_classic() +
xlab("Acinar cell category") + ylab("Log(Library size)") + scale_x_discrete(labels=c("center"="Non-edge",
                                                                                    "edge"="Edge"))

#After resampling
p2 <- ggplot( resample_info_list$resampled_meta_data_dt ) + geom_boxplot(aes(x=cell_category_after_resampling,y=log(nCount_RNA_after_resampling))) +
stat_compare_means(aes(x=cell_category_after_resampling,y=log(nCount_RNA_after_resampling)),label="p.format",label.x=1.5) + theme_classic() + ylab("") +
 xlab("Acinar cell category") + scale_x_discrete(labels=c("center"="Non-edge","edge"="Edge"))
 
p3 <- plot_skewness(resample_info_list$edge_info$control_dist_dt) + theme_classic()

p4 <- plot_edge_distance_ratio(resample_info_list$edge_info$edge_malignant_dist_dt) + theme_classic()
                              
final <- ggarrange(p1,p2,p3,p4,nrow=1,ncol=4,labels="auto")
print(final)

num_retained_edge_cells <- nrow(resample_info_list$resampled_meta_data_dt[cell_category_before_resampling == "edge" & 
                                         cell_category_after_resampling == "edge",] )
print(paste("# edge cells =",nrow(resample_info_list$resampled_meta_data_dt[cell_category_before_resampling == "edge",])))
print(paste("# common edge cells before and after resampling =",num_retained_edge_cells))
ggsave("supp_library_resampling.png", final, height=3, width=8 )

In [None]:
source("process_pdac.R")
source("pdac_plots.R")
norm_type <- "Acinar cell"

options(repr.plot.width=8, repr.plot.height=3)
#theme_set(theme_classic(base_size = 15))
resample_info_list <- resample_edge_cells( seurat_obj, pdac_anno_dt, edge_info_all$edge_center_dt, c("Acinar cell"), 
                                           c("Ductal cell type 2"), "genes", no_tumour_adjacent=F)

#Before resampling
p1 <- ggplot( resample_info_list$resampled_meta_data_dt ) + geom_boxplot(aes(x=cell_category_before_resampling,y=log(nFeature_RNA_before_resampling))) + 
stat_compare_means(aes(x=cell_category_before_resampling,y=log(nFeature_RNA_before_resampling)),label="p.format",label.x=1.5) + theme_classic() +
xlab("Acinar cell category") + ylab("Log(Library size)") + scale_x_discrete(labels=c("center"="Non-edge",
                                                                                    "edge"="Edge"))

#After resampling
p2 <- ggplot( resample_info_list$resampled_meta_data_dt ) + geom_boxplot(aes (x=cell_category_after_resampling,y=log(nFeature_RNA_after_resampling))) +
stat_compare_means(aes(x=cell_category_after_resampling,y=log(nFeature_RNA_after_resampling)),label="p.format",label.x=1.5) + theme_classic() + ylab("") +
xlab("Acinar cell category") + scale_x_discrete(labels=c("center"="Non-edge","edge"="Edge"))
 
p3 <- plot_skewness(resample_info_list$edge_info$control_dist_dt) + theme_classic()

p4 <- plot_edge_distance_ratio(resample_info_list$edge_info$edge_malignant_dist_dt) + theme_classic()
                              
final <- ggarrange(p1,p2,p3,p4,nrow=1,ncol=4,labels="auto")
print(final)

num_retained_edge_cells <- nrow(resample_info_list$resampled_meta_data_dt[cell_category_before_resampling == "edge" & 
                                         cell_category_after_resampling == "edge",] )
print(paste("# edge cells =",nrow(resample_info_list$resampled_meta_data_dt[cell_category_before_resampling == "edge",])))
print(paste("# common edge cells before and after resampling =",num_retained_edge_cells))
ggsave("supp_genes_resampling.png", final, height=3, width=8 )

## Justifying merging normal and tumour-adjacent acinar cells

In [None]:
norm_type <- "Acinar cell"
tumour_adjacent_cells <- meta_data_dt[cluster == norm_type & sample_type == "tumour",cell.name]

# acinar_edge_info <- add_edge_center_annotation( seurat_obj, pdac_anno_dt, 
#                                                       malignant_cell_types=c("Ductal cell type 2"), 
#                                   normal_cell_types=c(norm_type), num_pcs=num_pcs, 
#                                   pairwise_pca=T, perform_control=F, no_tumour_adjacent=T )

cell_type_acinar_edge_center_dt <- acinar_edge_info$edge_center_dt
diff_exp_dt <- get_edge_de_genes_dt( seurat_obj, cell_type_acinar_edge_center_dt, 
                                    tumour_adjacent_cells = tumour_adjacent_cells )
diff_exp_dt <- diff_exp_dt[p_val_adj < 0.01,]

acinar_edge_cells <- cell_type_acinar_edge_center_dt[cell_category == "edge",cell.name]
non_acinar_edge_cells <- cell_type_acinar_edge_center_dt[cell_category != "edge",cell.name]



In [None]:
options(repr.plot.width=8, repr.plot.height=3)
theme_set(theme_classic())

merged_diff_exp_dt <- merge( diff_exp_dt[de_set == "tumour_adjacent_vs_center"],
                         diff_exp_dt[de_set == "edge_vs_center"], by="gene_name",
                           suffixes=c(".adj_vs_center",".edge_vs_center"))
p1 <- ggplot( merged_diff_exp_dt ) + geom_point(aes(x=avg_logFC.edge_vs_center,y=avg_logFC.adj_vs_center)) + 
geom_line(aes(x=avg_logFC.edge_vs_center,y=avg_logFC.edge_vs_center),color="gray") + 
geom_hline(aes(yintercept=0)) + geom_vline(aes(xintercept=0)) + xlab("Average Log FC\n(Edge vs Non-Edge)") +
ylab("Average Log FC\n(Tumor-adjacent vs Non-Edge)")
print(paste("R^2"=cor.test(merged_diff_exp_dt$avg_logFC.edge_vs_center,merged_diff_exp_dt$avg_logFC.adj_vs_center)))

plot_dt <- melt( merged_diff_exp_dt[,.(gene_name,avg_logFC.edge_vs_center,avg_logFC.adj_vs_center)],
                id.vars="gene_name", value.name="Average Log FC")
p2 <- ggplot( plot_dt ) + geom_point(aes(x=variable,y=`Average Log FC`),position="jitter",color="gray") +
geom_boxplot(aes(x=variable,y=`Average Log FC`),outlier.shape=NA,fill=NA,size=1.0) + 
scale_x_discrete("",labels=c("avg_logFC.edge_vs_center" = "Edge vs\n Non-Edge",
                          "avg_logFC.adj_vs_center" = "Tumor-Adjacent vs\n Non-Edge")) + 
stat_compare_means(aes(x=variable,y=`Average Log FC`),label.x=1.5,label="p.signif")

acinar_edge_center_dt <- edge_info_all$edge_center_dt[normal_cell_type == "Acinar cell",.(cell.name,dist_from_normal_medoid,dist_from_malignant_medoid)]
acinar_edge_center_dt <- merge( acinar_edge_center_dt, meta_data_dt[,.(cell.name,sample_type)] )
melted_dt <- melt(acinar_edge_center_dt[sample_type=="tumour",][,!c("sample_type")],id.vars=c("cell.name"))
p3 <- ggplot( melted_dt ) + stat_compare_means(aes(x=variable,y=value),label="..p.format..",method.args=list("alternative"="g"),label.x=1.5) +
 geom_boxplot(aes(x=variable,y=value)) + scale_x_discrete(c("variable"=""), labels=c("dist_from_normal_medoid"="From\nacinar",
                                                           "dist_from_malignant_medoid"="From\nmalignant ductal")) + ylab("Distance in PC space")
ggarrange(p1,p2,p3ncol=3,nrow=1)

In [None]:
print(paste0("# of DE genes (edge vs center) ", nrow(diff_exp_dt[de_set == "edge_vs_center"])))
print(paste0("# of DE genes (tumour-adjacent vs center) ", nrow(diff_exp_dt[de_set == "tumour_adjacent_vs_center"])))
print(paste0("# of common DE genes ", nrow(merged_diff_exp_dt)))

## Edge-ness fingerprint plots

In [None]:
all_cell_types <- unique(pdac_anno_dt$cluster)
across_refs_edge_center_dt <- data.table()
num_pcs = 50

malignant_cell_types <- c("Ductal cell type 2")
normal_cell_types <- setdiff( unique(pdac_anno_dt$cluster), malignant_cell_types)
edge_info_all <- add_edge_center_annotation( seurat_obj, pdac_anno_dt, malignant_cell_types=malignant_cell_types, 
                                  normal_cell_types=normal_cell_types, num_pcs=num_pcs, 
                                  pairwise_pca=T, perform_control=T, no_tumour_adjacent=F )

saveRDS(edge_info_all,"edge_info_all_doublet_filtered_celltypes.rds")


## Comparison with embryonic progenitor genes

In [5]:
diff_exp_dt_list <- list()

edge_non_edge_zscores_list <- list()
edge_mean_exp_list <- list()
non_edge_mean_exp_list <- list()
for (cell_type in c("Acinar cell","Ductal cell type 1")) {
    edge_center_dt <- edge_info_all$edge_center_dt[normal_cell_type == cell_type,]

    diff_exp_dt_list[[cell_type]] <- get_edge_de_genes_dt( seurat_obj, 
                                          edge_center_dt                
                                        )[p_val_adj < 0.01,]
    subset_seurat_obj <- subset( seurat_obj, cells=edge_center_dt$cell.name ) %>% ScaleData(.)
    edge_mean_exp_list[[cell_type]] <- rowMeans(subset_seurat_obj[["RNA"]]@data[,edge_center_dt[cell_category == "edge",cell.name]])
    non_edge_mean_exp_list[[cell_type]] <- rowMeans(subset_seurat_obj[["RNA"]]@data[,edge_center_dt[cell_category == "center",cell.name]])

    edge_non_edge_zscores <- rowMeans(subset_seurat_obj[["RNA"]]@scale.data[,edge_center_dt[cell_category == "edge",cell.name]]) -
    rowMeans(subset_seurat_obj[["RNA"]]@scale.data[,edge_center_dt[cell_category == "center",cell.name]])
    
    edge_non_edge_zscores_list[[cell_type]] <- edge_non_edge_zscores
}
rm(subset_seurat_obj)

derived_signatures <- list()

common_edge_genes <- intersect( diff_exp_dt_list[["Acinar cell"]]$gene_name, 
                               diff_exp_dt_list[["Ductal cell type 1"]]$gene_name )
acinar_only_edge_genes <- setdiff( diff_exp_dt_list[["Acinar cell"]]$gene_name, common_edge_genes )
ductal_only_edge_genes <- setdiff( diff_exp_dt_list[["Ductal cell type 1"]]$gene_name, common_edge_genes )
all_edge_genes <- union( diff_exp_dt_list[["Acinar cell"]]$gene_name, 
                               diff_exp_dt_list[["Ductal cell type 1"]]$gene_name )

derived_signatures[["Edge Acinar All"]] <- diff_exp_dt_list[["Acinar cell"]]$gene_name
derived_signatures[["Edge Acinar Unique"]] <- acinar_only_edge_genes
derived_signatures[["Outlier Ductal All"]] <- diff_exp_dt_list[["Ductal cell type 1"]]$gene_name
derived_signatures[["Outlier Ductal Unique"]] <- ductal_only_edge_genes
derived_signatures[["Outlier Universe"]] <- all_edge_genes
derived_signatures[["Edge Outlier Common"]] <- common_edge_genes


Centering and scaling data matrix

Centering and scaling data matrix



In [None]:
marker_table <- fread("progenitor_marker_table.tsv")[,!c("Stainings\ and\ References")]
set(marker_table,NULL,"Acinar -> Progenitor",paste(marker_table$`Acinar cell`, marker_table$`Embryonic progenitor`,sep=" to "))
set(marker_table,NULL,"Ductal -> Progenitor",paste(marker_table$`Duct cell`, marker_table$`Embryonic progenitor`,sep=" to "))
marker_table <- marker_table[,!c("Acinar cell","Duct cell","Embryonic progenitor","")]
marker_genes <- marker_table$Marker

new_markers <- c("STAT3","SEL1L","CBL","KLF4","CTNND1","LKB1","ICAM1","DCLK1","CDKN1A")
marker_genes <- unique( c(marker_genes,new_markers) )
marker_genes <- marker_genes[marker_genes %in% names(edge_non_edge_zscores_list[["Acinar cell"]])]

for (gene in marker_genes) {
    if (!gene %in% marker_table$Marker){
        marker_table <- rbind( marker_table, list(gene,"- to +","- to +"))
    }
}
rows_to_update <- 1:nrow(marker_table)
for (cell_type in c("Acinar cell","Ductal cell type 1")) {
    de_status_vec <- rep("",length(marker_genes))#vector(mode="character",length=length(marker_genes))
    names(de_status_vec) <- marker_genes
    edge_column_values <- rep("N/A",length(marker_genes))
    names(edge_column_values) <- marker_genes

    de_marker_genes <- marker_genes[marker_genes %in% diff_exp_dt_list[[cell_type]]$gene_name]
    de_status_vec[de_marker_genes] <- "*"
    edge_column_values[marker_genes] <- paste(round(edge_non_edge_zscores_list[[cell_type]][marker_genes],2),de_status_vec)
    #edge_column_values[is.na(edge_column_values)] <- "N/A"
    edge_column_name <- paste(gsub("\ .*","",cell_type),"Non-Edge -> Edge")
    
    set(marker_table,i=rows_to_update[marker_table$Marker %in% marker_genes],
        edge_column_name,edge_column_values)
}

In [None]:
progenitor_count_mat <- read_gene_exp_mat("/Users/sreenivasagopv2/Data/sc-funnel/GSM4194789_TMM_counts_CPM.csv.gz")

if (!file.exists("progenitor_annotation.tsv")) {
    progenitor_seurat_obj <- CreateSeuratObject( progenitor_count_mat )
    progenitor_seurat_obj[["percent.mt"]] <- PercentageFeatureSet(progenitor_seurat_obj, pattern = "^MT-")
    dying_cells <- WhichCells( progenitor_seurat_obj, expression='percent.mt > 10' )
    
    progenitor_seurat_obj <- NormalizeData(progenitor_seurat_obj) 
    progenitor_seurat_obj <- ScaleData(progenitor_seurat_obj) %>% FindVariableFeatures(.,nfeatures = 1000) %>% RunPCA(.,dims=50) %>% RunUMAP(.,dims=1:50) 

    homotypic.prop <- modelHomotypic(progenitor_seurat_obj@meta.data$cluster)
    nExp_poi <- round(0.05*nrow(progenitor_seurat_obj@meta.data))
    #nExp_poi.adj <- round(nExp_poi*(1-homotypic.prop)) 

    sample_param_sweep <- paramSweep_v3(progenitor_seurat_obj, PCs = 1:50, sct = FALSE)
    sample_param_sweep_summary <- summarizeSweep(sample_param_sweep, GT = FALSE)
    sweep_dt <- data.table( find.pK( sample_param_sweep_summary ) )
    optimal_pK <- as.double(as.vector(sweep_dt[order(-BCmetric),][1]$pK))
    progenitor_seurat_obj <- doubletFinder_v3(progenitor_seurat_obj, 
                               PCs = 1:50, pN = 0.25, pK = optimal_pK, nExp = nExp_poi, 
                                          reuse.pANN = FALSE, sct = FALSE)

    sample_meta_data_dt <- data.table( progenitor_seurat_obj@meta.data, keep.rownames = T ) %>% setnames(.,"rn","cell.name")
    doublet_class_col <- paste("DF.classifications_0.25",optimal_pK,nExp_poi,sep="_")
    doublet_info_dt <- sample_meta_data_dt[,c("cell.name",doublet_class_col),with=F] %>%  setnames(.,doublet_class_col,"doublet_class")

    cell_doublets <- progenitor_meta_data_dt[cell.name %in% doublet_info_dt[doublet_class == "Doublet",cell.name],cell.name]
    #fwrite( data.table(cell.name=cell_doublets), "progenitor_cell_doublets.tsv", sep="\t", col.names=F, quote=F)
    
    cells_to_retain <- setdiff( Cells(progenitor_seurat_obj), c(dying_cells,doublet_cells) )
    progenitor_seurat_obj <- subset( progenitor_seurat_obj, cells=cells_to_retain )
    progenitor_meta_data_dt <- data.table( progenitor_seurat_obj@meta.data, keep.rownames=T ) %>% setnames(.,"rn","cell.name")

    progenitor_seurat_obj <- NormalizeData(progenitor_seurat_obj) %>% ScaleData(.) %>% 
    FindVariableFeatures(.,nfeatures = 1000) %>% RunPCA(.,dims=50) %>% FindNeighbors( ., reduction = "pca", k.param=30 ) %>%
    FindClusters(.) %>% RunUMAP(.,dims=1:50)
    progenitor_meta_data_dt <- data.table( progenitor_seurat_obj@meta.data, keep.rownames=T ) %>% setnames(.,"rn","cell.name")
    fwrite( progenitor_meta_data_dt, "progenitor_annotation.tsv", sep="\t", row.names=F,quote=F)
} else {
    progenitor_meta_data_dt <- fread("progenitor_annotation.tsv")
    progenitor_seurat_obj <- create_full_seurat_object( progenitor_count_mat, progenitor_meta_data_dt )
    progenitor_seurat_obj <- NormalizeData( progenitor_seurat_obj )
    #Cluster 7 contains Sox9+,PTF1a+ cells
    sox9_ptf1a_cells <- progenitor_meta_data_dt[seurat_clusters == 7,cell.name]
}


progenitor_seurat_obj <- SetIdent( progenitor_seurat_obj, value="seurat_clusters" )
progenitor_markers_df <- FindMarkers( progenitor_seurat_obj, ident.1="7")
progenitor_markers_dt <- data.table( progenitor_markers_df, keep.rownames=T ) %>% setnames(.,"rn","gene_name")
progenitor_genes <- progenitor_markers_dt[p_val < 0.1 & avg_logFC > 0,gene_name]
progenitor_genes <- progenitor_genes[progenitor_genes %in% all_genes]


In [None]:
get_cosine_matrix <- function(X,Y) {
    norm_X <- sqrt(colSums(X^2))
#     print(Y)
    norm_Y <- sqrt(colSums(Y^2))
    
    dimnames_X <- list(rownames(X),colnames(X))
    dimnames_Y <- list(rownames(Y),colnames(Y))

    X <- sapply(1:ncol(X),function(idx){return(X[,idx]/norm_X[idx])})
    Y <- sapply(1:ncol(Y),function(idx){return(Y[,idx]/norm_Y[idx])})
    
    dimnames(X) <- dimnames_X
    dimnames(Y) <- dimnames_Y
    
    dist_mat <- t(X) %*% Y
    return(dist_mat)
}

compute_cell_wise_cosine_similarity <- function( query_mat, reference_mat, genes_to_use=NULL, 
                                                dist_from="medoid",return_sd=F ) {
    if (!is.null(genes_to_use)) {
        query_mat <- query_mat[genes_to_use,]
        reference_mat <- reference_mat[genes_to_use,]
    } 
    
    query_ref_dist_mat <- get_cosine_matrix( query_mat, reference_mat )
    
    if (dist_from == "medoid") {
        ref_dist_mat <- get_cosine_matrix( reference_mat, reference_mat )
        medoid <-which.max(rowSums(ref_dist_mat))
        distances <- query_ref_dist_mat[,medoid]
        to_return <- list("dist"=distances)
    } else if (dist_from == "all") {
        distances <- Matrix::rowMeans(query_ref_dist_mat)
        num_ref_cells <- ncol(reference_mat)
        sd_distances <- sqrt( (num_ref_cells/(num_ref_cells-1)) * (rowMeans(query_ref_dist_mat^2) - distances^2))
        to_return <- list("dist"=distances,"sd"=sd_distances)
    }
    
    return(to_return)
}

plot_similarity_scatter <- function( dist_list_x, dist_list_y ) {
    dt <- data.table( x_mean=dist_list_x$dist,
                     y_mean=dist_list_y$dist,
                      x_sd=dist_list_x$sd,
                     y_sd=dist_list_y$sd )
    
    p <- ggplot( dt ) + geom_pointrange(aes(x=x_mean,y=y_mean,xmin=x_mean-1.96*x_sd,
                                            xmax=x_mean+1.96*x_sd),fatten=1,color="gray",
                                        size=0.5) +
       geom_pointrange(aes(x=x_mean,y=y_mean,ymin=y_mean-1.96*y_sd,ymax=y_mean+1.96*y_sd),fatten=1,color="gray",
                       size=0.5) + 
    geom_abline(slope=1,intercept=0) + geom_point(aes(x=x_mean,y=y_mean),color="black",size=1) +
    xlim(0,1) + ylim(0,1)
 
    return(p)
}

In [None]:
progenitor_mat <- progenitor_seurat_obj[["RNA"]]@data[,sox9_ptf1a_cells]

acinar_edge_cells <- edge_info_all$edge_center_dt[normal_cell_type == "Acinar cell" & cell_category=="edge",cell.name]
acinar_non_edge_cells <- edge_info_all$edge_center_dt[normal_cell_type == "Acinar cell" & cell_category!="edge",cell.name]

ductal_outlier_cells <- edge_info_all$edge_center_dt[normal_cell_type == "Ductal cell type 1" & 
                                                     cell_category == "edge",cell.name]
ductal_non_outlier_cells <- edge_info_all$edge_center_dt[normal_cell_type == "Ductal cell type 1" & 
                                                     cell_category != "edge",cell.name]

malignant_ductal_cells <- meta_data_dt[cluster == "Ductal cell type 2",cell.name]

# acinar_seurat_obj <- 
ductal_outlier_data_mat <- seurat_obj[["RNA"]]@data[progenitor_genes,ductal_outlier_cells]
acinar_edge_data_mat <- seurat_obj[["RNA"]]@data[progenitor_genes,acinar_edge_cells]
ductal_non_outlier_data_mat <- seurat_obj[["RNA"]]@data[progenitor_genes,ductal_non_outlier_cells]
acinar_non_edge_data_mat <- seurat_obj[["RNA"]]@data[progenitor_genes,acinar_non_edge_cells]
malignant_data_mat <- seurat_obj[["RNA"]]@data[progenitor_genes,malignant_ductal_cells]


In [None]:
ret_edge_acinar <- compute_cell_wise_cosine_similarity( acinar_edge_data_mat, progenitor_mat,
                                           genes_to_use = progenitor_genes, dist_from="medoid")
ret_outlier_ductal <- compute_cell_wise_cosine_similarity( ductal_outlier_data_mat, progenitor_mat,
                                           genes_to_use = progenitor_genes, dist_from="medoid")

ret_non_edge_acinar <- compute_cell_wise_cosine_similarity( acinar_non_edge_data_mat,progenitor_mat,
                                           genes_to_use = progenitor_genes, dist_from="medoid")
ret_non_outlier_ductal <- compute_cell_wise_cosine_similarity( ductal_non_outlier_data_mat, progenitor_mat,
                                           genes_to_use = progenitor_genes, dist_from="medoid")

ret_malignant <- compute_cell_wise_cosine_similarity( malignant_ductal_mat, progenitor_mat,
                                            genes_to_use = progenitor_genes, dist_from="medoid")

edge_acinar_dist_dt <- data.table(value=ret_edge_acinar$dist,variable="Edge Acinar")
outlier_ductal_dist_dt <- data.table(value=ret_outlier_ductal$dist,variable="Outlier Ductal")
non_edge_acinar_dist_dt <- data.table(value=ret_non_edge_acinar$dist,variable="Non-Edge Acinar")
non_outlier_ductal_dist_dt <- data.table(value=ret_non_outlier_ductal$dist,variable="Non-Outlier Ductal")
malignant_dist_dt <- data.table(value=ret_malignant$dist,variable="Malignant Ductal")


options(repr.plot.width=4,repr.plot.height=6)
plot_dt <- rbind(edge_acinar_dist_dt,non_edge_acinar_dist_dt,outlier_ductal_dist_dt,
                 non_outlier_ductal_dist_dt,malignant_dist_dt)

plot_dt$cell_type <- plot_dt$variable
plot_dt[grepl("Acinar",variable),]$cell_type <- "Acinar"
plot_dt[grepl("Ductal cell type 1",variable),]$cell_type <- "Ductal"
plot_dt[grepl("Ductal cell type 2",variable),]$cell_type <- "Ductal"

ggboxplot(plot_dt,x="variable",y="value",color="cell_type") +
theme_classic(base_size=15) + ylab("Cosine similarity") + xlab("") + 
# theme(axis.text.x = element_text(angle=90,hjust=1,vjust=0.5)) +
stat_compare_means(comparisons=list(c("Edge Acinar","Outlier Ductal"),
                                    c("Edge Acinar","Non-Edge Acinar"),
                                   c("Outlier Ductal","Non-Outlier Ductal"),
                                   c("Malignant Ductal","Edge Acinar")),step.increase=0.2,label.x=1.5,label="p.signif",size=8) +
scale_x_discrete(labels=c("Edge Acinar"="E","Outlier Ductal"="O","Non-Edge Acinar"="NE","Non-Outlier Ductal"="NO")) + 
labs(color="Cell Type") + theme(legend.position="bottom") + ylim(0,1.1)

In [None]:
options(repr.plot.width=6,repr.plot.height=8)
plot_dt <- rbind(edge_acinar_dist_dt,non_edge_acinar_dist_dt,outlier_ductal_dist_dt,
                 non_outlier_ductal_dist_dt,malignant_dist_dt)

plot_dt$cell_type <- plot_dt$variable
plot_dt[grepl("Acinar",variable),]$cell_type <- "Acinar"
plot_dt[grepl("Ductal cell type 1",variable),]$cell_type <- "Ductal"
plot_dt[grepl("Ductal cell type 2",variable),]$cell_type <- "Malignant"

ggboxplot(plot_dt,x="variable",y="value",color="cell_type") +
theme_classic(base_size=15) + ylab("Cosine similarity") + xlab("") + 
# theme(axis.text.x = element_text(angle=90,hjust=1,vjust=0.5)) +
stat_compare_means(comparisons=list(c("Edge Acinar","Outlier Ductal"),
                                    c("Edge Acinar","Non-Edge Acinar"),
                                   c("Outlier Ductal","Non-Outlier Ductal"),
                                   c("Malignant Ductal","Edge Acinar")),label.x=1.5,label="p.signif",size=8) +
scale_x_discrete(labels=c("Edge Acinar"="E","Outlier Ductal"="O","Non-Edge Acinar"="NE",
                          "Non-Outlier Ductal"="NO","Malignant Ductal"="Mal")) + 
labs(color="Cell Type") + theme(legend.position="bottom")

## FGSEA enrichment

In [None]:
pathways_h <- gmtPathways(file.path(base_path,"h.all.v7.0.symbols.gmt"))

all_genes <- rownames(seurat_obj[["RNA"]]@data)

cancersea_gene_sets <- list()
signature_file_paths <- list.files(file.path(base_path,"cancer-sea"),full.names=T)
for (idx in 1:length(signature_file_paths)) {
    signature_genes_dt <- fread(signature_file_paths[idx])
    temp <- unlist(strsplit(signature_file_paths[idx],"\\."))[1] 
    signature_name <- paste( "CancerSEA", tail( unlist( strsplit( temp, "/" ) ), 1 ), sep="_" )
    signature_gene_names <- signature_genes_dt$GeneName
    cancersea_gene_sets[[signature_name]] <- signature_gene_names
}
pathways_all <- c(pathways_h,cancersea_gene_sets)

num_pathways <- length(pathways_all)

pathways_all <- lapply( pathways_all, function(pathway_genes) {return(pathway_genes[pathway_genes %in% all_genes])})
for (pathway in names(pathways_all)) {
    if (length(pathways_all[[pathway]]) == 0) {
        pathways_all[[pathway]] <- NULL
    }
} 


### Conventional GSEA

In [None]:
normal_cell_types <- c("Acinar cell","Ductal cell type 1")
malignant_cell_type <- "Ductal cell type 2"
malignant_cells <- meta_data_dt[cluster == malignant_cell_type,cell.name]
fgsea_dt <- data.table(pathway=names(pathways_all))

comparison_list <- list("Malignant v All Acinar"=c("Ductal cell type 2","Acinar cell"),
                       "Edge v Non-Edge Acinar"=c("edge Acinar cell","center Acinar cell"),
                        "Malignant v All Ductal"=c("Ductal cell type 2","Ductal cell type 1"),
                       "Outlier v Non-Outlier Ductal"=c("edge Ductal cell type 1","center Ductal cell type 1"))

get_cells <- function(cluster_name) {
    if (!grepl("edge",cluster_name) && !grepl("center",cluster_name)) {
        cells <- meta_data_dt[cluster == cluster_name,cell.name]
    } else {
        if (grepl("edge",cluster_name)) {
            cell_category_ <- "edge"
        } else {
            cell_category_ <- "center"
        }
        cluster_name <- gsub(paste(cell_category_,""),"",cluster_name)
        cells <- edge_info_all$edge_center_dt[normal_cell_type == cluster_name & cell_category == cell_category_,
                                             cell.name]
    }
    
    return(cells)
}


for (comparison in names(comparison_list)) {
    print(comparison)
    flush.console()
    column_pair <- comparison_list[[comparison]]
    cells_1 <- get_cells( column_pair[1] )
    cells_2 <- get_cells( column_pair[2] )
    cells <- union(cells_1,cells_2)
    subset_seurat_obj <- subset( seurat_obj, cells=cells ) %>% ScaleData(.)
    scaled_data <- subset_seurat_obj[["RNA"]]@scale.data
    
    zscore_diff <- rowMeans(scaled_data[,cells_1]) - rowMeans(scaled_data[,cells_2])

    dt <- run_fgsea( pathways_all, zscore_diff)
    fgsea_dt <- merge( fgsea_dt, dt[,.(pathway,padj,NES)],by="pathway") %>% 
    setnames(.,"padj",paste("padj",comparison)) %>% setnames(.,"NES",paste("NES",comparison))
    rm(scaled_data)
    rm(subset_seurat_obj)
}


In [None]:
make_pathway_annotation <- function(disp_mat,num_clusters) {
     pathway_hclust_obj <- hclust( dist(t(scale(t(disp_mat)))) )
#     pathway_hclust_obj <- hclust( dist(disp_mat), method="complete" )

    pathway_clusters <- cutree(tree = pathway_hclust_obj, k = num_clusters)
    cluster_colors <- list()
    cluster_colors[["Cluster"]] <- brewer.pal(n = num_clusters, name = "Accent")
    names(cluster_colors[["Cluster"]]) <- paste("Cluster", 1:num_clusters)

    pathway_cluster_dt <- data.table( `Gene Set`=paste( "Cluster", pathway_clusters ) )
    rownames(pathway_cluster_dt) <- names(pathway_clusters)
    
    return(pathway_cluster_dt)
}

In [None]:
get_sig <- function(col) {
    row_mask <- fgsea_dt[[paste("padj", col)]] < 0.05
    #idxes <- 1:nrow(fgsea_dt)
    return(row_mask)
}
cluster_dt <- data.table(pathway=fgsea_dt$pathway,`Enriched in`="C8:Misc.")

cluster_dt[get_sig("Edge v Non-Edge Acinar"),`Enriched in`:="C6:EA-NEA"]
cluster_dt[get_sig("Outlier v Non-Outlier Ductal"),`Enriched in`:="C7:OD-NOD"]
cluster_dt[get_sig("Malignant v All Acinar"),`Enriched in`:="C3:Mal-Aci"]
cluster_dt[get_sig("Malignant v All Ductal"),`Enriched in`:="C4:Mal-Duc"]


cluster_dt[get_sig("Malignant v All Ductal") & get_sig("Malignant v All Acinar"),
                   `Enriched in`:="C2:Mal-Duc & Mal-Aci"]
cluster_dt[get_sig("Edge v Non-Edge Acinar") & get_sig("Outlier v Non-Outlier Ductal"),
           `Enriched in`:="C5:EA-NEA & OD-NOD"]
cluster_dt[get_sig("Malignant v All Ductal") & get_sig("Malignant v All Acinar") &
                get_sig("Edge v Non-Edge Acinar") & get_sig("Outlier v Non-Outlier Ductal"),
                   `Enriched in`:="C1:All"]

cluster_dt <- cluster_dt[order(`Enriched in`),]


In [None]:
row_order <- c("C1:All","C2:Mal-Duc & Mal-Aci",
                                        "C3:Mal-Aci",
                                        "C4:Mal-Duc",
                                       "C5:EA-NEA & OD-NOD",
                                       "C6:EA-NEA",
                                       "C7:OD-NOD")
                                         #"C8:Misc.")
pathway_cluster_dt <- data.table()
gap_rows <- c()
idx <- 0
new_row_entries <- c()
for (row in row_order) {
    if (row == "Misc.")
        next
    dt <- cluster_dt[`Enriched in` == row,]
    dt$`Enriched in` <- paste(row, paste0("(",nrow(dt),")"))
    pathway_cluster_dt <- rbind( pathway_cluster_dt, dt)
    idx <- idx + nrow(dt)
    gap_rows <- c(gap_rows,idx)
    new_row_entries <- c(new_row_entries,dt$`Enriched in`)
}
pathway_cluster_dt$`Enriched in` <- factor(pathway_cluster_dt$`Enriched in`,levels=unique(new_row_entries))
pathway_order <- pathway_cluster_dt$pathway
pathway_cluster_dt <- pathway_cluster_dt[,!c("pathway")]

In [None]:
options(repr.plot.width=20, repr.plot.height=20)
theme_set(theme_gray(base_size = 8))

all_col_names <- names(fgsea_dt)
nes_cols <- all_col_names[grepl("NES",all_col_names)]
padj_cols <- all_col_names[grepl("padj",all_col_names)]

# sig_pathways <- melt(fgsea_dt[,c("pathway",padj_cols),with=F],id.vars="pathway")[value < 0.05,pathway] %>% unique(.)
fgsea_dt <- fgsea_dt[order(match(pathway,pathway_order)),]
pathway_mat <- as.matrix(fgsea_dt[,nes_cols,with=F])
colnames(pathway_mat) <- gsub("NES ","", nes_cols )
rownames(pathway_mat) <- fgsea_dt$pathway

flip_to_NA <- function(x) {
    nes_scores <- pathway_mat[,x]
    p_vals <- fgsea_dt[[paste("padj",x)]]
    nes_scores[p_vals > 0.05] = NA
#      nes_scores[nes_scores > 0] <- 1
#      nes_scores[nes_scores < 0] <- -1
     return(nes_scores)
}

na_pathway_mat <- sapply( names(comparison_list), flip_to_NA )
colnames(na_pathway_mat) <- colnames(pathway_mat)
rownames(pathway_cluster_dt) <- pathway_order
na_pathway_mat <- na_pathway_mat[pathway_order,]
p <- pheatmap(na_pathway_mat,fontsize=15,cluster_cols=F,cluster_rows=F,show_rownames=T,
         annotation_row=pathway_cluster_dt, na_col="gray",border_color="black",
              colorRampPalette(rev(brewer.pal(n = 8, name = "RdYlBu")))(4),
             breaks=c(-2,-1,0,1,2),
              gaps_row=gap_rows,cellheight=15,cellwidth=15,
             filename="enrichment_heatmap.png",width=20,height=20,dpi=300)
             

## Tumour-wise scoring of ductal and acinar

In [None]:
acinar_edge_cells <- edge_info_all$edge_center_dt[normal_cell_type == "Acinar cell" & cell_category=="edge",cell.name]
acinar_non_edge_cells <- edge_info_all$edge_center_dt[normal_cell_type == "Acinar cell" & cell_category!="edge",cell.name]

ductal_outlier_cells <- edge_info_all$edge_center_dt[normal_cell_type == "Ductal cell type 1" & 
                                                     cell_category != "edge",cell.name]
ductal_non_outlier_cells <- edge_info_all$edge_center_dt[normal_cell_type == "Ductal cell type 1" & 
                                                      cell_category != "edge",cell.name]

ductal_non_outlier_data_mat <- seurat_obj[["RNA"]]@data[,ductal_non_outlier_cells]
acinar_non_edge_data_mat <- seurat_obj[["RNA"]]@data[,acinar_non_edge_cells]

ret_edge_acinar <- compute_cell_wise_cosine_similarity( malignant_ductal_mat, acinar_edge_data_mat,
                                           genes_to_use=derived_signatures[["Edge Acinar All"]], dist_from="medoid")
ret_outlier_ductal <- compute_cell_wise_cosine_similarity( malignant_ductal_mat, ductal_outlier_data_mat,
                                           genes_to_use=derived_signatures[["Outlier Ductal All"]], dist_from="medoid")
ret_non_edge_acinar <- compute_cell_wise_cosine_similarity( malignant_ductal_mat, acinar_non_edge_data_mat,
                                           genes_to_use=derived_signatures[["Edge Acinar All"]], dist_from="medoid")
ret_non_outlier_ductal <- compute_cell_wise_cosine_similarity( malignant_ductal_mat, ductal_non_outlier_data_mat,
                                           genes_to_use=derived_signatures[["Outlier Ductal All"]], dist_from="medoid")


In [None]:
options(repr.plot.width=16,repr.plot.height=6)
melted_dt <- melt(data.table(`Edge Acinar`=ret_edge_acinar$dist,
                        `Outlier Ductal`=ret_non_outlier_ductal$dist,
                        `Non-Edge Acinar`=ret_non_edge_acinar$dist,
                        `Non-Outlier Ductal`=ret_non_outlier_ductal$dist,
                       cell.name=names(ret_edge_acinar$dist)),id.vars=c("cell.name")
                 ) #%>% setnames(.,"variable","Cell Type")
# 4 + ""
melted_dt <- merge( melted_dt, meta_data_dt[,.(sample,cell.name)])
melted_dt$sample <- factor(melted_dt$sample,levels=paste0("T",1:24))

p_edge_outlier <- ggplot( melted_dt[!grepl("Non-",variable),]  ) + 
geom_boxplot(aes(y=value,x=sample,color=variable),size=1.0,fill=NA,width=0.5) + 
# stat_compare_means(aes(x=variable,y=value),label.x=1.5,label="p.signif",size=8) + 
xlab("") + ylab("") +
theme_classic(base_size=20) + labs(color="") + theme(legend.position=c(0.5,0.1), legend.direction="horizontal")

x_labels <- 1:24
names(x_labels) <- paste0("T",x_labels)
p_all <- ggplot( melted_dt  ) + 
geom_boxplot(aes(y=value,x=sample,color=variable),size=1.0,fill=NA,width=0.5) + 
theme_classic(base_size=20) + ylab("Cosine similarity") + xlab("") + labs(color="Cell Type") +
ylim(0.3,1) + theme(legend.direction="horizontal",legend.position=c(0.5,0.1))


melted_dt$cell_type <- melted_dt$variable
melted_dt[grepl("Acinar",variable),]$cell_type <- "Acinar"
melted_dt[grepl("Ductal",variable),]$cell_type <- "Ductal"
# ggplot( melted_dt ) + 
# geom_boxplot(aes(y=value,group=variable),size=1.0,fill=NA,width=0.5) + 
p_pan_tumor <- ggboxplot(melted_dt,x="variable",y="value",color="cell_type") +
theme_classic(base_size=20) + ylab("Cosine similarity") + xlab("") + 
# theme(axis.text.x = element_text(angle=90,hjust=1,vjust=0.5)) +
stat_compare_means(comparisons=list(c("Edge Acinar","Outlier Ductal"),
                                    c("Edge Acinar","Non-Edge Acinar"),
                                   c("Outlier Ductal","Non-Outlier Ductal")),
                   label.x=1.5,label="p.signif",size=5) +
scale_x_discrete(labels=c("Edge Acinar"="E","Outlier Ductal"="O","Non-Edge Acinar"="NE","Non-Outlier Ductal"="NO")) + 
scale_fill_discrete(name = c("cell_type"="Cell Type")) + labs(color="") + 
theme(legend.position="none") #+ ylim(0.3,0.9)

print("# of tumors where edge acinar is closer to malignant than outlier ductal")
p_values <- c()
for (sample_num in unique(melted_dt$sample)) {
    dt <- melted_dt[sample == sample_num,]
    p_value <- wilcox.test(dt[variable == "Edge Acinar",value],dt[variable == "Outlier Ductal",value])$p.value
    p_values <- c(p_values,p_value)
}
adj_p <- p.adjust(p_values,method="bonferroni")
print(sum(adj_p < 0.1))

rm(melted_dt)
rm(ductal_outlier_data_mat)
rm(ductal_non_outlier_data_mat)
rm(acinar_edge_data_mat)
rm(acinar_non_edge_data_mat)

final <- ggarrange(p_pan_tumor,p_edge_outlier,nrow=1,ncol=2,widths=c(0.3,1),labels="auto")
print(final)
print(p_all)

## Mechanisms of edge-ness

In [59]:
motif_footprints_dt <- fread(file.path(base_path,"match","footprints_mpbs.bed"))[,!c("V6","V7")] %>%
setnames(.,paste0("V",1:5),c("chr","start","end","motif","motif_score")) %>% unique(.)
motif_granges <- makeGRangesFromDataFrame( motif_footprints_dt, keep.extra.columns=T )

In [132]:
# all_genes <- rownames(seurat_obj[["RNA"]]@data)
# entrez_dt <- get_entrez_dt( all_genes )
entrez_ids <- entrez_dt$final_entrez_id
acinar_edge_up_genes <- diff_exp_dt_list[["Acinar cell"]][avg_logFC > 0,gene_name]
entrez_dt[SYMBOL %in% acinar_edge_up_genes,`:=`(gene_type="Edge")]
entrez_dt[is.na(gene_type),gene_type:="Not Edge"]
cds_tx_dt <- select(TxDb.Hsapiens.UCSC.hg19.knownGene,columns=c("GENEID","TXNAME"),
               keytype=c("GENEID"), keys=entrez_ids ) %>% na.omit(.)
cds_tx_dt <- merge( cds_tx_dt, entrez_dt[,.(SYMBOL,final_entrez_id,gene_type)],
                   by.x="GENEID", by.y="final_entrez_id") %>% data.table(.)

promoters_txdb <- promoters(TxDb.Hsapiens.UCSC.hg19.knownGene,upstream=2000,downstream=400) %>% trim(.)

mcol_df <- as.data.frame(mcols(promoters_txdb))
mcol_df$gene_type <- "Not Edge"
mcol_df[mcol_df$tx_name %in% cds_tx_dt[gene_type == "Edge",TXNAME],]$gene_type <- "Edge"
mcols(promoters_txdb) <- mcol_df
rm(mcol_df) 

overlap_dt <- data.table( as.data.frame(findOverlaps( motif_granges, promoters_txdb ) ) )
promoter_dt <- data.table( as.data.frame(promoters_txdb))

overlap_motif_dt <- cbind( motif_footprints_dt[overlap_dt$queryHits,.(motif,motif_score)],
                          promoter_dt[overlap_dt$subjectHits,.(tx_name,gene_type)] )
overlap_motif_dt <- merge( overlap_motif_dt, cds_tx_dt[,.(TXNAME,SYMBOL)], by.x="tx_name",
                         by.y="TXNAME" )[,!c("tx_name")]
rm(promoter_dt)
            

'select()' returned many:many mapping between keys and columns

“GRanges object contains 1 out-of-bound range located on sequence
  chrUn_gl000223. Note that ranges located on a sequence whose length is
  unknown (NA) or on a circular sequence are not considered out-of-bound
  (use seqlengths() and isCircular() to get the lengths and circularity
  flags of the underlying sequences). You can use trim() to trim these
  ranges. See ?`trim,GenomicRanges-method` for more information.”


In [262]:
motifs <- unique(overlap_motif_dt$motif)
#           is edge     is not edge
#has motif
#does not have motif
num_total_genes <- length(all_genes)
p_value_dt <- data.table( motif=motifs, p_value_fisher=1, p_value_chi_sq=1 )
frac_occ_dt <- data.table( motif=motifs, edge_frac=-1, non_edge_frac=-1 )
num_edge_genes <- length(unique(overlap_motif_dt[gene_type == "Edge",SYMBOL]))
num_non_edge_genes <- length(unique(overlap_motif_dt[gene_type != "Edge",SYMBOL]))

for (motif_ in motifs) {
    motif_dt <- overlap_motif_dt[motif == motif_,.(motif,gene_type,SYMBOL)] %>% unique(.)
    num_occ_edge <- nrow( motif_dt[gene_type == "Edge",] )
    num_occ_not_edge <- nrow( motif_dt[gene_type != "Edge",] )
    num_not_occ_edge <- num_edge_genes - num_occ_edge
    num_occ <- nrow(motif_dt)
    
    num_genes_without_motif <- overlap_motif_dt[motif != motif_ & gene_type != "Edge",]$SYMBOL %>% unique(.) %>% length(.)

    num_not_occ_not_edge <- num_genes_without_motif-num_not_occ_edge
    fisher_mat <- matrix(c(num_occ_edge,num_occ_not_edge,num_not_occ_edge,num_genes_without_motif),
                        nrow=2,ncol=2,byrow=T)
    fisher_p_value <- fisher.test( fisher_mat, alternative = "g" )$p.value
    num_null <- c(num_not_occ_edge,num_not_occ_not_edge)
    chisq_p_value <- chisq.test( fisher_mat[1,], p=num_null/sum(num_null) )$p.value

    p_value_dt[motif==motif_,`:=`(p_value_fisher=fisher_p_value,p_value_chi_sq=chisq_p_value)]
    
    frac_occ_dt[motif==motif_,`:=`(edge_frac=num_occ_edge/num_edge_genes,
                                   non_edge_frac=num_occ_not_edge/num_non_edge_genes)]
}
# p_value_dt[,p_adj:=p.adjust(p_value)]
frac_occ_dt[,ratio:=edge_frac/non_edge_frac]
frac_occ_dt <- merge( frac_occ_dt, p_value_dt )[ratio > 1,]
frac_occ_dt[,p_adj_chi_sq:=p.adjust(p_value_chi_sq)]

“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squar

“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squar

“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squar

“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squar

“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squar

“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squared approximation may be incorrect”
“Chi-squar

## Edge cells in other datasets

In [None]:
gene_exp_dir <- "/Users/sreenivasagopv2/Data/sc-funnel/GSE81547/"
file_paths = list.files(gene_exp_dir,full.names=T)
gene_exp_dt <- data.table()
col_idx <- 1
for (file_path in file_paths) {
    if (grepl(".gz",file_path)) {
        print(col_idx)
        flush.console()
        dt <- fread( file_path )
        gsm_num <- str_match( file_path, "GSM[0-9][0-9]*" )[1]
        if (nrow(gene_exp_dt) == 0 ) {
            gene_exp_dt <- dt %>% setnames(.,"V2",gsm_num)
        } else {
            gene_exp_dt <- cbind( gene_exp_dt, dt[,list(V2)] ) %>% setnames(.,"V2",gsm_num)
        }
        col_idx <- col_idx + 1
    }
}
out_path <- "/Users/sreenivasagopv2/Data/sc-funnel/GSE81547/GSE81547_mat.tsv.gz"
fwrite(gene_exp_dt,out_path,quote=F,row.names=F,sep="\t")

In [None]:
gene_exp_mat <- read_gene_exp_mat(out_path)
aging_obj <- create_full_seurat_object( gene_exp_mat )
aging_obj <- subset( aging_obj, features=rownames(seurat_obj[["RNA"]]@counts) )

rm(gene_exp_mat)

geo_info <- getGEO("GSE81547",getGPL=F)
geo_info_df <- pData(geo_info$GSE81547_series_matrix.txt.gz)
donor_age <- str_match(geo_info_df[,"title"],"^[0-9][0-9]*")
info_to_add_df <- data.frame( age=as.integer(donor_age), row.names=Cells(aging_obj))
aging_obj <- AddMetaData( aging_obj, info_to_add_df )


In [None]:
#Finding acinar cells
aging_obj <- NormalizeData( aging_obj ) %>% ScaleData(.) %>% FindVariableFeatures(.) %>% RunPCA(.,npcs=50) %>% FindNeighbors( . ) %>% FindClusters(.) %>% RunUMAP(.,dims=1:50)
aging_meta_data_dt <- data.table( aging_obj@meta.data, keep.rownames=T ) %>% setnames(.,"rn","cell.name")
prss1_exp_dt <- data.table( FetchData( aging_obj, vars=c("PRSS1","seurat_clusters")) )
print(prss1_exp_dt[,mean(PRSS1),by=seurat_clusters][order(-V1),]) #This gave clusters 1,13,15,16
aging_acinar_cells <- aging_meta_data_dt[seurat_clusters %in% c(1,15,13,16),cell.name]

### Signature-based evaluation

In [None]:
edge_acinar_up_genes <- diff_exp_dt_list[["Acinar cell"]][avg_logFC > 0,gene_name]
length(edge_acinar_up_genes)
aging_acinar_data_mat <- aging_obj[["RNA"]]@data[,aging_acinar_cells]
edge_acinar_up_genes <- edge_acinar_up_genes[edge_acinar_up_genes %in% rownames(aging_acinar_data_mat)]


aging_cell_edge_scores <- Matrix::colMeans(aging_acinar_data_mat[edge_acinar_up_genes,])
aging_edge_dt <- data.table( cell_name=names(aging_cell_edge_scores), edge_score=aging_cell_edge_scores )
cells_donor_age <- aging_meta_data_dt[cell.name %in% acinar_cells,][order(match(cell.name,names(aging_cell_edge_scores))),age]
aging_edge_dt[,donor_age:=cells_donor_age]

options(repr.plot.width=5, repr.plot.height=20)
aging_edge_dt$donor_age <- factor(aging_edge_dt$donor_age,levels=sort(unique(cells_donor_age)))
ggplot( aging_edge_dt ) + geom_density(aes(x=edge_score)) + 
facet_wrap(~donor_age,ncol=1) + theme_gray(base_size=20)

In [None]:
options(repr.plot.width=8, repr.plot.height=5)
ggplot( aging_edge_dt ) + geom_boxplot(aes(x=donor_age,y=edge_score)) + theme_gray(base_size=20) +
stat_compare_means(aes(x=donor_age,y=edge_score))

### Cosine similarity based evaluation

In [None]:
all_edge_acinar_genes <- diff_exp_dt_list[["Acinar cell"]]$gene_name

# all_edge_acinar_genes <- sample(rownames(aging_obj[["RNA"]]@counts),size=1000)
all_edge_acinar_genes <- all_edge_acinar_genes[all_edge_acinar_genes %in% rownames(aging_acinar_data_mat)]
original_edge_acinar_cells <- edge_info_all$edge_center_dt[normal_cell_type == "Acinar cell" & 
                                                           cell_category == "edge",cell.name]
original_non_edge_acinar_cells <- edge_info_all$edge_center_dt[normal_cell_type == "Acinar cell" & 
                                                           cell_category == "center",cell.name]
original_acinar_cells <- c(original_edge_acinar_cells,original_non_edge_acinar_cells)
original_acinar_obj <- subset( seurat_obj, idents="Acinar cell")# %>% ScaleData(features=all_edge_acinar_genes)
acinar_data_mat <- as.matrix(original_acinar_obj[["RNA"]]@data[all_edge_acinar_genes,original_acinar_cells])
aging_acinar_obj <- subset( aging_obj, cells=aging_acinar_cells )# %>% ScaleData(features=all_edge_acinar_genes)

aging_acinar_data_mat <- aging_acinar_obj[["RNA"]]@data[all_edge_acinar_genes,aging_acinar_cells]
rm(aging_acinar_obj)
rm(original_acinar_obj)

In [None]:
get_cosine_matrix <- function(X,Y) {
    norm_X <- sqrt(colSums(X^2))
    norm_Y <- sqrt(colSums(Y^2))
    
    dimnames_X <- list(rownames(X),colnames(X))
    dimnames_Y <- list(rownames(Y),colnames(Y))

    X <- sapply(1:ncol(X),function(idx){return(X[,idx]/norm_X[idx])})
    Y <- sapply(1:ncol(Y),function(idx){return(Y[,idx]/norm_Y[idx])})
    
    dimnames(X) <- dimnames_X
    dimnames(Y) <- dimnames_Y
    
    dist_mat <- t(X) %*% Y
    return(dist_mat)
}

In [None]:
original_cos_sim_mat <- get_cosine_matrix(acinar_data_mat,acinar_data_mat)

original_edge_medoid <- names(which.max(rowSums(original_cos_sim_mat[original_edge_acinar_cells,original_edge_acinar_cells])))
original_non_edge_medoid <- which.max(rowSums(original_cos_sim_mat[original_non_edge_acinar_cells,
                                                            original_non_edge_acinar_cells]))

average_non_edge_cos_sim <- original_cos_sim_mat[original_non_edge_acinar_cells,
                                                          original_edge_medoid]
num_original_edge_cells <- length(original_edge_acinar_cells)
average_edge_cos_sim <- original_cos_sim_mat[original_edge_acinar_cells,
                                                          original_edge_medoid]# - (1/num_original_edge_cells)
#average_edge_cos_sim <- ((num_original_edge_cells-1)/num_original_edge_cells) * average_edge_cos_sim

average_cos_sim <- c(average_non_edge_cos_sim,average_edge_cos_sim)
original_sim_stats_dt <- data.table( average_cos_sim=average_cos_sim, 
                                    cell_name=names(average_cos_sim), cell_category="edge" )
original_sim_stats_dt[cell_name %in% original_non_edge_acinar_cells,cell_category:="non-edge"]

aging_edge_cos_sim_mat <- get_cosine_matrix( aging_acinar_data_mat, acinar_data_mat )
average_aging_sim_with_edge <- aging_edge_cos_sim_mat[aging_acinar_cells,original_edge_medoid]#original_edge_acinar_cells])
average_aging_sim_with_non_edge <- aging_edge_cos_sim_mat[aging_acinar_cells,original_non_edge_medoid]

aging_sim_stats_dt <- data.table( `Similarity with Edge`=average_aging_sim_with_edge, 
                                 `Similarity with Non-Edge`=average_aging_sim_with_non_edge,
                                 cell_name=names(average_aging_sim_with_edge), age=-1 )
cells_donor_age <- aging_meta_data_dt[cell.name %in% aging_acinar_cells,][order(match(cell.name,names(average_aging_sim_with_edge))),age]
aging_sim_stats_dt[,age:=cells_donor_age]


edge_kde <- kde( average_edge_cos_sim, eval.points = average_aging_sim_with_edge )
non_edge_kde <- kde( average_non_edge_cos_sim, eval.points = average_aging_sim_with_non_edge )
edge_probability <- edge_kde$estimate
non_edge_probability <- non_edge_kde$estimate
likelihood_ratio <- edge_probability/non_edge_probability
likelihood_dt <- data.table( likelihood_ratio=likelihood_ratio, cell_name=names(average_aging_sim_with_edge))



In [None]:
options(repr.plot.width=15, repr.plot.height=8)

melted_dt <- melt(aging_sim_stats_dt,id.vars=c("cell_name","age"))

ggplot( melted_dt[!grepl("Non-Edge",variable),] ) + geom_density(aes(x=value,color=variable)) + facet_wrap(~age,ncol=4) +
theme_classic(base_size=18) + theme(legend.position="none",panel.grid.major=element_line()) + 
xlab("Cosine Similarity with Edge Acinar state")

ggboxplot( melted_dt[!grepl("Non-Edge",variable),], x="age", y="value" ) +
theme_classic(base_size=18) + theme(legend.position="none",panel.grid.major=element_line()) + 
ylab("Cosine Similarity with Edge Acinar state") + xlab("Donor Age")

4 + ""

merged_stats_dt <- merge( aging_sim_stats_dt, likelihood_dt, by="cell_name" )
merged_stats_dt[,cell_call:="Non-Edge"]
merged_stats_dt[likelihood_ratio > 1.2,cell_call:="Edge"]

total_N_dt <- merged_stats_dt[,.N,by="age"]
merged_stats_dt <- merge( merged_stats_dt, total_N_dt, by="age" ) %>% setnames(.,"N","N_total")

N_edge_dt <- merged_stats_dt[cell_call == "Edge",.N,by="age"]  %>% setnames(.,"N","N_edge")
merged_stats_dt <- merge( merged_stats_dt, N_edge_dt, by="age")

# merged_stats_dt[,facet_title:=paste("Age:",age,"f=",round(N_edge/N_total,2),paste0("(",paste(N_edge,N_total,sep="/"),")"))]
# melted_dt <- melt(merged_stats_dt,id.vars=c("cell_name","age","likelihood_ratio",
#                                                "cell_call","N_total","N_edge","facet_title"))

# ggplot( melted_dt ) + geom_density(aes(x=value,color=variable)) + facet_wrap(~facet_title,ncol=4) +
# theme_classic(base_size=18) + theme(legend.position="bottom") + xlab("Cosine similarity")
#annotate("text",x=0.3,y=20,label=facet_title)#+ xlim(-1,1)

In [None]:
options(repr.plot.width=9, repr.plot.height=6)

ggboxplot( melted_dt[!grepl("Non-Edge",variable),], x="age", y="value",color="age", add="jitter",
         size=1.0) +
theme_classic(base_size=18) + theme(legend.position="none",panel.grid.major=element_line()) + 
ylab("Cosine Similarity with Edge Acinar state") + xlab("Donor Age") + 
stat_compare_means(ref.group = "1",label="p.signif",size=8,label.y=0.9)

In [None]:
options(repr.plot.width=8, repr.plot.height=8)
ggplot( original_sim_stats_dt ) + geom_density(aes(x=average_cos_sim,color=cell_category)) +
theme_gray(base_size=15)

In [None]:
ggplot( aging_sim_stats_dt ) + geom_point(aes(x=`Similarity with Non-Edge`,y=`Similarity with Edge`) ) +
geom_line(aes(x=`Similarity with Non-Edge`,y=`Similarity with Non-Edge`)) + facet_wrap(~age,ncol=4) + 
theme_gray(base_size=18) #+ xlim(0,1) + ylim(0,1)

In [None]:
cells_rankings <- AUCell_buildRankings(aging_acinar_data_mat, nCores=6, plotStats=F)

edge_gene_set = list("Edgeness"=edge_acinar_up_genes)
cells_AUC <- AUCell_calcAUC(edge_gene_set, cells_rankings,nCores=6 )
pathway_scores_mat <- getAUC(cells_AUC)# %>% setnames(.,"rn","gene_set") %>% melt(.,id.vars="gene_set")


In [None]:
aucell_aging_dt <- data.table( t(pathway_scores_mat), keep.rownames=T ) %>% setnames(.,"rn","cell.name")
cells_donor_age <- aging_meta_data_dt[cell.name %in% aucell_aging_dt$cell.name,][order(match(cell.name,aucell_aging_dt$cell.name)),age]
aucell_aging_dt[,age:=cells_donor_age]
aucell_aging_dt <- aucell_aging_dt[order(age),]

### Checking which sample edge cells predominantly come from

In [None]:
edge_info_all_merged <- merge( edge_info_all$edge_center_dt, meta_data_dt[,.(cell.name,sample)] )
ordered_samples <- meta_data_dt[,.N,by=sample][order(-N)]$sample
edge_info_all_merged$sample <- factor( edge_info_all_merged$sample, levels = ordered_samples)

cell_types_present <- unique(edge_info_all$edge_center_dt$normal_cell_type)
combined_fisher_dt <- data.table()
for (cell_type_ in cell_types_present) {
    temp_dt <- edge_info_all_merged[normal_cell_type == cell_type_,]
    ordered_N_dt <- meta_data_dt[cluster == cell_type_,.N,by=sample][order(-N)]
    ordered_N_edge_dt <- temp_dt[cell_category=="edge",.N,by=sample][,.(sample,N_edge=N)]
    ordered_N_dt <- merge( ordered_N_dt, ordered_N_edge_dt, by="sample" )[order(-N),]
    
    N_edge <- sum(ordered_N_dt$N_edge)
    N_total <- sum(ordered_N_dt$N)
    
    ordered_samples <- ordered_N_dt$sample

    p_values <- c()
    odds_values <- c()
    for (sample_ in ordered_samples) {
        N_edge_from_sample <- ordered_N_dt[sample == sample_,N_edge]
        N_non_edge_from_sample <- ordered_N_dt[sample == sample_,N] - N_edge_from_sample
        N_edge_from_other_samples <- N_edge - N_edge_from_sample
        N_non_edge_from_other_samples <- N_total - N_edge_from_sample - N_non_edge_from_sample - N_edge_from_other_samples
        odds_mat <- matrix(c(N_edge_from_sample,N_non_edge_from_sample,
                             N_edge_from_other_samples,N_non_edge_from_other_samples),byrow=T,nrow=2,ncol=2)
        f_test <- fisher.test(odds_mat,alternative="greater")
        p_values <- c(p_values,f_test$p.value)
        odds_values <- c(odds_values,f_test$estimate)
    }
    ordered_N_dt[,`:=`(p_value=p_values,normal_cell_type=cell_type_,odds_ratio=odds_values)]
    combined_fisher_dt <- rbind( combined_fisher_dt, ordered_N_dt )
  
}
combined_fisher_dt$adj_p_value <- p.adjust(combined_fisher_dt$p_value,method="bonferroni")
combined_fisher_dt[adj_p_value < 0.1,]

## Finding doublets

In [None]:
doublet_info_list <- list()
samples <- unique( meta_data_dt$sample )
for (sample_id in samples) {
    sample_seurat_obj <- subset( seurat_obj, cells=meta_data_dt[sample == sample_id,cell.name])
    sample_seurat_obj <- NormalizeData(sample_seurat_obj) %>% FindVariableFeatures(.,nfeatures=1000) %>% 
    ScaleData(.) %>% RunPCA(.,npcs=50,verbose=F)  %>% RunUMAP(.,dims=1:50,verbose=F)
    
    homotypic.prop <- modelHomotypic(sample_seurat_obj@meta.data$cluster)
    nExp_poi <- round(0.05*nrow(sample_seurat_obj@meta.data))
    #nExp_poi.adj <- round(nExp_poi*(1-homotypic.prop)) 
    
    sample_param_sweep <- paramSweep_v3(sample_seurat_obj, PCs = 1:50, sct = FALSE)
    sample_param_sweep_summary <- summarizeSweep(sample_param_sweep, GT = FALSE)
    sweep_dt <- data.table( find.pK( sample_param_sweep_summary ) )
    optimal_pK <- as.double(as.vector(sweep_dt[order(-BCmetric),][1]$pK))
    sample_seurat_obj <- doubletFinder_v3(sample_seurat_obj, 
                               PCs = 1:50, pN = 0.25, pK = optimal_pK, nExp = nExp_poi, 
                                          reuse.pANN = FALSE, sct = FALSE)
    
    sample_meta_data_dt <- data.table( sample_seurat_obj@meta.data, keep.rownames = T ) %>% setnames(.,"rn","cell.name")
    doublet_class_col <- paste("DF.classifications_0.25",optimal_pK,nExp_poi,sep="_")
    doublet_info_list[[sample_id]] <- sample_meta_data_dt[,c("cell.name",doublet_class_col),with=F] %>%  setnames(.,doublet_class_col,"doublet_class")
    #4 + ""
    #meta_data_dt <- data.table( subset_seurat_obj@meta.data, keep.rownames = T ) %>% setnames(.,"rn","cell.name")
}

merged_doublet_dt <- rbindlist(doublet_info_list)
cell_doublets <- meta_data_dt[cell.name %in% merged_doublet_dt[doublet_class == "Doublet",cell.name],cell.name]
fwrite( data.table(cell.name=cell_doublets), "cell_doublets.tsv", sep="\t", col.names=F, quote=F)