# figure 1e: significantly enriched cell states in MMRd/MMRp

In [None]:
require(tidyverse)
require(ComplexHeatmap)
require(circlize)

## load meta data

In [None]:
metadata = readr::read_rds('/n/data1/bwh/medicine/korsunsky/lab/mup728/CRC_MERFISH_niches/labeled_seurat_objects/renamed_cell_states/metadata_complete.rds')
metadata$KNN_group = as.vector(metadata$KNN_group)
metadata$knn_renamed_cell_states = as.vector(metadata$knn_renamed_cell_states)
metadata = metadata %>% 
    mutate(KNN_group = ifelse(knn_coarse == 'B', knn_renamed_cell_states, KNN_group)) %>% 
    mutate(knn_renamed_cell_states = ifelse(knn_renamed_cell_states %in% c('Fibro-CXCL14', 'Fibro-BMP'), 'Fibro-CXCL14-BMP', knn_renamed_cell_states)) 
sample_n(metadata, 10)

In [None]:
metadata$knn_renamed_cell_states %>% unique %>% writeLines

## load pathology regions

In [None]:
pathology_regions = data.table::fread('/n/data1/bwh/medicine/korsunsky/lab/mup728/CRC_MERFISH/compare_CXCL10_niches_and_tessera_20240930/adata_all_obs.csv') %>%
    select(!V1) %>%
    mutate(orig_cell_id = sample_cell) 

# remove 'submucosa' regions

pathology_regions$pathology_region[pathology_regions$pathology_region == 'non_neoplastic_submucosa'] = 'Unannotated'
pathology_regions$sample_name[pathology_regions$sample_name == 'G4659'] = 'G4659-CP-MET_VMSC04701'
pathology_regions$orig_cell_id[pathology_regions$sample_name == 'G4659-CP-MET_VMSC04701'] = paste0('G4659-CP-MET_VMSC04701_', pathology_regions$cell_id[pathology_regions$sample_name == 'G4659-CP-MET_VMSC04701'])

pathology_regions$sample_name[pathology_regions$sample_name == 'G4659_Beta8'] = 'G4659-CP-MET_Beta8'
pathology_regions$orig_cell_id[pathology_regions$sample_name == 'G4659-CP-MET_Beta8'] = paste0('G4659-CP-MET_Beta8_', pathology_regions$cell_id[pathology_regions$sample_name == 'G4659-CP-MET_Beta8'])

pathology_regions$sample_cell = paste0(pathology_regions$sample_name, '_', pathology_regions$cell_id)
pathology_regions$orig_cell_id = paste0(pathology_regions$sample_name, '_', pathology_regions$cell_id)

In [None]:
pathology_regions$MSstatus[pathology_regions$sample_name == 'C107'] %>% unique

In [None]:
pathology_regions = pathology_regions %>%
    mutate(condition = MSstatus,
          knn_renamed_cell_states = KNN_celltype_v2,
          orig.ident = sample_name,
          knn_coarse = KNN_Top,
          #pathology_region = ifelse(grepl(pathology_region, pattern = 'tumor'), yes = 'tumor', no = pathology_region),
          ) %>%
    mutate(knn_coarse = ifelse(knn_coarse == 'Mast', yes = 'Myeloid', no = ifelse(knn_coarse %in% c('B', 'Plasma'), yes = 'Bplasma', no = knn_coarse))) %>%
    mutate(knn_renamed_cell_states = ifelse(knn_coarse == 'Epi', yes = 'Epi', no = knn_renamed_cell_states)) %>%
    mutate(pathology_region = gsub(pathology_region, pattern = '_', replacement = ' ')) %>%
    mutate(pathology_region = str_to_title(pathology_region)) %>%
    mutate(pathology_region = gsub(pathology_region, pattern = 'Muscularis Propria', replacement = 'M. propria')) %>%
    mutate(pathology_region = gsub(pathology_region, pattern = 'Non Neoplastic Mucosa', replacement = 'Non-neo. muc.')) %>%
    mutate(pathology_region = gsub(pathology_region, pattern = 'Non Neoplastic Submucosa|non_neoplastic_submucosa', replacement = 'Unannotated')) %>%  #'Non-neo. submuc.'
    mutate(pathology_region = gsub(pathology_region, pattern = 'Tumor Invasive Margin', replacement = 'Tumor inv-border')) %>%
    mutate(pathology_region = gsub(pathology_region, pattern = 'Tumor Luminal Margin', replacement = 'Tumor lum-border')) %>%
    mutate(knn_renamed_cell_states = ifelse(knn_renamed_cell_states %in% c('Fibro-CXCL14', 'Fibro-BMP'), 'Fibro-CXCL14-BMP', knn_renamed_cell_states)) 
    

In [None]:
sample_n(pathology_regions, 10)

## frequencies of region per sample show that some regions are extremely small and must be removed

In [None]:
.x = metadata %>% 
    left_join(., pathology_regions %>% select(sample_cell, pathology_region, PatientID) %>% rename(cell = sample_cell)) %>%
    mutate(pathology_region = gsub(pathology_region, pattern = '_', replacement = ' '))
table(.x$pathology_region, .x$orig.ident) %>% as.data.frame %>% filter(Freq > 0)

In [None]:
table(.x$pathology_region, .x$orig.ident) %>% as.data.frame %>% filter(Freq > 0) %>% 
pivot_wider(names_from = Var1, values_from = Freq, values_fill = 0)

In [None]:
table(.x$pathology_region, .x$PatientID) %>% as.data.frame %>% filter(Freq > 200) %>% 
pivot_wider(names_from = Var1, values_from = Freq, values_fill = 0) %>%
tibble::column_to_rownames('Var2')

## QC regions < 200 cells

In [None]:
table(.x$pathology_region, 
      .x$PatientID) %>% 
    as.data.frame %>% 
    filter(Freq > 250) %>%
    select(Var1, Var2) %>%
    rename(pathology_region = Var1, PatientID = Var2) %>%
    mutate(pathology_region_PatientID = paste0(pathology_region, '_', PatientID))

selected_pathology_regions = table(.x$pathology_region, 
      .x$PatientID) %>% 
    as.data.frame %>% 
    filter(Freq > 200) %>%
    select(Var1, Var2) %>%
    rename(pathology_region = Var1, PatientID = Var2) %>%
    mutate(pathology_region_PatientID = paste0(pathology_region, '_', PatientID)) %>%
    pull(pathology_region_PatientID)

In [None]:
pathology_regions %>% dim
pathology_regions_preQC = pathology_regions
pathology_regions = pathology_regions %>% 
    mutate(pathology_region_PatientID = paste0(pathology_region, '_', PatientID)) %>%
    filter(pathology_region_PatientID %in% selected_pathology_regions) 
pathology_regions %>%
    dim

In [None]:
nCells_pathology = pathology_regions %>% 
    left_join(., metadata %>% select(cell_id, orig.ident, area)) %>%
    mutate(orig.ident = gsub(orig.ident, pattern = '_.*', replacement = '')) %>%
    select(condition, knn_renamed_cell_states, orig.ident, KNN_Top, pathology_region, area) %>%
    group_by(orig.ident, knn_renamed_cell_states, condition, KNN_Top, pathology_region) %>%
    summarize(total_cell_state = n(), cell_state_area = sum(area), .groups = 'keep') %>%
    ungroup %>%
    group_by(orig.ident, condition, pathology_region, .drop = FALSE) %>%
    mutate(total_region = sum(total_cell_state), total_region_area = sum(cell_state_area)) %>%
    ungroup %>%
    group_by(orig.ident, .drop = FALSE) %>%
    mutate(total_cells = sum(total_cell_state), total_area = sum(total_region_area)) %>%
    mutate(cell_state_as_percent_of_total = 100*total_cell_state/total_cells, cell_state_as_percent_of_region = 100*total_cell_state/total_region, cell_state_density_area = cell_state_area/total_region_area) %>%
    ungroup 

nCells_pathology %>%
    sample_n(20)

In [None]:
dim(nCells_pathology)

## figure 2c

# cell states enriched in pathology regions, split by sample

## plot frequencies    

In [None]:
table(pathology_regions$KNN_Grouped, pathology_regions$knn_renamed_cell_states) %>% (as.data.frame) %>% filter(Freq > 0) %>%
select(!Freq) #%>%
#pivot_wider(names_from = Var1, values_from = Var2)

In [None]:
nCells_pathology = pathology_regions %>%
    na.omit %>%
    mutate(pathology_region = ifelse(grepl(pathology_region, pattern = 'tumor|Tumor'), 'Tumor', pathology_region)) %>%
    filter(pathology_region == 'Tumor') %>%
    select(condition, PatientID, pathology_region,  KNN_Grouped, knn_renamed_cell_states, sample_cell) %>%
    group_by(condition, PatientID, pathology_region,  KNN_Grouped, knn_renamed_cell_states, .drop = FALSE) %>%
    summarize(cell_state_in_region = n(), .groups = 'keep') %>%
    ungroup() %>%
    group_by(condition, PatientID, pathology_region, KNN_Grouped, .drop = FALSE) %>%
    mutate(cell_group_in_region = sum(cell_state_in_region)) %>%
    mutate(percent = 100*cell_state_in_region/cell_group_in_region) %>%
    ungroup() %>%
    mutate(sample_region = paste0(PatientID, '_', pathology_region))
nCells_pathology %>%
    slice_sample(n = 10) 

In [None]:
nCells_pathology %>%
    select(KNN_Grouped, knn_renamed_cell_states) %>%
    distinct %>%
    pull(KNN_Grouped) %>%
    table() %>%
    as.data.frame %>%
    filter(Freq > 0)

In [None]:
options(repr.plot.width = 12, repr.plot.height = 4, repr.plot.res = 200)
plotList = lapply(c('Endothelial', 'Fibroblast', 'Immune'), function(groupName){
nCells_pathology %>%
    filter(KNN_Grouped == groupName) %>%
    ggplot() +
        geom_boxplot(aes(x = knn_renamed_cell_states, 
                         y = percent, 
                         color = condition)) +
        facet_wrap(~KNN_Grouped,
                   scales = 'free',
        ) +
        theme_bw() +
        theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 0.5)) +
        ggtitle(groupName) +
        #ggtitle('Abundance of cell states - as percentage of group in the region') +
        #ylab('100 * Number of cells/Total number of group in the region')    
        ylab('') +
        xlab('') 
}) 
require(patchwork)
wrap_elements(patchwork::wrap_plots(plotList, widths = c(7, 6, 50)) + plot_layout(guides = 'collect')) +
  labs(tag = "Percent of group") + 
  theme(
    plot.tag = element_text(size = rel(1), angle = 90),
    plot.tag.position = "left"
  )

In [None]:
nCells_pathology

In [None]:
require(presto)

percent_of_group = lapply(c('Immune', 'Fibroblast', 'Endothelial'), function(groupName){
    .temp2 = nCells_pathology %>% filter(KNN_Grouped == groupName) %>% na.omit
    .counts = .temp2 %>%
        ungroup %>%
        select(sample_region, knn_renamed_cell_states, percent) %>%
        pivot_wider(names_from = knn_renamed_cell_states, values_from = percent, values_fill = 0) %>%
        tibble::column_to_rownames('sample_region') %>%
        t
    .metadata = .temp2 %>%
        ungroup %>%
        select(condition, PatientID, sample_region) %>%
        distinct %>%
        as.data.frame() 
    rownames(.metadata) = .metadata$sample_region
    .metadata = .metadata[colnames(.counts),]
    return(wilcoxauc(X = .counts, y = .metadata$condition) %>% mutate(KNN_group = groupName))
}) %>% rbindlist %>% mutate(fdr = p.adjust(p = pval, method = 'fdr'))
percent_of_group

In [None]:
options(repr.plot.width = 11, repr.plot.height = 4, repr.plot.res = 500)
require(ggrepel)
logFC_threshold = 1
fdr_threshold = 0.05
percent_of_group = percent_of_group %>% mutate(fdr = pval)
.data = percent_of_group %>%
    filter(group == 'MSI') %>%
    mutate(`Significance` = ifelse(
        logFC > logFC_threshold & fdr < fdr_threshold, 
            yes = 'Up in MSI', 
            no = ifelse(
                logFC < -logFC_threshold & fdr < fdr_threshold,
                yes = 'Down in MSI',
                no = 'Not significant'
            ))) %>%
    left_join(., nCells_pathology %>% select(KNN_Grouped, knn_renamed_cell_states) %>% distinct %>% rename('feature' = 'knn_renamed_cell_states')) %>%
    left_join(., metadata %>% select(knn_renamed_cell_states, knn_coarse) %>% distinct %>% rename('feature' = 'knn_renamed_cell_states'))

ggplot() + 
    geom_point(data = .data, aes(x = logFC, 
                   y = -log10(fdr)), color = 'grey', size = 2, shape = 16) +
    geom_point(data = .data %>% filter(Significance != 'Not significant'),
               aes(x = logFC, 
                   y = -log10(fdr),
                   fill = knn_coarse), size = 2, shape = 21, color = 'black') +
    geom_text_repel(size = 3, 
                    data = .data %>% filter(Significance != 'Not significant'),
                    aes(x = logFC, 
                         y = -log10(fdr),
                         label = feature,), 
                    max.overlaps = Inf, color = 'black',
                         force = 15, hjust = 1) +
    geom_vline(xintercept = logFC_threshold, linetype = 'dotted') +
    geom_vline(xintercept = -logFC_threshold, linetype = 'dotted') +
    geom_hline(yintercept = -log10(fdr_threshold), linetype = 'dotted') + 
    ggpubr::theme_pubr() +
    scale_fill_manual(name = 'Lineage', values = c('Epi' = '#CA49FC',
            'Strom' = '#00D2D0',
            'Myeloid' = '#FFB946',
            'Mast' = '#F4ED57',
            'Plasma' = '#61BDFC',
            'B' = '#0022FA',
            'TNKILC' = '#FF3420'
            )) +
    facet_grid(~KNN_group, scales = 'fixed', space = 'fixed') +
    guides(fill = guide_legend(override.aes = list(size=6))) +
    ylab('-log10(pvalue)') +
    NULL

In [None]:
percent_of_group

In [None]:
knn_mid_renamed = fread('knn_cell_state_labels.csv')
knn_mid_renamed
metadata = left_join(metadata, knn_mid_renamed)

In [None]:
options(repr.plot.res = 500, repr.plot.width = 12, repr.plot.height = 5)
set.seed(1)
.temp = percent_of_group %>%
    filter(KNN_group == 'Immune') %>%
    select(group, feature, logFC) %>%
    pivot_wider(names_from = feature, values_from = logFC) %>%
    column_to_rownames('group') %>%
    as.matrix
dim(.temp)

.rowAnno = metadata %>%
    select(knn_renamed_cell_states, knn_coarse) %>%
    group_by(knn_renamed_cell_states, .drop = FALSE) %>%
    mutate(n = n()) %>%
    ungroup %>%
    distinct %>%
    na.omit %>%
    as.data.frame
rownames(.rowAnno) = .rowAnno$knn_renamed_cell_states

.pval = percent_of_group %>%
    filter(KNN_group == 'Immune') %>%
    select(group, feature, fdr, logFC) %>%
    mutate(fdr = ifelse(fdr < 0.05 & logFC > 2, yes = '*', no = '')) %>%
    select(!logFC) %>%
    pivot_wider(names_from = feature, values_from = fdr) %>%
    column_to_rownames('group') %>%
    as.matrix

.knn_coarse = .rowAnno$knn_coarse
names(.knn_coarse) = .rowAnno$knn_renamed_cell_states
.knn_coarse = .knn_coarse[colnames(.temp)]
#.temp = .temp[, .rowAnno$knn_mid_renamed]
#.pval = .pval[, .rowAnno$knn_mid_renamed]
ha1 = HeatmapAnnotation(
    which = 'column', 
    Lineage = .knn_coarse,
    col = list(Lineage = c('Epi' = '#CA49FC',
        'Strom' = '#00D2D0',
        'Myeloid' = '#FFB946',
        'Mast' = '#F4ED57',
        'Plasma' = '#61BDFC',
        'B' = '#0022FA',
        'TNKILC' = '#FF3420'
        )),
    annotation_legend_param = list(Lineage = list(nrow = 3, direction = 'horizontal'))
    )
ha2 = HeatmapAnnotation(
    `log10(Count)` = anno_barplot(log10(.rowAnno$n)),
    annotation_name_rot = 90,
    which = 'row'
    )
#col_fun = colorRamp2(c(min(.temp), 0, max(.temp)), c(scales::muted('blue'), "white", scales::muted('red')))
col_fun = colorRamp2(c(min(.temp), 0, 5, max(.temp)), c('white', 'white',  scales::muted('navyblue'), scales::muted('navyblue')))
h1 = ComplexHeatmap::Heatmap(
                        heatmap_legend_param = list(direction = 'horizontal'),
                        col = col_fun,
                        cluster_rows = TRUE,
                        cluster_columns = TRUE,
                        top_annotation = ha1,
                        #right_annotation = ha2,
                        cell_fun = function(j, i, x, y, width, height, fill) {grid.text(.pval[i, j], x, y, gp = gpar(fontcolor = 'red', fontsize = 10))},
                        name = 'logFC',
                        column_names_side = 'top',
                        show_column_dend = FALSE,
                        show_row_dend = FALSE,
                        matrix = .temp,
                        row_names_side = 'left')
draw( h1,
     merge_legend = TRUE, 
     heatmap_legend_side = "bottom", 
     annotation_legend_side = "bottom")

In [None]:
options(repr.plot.res = 500, repr.plot.width = 12, repr.plot.height = 5)
set.seed(1)
.temp = percent_of_group %>%
    filter(KNN_group == 'Endothelial') %>%
    select(group, feature, logFC) %>%
    pivot_wider(names_from = feature, values_from = logFC) %>%
    column_to_rownames('group') %>%
    as.matrix
dim(.temp)

.rowAnno = metadata %>%
    select(knn_renamed_cell_states, knn_coarse) %>%
    group_by(knn_renamed_cell_states, .drop = FALSE) %>%
    mutate(n = n()) %>%
    ungroup %>%
    distinct %>%
    na.omit %>%
    as.data.frame
rownames(.rowAnno) = .rowAnno$knn_renamed_cell_states

.pval = percent_of_group %>%
    filter(KNN_group == 'Endothelial') %>%
    select(group, feature, fdr, logFC) %>%
    mutate(fdr = ifelse(fdr < 0.05 & logFC > 2, yes = '*', no = '')) %>%
    select(!logFC) %>%
    pivot_wider(names_from = feature, values_from = fdr) %>%
    column_to_rownames('group') %>%
    as.matrix

.knn_coarse = .rowAnno$knn_coarse
names(.knn_coarse) = .rowAnno$knn_renamed_cell_states
.knn_coarse = .knn_coarse[colnames(.temp)]
#.temp = .temp[, .rowAnno$knn_mid_renamed]
#.pval = .pval[, .rowAnno$knn_mid_renamed]
ha1 = HeatmapAnnotation(
    which = 'column', 
    Lineage = .knn_coarse,
    col = list(Lineage = c('Epi' = '#CA49FC',
        'Strom' = '#00D2D0',
        'Myeloid' = '#FFB946',
        'Mast' = '#F4ED57',
        'Plasma' = '#61BDFC',
        'B' = '#0022FA',
        'TNKILC' = '#FF3420'
        )),
    annotation_legend_param = list(Lineage = list(nrow = 3, direction = 'horizontal'))
    )
ha2 = HeatmapAnnotation(
    `log10(Count)` = anno_barplot(log10(.rowAnno$n)),
    annotation_name_rot = 90,
    which = 'row'
    )
#col_fun = colorRamp2(c(min(.temp), 0, max(.temp)), c(scales::muted('blue'), "white", scales::muted('red')))
col_fun = colorRamp2(c(min(.temp), 0, 5, max(.temp)), c('white', 'white',  scales::muted('navyblue'), scales::muted('navyblue')))
h1 = ComplexHeatmap::Heatmap(
                        heatmap_legend_param = list(direction = 'horizontal'),
                        col = col_fun,
                        cluster_rows = TRUE,
                        cluster_columns = TRUE,
                        top_annotation = ha1,
                        #right_annotation = ha2,
                        cell_fun = function(j, i, x, y, width, height, fill) {grid.text(.pval[i, j], x, y, gp = gpar(fontcolor = 'red', fontsize = 10))},
                        name = 'logFC',
                        column_names_side = 'top',
                        show_column_dend = FALSE,
                        show_row_dend = FALSE,
                        matrix = .temp,
                        row_names_side = 'left')
draw( h1,
     merge_legend = TRUE, 
     heatmap_legend_side = "bottom", 
     annotation_legend_side = "bottom")

In [None]:
options(repr.plot.res = 500, repr.plot.width = 12, repr.plot.height = 5)
set.seed(1)
.temp = percent_of_group %>%
    filter(KNN_group == 'Fibroblast') %>%
    select(group, feature, logFC) %>%
    pivot_wider(names_from = feature, values_from = logFC) %>%
    column_to_rownames('group') %>%
    as.matrix
dim(.temp)

.rowAnno = metadata %>%
    select(knn_renamed_cell_states, knn_coarse) %>%
    group_by(knn_renamed_cell_states, .drop = FALSE) %>%
    mutate(n = n()) %>%
    ungroup %>%
    distinct %>%
    na.omit %>%
    as.data.frame
rownames(.rowAnno) = .rowAnno$knn_renamed_cell_states

.pval = percent_of_group %>%
    filter(KNN_group == 'Fibroblast') %>%
    select(group, feature, fdr, logFC) %>%
    mutate(fdr = ifelse(fdr < 0.05 & logFC > 2, yes = '*', no = '')) %>%
    select(!logFC) %>%
    pivot_wider(names_from = feature, values_from = fdr) %>%
    column_to_rownames('group') %>%
    as.matrix

.knn_coarse = .rowAnno$knn_coarse
names(.knn_coarse) = .rowAnno$knn_renamed_cell_states
.knn_coarse = .knn_coarse[colnames(.temp)]
#.temp = .temp[, .rowAnno$knn_mid_renamed]
#.pval = .pval[, .rowAnno$knn_mid_renamed]
ha1 = HeatmapAnnotation(
    which = 'column', 
    Lineage = .knn_coarse,
    col = list(Lineage = c('Epi' = '#CA49FC',
        'Strom' = '#00D2D0',
        'Myeloid' = '#FFB946',
        'Mast' = '#F4ED57',
        'Plasma' = '#61BDFC',
        'B' = '#0022FA',
        'TNKILC' = '#FF3420'
        )),
    annotation_legend_param = list(Lineage = list(nrow = 3, direction = 'horizontal'))
    )
ha2 = HeatmapAnnotation(
    `log10(Count)` = anno_barplot(log10(.rowAnno$n)),
    annotation_name_rot = 90,
    which = 'row'
    )
#col_fun = colorRamp2(c(min(.temp), 0, max(.temp)), c(scales::muted('blue'), "white", scales::muted('red')))
col_fun = colorRamp2(c(min(.temp), 0, 5, max(.temp)), c('white', 'white',  scales::muted('navyblue'), scales::muted('navyblue')))
h1 = ComplexHeatmap::Heatmap(
                        heatmap_legend_param = list(direction = 'horizontal'),
                        col = col_fun,
                        cluster_rows = TRUE,
                        cluster_columns = TRUE,
                        top_annotation = ha1,
                        #right_annotation = ha2,
                        cell_fun = function(j, i, x, y, width, height, fill) {grid.text(.pval[i, j], x, y, gp = gpar(fontcolor = 'red', fontsize = 10))},
                        name = 'logFC',
                        column_names_side = 'top',
                        show_column_dend = FALSE,
                        show_row_dend = FALSE,
                        matrix = .temp,
                        row_names_side = 'left')
draw( h1,
     merge_legend = TRUE, 
     heatmap_legend_side = "bottom", 
     annotation_legend_side = "bottom")