In [None]:
# ---- CONFIGURATION ----
AGG_METADATA_PATH     = "path/to/agg_metadata.rds"
TILE_METADATA_PATH    = "path/to/tile_metadata.rds"
HARMONIZED_MERFISH_PATH = "path/to/harmonized_merfish.rds"
# -----------------------


# Gene enrichment around MSI and MSS interfaces

In [None]:
require(tidyverse)
require(sf)
require(ggthemes)
require(ggpubr)
require(scattermore)
require(data.table)
require(future)
require(furrr)
require(nngeo)
require(patchwork)
require(mgcv)
require(marginaleffects)
require(Seurat)
require(slider)
require(rstatix)
require(ggpubr)
require(ggnewscale)

In [None]:
fig.size = function(height, width, res = 300){
    options(repr.plot.width = width, repr.plot.height = height, repr.plot.res = res)
}
fixTheme = ggpubr::theme_pubr(base_family = "Helvetica", base_size = 12) + 
          theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 0.5))
fixGuides = guides(fill = guide_legend(override.aes = list(shape = 16, size = 12)), 
                 color = guide_legend(override.aes = list(shape = 16, size = 12)))
tumor_palette =  c("CXCL+ stroma" = "#1e3888",
                   "CXCL- stroma" = "#47a8bd",
                   "CXCL+ tumor" = "#f5e663",
                   "CXCL- tumor" = "#ffad69",
                   "CXCL+ tumor-stromal interface" = 'red') #"#9c3848"
lineage_palette = c('Epi' = '#CA49FC',
        'Strom' = '#00D2D0',
        'Myeloid' = '#FFB946',
        'Mast' = '#F4ED57',
        'Plasma' = '#61BDFC',
        'B' = '#0022FA',
        'TNKILC' = '#FF3420'
        )

## Load data

### Tessera tile metadata and cell-level metadata

In [None]:
agg_metadata = readr::read_rds(AGG_METADATA_PATH)#'agg_metadata_2025-03-27.rds')
tile_metadata = readr::read_rds(TILE_METADATA_PATH) #'tile_metadata_2025-03-27.rds')
tile_metadata$type_lvl1[tile_metadata$type_lvl2 == 'Mast'] = 'Mast'


In [None]:
sample_n(agg_metadata, 10)
dim(agg_metadata)
colnames(agg_metadata)

In [None]:
sample_n(tile_metadata, 10)
dim(tile_metadata)
colnames(tile_metadata)

### table of distances of each cell to the nearest interface

In [None]:
# --- Data Import and Processing ---
distances = fread('distances_of_tumor_cells_to_MSI_and_MSS_interfaces_20250723.csv') %>%
    filter(! tiles_to_exclude_from_interface_analysis == "This tile is disconnected from the main tumor. Exclude it from the interface analysis.") %>%
    mutate(measurement_bin = factor(measurement_bin)) %>%
    mutate(measurement_bin = fct_reorder(measurement_bin, bin_numeric, .fun = mean)) 
distances$type_lvl1[distances$type_lvl2 == 'Mast'] = 'Mast'

# --- Inspect the Result ---

glimpse(distances)

In [None]:
distances_trimmed = distances %>% filter(abs(dist) <= 100)
dim(distances_trimmed)
glimpse(distances_trimmed)

# read in harmonized data

In [None]:
merged_merfish = readr::read_rds(HARMONIZED_MERFISH_PATH)
merged_merfish

In [None]:
merged_merfish@meta.data$orig.ident[merged_merfish@meta.data$orig.ident %in% c('G4659-CP-MET', 'G4659-CP-MET_VMSC04701')] = 'G4659'

In [None]:
unique(merged_merfish@meta.data$orig.ident)

In [None]:
head(merged_merfish@meta.data)

In [None]:
.counts = GetAssayData(merged_merfish, layer = 'counts') 
dim(.counts)

In [None]:
.total_counts = data.frame(total_RNA = colSums(.counts)) %>%
    tibble::rownames_to_column('cell_id')

In [None]:
dim(.total_counts)

# Show donor-level traces and a median line for multiple genes at the same time: MSI vs MSS interfaces

In [None]:
# Load the data.table library for efficient data manipulation
library(data.table)

# Define the list of genes and associated comments
# Create a data.table directly, which is more efficient than creating a data.frame
# Use the := operator to add the 'location' column by reference
gene_dt <- data.table(
  gene_name = c(
    "CXCL9", "CXCL10", "CXCL11", "CXCL13", "CCL5", "CCL18", "CXCL16",
    "CCL19", "CCL21", "ZNF683", "ITGAE", "PDCD1", "TGFB1", "CD274",
    "IDO1", "CCL3", "CCL4", "CCL17", "CCL22", "IFNG", "PDCD1LG2",
    "TGFB2", "TGFB3", "IL10", "ENTPD1", "NT5E", "CD38", "PTGS2",
    "PTGES2", "CCR4", "CCR5", "CCR7", "CCR8", "CXCR3", "CXCR5",
    "CXCR6", "LAG3", "PTGES", "PTGS1"
  ),
  comment = c(
    "ligand - good", "ligand - good", "ligand - good", "ligand - good",
    "(CD8 T cells) - ligand - good", "(myeloid ISG) - ligand - good",
    "ligand - good", "ligand - good", "ligand - good",
    "transcription factor - good", "receptor for CDH1 - good.", NA,
    "ligand - good", "(PDL1)- ligand - good", "ligand - good",
    "ligand - good", "ligand - good", "(mregDC) - ligand - bad",
    "(mregDC) - ligand - bad", "ligand - bad", NA, "ligand - medium",
    "ligand - good - supp", NA, "(CD39) ligand - bad",
    "(CD73) - ligand - medium", NA, "ligand - medium", "ligand - medium",
    "(Tregs) - receptor - bad", "receptor - bad", "receptor - bad",
    "(Tregs) - receptor - bad", "receptor - bad", "receptor - medium",
    "receptor - bad", "receptor", "enzyme", "enzyme"
  )
)

# Add the 'location' column. Default to 'supplement', set the first 15 to 'main',
# then specifically set PDCD1 (gene #12) back to 'supplement'.
gene_dt[, location := "supplement"]
gene_dt[1:15, location := "main"]
gene_dt[gene_name == "PDCD1", location := "supplement"]


# --- Data Preparation ---
# Extract all gene names for subsequent filtering and reshaping
all_gene_names <- gene_dt[, gene_name]

# Check how many of these genes are present in the counts matrix
# This is for verification and does not alter data
# sum(all_gene_names %in% rownames(.counts))

# Subset the counts matrix for the genes of interest, transpose it,
# and convert to a data.table.
# `keep.rownames = "cell"` efficiently converts row names to a 'cell' column.
temp_counts_dt <- as.data.table(
  t(.counts[all_gene_names, ]),
  keep.rownames = "cell"
)

# Calculate total transcript counts per cell and store in a data.table
total_counts_dt <- data.table(
  cell = names(colSums(.counts)),
  total_counts = colSums(.counts)
)

## --- Merging Data Tables ---
# Convert the Seurat object metadata to a data.table.
meta_dt <- as.data.table(merged_merfish@meta.data)

# Rename 'orig.ident' to 'PatientID' by reference using setnames for efficiency.
setnames(meta_dt, "orig.ident", "PatientID")

# Convert the distances data.frame to a data.table.
distances_dt <- as.data.table(distances_trimmed)

# Select and rename columns in the distances data.table.
# Note: Updated to use 'tessera_annotation' as per your error message.
distances_dt <- distances_dt[, .(
  cell = cell_id, dist, bin_numeric, measurement_bin,
  SampleID, PatientID, tessera_annotation, interface_type
)]

# Sequentially left-join the data.tables.
temp_dt <- merge(meta_dt, temp_counts_dt, by = "cell", all.x = TRUE)
temp_dt <- merge(temp_dt, total_counts_dt, by = "cell", all.x = TRUE)

# The corrected merge call, joining ONLY on the shared columns.
temp_dt <- merge(temp_dt, distances_dt, by = c("cell", "PatientID"), all.x = TRUE)

# Filter out rows where the distance is missing
temp_dt <- temp_dt[!is.na(dist)]

# Clean up intermediate objects to free up memory
rm(temp_counts_dt, merged_merfish)
gc()


# --- Data Reshaping and Final Calculations ---
# Reshape the data from wide to long format using melt().
# This is the data.table equivalent of tidyr::pivot_longer.
# `id.vars` are the columns to keep, `measure.vars` are the columns to pivot.
temp2_dt <- melt(
  temp_dt,
  id.vars = setdiff(names(temp_dt), all_gene_names),
  measure.vars = all_gene_names,
  variable.name = "gene",
  value.name = "counts"
)

# Add new columns by reference using the `:=` operator.
# This single step performs all calculations efficiently without creating a copy.
temp2_dt[, `:=`(
  # Calculate the proportion of transcripts for each gene in each cell.
  propTx = counts / total_counts,
  # Parse the measurement_bin string '(lower,upper]' to extract numeric bounds.
  upper_bound = as.numeric(gsub(measurement_bin, pattern = ".*,|)|]", replacement = "")),
  lower_bound = as.numeric(gsub(measurement_bin, pattern = "\\[|\\(|\\)|,.*", replacement = "")),
  # Assign the bin midpoint.
  midpoint = bin_numeric
)]

# View the first few rows of the final data.table
head(temp2_dt)

# This code iterates through a list of gene names and, for each gene, calculates summary statistics on its proportional expression across different experimental conditions (interface, spatial bins, etc.).


In [None]:
# The final output is a list of data frames, one for each gene.
summary_stats_list <- lapply(all_gene_names, function(.gene, .data, .temp2) {

  # Print the name of the gene currently being processed for progress tracking.
  message("Processing gene: ", .gene)

  # --- 1. Data Preparation and Merging ---

  # Start with the '.temp2' dataframe, which contains gene count information.
  # Filter it to keep only the data for the current gene.
  # Then, join it with patient status information from the '.data' dataframe.
  .temp3 <- .temp2 %>%
    filter(gene == .gene) 

  # --- 2. Calculate Proportional Expression per Patient ---

  # Aggregate counts for each patient within each experimental bin.
  # This step calculates the total counts for the specific gene and the overall
  # total counts within that bin for each patient.
  .temp4 <- .temp3 %>%
    mutate(measurement_bin = as.factor(measurement_bin)) %>%
    group_by(PatientID, interface_type, measurement_bin, midpoint, gene) %>%
    summarize(
      total_gene_counts = sum(counts, na.rm = TRUE),
      total_counts = sum(total_counts, na.rm = TRUE),
      .groups = 'drop'
    ) %>%
    # Calculate the proportion of the specific gene's transcript (Tx)
    # relative to all transcripts in that bin.
    mutate(prop_Tx = total_gene_counts / total_counts)

  # --- 3. Calculate Summary Statistics Across Patients ---

  # Now, calculate the mean, standard deviation, and standard error of the
  # proportional expression across all patients for each condition.
  summary_df <- .temp4 %>%
    group_by(interface_type, gene, measurement_bin, midpoint) %>%
    summarise(
      # Calculate the average proportion across patients.
      mean_prop = mean(prop_Tx, na.rm = TRUE),
      # Calculate the standard deviation.
      sd_prop = sd(prop_Tx, na.rm = TRUE),
      # Count the number of patients in the group.
      n_patients = n(),
      # Calculate the standard error of the mean (SEM).
      sem_prop = sd_prop / sqrt(n_patients),
      .groups = 'drop'
    ) %>%
    # Calculate the upper and lower bounds for error bars.
    mutate(
      ymin = mean_prop - sem_prop,
      ymax = mean_prop + sem_prop
    ) %>%
    as.data.frame() # Convert to a standard data frame.

  # --- 4. Return the Final Data Frame ---

  return(summary_df)

}, .data = distances_trimmed, .temp2 = temp2_dt) # Pass the necessary data frames to the function
summary_stats = rbindlist(summary_stats_list)
sample_n(summary_stats, 10)
summary_stats %>% fwrite('gene_expression_spatial_patterns_MSI_vs_MSS.csv')

In [None]:
summary_stats %>% sample_n(20)

# Test for spatial gene patterning with wilcoxon

In [None]:

# The final output is a list of data frames, one for each gene.
summary_stats_list <- lapply(all_gene_names, function(.gene, distances_trimmed, temp2_dt) {

  # Print the name of the gene currently being processed for progress tracking.
  message("Processing gene: ", .gene)

  # --- 1. Data Preparation and Merging ---

  # Start with the 'temp2_dt' dataframe, which contains gene count information.
  # Filter it to keep only the data for the current gene.
  # Then, join it with patient status information from the 'distances_trimmed' dataframe.
  .temp3 <- temp2_dt %>%
    filter(gene == .gene) 

  # --- 2. Calculate Proportional Expression per Patient ---

  # Aggregate counts for each patient within each experimental bin.
  # This step calculates the total counts for the specific gene and the overall
  # total counts within that bin for each patient.
  .temp4 <- .temp3 %>%
    mutate(measurement_bin = as.factor(measurement_bin)) %>%
    group_by(PatientID, interface_type, measurement_bin, midpoint, gene) %>%
    summarize(
      total_gene_counts = sum(counts, na.rm = TRUE),
      total_counts = sum(total_counts, na.rm = TRUE),
      .groups = 'drop'
    ) %>%
    # Calculate the proportion of the specific gene's transcript (Tx)
    # relative to all transcripts in that bin.
    mutate(prop_Tx = total_gene_counts / total_counts)


  return(.temp4)

}, distances_trimmed = distances_trimmed, temp2_dt = temp2_dt) # Pass the necessary data frames to the function

In [None]:
complete_gene_data = rbindlist(summary_stats_list)
fwrite(complete_gene_data, 'complete_gene_data_MSI_vs_MSS.csv')
head(complete_gene_data)

In [None]:
unique(complete_gene_data$interface_type)

In [None]:
# Filter the data to include only the 'CXCL+' and 'MMRp' interface_type groups
# Also, ensure 'interface_type' is a factor with the desired order for plotting
paired_data <- complete_gene_data %>%
  filter(interface_type %in% c("MMRd", "MMRp"))

paired_data_MMRd = paired_data %>%
  filter(interface_type == 'MMRd') %>%
  group_by(measurement_bin, gene, midpoint) %>%
  complete(PatientID, interface_type, fill = list(total_gene_counts = 0, total_counts = 0, prop_Tx = 0)) %>%
  ungroup()

paired_data_MMRp = paired_data %>%
  filter(interface_type == 'MMRp') %>%
  group_by(measurement_bin, gene, midpoint) %>%
  complete(PatientID, interface_type, fill = list(total_gene_counts = 0, total_counts = 0, prop_Tx = 0)) %>%
  ungroup()

paired_data = rbind(paired_data_MMRd, paired_data_MMRp) %>%
  mutate(interface_type = factor(interface_type, levels = c("MMRp", "MMRd"))) 
# Display the first few rows of the prepared data
print("Prepared Data Head:")
head(paired_data)

## Perform Wilcoxon Test, Calculate Log2FC, and Apply BH Correction

In [None]:

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Step 3: Perform Paired Wilcoxon Test, Calculate Log2FC, and Apply BH Correction
#
# We will group by Gene and Bin, run the test, calculate the log2 fold change,
# and then apply the Benjamini-Hochberg correction across all tests.
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# Group by Gene and measurement_bin, then perform the Wilcoxon test and calculate Log2FC
stat_test_results_raw <- paired_data %>%
  group_by(gene, measurement_bin) %>%
  do({
    # Capture the current data subset
    current_data <- .
    
    # Initialize p-value and log2fc
    p_value <- NA
    log2fc <- NA
    
    # Use a try-catch block in case the test fails for a group
    test_result <- try(wilcox.test(prop_Tx ~ interface_type, data = current_data, paired = FALSE), silent = FALSE)
    
    if (!inherits(test_result, "try-error")) {
      p_value <- test_result$p.value
    }
    
    # --- NEW: Calculate Log2 Fold Change ---
    # Calculate mean proportions for each group, adding a small pseudocount
    mean_cxcl_pos <- mean(current_data$prop_Tx[current_data$interface_type == "MMRd"], na.rm = TRUE) + 1e-6
    mean_cxcl_neg <- mean(current_data$prop_Tx[current_data$interface_type == "MMRp"], na.rm = TRUE) + 1e-6
    
    # Calculate log2 fold change
    log2fc <- log2(mean_cxcl_pos / mean_cxcl_neg)
    
    # Return a tidy data frame with p-value and log2fc
    data.frame(p = p_value, log2fc = log2fc)
  }) %>%
  ungroup() %>%
  # Remove rows where the test failed
  filter(!is.na(p))

# Apply Benjamini-Hochberg correction to the collected p-values
stat_test_results_wilcox <- stat_test_results_raw %>%
  mutate(p.adj = p.adjust(p, method = "fdr")) %>%
  # Add other necessary columns for plotting
  mutate(group1 = "MMRp", group2 = "MMRd") 

# Display the statistical test results
print("Statistical Test Results with BH-Adjusted P-values and Log2FC:")
print(stat_test_results_wilcox)


## Display the statistical test results

In [None]:
print("Statistical Test Results with BH-Adjusted P-values and Log2FC:")
head(stat_test_results_wilcox)

## Plot all genes

### Prepare data for the shaded significance rectangles (geom_rect)

In [None]:
stat_test_results_wilcox

In [None]:
pvalues_for_rects <- stat_test_results_wilcox %>%
  mutate(fdr = p.adjust(p, 'fdr')) %>%
  left_join(., complete_gene_data %>% select(measurement_bin, midpoint) %>% distinct) %>%
  mutate(midpoint = as.numeric(as.vector(midpoint))) %>%
  mutate(is_significant = fdr < 0.1 & abs(log2fc) > 1) %>% #  & log2_fold_change < -0.585
  filter(is_significant == TRUE) 

if (nrow(pvalues_for_rects) > 0) {
    pvalues_for_rects = pvalues_for_rects %>%
      group_by(gene) %>%
      mutate(block = cumsum(is_significant != lag(is_significant, default = first(is_significant)))) %>%
      group_by(gene, block, is_significant) %>%
      summarise(
        min_midpoint = min(midpoint),
        max_midpoint = max(midpoint),
        .groups = 'drop'
      )
}
pvalues_for_rects #%>% sample_n(10)

In [None]:
complete_gene_data$PatientID %>% unique %>% length

In [None]:
fig.size(height = 5, width = 7, res = 400)

gene_plot = ggplot(data = summary_stats, aes(x = midpoint,
                    y = mean_prop,
                    color = interface_type)) +
    geom_line(alpha = 1, linewidth = 0.25) +
    geom_ribbon(
            aes(
                ymin = ymin,
                ymax = ymax,
                fill = interface_type), color = NA, alpha = 0.25, inherit.aes = TRUE) +
    scale_y_continuous(labels = scales::percent) +
    scale_color_manual(
    name = "Interface type:",
    values = c("MMRd" = "red", "MMRp" = "blue")
  ) +
  scale_fill_manual(
    name = "Interface type:",
    values = c("MMRd" = "red", "MMRp" = "blue")
  ) +
 # Add labels and facet by celltype
  labs(
    x = expression(paste("Distance from interface_type (", mu, "m)")),
    y = "Percent of all transcripts"
  ) +
  geom_vline(xintercept = 0, color = 'red', linetype = 'dotted') + 
  # Final theme adjustments
  cowplot::theme_half_open(7) + 
  theme(
    legend.position = "top",
    axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size = 7)
  ) +
  guides(color = guide_legend(override.aes = list(shape = 16, size = 4, alpha = 1))) +
  facet_wrap(~gene, scales = 'free') 

if(nrow(pvalues_for_rects)> 0){
gene_plot = gene_plot +
  # Add shaded rectangles for significant regions
  geom_rect(
    data = pvalues_for_rects,
    aes(xmin = min_midpoint, xmax = max_midpoint, ymin = -Inf, ymax = Inf),
    fill = "#EEE8AA",
    color = "#EEE8AA",
    alpha = 0.5,
    inherit.aes = FALSE
  ) }
gene_plot + ggtitle('Paired Wilcoxon test') + labs(subtitle = 'Yellow: fdr < 0.1 & abs(log2fc) > 1')

# Main figure

In [None]:
fig.size(height = 5, width = 6, res = 400)
main_fig_genes = gene_vector <- c(
  "CXCL9",
  "CXCL10",
  "CXCL11",
  "CXCL13",
  "CCL5",
  "CCL18",
  "CXCL16",
  "CCL19",
  "CCL21",
  "ZNF683",
  "ITGAE",
  "TGFB1",
  "IDO1",
  "CD274", 
  "PTGS2",
  "PTGES2"
) #gene_dt %>% filter(location == 'main') %>% pull(gene_name)
main_fig_genes
gene_plot = ggplot(data = summary_stats %>%
    filter(gene %in% main_fig_genes) %>%
    mutate(gene = factor(gene, levels = main_fig_genes), ordered = TRUE), aes(x = midpoint,
                    y = mean_prop,
                    color = interface_type)) +
    geom_line(alpha = 1, linewidth = 0.25) +
    geom_ribbon(
            aes(
                ymin = ymin,
                ymax = ymax,
                fill = interface_type), color = NA, alpha = 0.25, inherit.aes = TRUE) +
    scale_y_continuous(labels = scales::percent) +
    scale_color_manual(
    name = "Interface type:",
    values = c("MMRd" = "red", "MMRp" = "blue")
  ) +
  scale_fill_manual(
    name = "Interface type:",
    values = c("MMRd" = "red", "MMRp" = "blue")
  ) +
 # Add labels and facet by celltype
  labs(
    x = expression(paste("Distance from interface_type (", mu, "m)")),
    y = "Percent of all transcripts"
  ) +
  geom_vline(xintercept = 0, color = 'red', linetype = 'dotted') + 
  # Final theme adjustments
  cowplot::theme_half_open(7) + 
  theme(
    legend.position = "top",
    axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size = 7)
  ) +
  guides(color = guide_legend(override.aes = list(shape = 16, size = 4, alpha = 1))) +
  facet_wrap(~gene, scales = 'free') +
  NULL

if(nrow(pvalues_for_rects)> 0){
gene_plot = gene_plot +
  # Add shaded rectangles for significant regions
  geom_rect(
    data = pvalues_for_rects %>% filter(gene %in% main_fig_genes)%>%
    mutate(gene = factor(gene, levels = main_fig_genes), ordered = TRUE),
    aes(xmin = min_midpoint, xmax = max_midpoint, ymin = -Inf, ymax = Inf),
    fill = "#EEE8AA",
    color = "#EEE8AA",
    alpha = 0.5,
    inherit.aes = FALSE
  )  +
new_scale_fill() +
  # --- NEW: Add a dummy geom_point for the significance legend ---
  # This point won't be plotted (NA coordinates) but its aesthetic will appear in the legend.
  geom_point(
    data = data.frame(fill = "FDR < 0.1 & absolute log2 FC > 1"), # The text for the legend
    aes(fill = fill, x = 0, y = 0),
    shape = '.',
    size = 0,
    inherit.aes = FALSE
  ) +
  scale_fill_manual(
    name = "", # Set the same name for color and fill to merge legends
    values = c(
    #  "MSI" = "#D55E00",
    #  "MSS" = "#009E73",
      "FDR < 0.1 & absolute log2 FC > 1" = "#EEE8AA" # Assign yellow to the new item
    )) +
  # Override aesthetics to ensure legend keys are correct
  guides(
    fill = guide_legend(override.aes = list(shape = 22, size = 5))
  ) +
  geom_vline(xintercept = 0, color = 'grey', linetype = 'dotted') +
  geom_vline(xintercept = -50, color = 'grey', linetype = 'dotted') +
  geom_vline(xintercept = 50, color = 'grey', linetype = 'dotted') +
  NULL
}


fig.size(width = 6, height = 5, res = 400)
gene_plot

pdf('spatial_gene_patterning_main_figure_MSI_vs_MSS.pdf', width = 6, height = 5)
gene_plot
dev.off()

# Supplement - ligands

In [None]:
gene_dt %>% filter(location == 'supplement')

In [None]:
gene_dt$comment[gene_dt$gene_name == 'CD38'] = 'receptor'
gene_dt$comment[gene_dt$gene_name == 'IL10'] = 'ligand'
gene_dt$comment[gene_dt$gene_name == 'PDCD1LG2'] = 'ligand'
gene_dt$comment[gene_dt$gene_name == 'PDCD1'] = 'receptor'
gene_dt$location[gene_dt$gene_name == 'PTGS2'] = 'main'
gene_dt$location[gene_dt$gene_name == 'PTGES2'] = 'main'

In [None]:
fig.size(height = 4, width = 6, res = 400)
supplementary_ligands = c(
  "CCL3",
  "CCL4",
  "CCL17",
  "CCL22",
  "IFNG",
  "PDCD1LG2",
  "TGFB2",
  "TGFB3",
  "PTGES",
  "PTGS1",
  "IL10",
  "ENTPD1",
  "NT5E"
)#gene_dt %>% filter(location == 'supplement' & grepl(comment, pattern = 'ligand|enzyme')) %>% pull(gene_name)
supplementary_ligands
gene_plot_supplementary_ligands = ggplot(data = summary_stats %>%
    filter(gene %in% supplementary_ligands) %>%
    mutate(gene = factor(gene, levels = supplementary_ligands)), aes(x = midpoint,
                    y = mean_prop,
                    color = interface_type)) +
    geom_line(alpha = 1, linewidth = 0.25) +
    geom_ribbon(
            aes(
                ymin = ymin,
                ymax = ymax,
                fill = interface_type), color = NA, alpha = 0.25, inherit.aes = TRUE) +
    scale_y_continuous(labels = scales::percent) +
    scale_color_manual(
    name = "Interface type:",
    values = c("MMRp" = "blue", "MMRd" = "red")
  ) +
  scale_fill_manual(
    name = "Interface type:",
    values = c("MMRp" = "blue", "MMRd" = "red")
  ) +
 # Add labels and facet by celltype
  labs(
    x = expression(paste("Distance from interface_type (", mu, "m)")),
    y = "Percent of all transcripts"
  ) +
  geom_vline(xintercept = 0, color = 'red', linetype = 'dotted') + 
  # Final theme adjustments
  cowplot::theme_half_open(7) + 
  theme(
    legend.position = "top",
    axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size = 7)
  ) +
  guides(color = guide_legend(override.aes = list(shape = 16, size = 4, alpha = 1))) +
  facet_wrap(~gene, scales = 'free') +
  NULL

if (nrow(pvalues_for_rects)>0){
# Add shaded rectangles for significant regions
gene_plot_supplementary_ligands = gene_plot_supplementary_ligands + geom_rect(
    data = pvalues_for_rects %>% filter(gene %in% supplementary_ligands)%>%
    mutate(gene = factor(gene, levels = supplementary_ligands)),
    aes(xmin = min_midpoint, xmax = max_midpoint, ymin = -Inf, ymax = Inf),
    fill = "#EEE8AA",
    color = "#EEE8AA",
    alpha = 0.5,
    inherit.aes = FALSE
  )+
new_scale_fill() +
  # --- NEW: Add a dummy geom_point for the significance legend ---
  # This point won't be plotted (NA coordinates) but its aesthetic will appear in the legend.
  geom_point(
    data = data.frame(fill = "FDR < 0.1 & absolute log2 FC > 1"), # The text for the legend
    aes(fill = fill, x = 0, y = 0),
    shape = '.',
    size = 0,
    inherit.aes = FALSE
  ) +
  scale_fill_manual(
    name = "", # Set the same name for color and fill to merge legends
    values = c(
    #  "MSI" = "#D55E00",
    #  "MSS" = "#009E73",
      "FDR < 0.1 & absolute log2 FC > 1" = "#EEE8AA" # Assign yellow to the new item
    )) +
  # Override aesthetics to ensure legend keys are correct
  guides(
    fill = guide_legend(override.aes = list(shape = 22, size = 5))
  ) +
  geom_vline(xintercept = 0, color = 'grey', linetype = 'dotted') +
  geom_vline(xintercept = -50, color = 'grey', linetype = 'dotted') +
  geom_vline(xintercept = 50, color = 'grey', linetype = 'dotted') +
  NULL}

gene_plot_supplementary_ligands

pdf('spatial_gene_patterning_supplement_ligands_MSI_vs_MSS.pdf', width = 6, height = 4)
gene_plot_supplementary_ligands
dev.off()

## Supplement - receptors

In [None]:
gene_dt %>%
    filter(location == 'supplement')

In [None]:
fig.size(height = 4, width = 6, res = 400)
supplementary_receptors = c(
  "CD38",
  "PDCD1",
  "CCR4",
  "CCR5",
  "CCR7",
  "CCR8",
  "CXCR3",
  "CXCR5",
  "CXCR6",
  "LAG3"
) #gene_dt %>% filter(location == 'supplement' & grepl(comment, pattern = 'receptor')) %>% pull(gene_name)
supplementary_receptors
gene_plot_supplementary_receptors = ggplot(data = summary_stats %>%
    filter(gene %in% supplementary_receptors) %>%
    mutate(gene = factor(gene, levels = supplementary_receptors)), aes(x = midpoint,
                    y = mean_prop,
                    color = interface_type)) +
    geom_line(alpha = 1, linewidth = 0.25) +
    geom_ribbon(
            aes(
                ymin = ymin,
                ymax = ymax,
                fill = interface_type), color = NA, alpha = 0.25, inherit.aes = TRUE) +
    scale_y_continuous(labels = scales::percent) +
    scale_color_manual(
    name = "Interface type:",
    values = c("MMRp" = "blue", "MMRd" = "red")
  ) +
  scale_fill_manual(
    name = "Interface type:",
    values = c("MMRp" = "blue", "MMRd" = "red")
  ) +
 # Add labels and facet by celltype
  labs(
    x = expression(paste("Distance from interface (", mu, "m)")),
    y = "Percent of all transcripts"
  ) +
  geom_vline(xintercept = 0, color = 'red', linetype = 'dotted') + 
  # Final theme adjustments
  cowplot::theme_half_open(7) + 
  theme(
    legend.position = "top",
    axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size = 7)
  ) +
  guides(color = guide_legend(override.aes = list(shape = 16, size = 4, alpha = 1))) +
  facet_wrap(~gene, scales = 'free') +
  NULL

if (nrow(pvalues_for_rects)>0){
# Add shaded rectangles for significant regions
gene_plot_supplementary_receptors = gene_plot_supplementary_receptors + geom_rect(
    data = pvalues_for_rects %>% filter(gene %in% supplementary_receptors)%>%
    mutate(gene = factor(gene, levels = supplementary_receptors)),
    aes(xmin = min_midpoint, xmax = max_midpoint, ymin = -Inf, ymax = Inf),
    fill = "#EEE8AA",
    color = "#EEE8AA",
    alpha = 0.5,
    inherit.aes = FALSE
  ) }

gene_plot_supplementary_receptors

pdf('spatial_gene_patterning_supplement_receptors_MSI_vs_MSS.pdf', width = 6, height = 4)
gene_plot_supplementary_receptors
dev.off()

In [None]:
fig.size(height = 9, width = 6, res = 400)
gene_plot_supplementary_ligands + gene_plot_supplementary_receptors + plot_annotation(tag_levels = 'A') + plot_layout(nrow = 2)
pdf('supplementary_figure_genes_spatial_pattern_MSI_vs_MSS.pdf', width = 6, height = 9)
gene_plot_supplementary_ligands + gene_plot_supplementary_receptors + plot_annotation(tag_levels = 'A') + plot_layout(nrow = 2)
dev.off()