# Construct interfaces 
## Construct interfaces in MSI and MSS. 
## Construct interfaces between hub+ and hub- tumor/stroma. 
## Visualize interfaces. 

# Construct all types of interfaces & cache for sharing

In [None]:
require(tidyverse)
require(sf)
require(ggthemes)
require(ggpubr)
require(scattermore)
require(data.table)
require(future)
require(furrr)

In [None]:
fig.size = function(height, width, res = 300){
    options(repr.plot.width = width, repr.plot.height = height, repr.plot.res = res)
}
fixTheme = ggpubr::theme_pubr(base_family = "Helvetica", base_size = 12) + 
          theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 0.5))
fixGuides = guides(fill = guide_legend(override.aes = list(shape = 16, size = 12)), 
                 color = guide_legend(override.aes = list(shape = 16, size = 12)))
tumor_palette =  c("CXCL+ stroma" = "#17527D",
                   "CXCL- stroma" = "#47a8bd",
                   "CXCL+ tumor" = "#A97F2F",
                   "CXCL- tumor" = "#ffad69",
                   "CXCL+ tumor-stromal interface" = 'red', 
                  "All Tumor-Stromal\ninterface" = 'black'
                  ) #"#9c3848"
lineage_palette = c('Epi' = '#CA49FC',
        'Strom' = '#00D2D0',
        'Myeloid' = '#FFB946',
        'Mast' = '#F4ED57',
        'Plasma' = '#61BDFC',
        'B' = '#0022FA',
        'TNKILC' = '#FF3420'
        )

## load data

In [None]:
agg_metadata = readr::read_rds('../Tessera tiles/Tessera processed results/agg_metadata_2025-07-22.rds')
tile_metadata = readr::read_rds('../Tessera tiles/Tessera processed results/tile_metadata_2025-07-22.rds')
tile_metadata$type_lvl1[tile_metadata$type_lvl2 == 'Mast'] = 'Mast' 

In [None]:
tile_metadata$SampleID[tile_metadata$Status == 'MSI'] %>% unique
tile_metadata$SampleID[tile_metadata$Status == 'MSS'] %>% unique

## draw boundaries and cache

### MSS samples

### MSI samples

## table of distances of each cell to the nearest interface

In [None]:
# Load necessary libraries
# Ensure you have these installed: install.packages(c("data.table", "Matrix", "spatula", "presto", "furrr", "dplyr", "metafor"))
library(data.table)
library(Matrix)
library(spatula)
library(presto)
library(furrr)
library(dplyr)
library(metafor)

#' Calculate Cell Type Spatial Enrichment
#'
#' This function analyzes spatial relationships between cell types on a per-sample basis.
#' For each "anchor" cell, it counts the number of "neighbor" cells within a defined
#' neighborhood and uses a mixed-effects model (via presto) to determine if certain
#' neighbor types are enriched around certain anchor types.
#'
#' @param cells A data.table or data.frame containing cell data. Must include columns
#'   for 'SampleID', spatial coordinates 'X' and 'Y', and cell type 'type_lvl3'.
#' @param anchor_types A character vector specifying the cell types to be treated as
#'   the central "anchor" cells in the analysis.
#' @param neighbor_types A character vector specifying the cell types to be counted
#'   as "neighbors".
#' @param max_dist A numeric value for the maximum distance (in the same units as X/Y
#'   coordinates) to consider two cells neighbors. Connections longer than this are
#'   pruned. Defaults to 30.
#' @param nsteps An integer specifying the number of steps to expand the neighborhood.
#'   nsteps = 1 means immediate neighbors, nsteps = 2 includes neighbors of neighbors, etc.
#'   Defaults to 3.
#' @param use_multicore A logical value indicating whether to process samples in parallel
#'   using `furrr` with the `multicore` plan. Defaults to TRUE.
#'
#' @return A data.table containing the enrichment analysis results for each sample,
#'   with columns for anchor type, neighbor type, log-fold change, standard error, etc.
#'
calculate_spatial_enrichment <- function(cells,
                                         anchor_types,
                                         neighbor_types,
                                         max_dist = 30,
                                         nsteps = 3,
                                         use_multicore = TRUE) {
  
  # Set up the parallel processing plan based on the user's choice.
  # plan(sequential) runs tasks one by one.
  # plan(multicore) runs tasks in parallel on the same machine.
  if (use_multicore) {
    plan(multicore)
  } else {
    plan(sequential)
  }
  
  # Process each sample independently.
  # We split the main 'cells' data.table by 'SampleID'.
  # 'future_imap' then iterates over each sample's data chunk.
  res_all <- cells %>%
    split(.$SampleID) %>%
    future_imap(function(.cells, .name) {
      
      # Print the name of the sample being processed for progress tracking.
      message("Processing sample: ", .name)
      
      # --- 1. Define Spatial Neighbors ---
      
      # Extract X and Y coordinates into a matrix.
      coords <- as.matrix(.cells[, .(X, Y)])

      # Center the coordinates without changing their scale
      #coords <- scale(coords, center = TRUE, scale = FALSE)
      # 'getSpatialNeighbors' creates an adjacency matrix where non-zero entries
      # indicate neighboring cells and the values are the distances.
      adj <- spatula::getSpatialNeighbors(coords, return_weights = TRUE)
      
      # Prune long-distance connections: set distances greater than 'max_dist' to 0.
      adj@x[adj@x > max_dist] <- 0
      
      # 'drop0' removes the explicit zero entries, making the matrix sparse and efficient.
      adj <- Matrix::drop0(adj)
      
      # Convert the adjacency matrix to be unweighted (binary: 1 for neighbor, 0 otherwise).
      adj@x <- rep(1, length(adj@x))
      
      # Sanity check: ensure no cell is counted as its own neighbor.
      stopifnot(all(diag(adj) == 0))
      
      # --- 2. Get Neighborhood Compositions ---
      
      # Create a sparse matrix where rows are cells and columns are cell types.
      # This represents the "0-step" neighbors (the cell itself).
      counts <- Matrix::sparse.model.matrix(~ 0 + type_lvl3, .cells)
      
      # Expand the neighborhood by 'nsteps'.
      # Matrix multiplication aggregates neighbor counts at each step.
      for (iter in seq_len(nsteps)) {
        counts <- adj %*% counts
      }
      
      # Clean up column names for clarity.
      colnames(counts) <- gsub('type_lvl3', '', colnames(counts))
      
      # --- 3. Filter Cells and Neighbors for Modeling ---
      
      # Only consider specified neighbor types.
      valid_neighbor_types <- intersect(neighbor_types, colnames(counts))
      counts <- counts[, valid_neighbor_types, drop = FALSE]
      
      # Calculate the total number of relevant neighbors for each cell and log-transform it.
      # This will be used as an offset in the model to account for neighborhood density.
      .cells$log_n_neighbors <- log(rowSums(counts))
      
      # Filter out cells that have zero neighbors (where log(0) is -Inf).
      i <- which(!is.infinite(.cells$log_n_neighbors))
      M <- .cells[i, ]
      counts <- counts[i, , drop = FALSE]
      
      # Limit the analysis to the specified anchor cell types.
      i <- which(M$type_lvl3 %in% anchor_types)
      M <- M[i, ]
      counts <- counts[i, , drop = FALSE]
      
      # --- 4. Run Statistical Model (presto) ---
      
      # Suppress warnings that may arise during model fitting.
      suppressWarnings({
        # 'presto.presto' fits a fast generalized linear mixed model.
        # Formula: y ~ 1 + (1|type_lvl3) + offset(log_n_neighbors)
        # - y: The count of a specific neighbor type (handled internally by presto).
        # - (1|type_lvl3): A random intercept for each anchor cell type.
        # - offset(log_n_neighbors): Accounts for the total number of neighbors.
        presto_res <- presto::presto.presto(
          y ~ 1 + (1 | type_lvl3) + offset(log_n_neighbors),
          M,
          t(counts), # Counts matrix needs to be transposed for presto.
          'log_n_neighbors',
          effects_cov = c('type_lvl3'),
          nsim = 1000,
          ncore = 1, # Parallelism is handled by furrr at the sample level.
          family = 'poisson',
          min_sigma = .05,
          verbose = FALSE
        )
      })
      
      # --- 5. Calculate and Format Contrasts ---
      
      # Create a contrast matrix to compare anchor types.
      contrasts_mat <- make_contrast.presto(
        object = presto_res,
        var_contrast = 'type_lvl3'
      )
      contrasts_mat[is.na(contrasts_mat)] <- 0 # Replace NAs with 0.
      
      # Calculate the effects based on the contrast matrix.
      eff <- contrasts.presto(presto_res, contrasts_mat, one_tailed = FALSE) %>%
        dplyr::rename(anchor = contrast, neighbor = feature, logFC = beta, logSE = sigma) %>%
        dplyr::mutate(
          # Convert from natural log (ln) to log base 2 for easier interpretation.
          logFC = logFC / log(2),
          logSE = logSE / log(2)
        ) %>%
        data.table()
      
      return(eff)
      
    }, .options = furrr::furrr_options(seed = TRUE)) %>% # Use a seed for reproducibility.
    bind_rows(.id = 'SampleID') # Combine results from all samples into one data.table.
  
  return(res_all)
}


#' Summarize Enrichment Results with a Meta-Analysis
#'
#' This function takes the per-sample enrichment results and combines them using a
#' random-effects model with inverse variance weighting to produce a single,
#' summarized result for each anchor-neighbor pair.
#'
#' @param enrichment_results A data.table, typically the output from
#'   `calculate_spatial_enrichment`.
#' @param n_total The total number of samples, used in the REML prior calculation.
#' @param c_prior The prior value for the REML calculation.
#'
#' @return A final, summarized data.table with meta-analyzed logFC, p-values,
#'   and adjusted p-values for each anchor-neighbor interaction.
#'
summarize_enrichment_results <- function(enrichment_results, n_total = 8, c_prior = 0.1) {
  require(data.table)
  # This step groups the data by each anchor-neighbor pair and applies the
  # meta-analysis function to combine logFC and logSE values across all samples.
  res <- enrichment_results[, 
    combine_with_REML_prior(.SD$logFC, .SD$logSE, n_total = n_total, c_prior = c_prior),
    by = .(anchor, neighbor)
  ][
    order(pvalue) # Order results by p-value.
  ][
    !is.na(logFC) # Remove rows where logFC is NA.
  ][
    , padj := p.adjust(pvalue, 'BH') # Calculate Benjamini-Hochberg adjusted p-values.
  ]
  
  return(res)
}


# --- Meta-Analysis Helper Functions ---

#' Estimate Heterogeneity (Tau) via REML
#'
#' This function estimates the between-dataset standard deviation (tau), a measure of
#' heterogeneity, from observed effect sizes and their standard errors. It uses the
#' `rma.uni` function from the `metafor` package.
#'
#' @param est A numeric vector of effect size estimates (e.g., log-fold changes).
#' @param se A numeric vector of standard errors corresponding to the estimates.
#' @param method A character string specifying the method for estimating tau-squared.
#'   Defaults to "REML" (Restricted Maximum-Likelihood).
#' @return A numeric value representing the estimated between-study standard deviation (tau).
#'
tau_reml <- function(est, se, method = "REML") {
  # Ensure the estimates and standard errors are of the same length.
  stopifnot(length(est) == length(se))
  
  # Fit a random-effects meta-analysis model using the metafor package.
  # 'yi' are the effect sizes, and 'sei' are their standard errors.
  fit <- metafor::rma.uni(yi = est, sei = se, method = method)
  
  # Extract tau^2 (the variance) and take the square root to get tau (the standard deviation).
  as.numeric(sqrt(fit$tau2))
}

#' Combine Estimates with a Zero-Centered Prior for Missing Data
#'
#' This function performs a random-effects meta-analysis that incorporates a
#' zero-centered prior for "missing" studies. This is a form of regularization that
#' shrinks the overall estimate towards zero when data is sparse (i.e., when an
#' effect is observed in only a few out of many possible datasets).
#'
#' @param est A numeric vector of the *observed* effect size estimates.
#' @param se A numeric vector of the *observed* standard errors.
#' @param n_total An integer for the total number of datasets that *could* have
#'   contributed data (observed + missing).
#' @param tau_hat A numeric value for the estimated between-study heterogeneity (tau),
#'   typically from `tau_reml`.
#' @param c_prior A numeric scaling factor for the prior's standard deviation. The
#'   prior's SD is calculated as `tau0 = c_prior * tau_hat`. Defaults to 1.
#' @param se_floor An optional numeric value to prevent a too-strong (too narrow) prior
#'   when `tau_hat` is close to zero. The prior's SD will be `max(tau0, se_floor)`.
#' @return A data.table containing the meta-analyzed logFC, logSE, z-score, p-value,
#'   and estimates of tau.
#'
meta_with_zero_prior_RE <- function(est, se, n_total, tau_hat,
                                      c_prior = 1, se_floor = NULL) {
  # Perform sanity checks on the inputs.
  stopifnot(length(est) == length(se), n_total >= length(est))
  
  # Calculate the number of observed and missing studies.
  k_obs  <- length(est)
  k_miss <- n_total - k_obs
  
  # Calculate random-effects weights for the observed studies.
  # The weight for each study is the inverse of its variance (se^2 + tau_hat^2).
  w_obs <- 1 / (se^2 + tau_hat^2)
  w_obs_sum <- sum(w_obs)
  
  # Define the prior's standard deviation (tau0). The prior assumes missing studies
  # have an effect size of 0 with a standard deviation of tau0.
  tau0 <- c_prior * tau_hat
  
  # If an se_floor is provided, ensure the prior is never stronger (more certain)
  # than a typical observed SE. This prevents overly strong shrinkage if tau_hat is near zero.
  if (!is.null(se_floor)) {
    tau0 <- max(tau0, se_floor)
  }
  
  # Calculate the weight for each "missing" pseudo-study.
  w0 <- if (k_miss > 0) 1 / (tau0^2) else 0
  
  # Calculate the pooled estimate and its standard error.
  # This is a weighted average of the observed effects and the k_miss pseudo-studies at effect 0.
  w_tot <- w_obs_sum + k_miss * w0
  mu    <- if (w_tot > 0) sum(w_obs * est) / w_tot else NA_real_
  se_mu <- if (w_tot > 0) sqrt(1 / w_tot)       else NA_real_
  
  # Calculate z-score and two-tailed p-value.
  z     <- mu / se_mu
  p     <- 2 * (1 - pnorm(abs(z)))
  
  # Return the results in a data.table.
  data.table(
    logFC = mu, logSE = se_mu, zscore = z, pvalue = p,
    tau_hat = tau_hat, tau0 = tau0
  )
}

#' Wrapper to Estimate Tau and Perform Meta-Analysis with Prior
#'
#' This is a convenience wrapper that first estimates heterogeneity (tau) via REML and
#' then calls `meta_with_zero_prior_RE` to perform the meta-analysis. It includes
#' error handling for cases where tau cannot be estimated.
#'
#' @param est A numeric vector of observed effect size estimates.
#' @param se A numeric vector of observed standard errors.
#' @param n_total An integer for the total number of potential studies/samples.
#' @param c_prior A numeric scaling factor for the prior's standard deviation. Defaults to 1.
#' @param se_floor An optional minimum value for the prior's SD. If NULL (the default),
#'   it is set to the median of the observed standard errors.
#' @param method A character string for the method used by `tau_reml` to estimate tau-squared.
#' @return A data.table with meta-analysis results, or a data.table with NA values if
#'   an error occurs during calculation.
#'
combine_with_REML_prior <- function(est, se, n_total,
                                      c_prior = 1, se_floor = NULL,
                                      method = "REML") {
  
  # Handle cases with insufficient data for heterogeneity estimation.
  if (length(est) < 2) {
    # If there's only one observation, we cannot estimate between-study variance,
    # so we assume it's zero.
    tau_hat <- 0
    if (is.null(se_floor)) se_floor <- stats::median(se, na.rm = TRUE)
    
    return(meta_with_zero_prior_RE(est, se, n_total, tau_hat, c_prior = c_prior, se_floor = se_floor))
  }
  
  # Use tryCatch for the standard case with >= 2 observations, as convergence can still fail.
  tryCatch({
    
    # Estimate between-study heterogeneity (tau) from the observed data.
    tau_hat <- tau_reml(est, se, method = method)
    
    # Set a default for se_floor if not provided. Using the median observed SE
    # is a reasonable heuristic to prevent the prior from being overly strong.
    if (is.null(se_floor)) se_floor <- stats::median(se, na.rm = TRUE)
    
    # Perform the meta-analysis using the estimated tau.
    meta_with_zero_prior_RE(est, se, n_total, tau_hat, c_prior = c_prior, se_floor = se_floor)
    
  }, error = function(e) {
    # If any error occurs, return a data.table with NA values.
    # This prevents the entire analysis pipeline from crashing.
    message("REML failed to converge for a group. Details: ", e$message)
    data.table(logFC=NA_real_, logSE=NA_real_, zscore=NA_real_, pvalue=NA_real_, tau_hat=NA_real_, tau0=NA_real_)
  })
}

#' @title Calculate Cell Counts in Distance Bins from an Interface
#' @description This function takes spatial coordinates of cells and interface lines,
#'   calculates the signed distance of each cell to the nearest interface, and
#'   groups cells into discrete distance bins. It returns a matrix of cell
#'   type counts per bin for a single sample.
#' @param cells A data.table containing cell information, including 'X'/'Y' coordinates,
#'   cell type ('type_lvl3'), and a region annotation ('tessera_annotation').
#' @param interfaces An sf object containing interface geometries (e.g., LINESTRINGs).
#' @return A matrix where rows are distance bins (e.g., "(-5,0]") and columns
#'   are cell types ('type_lvl3'), with values representing cell counts.
get_bins_per_cell = function(cells, interfaces) {
    ## Get distances and closest interface type
    pts = st_as_sf(cells[, .(X, Y)], coords = c('X', 'Y'))
    geos_pts = geos::as_geos_geometry(pts$geometry)
    geos_lines = geos::as_geos_geometry(interfaces$x[1:nrow(interfaces)])
    
    nearest_interfaces_idx = geos::geos_nearest(geos_pts, geos_lines)
    
    cells$closest_interface_type = interfaces$Type_of_Interface[nearest_interfaces_idx]
    cells$dist_interface = geos::geos_distance(geos_pts, geos_lines[nearest_interfaces_idx])
    
    ## Assign sign to distances
    cells$dist_interface_signed = fifelse(
        cells$tessera_annotation == 'Stromal-enriched',
        -cells$dist_interface,
        cells$dist_interface
    )
    
    ## Assign cells to 5um bins
    dist_breaks = seq(-100, 100, by = 5)
    cells$dist_bin = cut(cells$dist_interface_signed, breaks = dist_breaks, include.lowest = TRUE)

    # --- ROBUST SUMMARIZATION --

    # cells = cells %>% filter(
    #     (closest_interface_type == 'CXCLpos tumor & CXCLpos stroma' & cxcl_pos_tile == 'CXCL_pos') | (closest_interface_type == 'CXCLneg tumor & CXCLneg stroma' & cxcl_pos_tile == 'CXCL_neg')        
    # )
    
    cells_in_range = cells[!is.na(dist_bin)]
    
    if (nrow(cells_in_range) == 0) {
        warning("No cells found within the -100 to 100Âµm distance range.")
        return(list())
    }

    all_interface_types = unique(cells$closest_interface_type)
    cells_in_range[, closest_interface_type := factor(closest_interface_type, levels = all_interface_types)]

    # counts_long = cells_in_range[, .N, by = .(closest_interface_type, dist_bin, type_lvl3)]

    # counts_wide = dcast(counts_long,
    #                     closest_interface_type + dist_bin ~ type_lvl3,
    #                     value.var = "N",
    #                     fill = 0,
    #                     drop = FALSE)

    # result_list = split(counts_wide, by = "closest_interface_type")

    # result_list = lapply(result_list, function(dt) {
    #     row_names = dt$dist_bin
    #     count_cols = setdiff(names(dt), c("closest_interface_type", "dist_bin"))
    #     mat = as.matrix(dt[, ..count_cols])
    #     rownames(mat) = row_names
    #     return(mat)
    # })

    return(cells)
}

In [None]:
# Load interface geometry files for each sample
#ids = unique(tile_metadata$SampleID[tile_metadata$Status == 'MSI'])
interfaces = map(unique(tile_metadata$SampleID), function(.id) {
    fname = normalizePath(list.files(path = '../Tessera tiles/Spatial objects for tumor-stromal interfaces in all MERFISH samples/', pattern = '_tumor_stromal_interfaces.rds', full.names = TRUE)[grepl(list.files(path = '../Tessera tiles/Spatial objects for tumor-stromal interfaces in all MERFISH samples/', pattern = '_tumor_stromal_interfaces.rds', full.names = TRUE), pattern = .id)])
    readRDS(fname)
})
names(interfaces) = unique(tile_metadata$SampleID)

glimpse(interfaces[[1]])

In [None]:
unique(tile_metadata$SampleID)

In [None]:
# Set a higher limit for global variables when using parallel processing
plan(sequential)
plan(multicore)
options(future.globals.maxSize = 1e10)

# Run get_bins for each sample in parallel
system.time({
    distances = future_map(unique(tile_metadata$SampleID), function(.id) {
        get_bins_per_cell(tile_metadata[SampleID == .id], interfaces[[.id]])    
    }, .options = furrr::furrr_options(seed=TRUE))
    names(distances) = unique(tile_metadata$SampleID)
})
distances = rbindlist(distances) %>% na.omit()
head(distances) 
plan(sequential)

In [None]:
tiles_to_omit = read.csv('../Tessera tiles/Tessera processed results/tiles_to_exclude_from_interface_analysis.csv') %>%
    filter(tiles_to_exclude_from_interface_analysis != '') %>%
    pull(agg_id)
length(tiles_to_omit)
head(tiles_to_omit)

In [None]:
distances$dist_interface_signed %>% head

In [None]:
# --- Data Import and Processing ---
distances = distances %>%
    filter(! agg_id %in% tiles_to_omit) %>%
    mutate(measurement_bin = factor(dist_bin)) %>%
    mutate(measurement_bin = fct_reorder(measurement_bin, dist_interface_signed, .fun = mean)) %>%
    mutate(dist = dist_interface_signed) %>%
    mutate(closest_interface_subtype  = closest_interface_type ) %>%
    mutate(closest_interface_type  = MMRstatus               ) 

distances$type_lvl1[distances$type_lvl2 == 'Mast'] = 'Mast'

# --- Inspect the Result ---

glimpse(distances)

# Plot an example

In [None]:
sample_of_interest = 'G4423'
interfaces = readr::read_rds(file = glue::glue('../Tessera tiles/Spatial objects for tumor-stromal interfaces in all MERFISH samples/MSI_', sample_of_interest, '_tumor_stromal_interfaces.rds'))
fig.size(10,10,500)
interfaces %>% ggplot() + geom_sf()

## Distribution of cells around the tumor-stromal interface

In [None]:
distances_by_status = distances
distances_by_status[, `:=`(
  # Parse the measurement_bin string '(lower,upper]' to extract numeric bounds.
  upper_bound = as.numeric(gsub(measurement_bin, pattern = ".*,|)|]", replacement = "")),
  lower_bound = as.numeric(gsub(measurement_bin, pattern = "\\[|\\(|\\)|,.*", replacement = ""))
)]

distances_by_status[, `:=`(
  # Assign the bin midpoint.
  midpoint = (upper_bound + lower_bound)/2
)]
distances_by_status[, `:=`(
  bin_numeric = midpoint
)]
head(distances_by_status)

In [None]:
anyNA(distances_by_status$dist)
sum(is.na(distances_by_status$dist))

In [None]:
distances_by_status %>%
    group_by(SampleID) %>%
    summarize(n = sum(is.na(dist)))

#### Total cell distribution relative to epithelial-stromal interfaces in MSI and MSS samples

In [None]:
tiles_to_omit = read.csv('../Tessera tiles/Tessera processed results/tiles_to_exclude_from_interface_analysis.csv') %>%
    filter(tiles_to_exclude_from_interface_analysis != '') %>%
    pull(agg_id)
length(tiles_to_omit)
head(tiles_to_omit)

In [None]:
# Set the figure dimensions and resolution for the output plot.
fig.size(height = 3, width = 7, res = 500)

# --- Data Preparation for Total Counts ---

# Create a temporary dataframe 'temp' by processing the 'distances' dataframe.
# Counts are now aggregated across all lineages.
temp <- distances_by_status %>%
    filter(! agg_id %in% tiles_to_omit) %>%  # Remove any rows that contain NA (missing) values.
  na.omit() %>%
  # Group data by bin, status, and patient to get total counts across all cell types.
  group_by(measurement_bin,closest_interface_type, PatientID) %>%
  # Create a new column 'total_cells' that counts the number of observations in each group.
  mutate(total_cells = n()) %>%
  # Remove the grouping structure.
  ungroup() %>%
  # Select only the columns needed for the plot.
  select(measurement_bin,closest_interface_type, PatientID, total_cells, bin_numeric) %>%
  # Remove any duplicate rows.
  distinct() 

# --- Plot Creation ---

# Create the ggplot object 'p1'.
p2 <- ggplot(
  # Use the distinct rows from the 'temp' dataframe as the primary dataset.
  data = temp %>%
    select(measurement_bin,closest_interface_type, PatientID, bin_numeric, total_cells) %>%
    distinct()
) +
  # Add dotted vertical lines at x = 100 and x = -100 microns as reference points.
  geom_vline(xintercept = 100, color = 'black', linetype = 'dotted') +
  geom_vline(xintercept = -100, color = 'black', linetype = 'dotted') +
  # Add semi-transparent lines representing the total cell counts for each individual patient.
  geom_line(aes(x = bin_numeric, y = total_cells, color =closest_interface_type, group = PatientID), alpha = 0.25) +
  # Add a second, solid line representing the median trend for each status group (MSI/MSS).
  geom_line(
    # This layer uses a separately summarized dataset for the median line.
    data = temp %>%
      select(measurement_bin,closest_interface_type, PatientID, bin_numeric, total_cells) %>%
      distinct() %>%
      # Group by bin and status to calculate the median across all patients.
      group_by(measurement_bin,closest_interface_type, bin_numeric) %>%
      # Calculate the median of 'total_cells' for each group.
      summarize(total_cells = median(total_cells)) %>%
      na.omit(),
    # Map aesthetics for the median line.
    aes(x = bin_numeric, y = total_cells, color =closest_interface_type, group =closest_interface_type)
  ) +
  # Create a grid of plots faceted only by 'Status'.
  facet_wrap(~closest_interface_type, scales = 'free', nrow = 1) +
  # Apply a clean, publication-ready theme.
  cowplot::theme_cowplot(7) +
  # Further customize the plot's theme.
  theme(
    legend.position = 'top',
    axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 0.5),
    plot.title = element_text(size = 7)
  ) +
  # Set the labels for the axes and the plot title.
  labs(
    x = 'Microns from interface',
    y = 'Total number of cells',
    title = 'Total cell distribution relative to epithelial-stromal interfaces in MSI and MSS samples'
  ) +
  # Customize the x-axis to have a specific number of breaks.
  scale_x_continuous(n.breaks = 6) +
  # Manually define the colors for the 'Status' variable.
  scale_color_manual(
    name = "Interface type:",
    values = c("MMRd" = 'red', "MMRp" = 'blue')
  ) +
  NULL

# Print or display the final plot object.
p2

#### Cell Lineage Distribution Near the Tumor Interface in MSI and MSS 

In [None]:
head(distances_by_status)

In [None]:
unique(distances_by_status$tiles_to_exclude_from_interface_analysis)

In [None]:
# Set the figure dimensions and resolution for the output plot.
fig.size(height = 3, width = 7, res = 500)

# --- Data Preparation for Total Counts ---

# Create a temporary dataframe 'temp' by processing the 'distances' dataframe.
# Counts are now aggregated across all lineages.
temp <- distances_by_status %>%
  filter(! agg_id %in% tiles_to_omit) %>%
  # Remove any rows that contain NA (missing) values.
  na.omit() %>%
  # Group data by bin, status, and patient to get total counts across all cell types.
  group_by(measurement_bin, closest_interface_type, PatientID, type_lvl1, bin_numeric) %>%
  # Create a new column 'total_cells' that counts the number of observations in each group.
  mutate(total_cells = n()) %>%
  # Remove the grouping structure.
  ungroup() %>%
  # Select only the columns needed for the plot.
  select(measurement_bin,closest_interface_type, PatientID, total_cells, bin_numeric, type_lvl1) %>%
  # Remove any duplicate rows.
  distinct()
dim(temp)
# --- Plot Creation ---

# Create the ggplot object 'p1'.
p1 <- ggplot(
  # Use the distinct rows from the 'temp' dataframe as the primary dataset.
  data = temp %>%
    select(measurement_bin,closest_interface_type, PatientID, bin_numeric, total_cells, type_lvl1) %>%
    distinct()
) +
  # Add dotted vertical lines at x = 100 and x = -100 microns as reference points.
  geom_vline(xintercept = 100, color = 'black', linetype = 'dotted') +
  geom_vline(xintercept = -100, color = 'black', linetype = 'dotted') +
  # Add semi-transparent lines representing the total cell counts for each individual patient.
  geom_line(aes(x = bin_numeric, y = total_cells, color =closest_interface_type, group = PatientID), alpha = 0.25) +
  # Add a second, solid line representing the median trend for each status group (MSI/MSS).
  geom_line(
    # This layer uses a separately summarized dataset for the median line.
    data = temp %>%
      select(measurement_bin,closest_interface_type, PatientID, bin_numeric, total_cells, type_lvl1) %>%
      distinct() %>%
      # Group by bin and status to calculate the median across all patients.
      group_by(measurement_bin,closest_interface_type, bin_numeric, type_lvl1) %>%
      # Calculate the median of 'total_cells' for each group.
      summarize(total_cells = median(total_cells)) %>%
      na.omit(),
    # Map aesthetics for the median line.
    aes(x = bin_numeric, y = total_cells, color =closest_interface_type, group =closest_interface_type)
  ) +
  # Create a grid of plots faceted only by 'Status'.
  facet_wrap(type_lvl1~closest_interface_type, scales = 'free', nrow = 2) +
  # Apply a clean, publication-ready theme.
  cowplot::theme_cowplot(7) +
  # Further customize the plot's theme.
  theme(
    legend.position = 'top',
    axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 0.5),
    plot.title = element_text(size = 7)
  ) +
  # Set the labels for the axes and the plot title.
  labs(
    x = 'Microns from interface',
    y = 'Total number of cells',
    title = 'Total cell distribution relative to epithelial-stromal interfaces in MSI and MSS samples'
  ) +
  # Customize the x-axis to have a specific number of breaks.
  scale_x_continuous(n.breaks = 6) +
  # Manually define the colors for the 'Status' variable.
  scale_color_manual(
    name = "Interface type:",
    values = c("MMRd" = 'red', "MMRp" = 'blue')
  ) +
  NULL

# Print or display the final plot object.
p1

In [None]:
fig.size(height = 3, width = 7)
require(patchwork)
(p2 + ggtitle("")) + (p1 + labs(title = '', subtitle = '')) + plot_layout(design = 'ABBB', guides = 'collect') & theme(legend.position = 'top')

In [None]:
pdf('distribution_of_cells_around_MMRd_MMRp_interfaces.pdf', width = 3, height = 3)
p2 + ggtitle("")
dev.off()

In [None]:
pdf('distribution_of_lineages_around_MMRd_MMRp_interfaces.pdf', width = 4, height = 3)
p1 + labs(title = '', subtitle = '')
dev.off()

### Plot the number of cells assigned to the hub around hub+ and hub- interfaces

In [None]:
distances = distances_by_status  %>%
    filter(abs(dist) <= 100) %>%
   mutate(closest_interface_type = case_when(
        closest_interface_subtype == 'CXCLneg tumor & CXCLneg stroma' ~ 'Hub-',
        closest_interface_subtype == 'CXCLpos tumor & CXCLpos stroma' ~ 'Hub+',
       .default = 'Heterotypic'
   )) %>%
   filter(!closest_interface_type == 'Heterotypic')
head(distances)

In [None]:
distances$closest_interface_type %>% unique

In [None]:
fig.size(height = 3, width = 7, res = 500)
temp <- distances %>%
  filter(abs(dist) <= 100) %>%
  filter(Status == 'MSI')

# --- Axis Label Preparation ---

# Get all unique factor levels for the x-axis.
# Sorting is important to ensure the labels are in the correct order.
all_x_labels <- sort(unique(temp$measurement_bin))

# Programmatically select 4 evenly spaced labels to display on the axis.
# This avoids clutter while still representing the full range of data.
x_axis_breaks <- all_x_labels[round(seq(from = 1, to = length(all_x_labels), length.out = 6))]

# --- Plotting ---

p3 <- ggplot(temp) +
  geom_bar(aes(x = measurement_bin, fill = cxcl_pos_tile), alpha = 1, position = 'fill', width = 1) +
  # Use scale_x_discrete() to specify exactly which labels to show.
  scale_x_discrete(breaks = x_axis_breaks) +
  facet_wrap(closest_interface_type ~ type_lvl1, scales = 'fixed', nrow = 2) +
  cowplot::theme_cowplot(7) +
  theme(
    legend.position = 'top',
    axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 0.5),
    plot.title = element_text(size = 7)
  ) +
  labs(
    x = 'Spatial bins',
    y = 'Proportion of cells',
    title = ''
  ) +
  scale_color_tableau(palette = 'Classic Blue-Red 6', name = 'Which type of tiles are cells assigned to?') +
  scale_fill_tableau(palette = 'Classic Blue-Red 6', name = 'Which type of tiles are cells assigned to?') +
  NULL

p3

In [None]:
fig.size(height = 3, width = 5, res = 500)
temp <- distances %>%
  filter(abs(dist) <= 100) %>%
  filter(Status == 'MSI')

# --- Axis Label Preparation ---

# Get all unique factor levels for the x-axis.
# Sorting is important to ensure the labels are in the correct order.
all_x_labels <- sort(unique(temp$measurement_bin))

# Programmatically select 4 evenly spaced labels to display on the axis.
# This avoids clutter while still representing the full range of data.
x_axis_breaks <- all_x_labels[round(seq(from = 1, to = length(all_x_labels), length.out = 6))]

# --- Plotting ---

p4 <- ggplot(temp) +
  geom_bar(aes(x = as.factor(measurement_bin), fill = cxcl_pos_tile), alpha = 1, position = 'fill', width = 1) +
  # Use scale_x_discrete() to specify exactly which labels to show.
  scale_x_discrete(breaks = x_axis_breaks) +
  facet_wrap(~closest_interface_type, scales = 'fixed', nrow = 1) +
  cowplot::theme_cowplot(7) +
  theme(
    legend.position = 'top',
    axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 0.5),
    plot.title = element_text(size = 7)
  ) +
  labs(
    x = 'Spatial bins',
    y = 'Proportion of cells',
    title = ''
  ) +
  scale_color_tableau(palette = 'Classic Blue-Red 6', name = 'Which type of tiles are cells assigned to?') +
  scale_fill_tableau(palette = 'Classic Blue-Red 6', name = 'Which type of tiles are cells assigned to?') +
  NULL

p4

In [None]:
fig.size(height = 3, width = 7)
(p4 + ggtitle("")) + (p3 + labs(title = '', subtitle = '')) + plot_layout(design = 'ABBB', guides = 'collect') & theme(legend.position = 'top')

In [None]:
pdf("cells_in_hub_tiles_around_interface.pdf", height = 3, width = 7)
(p4 + ggtitle("")) + (p3 + labs(title = '', subtitle = '')) + plot_layout(design = 'ABBB', guides = 'collect') & theme(legend.position = 'top')
dev.off()

# TO BE ADDED: CODE FOR VISUALIZING INTERFACES

In [None]:
selected_sample = 'C110'
# select aggregates to plot
roi = top_hub_aggregates %>% 
    filter(SampleID == selected_sample) %>%
    slice_max(n = 1, order_by = nTcd8_CXCL13) %>%
    pull(agg_id) 
roi = roi[[1]]
roi

# find aggregate in the agg_metadata table
central_agg = agg_metadata %>%
    filter(id == roi) %>%
    st_as_sf()
central_agg %>% as.data.frame

# find cells in aggregates near this central aggregate from the tile_metadata table, which contains cell level metadata
bbox_central_agg = central_agg %>%
    st_bbox()
bbox_central_agg

expand = 20
expanded_bbox =  c(bbox_central_agg$xmin - expand, 
                   bbox_central_agg$xmax + expand, 
                   bbox_central_agg$ymin - expand, 
                   bbox_central_agg$ymax + expand) %>%
    as.vector()
names(expanded_bbox) = c('xmin', 'xmax', 'ymin', 'ymax')
expanded_bbox

aggs_in_expanded_region = agg_metadata %>%
    filter(SampleID == selected_sample) %>%
    st_as_sf() %>%
    st_crop(., expanded_bbox) 

cells_in_expanded_region = tile_metadata %>%
    filter(SampleID == selected_sample) %>%
    st_as_sf(coords = c('X', 'Y'), remove = FALSE) %>%
    st_filter(., aggs_in_expanded_region) 

interfaces = st_intersection(
        aggs_in_expanded_region %>% filter(leiden_0.1 == 2) %>% pull(shape), 
        aggs_in_expanded_region %>% filter(leiden_0.1 == 1) %>% pull(shape))

boundary_buffer = pmap_dfr(expand.grid("interface" = interfaces, "radius" = c(seq(0, 100, by = 5))), 
         function(interface, radius){
               temp = st_buffer(x = interface, dist = radius, singleSide = FALSE) 
               boundary_buffer = st_sf(geometry = st_sfc(temp)) %>% 
                     mutate("interface" = st_sfc(interface), "radius" = radius) %>% 
                     as.data.frame()
               return(boundary_buffer)
        }
)
boundary_buffer = boundary_buffer %>% 
    filter(radius != 0) %>% 
    st_as_sf %>% 
    arrange(desc(radius)) %>%
    st_crop(., expanded_bbox)


boundary_buffer = lapply(seq(0, 100, by = 5), 
    function(radius, interfaces){
    temp = interfaces %>%
        st_as_sf() %>%
        #st_cast('MULTILINESTRING') %>%
        #st_cast('MULTIPOLYGON') %>% 
        #st_cast('POLYGON') %>% 
        st_combine %>%
        st_buffer(., dist = radius, singleSide = FALSE) 
    boundary_buffer = st_sf(geometry = st_sfc(temp)) %>% 
         mutate("radius" = radius) %>% 
         as.data.frame()
    return(boundary_buffer)
    }, 
interfaces = interfaces) %>%
    rbindlist(ignore.attr = TRUE) %>%
    filter(radius != 0) %>% 
    st_as_sf %>% 
    arrange(desc(radius)) %>%
    st_crop(., aggs_in_expanded_region)    
    

require(ggnewscale)
tessera_palette = c('Epithelial-enriched' = '#A97F2F', 'Stromal-enriched' = '#17527D')
selected_states_palette = c('Tcd8-CXCL13' = 'magenta', 'Tcd8-HOBIT' = 'darkgreen', 'Myeloid-ISG' = 'brown', 'Tcd4-Treg' = 'orange')
borders_palette = c('Spatial bins' = 'black', 'MMRd interface' = 'red', 'MMRp interface' = 'blue')
ggplot() +
    geom_sf(data = aggs_in_expanded_region, aes(fill = tessera_annotation), color = 'white', alpha = 1) +
    scale_fill_manual(name = 'Tessera_annotations', values = tessera_palette) +
    #scale_color_manual(name = 'Tessera_annotations', values = tessera_palette) +
    new_scale_fill() +
    new_scale_color() +
    geom_sf(data = cells_in_expanded_region, aes(color = type_lvl1)) +
    scale_color_manual(name = 'Lineage', values = lineage_palette) +
    new_scale_color() +
    geom_sf(data = cells_in_expanded_region %>% filter(type_lvl3 %in% c('Tcd8-HOBIT', 'Tcd8-CXCL13', 'Tcd4-Treg', 'Tcd8-HOBIT', 'Myeloid-ISG')), 
            aes(color = type_lvl3), shape = '*', size = 12) +
    scale_color_manual(name = 'Selected states', values = selected_states_palette) + 
    new_scale_color() +
    new_scale_fill() +
    geom_sf(data = interfaces %>% st_buffer(dist = 1) %>% st_crop(., aggs_in_expanded_region) %>% st_as_sf %>% mutate(type = 'MMRd interface'), aes(color = type), fill = 'red') +
    geom_sf(data = boundary_buffer %>% mutate(type = 'Spatial bins'), aes(color = type), fill = NA) +
    scale_color_manual(name = 'Lines', values = borders_palette) + 
    #scale_fill_manual(name = 'Lines', values = borders_palette) + 
    guides(fill = guide_legend(override.aes = list(shape = 16)), 
            color = guide_legend(override.aes = list(shape = 16))) +
    theme_minimal() +
    ggtitle(roi) +
    NULL
print(dim(cells_in_expanded_region))

In [None]:
# # find cells in this aggregate from the tile_metadata table, which contains cell level metadata
# cells_in_central_agg = tile_metadata %>%
#     filter(agg_id == roi) 

# # find cells in aggregates near this central aggregate from the tile_metadata table, which contains cell level metadata
# bbox_central_agg = cells_in_central_agg %>%
#     st_as_sf(coords = c('X', 'Y')) %>%
#     st_bbox()
# bbox_central_agg

# expand = 20
# expanded_bbox =  c(bbox_central_agg$xmin - expand, 
#                    bbox_central_agg$xmax + expand, 
#                    bbox_central_agg$ymin - expand, 
#                    bbox_central_agg$ymax + expand) %>%
#     as.vector()
# names(expanded_bbox) = c('xmin', 'xmax', 'ymin', 'ymax')
# expanded_bbox

# cells_in_expanded_region = tile_metadata %>%
#     filter(SampleID == selected_sample) %>%
#     filter(X < expanded_bbox[['xmax']] & X > expanded_bbox[['xmin']] & Y < expanded_bbox[['ymax']] & Y > expanded_bbox[['ymin']]) 
# cells_in_expanded_region %>% glimpse

# # get tiles into which these cells fall
# aggs_in_expanded_region = agg_metadata %>%
#     filter(id %in% unique(cells_in_expanded_region$agg_id)) %>%
#     distinct() %>%
#     st_as_sf() %>%
#     st_make_valid()
# aggs_in_expanded_region %>% glimpse

# # Find cells which fall into these tiles
# cells_in_expanded_region = tile_metadata %>%
#     filter(SampleID == selected_sample) %>%
#     filter(agg_id %in% unique(aggs_in_expanded_region$id)) 
# cells_in_expanded_region %>% glimpse

# interfaces = st_intersection(
#         aggs_in_expanded_region %>% filter(leiden_0.1 == 2) %>% pull(shape), 
#         aggs_in_expanded_region %>% filter(leiden_0.1 == 1) %>% pull(shape))

# fig.size(10,10,200)
# ggplot() +
#     geom_sf(data = aggs_in_expanded_region, aes(fill = tessera_annotation), color = 'white', alpha = 0.5) +
#     geom_sf(data = interfaces, color = 'red') + 
#     ggtitle(roi) +
#     geom_point(data = cells_in_expanded_region, aes(x = X, y = Y, color = type_lvl1), shape = 16) +
#     theme_minimal() +
#     NULL

### Functions for plotting interfaces

In [None]:
sample = 'C110'

lineage_shapes = c('Epi' = 'E', 'Strom' = 'S', 'Myeloid' = 'M', 'Plasma' = 'P', 'B' = 'B', 'TNKILC' = 'T', 'Other' = '.')
lineage_shapes[highlight] = '*'
size_palette = c('Epi' = 1, 'Strom' = 1, 'Myeloid' = 1, 'Plasma' = 1, 'B' = 1, 'TNKILC' = 1, 'Other' = 0.5)
size_palette[highlight] = 1

tumor_palette =  c("CXCL+ Stroma" = "#17527D",
           "CXCL- Stroma" = scales::alpha("#17527D", 0.1),
           "CXCL+ Tumor" = "#A97F2F",
           "CXCL- Tumor" = scales::alpha("#A97F2F", 0.1),
           "CXCL+ Tumor-Stromal\ninterface" = 'red',
                  "All Tumor-Stromal\ninterface" = 'black',
                   
            "MSS\ninterface" = 'red',
            "Epithelial-enriched" = scales::alpha("#A97F2F", 1),
            "Stromal-enriched" = scales::alpha("#17527D", 1)
            ) #"#9c3848"

.metadata = agg_metadata %>% filter(SampleID == sample) %>% st_as_sf(sf_column_name = 'shape') %>%
    mutate(type = 
        case_when(
            leiden_0.1 == 1 & cxcl_pos_tile == 'CXCL_pos' ~ 'CXCL+ Tumor',
            leiden_0.1 == 1 & cxcl_pos_tile == 'CXCL_neg' ~ 'CXCL- Tumor',
            leiden_0.1 == 2 & cxcl_pos_tile == 'CXCL_pos' ~ 'CXCL+ Stroma', 
            leiden_0.1 == 2 & cxcl_pos_tile == 'CXCL_neg' ~ 'CXCL- Stroma', 
            leiden_0.1 == 3 ~ 'Granulocyte cap'
        )
) 


In [None]:
.metadata$leiden_0.1 %>% table

In [None]:
tile_metadata %>%
    filter(SampleID == sample) %>%
    filter(grepl(type_lvl2, pattern = 'Tcd8-CXCL13')) %>%
    group_by(agg_id) %>%
    summarize(n = n()) %>%
    slice_max(order_by = n, n = 5)

In [None]:
index = tile_metadata %>%
    filter(SampleID == sample) %>%
    filter(grepl(type_lvl2, pattern = 'Tcd8-CXCL13')) %>%
    group_by(agg_id) %>%
    summarize(n = n()) %>%
    slice_max(order_by = n, n = 1, with_ties = FALSE) %>%
    pull(agg_id) 


index

bbox = st_bbox(.metadata %>% filter(id == index))


expand = 100

bounds = c(bbox$xmin - expand, bbox$xmax + expand, bbox$ymin - expand, bbox$ymax + expand)

.cells = tile_metadata %>%
    filter(SampleID == sample) %>%
    filter(X < bounds['xmax'] & X > bounds['xmin'] & Y < bounds['ym`ax'] & Y > bounds['ymin']) 

.metadata = .metadata %>% filter(id %in% .cells$agg_id)

.cells = tile_metadata %>% 
    filter(agg_id %in% .metadata$id) %>%
    mutate(type_lvl3 = gsub(type_lvl2, pattern = '-prolif|high|low', replacement = '')) %>%
    mutate(type_lvl3 = gsub(type_lvl3, pattern = 'Epi.*', replacement = 'Epi')) %>%
    mutate(type_lvl2 = case_when(
        grepl(type_lvl3, pattern = 'Tcd8-CXCL13')~'Tcd8-CXCL13',
        grepl(type_lvl3, pattern = 'Myeloid-ISG')~'Myeloid-ISG',
        grepl(type_lvl2, pattern = 'Tcd4-Treg')~'Tcd4-Treg',
        grepl(type_lvl2, pattern = 'Tcd8-HOBIT')~'Tcd8-HOBIT',
        grepl(type_lvl2, pattern = 'Tcd8-gdlike')~'Tcd8-gdlike',
        #grepl(type_lvl2, pattern = 'Tcd4-IL7R')~'Tcd4-IL7R',
        #grepl(type_lvl3, pattern = '^B')~'B',
        .default = 'Other'
    ))
head(.cells)

In [None]:
.metadata %>% filter(id %in% .cells$agg_id) %>% filter(leiden_0.1 == 2 & cxcl_pos_tile == 'CXCL_pos') %>% pull(shape)

In [None]:
.metadata$leiden_0.1 %>% unique

In [None]:
.hub_pos_boundary = st_intersection(
    .metadata %>% filter(id %in% .cells$agg_id) %>% filter(leiden_0.1 == 2 & cxcl_pos_tile == 'CXCL_pos') %>% pull(shape), 
    .metadata %>% filter(leiden_0.1 == 1 & cxcl_pos_tile == 'CXCL_pos') %>% pull(shape)) %>%
        st_buffer(x = ., dist = 1e-6, singleSide = FALSE)  %>% 
        st_cast('MULTIPOLYGON') %>% 
        st_cast('POLYGON') %>% 
        st_combine %>% 
        st_as_sf() %>% 
        mutate(type = 'Hub+ interface')
.hub_pos_boundary

In [None]:
.hub_pos_boundary = st_intersection(
    .metadata %>% filter(id %in% .cells$agg_id) %>% filter(leiden_0.1 == 2 & cxcl_pos_tile == 'CXCL_pos') %>% pull(shape), 
    .metadata %>% filter(leiden_0.1 == 1 & cxcl_pos_tile == 'CXCL_pos') %>% pull(shape)) %>%
        st_buffer(x = ., dist = 1e-6, singleSide = FALSE)  %>% 
        st_cast('MULTIPOLYGON') %>% 
        st_cast('POLYGON') %>% 
        st_combine %>% 
        st_as_sf() %>% 
        mutate(type = 'Hub+ interface')

.hub_neg_boundary = st_intersection(
    .metadata %>% filter(id %in% .cells$agg_id) %>% filter(leiden_0.1 == 2 & cxcl_pos_tile == 'CXCL_neg') %>% pull(shape), 
    .metadata %>% filter(leiden_0.1 == 1 & cxcl_pos_tile == 'CXCL_neg') %>% pull(shape))%>%
        st_buffer(x = ., dist = 1e-6, singleSide = FALSE)  %>% 
        st_cast('MULTIPOLYGON') %>% 
        st_cast('POLYGON') %>% 
        st_combine %>% 
        st_as_sf() %>% 
        mutate(type = 'Hub- interface')


.heterotypic_boundary1 = st_intersection(
    .metadata %>% filter(id %in% .cells$agg_id) %>% filter(leiden_0.1 == 2 & cxcl_pos_tile == 'CXCL_neg') %>% pull(shape), 
    .metadata %>% filter(leiden_0.1 == 1 & cxcl_pos_tile == 'CXCL_pos') %>% pull(shape))%>%
        st_buffer(x = ., dist = 1e-6, singleSide = FALSE)  %>% 
        st_cast('MULTIPOLYGON') %>% 
        st_cast('POLYGON') %>% 
        st_combine %>% 
        st_as_sf() %>% 
        mutate(type = 'Heterotypic interface')

.heterotypic_boundary2 = st_intersection(
    .metadata %>% filter(id %in% .cells$agg_id) %>% filter(leiden_0.1 == 2 & cxcl_pos_tile == 'CXCL_pos') %>% pull(shape), 
    .metadata %>% filter(leiden_0.1 == 1 & cxcl_pos_tile == 'CXCL_neg') %>% pull(shape))%>%
        st_buffer(x = ., dist = 1e-6, singleSide = FALSE)  %>% 
        st_cast('MULTIPOLYGON') %>% 
        st_cast('POLYGON') %>% 
        st_combine %>% 
        st_as_sf() %>% 
        mutate(type = 'Heterotypic interface')

.shared_boundary = rbind(.hub_pos_boundary, .hub_neg_boundary, .heterotypic_boundary1, .heterotypic_boundary2)

.shared_boundary %>% dim

In [None]:
as.data.frame(.shared_boundary)

In [None]:
.shared_boundary %>% st_cast('MULTIPOLYGON') %>% st_cast('POLYGON') %>% st_combine

In [None]:
p1 = ggplot(.metadata %>%
        mutate(type = as.factor(type)) %>%
        group_by(type) %>%
        summarize %>%
        ungroup
    ) +
    geom_sf(aes(fill = type, 
                color = type), 
            show.legend = TRUE) +   
    # scale_color_manual(name = 'Region', values = tumor_palette, drop = FALSE) +
    # scale_fill_manual(name = 'Region', values = tumor_palette, drop = FALSE) +
    cowplot::theme_half_open(7) +
    geom_point(data = .cells,
               aes(X, 
                   Y, 
                   color = type_lvl2),
              shape = '.',
              color = '#e9e9e8'
              ) +
    labs(x = 'X', y = 'Y') +
    geom_sf(data = .shared_boundary,
            aes(fill = type, 
                color = type), 
            show.legend = FALSE) + 
    theme_void(base_size = 7) +
    theme(legend.position = 'right') +
    #guides(fill = guide_legend(nrow = 1, override.aes = list(shape = 16))) +
    NULL

p1

In [None]:
shape_palette = c('*', '*', '*', '*', '*',  '.')
names(shape_palette) = c('Tcd8-CXCL13', 'Tcd4-Treg', 'Tcd8-HOBIT', 'Tcd8-gdlike', 'Myeloid-ISG', 'Other')

size_palette = c(2, 2, 2, 2, 2, 1)
names(size_palette) = c('Tcd8-CXCL13', 'Tcd4-Treg', 'Tcd8-HOBIT', 'Tcd8-gdlike', 'Myeloid-ISG', 'Other')

type_palette = c(ggthemes::tableau_color_pal(palette = 'Tableau 10')(5), '#E9E9E8')
names(type_palette) = names(size_palette)

#type_palette = c('B' = '#4E79A7', 'Tcd8-CXCL13' = '#76B7B2', 'Myeloid-ISG' = '#E15759', 'Other' = '#E9E9E8')

.shared_boundary_combined = .shared_boundary %>% st_cast('MULTIPOLYGON') %>% st_cast('POLYGON') %>% st_combine
boundary_buffer = lapply(c(seq(0, 100, by = 5)), function(r){
    boundary_buffer = st_buffer(x = .shared_boundary_combined, dist = r, singleSide = FALSE)  %>% st_cast('MULTIPOLYGON') %>% st_cast('POLYGON') %>% st_combine %>% st_as_sf %>% mutate(radius = r, side = sign(r)) %>% as.data.frame 
    return(boundary_buffer)
}) %>% rbindlist()
boundary_buffer = boundary_buffer %>% filter(radius != 0) %>% st_as_sf %>% arrange(desc(radius))

In [None]:
ggplot(.metadata %>%
        mutate(type = as.factor(type)) %>%
        group_by(type) %>%
        summarize %>%
        ungroup
    ) +
    geom_sf(aes(fill = type, 
                color = type), 
            show.legend = TRUE) +   
    #scale_color_manual(name = 'Region', values = tumor_palette, drop = FALSE) +
    #scale_fill_manual(name = 'Region', values = tumor_palette, drop = FALSE) +
    cowplot::theme_half_open(7) +
    geom_sf(data = boundary_buffer, color = alpha('green', 0.25), fill = NA) +
    geom_point(data = .cells,
               aes(X, 
                   Y, 
                   color = type_lvl2),
              shape = '.',
              color = '#e9e9e8'
              ) +
    labs(x = 'X', y = 'Y') +
    geom_sf(data = .shared_boundary %>% 
            st_as_sf() %>% 
            mutate(type = as.factor("All Tumor-Stromal\ninterface")), 
            aes(fill = type, 
                color = type), 
            show.legend = FALSE) + 
    theme_void(base_size = 7) +
    theme(legend.position = 'right') +
    #guides(fill = guide_legend(nrow = 1, override.aes = list(shape = 16))) +
    NULL

In [None]:
require(ggnewscale)
p2 = ggplot(.metadata %>%
        mutate(type = as.factor(type)) %>%
        group_by(type) %>%
        summarize %>%
        ungroup
    ) +
    geom_sf(color = 'black', fill = 'white') +   
    geom_sf(data = boundary_buffer, color = alpha('green', 0.25), fill = NA) +
    geom_point(data = .cells,
               aes(X, 
                   Y, 
                   color = type_lvl2,
                    shape = type_lvl2,
                   size = type_lvl2
                  ),
              show.legend = TRUE
              ) +
    scale_color_manual(values = type_palette, name = 'Selected cell\nstates: ', na.value = '#e9e9e8', drop = FALSE) +
    scale_shape_manual(values = shape_palette, guide = 'none') + 
    scale_size_manual(values = size_palette, guide = 'none') +
    new_scale_color() +
    new_scale_fill() +
        geom_sf(data = .shared_boundary %>% 
            st_as_sf() %>% 
            mutate(type = as.factor("All Tumor-Stromal\ninterface")),
            color = 'black',
            fill = 'black',
            show.legend = FALSE) + 
    theme_minimal(base_size = 7) +
    #labs(x = 'X', y = 'Y') +
    theme(legend.position = 'right') +
    #theme(axis.text = element_text(angle = 90, hjust = 0.5, vjust = 0.5)) +
    guides(#shape = element_blank(),
           #size = element_blank(), 
           color = guide_legend(ncol = 1, override.aes = list(size = 6, shape = 16))) +
    scale_y_continuous(n.breaks = 4) +
    scale_x_continuous(n.breaks = 4) +
    NULL 
p2

In [None]:

plotInterface = function(index, agg_metadata, sample, tile_metadata, expand = 100, highlight = 'Tcd8-CXCL13'){
    
    message(index)
    
    lineage_shapes = c('Epi' = 'E', 'Strom' = 'S', 'Myeloid' = 'M', 'Plasma' = 'P', 'B' = 'B', 'TNKILC' = 'T', 'Other' = '.')
    lineage_shapes[highlight] = '*'
    size_palette = c('Epi' = 1, 'Strom' = 1, 'Myeloid' = 1, 'Plasma' = 1, 'B' = 1, 'TNKILC' = 1, 'Other' = 0.5)
    size_palette[highlight] = 1
    
    tumor_palette =  c("CXCL+ Stroma" = "#17527D",
                   "CXCL- Stroma" = scales::alpha("#17527D", 0.1),
                   "CXCL+ Tumor" = "#A97F2F",
                   "CXCL- Tumor" = scales::alpha("#A97F2F", 0.1),
                   "CXCL+ Tumor-Stromal\ninterface" = 'red',
                    "MSS\ninterface" = 'red',
                    "Tumor" = scales::alpha("#A97F2F", 1),
                    "Stroma" = scales::alpha("#17527D", 1)
                    ) #"#9c3848"
    
    .metadata = agg_metadata %>% filter(SampleID == sample) %>% st_as_sf(sf_column_name = 'shape') %>%
        mutate(type = 
            case_when(
                leiden_0.1 == 1 & cxcl_pos_tile == 'CXCL_pos' ~ 'CXCL+ Tumor',
                leiden_0.1 == 1 & cxcl_pos_tile == 'CXCL_neg' ~ 'CXCL- Tumor',
                leiden_0.1 == 2 & cxcl_pos_tile == 'CXCL_pos' ~ 'CXCL+ Stroma', 
                leiden_0.1 == 2 & cxcl_pos_tile == 'CXCL_neg' ~ 'CXCL- Stroma', 
                leiden_0.1 == 3 ~ 'Granulocyte cap'
            )
    ) 
    
    bbox = st_bbox(.metadata %>% filter(id == index))
    
    bounds = c(bbox$xmin - expand, bbox$xmax + expand, bbox$ymin - expand, bbox$ymax + expand)
    
    .cells = tile_metadata %>%
        filter(SampleID == sample) %>%
        filter(X < bounds['xmax'] & X > bounds['xmin'] & Y < bounds['ymax'] & Y > bounds['ymin']) 
    
    .metadata = .metadata %>% filter(id %in% .cells$agg_id)
    
    .cells = tile_metadata %>% 
        filter(agg_id %in% .metadata$id) %>%
        mutate(type_lvl3 = gsub(type_lvl2, pattern = '-prolif|high|low', replacement = '')) %>%
        mutate(type_lvl3 = gsub(type_lvl3, pattern = 'Epi.*', replacement = 'Epi')) %>%
        mutate(type_lvl2 = case_when(
            grepl(type_lvl3, pattern = 'Tcd8-CXCL13')~'Tcd8-CXCL13',
            grepl(type_lvl3, pattern = 'Myeloid-ISG')~'Myeloid-ISG',
            grepl(type_lvl2, pattern = 'Tcd4-Treg')~'Tcd4-Treg',
            grepl(type_lvl2, pattern = 'Tcd8-HOBIT')~'Tcd8-HOBIT',
            grepl(type_lvl2, pattern = 'Tcd8-gdlike')~'Tcd8-gdlike',
            #grepl(type_lvl2, pattern = 'Tcd4-IL7R')~'Tcd4-IL7R',
            #grepl(type_lvl3, pattern = '^B')~'B',
            .default = 'Other'
        ))
    
    .shared_boundary = st_intersection(
        .metadata %>% filter(id %in% .cells$agg_id) %>% filter(leiden_0.1 == 2 & cxcl_pos_tile == 'CXCL_pos') %>% pull(shape), 
        .metadata %>% filter(leiden_0.1 == 1 & cxcl_pos_tile == 'CXCL_pos') %>% pull(shape))

    .shared_boundary = .shared_boundary %>%
        st_buffer(x = ., dist = 0.25, singleSide = FALSE)  %>% 
        st_cast('MULTIPOLYGON') %>% 
        st_cast('POLYGON') %>% 
        st_combine %>% 
        st_as_sf()
    
    p1 = ggplot(.metadata %>%
            mutate(type = as.factor(type)) %>%
            group_by(type) %>%
            summarize %>%
            ungroup
        ) +
        geom_sf(aes(fill = type, 
                    color = type), 
                show.legend = TRUE) +   
        scale_color_manual(name = 'Region', values = tumor_palette, drop = FALSE) +
        scale_fill_manual(name = 'Region', values = tumor_palette, drop = FALSE) +
        cowplot::theme_half_open(7) +
        geom_point(data = .cells,
                   aes(X, 
                       Y, 
                       color = type_lvl2),
                  shape = '.',
                  color = '#e9e9e8'
                  ) +
        labs(x = 'X', y = 'Y') +
        geom_sf(data = .shared_boundary %>% 
                st_as_sf() %>% 
                mutate(type = as.factor("CXCL+ Tumor-Stromal\ninterface")), 
                aes(fill = type, 
                    color = type), 
                show.legend = FALSE) + 
        theme_void(base_size = 7) +
        theme(legend.position = 'right') +
        guides(fill = guide_legend(nrow = 1, override.aes = list(shape = 16))) +
        NULL
    
    shape_palette = c('*', '*', '*', '*', '*',  '.')
    names(shape_palette) = c('Tcd8-CXCL13', 'Tcd4-Treg', 'Tcd8-HOBIT', 'Tcd8-gdlike', 'Myeloid-ISG', 'Other')

    size_palette = c(2, 2, 2, 2, 2, 1)
    names(size_palette) = c('Tcd8-CXCL13', 'Tcd4-Treg', 'Tcd8-HOBIT', 'Tcd8-gdlike', 'Myeloid-ISG', 'Other')

    type_palette = c(ggthemes::tableau_color_pal(palette = 'Tableau 10')(5), '#E9E9E8')
    names(type_palette) = names(size_palette)
    
    #type_palette = c('B' = '#4E79A7', 'Tcd8-CXCL13' = '#76B7B2', 'Myeloid-ISG' = '#E15759', 'Other' = '#E9E9E8')

    boundary_buffer = lapply(c(seq(0, 50, by = 5)), function(r){
        boundary_buffer = st_buffer(x = .shared_boundary[1], dist = r, singleSide = FALSE)  %>% st_cast('MULTIPOLYGON') %>% st_cast('POLYGON') %>% st_combine %>% st_as_sf %>% mutate(radius = r, side = sign(r)) %>% as.data.frame 
        return(boundary_buffer)
    }) %>% rbindlist()
    boundary_buffer = boundary_buffer %>% filter(radius != 0) %>% st_as_sf %>% arrange(desc(radius))

    p2 = ggplot(.metadata %>%
            mutate(type = as.factor(type)) %>%
            group_by(type) %>%
            summarize %>%
            ungroup
        ) +
        geom_sf(color = 'black', fill = 'white') +   
        geom_sf(data = boundary_buffer, color = alpha('green', 0.25), fill = NA) +
        geom_point(data = .cells,
                   aes(X, 
                       Y, 
                       color = type_lvl2,
                        shape = type_lvl2,
                       size = type_lvl2
                      ),
                  show.legend = TRUE
                  ) +
            geom_sf(data = .shared_boundary %>% 
                st_as_sf() %>% 
                mutate(type = as.factor("CXCL+ Tumor-Stromal\ninterface")),
                color = 'red',
                fill = 'red',
                show.legend = FALSE) + 
        scale_color_manual(values = type_palette, name = 'Selected cell\nstates: ', na.value = '#e9e9e8', drop = FALSE) +
        scale_shape_manual(values = shape_palette, guide = 'none') + 
        scale_size_manual(values = size_palette, guide = 'none') +
        theme_void(base_size = 7) +
        #labs(x = 'X', y = 'Y') +
        theme(legend.position = 'right') +
        #theme(axis.text = element_text(angle = 90, hjust = 0.5, vjust = 0.5)) +
        guides(#shape = element_blank(),
               #size = element_blank(), 
               color = guide_legend(ncol = 1, override.aes = list(size = 6, shape = 16))) +
        scale_y_continuous(n.breaks = 4) +
        scale_x_continuous(n.breaks = 4) +
        NULL 
    
    return(list(p1 , p2, bbox))
}
fig.size(3,3.5,500)
index = 'G4595_9774'
sample = 'G4595'
message(index)
expand = 100
highlight= TRUE

agg_id = "G4595_9774"
.zoomed_interface = plotInterface(agg_id, 
    agg_metadata = agg_metadata, 
    sample = 'G4595', 
    tile_metadata = .cells, 
    highlight = 'Tcd8-CXCL13|Myeloid-ISG|Tcd8-HOBIT|Tcd8-CXCL13-prolif|',
    expand = 200)
