# Spatial Analysis of Cell Proximity to Interfaces

This notebook performs a comprehensive spatial analysis to determine the distribution of specific cell types relative to tumor-stromal interfaces. It calculates distances from cells to interfaces, bins them, and applies statistical smoothing to generate robust prevalence estimates.

### Workflow Overview:
1.  **Setup**: Load necessary R libraries for spatial analysis (`sf`, `geos`), parallel processing (`furrr`), and plotting (`ggplot2`).
2.  **Data Ingestion**: Load single-cell metadata and pre-computed interface geometries.
3.  **Spatial Calculation**: Determine the signed distance of every cell to the nearest interface (Negative = Stroma, Positive = Tumor).
4.  **Statistical Modeling**: Use Empirical Bayes shrinkage to stabilize proportion estimates in spatial bins with few cells.
5.  **Meta-Analysis**: Aggregate results across multiple patient samples using inverse-variance weighting (`ashr`).
6.  **Visualization**: Generate line traces and summary statistics.

### 1. Library Loading and Configuration
Here we load the required packages. `plan(multicore)` sets up parallel processing to speed up the heavy geometric calculations later. The `fig.size` function is a helper to control plot dimensions in the notebook.

In [None]:
suppressPackageStartupMessages({
    library(data.table)
    library(sf)
    library(purrr)
    library(ggplot2)
    library(ggthemes)
    library(geos)
    library(glue)
    library(furrr)
    library(future)
    library(dplyr)
    library(patchwork)
})

# Configure parallel processing plan
# "multicore" uses forking (efficient on Linux/Mac), falls back to "multisession" on Windows
plan(multicore)

#' @title Set Plot Dimensions
#' @description A helper function to adjust the global options for plot sizing in the Jupyter notebook.
#' @param h Numeric. The height of the plot in inches.
#' @param w Numeric. The width of the plot in inches.
fig.size <- function(h, w) {
    options(repr.plot.height = h, repr.plot.width = w)
}

### 2. Utility Functions
`find_midpoint` is a utility to parse the string representation of a bin interval (e.g., `"(5, 10]"`) and return its numeric midpoint (e.g., `7.5`), which is essential for plotting the x-axis.

In [None]:
find_midpoint <- function(interval_string) {
  # 1. Remove parentheses and brackets using gsub
  # The pattern "[()\\[\\]]" matches any character inside the outer brackets.
  # We need to escape the inner square brackets with \\.
  cleaned_string <- gsub("\\(|\\[|\\)|\\]", "", interval_string)
  
  # 2. Split the string by the comma
  # strsplit returns a list, so we take the first element [[1]]
  num_strings <- strsplit(cleaned_string, ",")[[1]]
  
  # 3. Convert character vector to numbers and calculate the mean
  midpoint <- mean(as.numeric(num_strings))
  
  return(midpoint)
}

# Load the initial metadata to verify structure
tile_metadata = readr::read_rds('../Tessera tiles/Tessera processed results/tile_metadata_2025-07-22.rds') #'tile_metadata_2025-03-27.rds')
head(tile_metadata)

### 3. Core Distance Calculation Function (Version 2)
This function calculates the Euclidean distance from every cell to the nearest tumor/stroma interface. 

**Key Steps:**
1.  **Geometry Conversion**: Converts cell X/Y coordinates into GEOS points for fast spatial querying.
2.  **Nearest Neighbor**: Finds the specific interface line segment closest to each cell.
3.  **Signed Distance**: If a cell is in the Stroma, the distance is made negative; if in the Tumor/Epithelium, it remains positive.
4.  **Binning**: Cells are grouped into 5µm bins ranging from -100µm to +100µm.
5.  **Aggregation**: The function returns a list of matrices (one for each interface type) containing cell counts per bin.

In [None]:
summarize_cells_by_interface_proximity_2 = function(cells, interfaces) {
    # retains all cells around hub+ and hub- interfaces.
    
    ## Get distances and closest interface type
    pts = st_as_sf(cells[, .(X, Y)], coords = c('X', 'Y'))
    geos_pts = geos::as_geos_geometry(pts$geometry)
    geos_lines = geos::as_geos_geometry(interfaces$x[1:nrow(interfaces)])
    
    nearest_interfaces_idx = geos::geos_nearest(geos_pts, geos_lines)
    
    cells$closest_interface_type = interfaces$Type_of_Interface[nearest_interfaces_idx]
    
    cells$dist_interface = geos::geos_distance(geos_pts, geos_lines[nearest_interfaces_idx])
    
    ## Assign sign to distances
    cells$dist_interface_signed = fifelse(
        cells$tessera_annotation == 'Stromal-enriched',
        -cells$dist_interface,
        cells$dist_interface
    )
    
    ## Assign cells to 5um bins
    dist_breaks = seq(-100, 100, by = 5)
    cells$dist_bin = cut(cells$dist_interface_signed, breaks = dist_breaks, include.lowest = TRUE)

    # --- ROBUST SUMMARIZATION --

    # cells = cells %>% filter(
    #     (closest_interface_type == 'CXCLpos tumor & CXCLpos stroma' & cxcl_pos_tile == 'CXCL_pos') | (closest_interface_type == 'CXCLneg tumor & CXCLneg stroma' & cxcl_pos_tile == 'CXCL_neg')        
    # )
    
    cells_in_range = cells[!is.na(dist_bin)]
    
    if (nrow(cells_in_range) == 0) {
        warning("No cells found within the -100 to 100µm distance range.")
        return(list())
    }

    all_interface_types = unique(cells$closest_interface_type)
    cells_in_range[, closest_interface_type := factor(closest_interface_type, levels = all_interface_types)]

    counts_long = cells_in_range[, .N, by = .(closest_interface_type, dist_bin, type_lvl3)]

    counts_wide = dcast(counts_long,
                        closest_interface_type + dist_bin ~ type_lvl3,
                        value.var = "N",
                        fill = 0,
                        drop = FALSE)

    result_list = split(counts_wide, by = "closest_interface_type")

    result_list = lapply(result_list, function(dt) {
        row_names = dt$dist_bin
        count_cols = setdiff(names(dt), c("closest_interface_type", "dist_bin"))
        mat = as.matrix(dt[, ..count_cols])
        rownames(mat) = row_names
        return(mat)
    })

    return(result_list)
}


### 4. Alternative Summarization Function
This is a variation of the previous function. It includes a specific filtering step (currently active) that restricts analysis to cells where the local tile annotation (CXCL positive/negative) matches the interface type.

In [None]:
summarize_cells_by_interface_proximity = function(cells, interfaces) {
    ## Get distances and closest interface type
    pts = st_as_sf(cells[, .(X, Y)], coords = c('X', 'Y'))
    geos_pts = geos::as_geos_geometry(pts$geometry)
    geos_lines = geos::as_geos_geometry(interfaces$x[1:nrow(interfaces)])
    
    nearest_interfaces_idx = geos::geos_nearest(geos_pts, geos_lines)
    
    cells$closest_interface_type = interfaces$Type_of_Interface[nearest_interfaces_idx]
    
    cells$dist_interface = geos::geos_distance(geos_pts, geos_lines[nearest_interfaces_idx])
    
    ## Assign sign to distances
    cells$dist_interface_signed = fifelse(
        cells$tessera_annotation == 'Stromal-enriched',
        -cells$dist_interface,
        cells$dist_interface
    )
    
    ## Assign cells to 5um bins
    dist_breaks = seq(-100, 100, by = 5)
    cells$dist_bin = cut(cells$dist_interface_signed, breaks = dist_breaks, include.lowest = TRUE)

    # --- ROBUST SUMMARIZATION --

    cells = cells %>% filter(
        (closest_interface_type == 'CXCLpos tumor & CXCLpos stroma' & cxcl_pos_tile == 'CXCL_pos') | (closest_interface_type == 'CXCLneg tumor & CXCLneg stroma' & cxcl_pos_tile == 'CXCL_neg')        
    )
    
    cells_in_range = cells[!is.na(dist_bin)]
    
    if (nrow(cells_in_range) == 0) {
        warning("No cells found within the -100 to 100µm distance range.")
        return(list())
    }

    all_interface_types = unique(cells$closest_interface_type)
    cells_in_range[, closest_interface_type := factor(closest_interface_type, levels = all_interface_types)]

    counts_long = cells_in_range[, .N, by = .(closest_interface_type, dist_bin, type_lvl3)]

    counts_wide = dcast(counts_long,
                        closest_interface_type + dist_bin ~ type_lvl3,
                        value.var = "N",
                        fill = 0,
                        drop = FALSE)

    result_list = split(counts_wide, by = "closest_interface_type")

    result_list = lapply(result_list, function(dt) {
        row_names = dt$dist_bin
        count_cols = setdiff(names(dt), c("closest_interface_type", "dist_bin"))
        mat = as.matrix(dt[, ..count_cols])
        rownames(mat) = row_names
        return(mat)
    })

    return(result_list)
}


### 5. Empirical Bayes Statistics
To handle the noise inherent in single-cell spatial data (especially in bins with few cells), we use an Empirical Bayes approach to estimate proportions.

* **`estimate_beta_prior`**: Estimates a Beta distribution prior from the overall data using the method of moments.
* **`empirical_bayes_summary`**: Updates the raw observed proportion (k/n) with the prior alpha and beta, resulting in a "shrunken" estimate that pulls extreme values from low-count bins towards the global mean.

In [None]:
estimate_beta_prior <- function(k, n) {
    if (length(k) <= 1) {
        return(list(alpha = 0, beta = 0))
    }

    valid_bins <- n > 0
    k_valid <- k[valid_bins]
    n_valid <- n[valid_bins]
    
    if (length(k_valid) <= 1) {
        return(list(alpha = 0, beta = 0))
    }

    p_hat <- k_valid / n_valid
    mean_p <- mean(p_hat)
    var_p <- var(p_hat)
    mean_n <- mean(n_valid)
    
    var_true <- var_p - mean_p * (1 - mean_p) / mean_n
    
    if (is.na(var_true) || var_true <= 0) {
        return(list(alpha = 0, beta = 0))
    }
    
    nu <- mean_p * (1 - mean_p) / var_true - 1
    list(alpha = mean_p * nu, beta = (1 - mean_p) * nu)
}

empirical_bayes_summary <- function(k, n, bin_lvls, model = c("mle", "binomial", "poisson")) {
    model <- match.arg(model)
    if (length(k) != length(n)) stop("Input lengths must match.")
    
    est <- k / n
    
    if (model == "binomial") {
        prior <- estimate_beta_prior(k, n)
        est <- (k + prior$alpha) / (n + prior$alpha + prior$beta)
        var <- ((k + prior$alpha) * (n - k + prior$beta)) /
               ((n + prior$alpha + prior$beta)^2 * (n + prior$alpha + prior$beta + 1))
               
    } else {
       stop("Only binomial model is fully implemented in this notebook version.")
    }
    
    df = data.table(
        dist_bin = factor(bin_lvls, bin_lvls),
        model = model,
        count = k,
        size = n,
        estimate = est,
        variance = var,
        alpha = prior$alpha,
        beta = prior$beta
    )

    df[, p := exp(pnorm(estimate / sqrt(variance), lower.tail = FALSE, log.p = TRUE))]
    df[, padj := p.adjust(p)]
    df[, asterisk := ifelse(padj < 0.01, "*", "")]
    
    return(df)
}

### 6. Meta-Analysis and Aggregation
These functions aggregate the statistical results across multiple samples (patients).
* **`meta_ashr`**: Uses the `ashr` package to perform adaptive shrinkage meta-analysis, combining estimates from multiple samples.
* **`get_stats`**: A high-level driver that computes stats for one condition, running the Empirical Bayes step per sample and then the Meta-Analysis across samples.
* **`t_test_and_lfc`**: Computes significance (Welch's t-test) and Log2 Fold Change between two conditions.

In [None]:
meta_ashr <- function(p_vec, var_vec) {
    ash_fit = ashr::ash(betahat = p_vec, sebetahat = sqrt(var_vec), method = "fdr", mixcompdist = 'normal')
    w = prop.table(1 / (ash_fit$result$PosteriorSD^2 + 1e-8))
    data.table(
        estimate = sum(w * ash_fit$result$PosteriorMean),
        variance = sum(w * ash_fit$result$PosteriorSD^2)
    )
}

get_stats = function(counts_list, .types) {
    df_list = imap(counts_list, function(counts, .id) {
        empirical_bayes_summary(
            rowSums(counts[, .types, drop = FALSE]),
            rowSums(counts),
            rownames(counts),
            'binomial'
        )
    })
    
    df = bind_rows(df_list, .id = 'SampleID')[
        , .(SampleID, dist_bin, estimate, variance)
    ][
        , meta_ashr(estimate, variance), dist_bin
    ]
    
    df[, p := exp(pnorm(estimate / sqrt(variance), lower.tail = FALSE, log.p = TRUE))]
    df[, padj := p.adjust(p)]
    df[, asterisk := case_when(
        is.na(padj) ~ '',
        padj < 0.01 ~ "*",
        TRUE ~ ''
    )]
    
    df[]
}

t_test_and_lfc <- function(mu1, var1, n1, mu2, var2, n2) {
  se_diff <- sqrt(var1 / n1 + var2 / n2)
  t_stat <- (mu1 - mu2) / se_diff
  df_num <- (var1 / n1 + var2 / n2)^2
  df_denom <- ((var1 / n1)^2) / (n1 - 1) + ((var2 / n2)^2) / (n2 - 1)
  df <- df_num / df_denom
  p_value <- 2 * pt(-abs(t_stat), df)
  lfc <- log2((mu1) / (mu2 ))
  
  return(list(
    p_value = p_value,
    log2_fold_change = lfc
  ))
}

standardize_matrix_columns <- function(mat_list) {
    all_cols <- sort(unique(unlist(lapply(mat_list, colnames))))
    
    lapply(mat_list, function(mat) {
        missing_cols <- setdiff(all_cols, colnames(mat))
        if (length(missing_cols) > 0) {
            zeros <- matrix(0, nrow = nrow(mat), ncol = length(missing_cols),
                            dimnames = list(rownames(mat), missing_cols))
            mat <- cbind(mat, zeros)
        }
        mat[, all_cols, drop = FALSE]
    })
}

interface_plot = function(counts, .types, est_model=c('binomial', 'poisson', 'mle')) {
    est_model <- match.arg(est_model)
    df = empirical_bayes_summary(
        rowSums(counts[, .types, drop = FALSE]),
        rowSums(counts),
        rownames(counts),
        est_model
    ) 

    ymax = 100 * max(df$estimate + 1.96 * sqrt(df$variance))
    
    ggplot(df, aes(dist_bin, 100 * estimate)) + 
        geom_vline(xintercept = c(20.5), size = 2, linetype = 1, color = 'grey') + 
        geom_point(aes(size = size)) + 
        geom_errorbar(aes(ymin = 100 * (estimate - 1.96 * sqrt(variance)), ymax = 100 * (estimate + 1.96 * sqrt(variance))), width = 0) + 
        geom_hline(yintercept = 0) + 
        geom_line(data = . %>% dplyr::mutate(dist_bin = as.numeric(dist_bin))) + 
        theme_bw(base_size = 16) + 
        theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
        labs(y = '% of all cells', x = 'Distance Window', size = '# Cells', subtitle = 'mean & 95% CI, *padj<0.01', title = paste(.types, collapse = '; ')) + 
        geom_text(aes(y = 100 * (estimate + 1.96 * sqrt(variance)), label = asterisk), size = 6, vjust = 0) + 
        annotate("text", x = 0.5, y = ymax + .05, label = 'Stromal Side', hjust = 0, size = 6) + 
        annotate("text", x = 40.5, y = ymax + .05, label = 'Epithelial Side', hjust = 1, size = 6) + 
        NULL
}

run_global_hub_analysis <- function(types_list, counts_list) {
  n_hubPos = sum(grepl('hubPos', names(counts_list)))
  n_hubNeg = sum(grepl('hubNeg', names(counts_list)))

  results_by_type <- purrr::imap(types_list, function(.x, .y) {
    .types <- .y
    df_hubPos <- get_stats(counts_list[grepl('hubPos', names(counts_list))], .types)
    df_hubNeg <- get_stats(counts_list[grepl('hubNeg', names(counts_list))], .types)
    df <- bind_rows(list(hubPos = df_hubPos, hubNeg = df_hubNeg), .id = 'Status')
    df_stat <- dcast(df, dist_bin ~ Status, value.var = c('estimate', 'variance'))[
      , c('p', 'log2_fold_change') := t_test_and_lfc(estimate_hubPos, variance_hubPos, n_hubPos, estimate_hubNeg, variance_hubNeg, n_hubNeg), dist_bin
    ]
    return(list(
      hubPos_data = df_hubPos,
      hubNeg_data = df_hubNeg,
      stats_data = df_stat
    ))
  })
  
  transposed_results <- purrr::transpose(results_by_type)
  all_hubPos_df <- dplyr::bind_rows(transposed_results$hubPos_data, .id = "cell_type")
  all_hubNeg_df <- dplyr::bind_rows(transposed_results$hubNeg_data, .id = "cell_type")
  summary_stats <- dplyr::bind_rows(transposed_results$stats_data, .id = "cell_type")
  
  summary_stats[, padj_global := p.adjust(p, method = 'fdr')]
  summary_stats[, height := max(estimate_hubPos + 1.96 * sqrt(variance_hubPos), estimate_hubNeg + 1.96 * sqrt(variance_hubNeg)), by = .(cell_type, dist_bin)]
  summary_stats[, asterisk := fifelse(padj_global < 0.01, "*", "")]
  
  return(list(
    summary_stats = summary_stats,
    hubPos_results = all_hubPos_df,
    hubNeg_results = all_hubNeg_df
  ))
}

run_global_mmr_analysis <- function(types_list, counts_list, mmr_map) {
  
  n_msi <- length(unique(mmr_map[MMRstatus == 'MMRd']$SampleID))
  n_mss <- length(unique(mmr_map[MMRstatus == 'MMRp']$SampleID))
  
  results_by_type <- purrr::imap(types_list, function(.x, .y) {
    .types <- .y # Use the name of the list element (the correct type) for subsetting
    
    df_MSI <- get_stats(counts_list[mmr_map[MMRstatus == 'MMRd']$SampleID], .types)
    df_MSS <- get_stats(counts_list[mmr_map[MMRstatus == 'MMRp']$SampleID], .types)
    
    df <- bind_rows(list(MSI = df_MSI, MSS = df_MSS), .id = 'Status')
    
    df_stat <- dcast(df, dist_bin ~ Status, value.var = c('estimate', 'variance'))[
      , c('p', 'log2_fold_change') := t_test_and_lfc(estimate_MSI, variance_MSI, n_msi, estimate_MSS, variance_MSS, n_mss), dist_bin
    ]
    
    return(list(MSI_data = df_MSI, MSS_data = df_MSS, stats_data = df_stat))
  })
  
  transposed_results <- purrr::transpose(results_by_type)
  all_MSI_df <- dplyr::bind_rows(transposed_results$MSI_data, .id = "cell_type")
  all_MSS_df <- dplyr::bind_rows(transposed_results$MSS_data, .id = "cell_type")
  summary_stats <- dplyr::bind_rows(transposed_results$stats_data, .id = "cell_type")
  
  summary_stats[, padj_global := p.adjust(p, method = 'fdr')]
  summary_stats[, asterisk := fifelse(padj_global < 0.01, "*", "")]
  summary_stats[, height := max(estimate_MSI + 1.96 * sqrt(variance_MSI), estimate_MSS + 1.96 * sqrt(variance_MSS)), by = .(cell_type, dist_bin)]
  
  return(list(summary_stats = summary_stats, MSI_results = all_MSI_df, MSS_results = all_MSS_df))
}

### 7. Data Loading and Preprocessing
We load the tile metadata and exclude specific tiles based on quality control. Cell types are cleaned and standardized. We then group cells by lineage for easier analysis.

In [None]:
tiles_to_omit = read.csv('../Tessera tiles/Tessera processed results/tiles_to_exclude_from_interface_analysis.csv') %>%
    filter(tiles_to_exclude_from_interface_analysis != '') %>%
    pull(agg_id)
length(tiles_to_omit)
head(tiles_to_omit)

cells = readr::read_rds('../Tessera tiles/Tessera processed results/tile_metadata_2025-07-22.rds') 
cells$type_lvl1[cells$type_lvl2 == 'Mast'] = 'Mast' 

cells <- cells %>%
    filter(!agg_id %in% tiles_to_omit) %>%
    mutate(type_lvl2 = case_when(type_lvl2 == 'Myeloid-ISGlow' ~ 'Myeloid-ISG', .default = type_lvl2)) %>%
    mutate(type_lvl3 = gsub(type_lvl2, pattern = '-prolif', replacement = '')) %>% # |high|low|-PD1
    mutate(type_lvl3 = gsub(type_lvl3, pattern = 'Epi.*', replacement = 'Epi')) %>% 
    select(c('PatientID', 'SampleID', 'MMRstatus', 'X', 'Y', 'tessera_annotation', 'type_lvl3', 'type_lvl1', 'type_lvl2', 'cell_id', 'cxcl_pos_tile'))

glimpse(cells)

lineage_list <- cells %>% 
    select(type_lvl1, type_lvl3) %>% 
    distinct %>%
    {split(.$type_lvl3, .$type_lvl1)}

In [None]:
require(tidyverse)
cells %>% 
    rename(merged_states = type_lvl3) %>% 
    group_by(PatientID, SampleID, MMRstatus, tessera_annotation, type_lvl1, type_lvl2, merged_states) %>%
    summarize(n = n()) %>% 
    ungroup %>%
    mutate(states = interaction(type_lvl2, tessera_annotation, sep = ' from ')) %>%
    select(states, n, PatientID, SampleID) %>%
    pivot_wider(data = ., names_from = states, values_from = n) %>%
    fwrite(., file = 'table_of_cell_states_per_tessera_region_2.csv')

ids = unique(cells$SampleID[cells$MMRstatus == 'MMRd'])
interfaces = map(ids, function(.id) {
    fname = normalizePath(list.files(path = '../Tessera tiles/Spatial objects for tumor-stromal interfaces in all MERFISH samples/', pattern = '_tumor_stromal_interfaces.rds', full.names = TRUE)[grepl(list.files(path = '../Tessera tiles/Spatial objects for tumor-stromal interfaces in all MERFISH samples/', pattern = '_tumor_stromal_interfaces.rds', full.names = TRUE), pattern = .id)])
    readRDS(fname)
})
names(interfaces) = ids

glimpse(interfaces[[1]])

### 8. Interface Length Statistics
Calculates the total length of Hub+ vs Hub- interfaces per patient to understand the prevalence of each interface type.

In [None]:
all_ids = unique(cells$SampleID)
all_interfaces = map(all_ids, function(.id) {
    fname = normalizePath(list.files(path = '../Tessera tiles/Spatial objects for tumor-stromal interfaces in all MERFISH samples/', pattern = '_tumor_stromal_interfaces.rds', full.names = TRUE)[grepl(list.files(path = '../Tessera tiles/Spatial objects for tumor-stromal interfaces in all MERFISH samples/', pattern = '_tumor_stromal_interfaces.rds', full.names = TRUE), pattern = .id)])
    readRDS(fname)
})
names(all_interfaces) = all_ids

lengthsOfInterfaces = all_interfaces %>%
    rbindlist(ignore.attr=TRUE) %>%
    st_as_sf(sf_column_name = 'x') %>%
    filter(!st_is_empty(.)) %>%
    mutate(len = st_length(x)) %>%
    as.data.frame() %>%
    mutate(Type_of_Interface = case_when(
       Type_of_Interface == 'CXCLneg tumor & CXCLneg stroma' ~ 'Hub-',
       Type_of_Interface == 'CXCLpos tumor & CXCLpos stroma' ~ 'Hub+',
        .default = 'Heterotypic'
    )) %>%
    group_by(Type_of_Interface, SampleID) %>%
    summarize(len = sum(len)) %>%
    ungroup 

In [None]:
fig.size(h = 2, w = 3)
lengthsOfInterfaces_plot = ggplot(lengthsOfInterfaces %>%
    as.data.frame %>%
    left_join(., cells %>% select(SampleID, PatientID, MMRstatus) %>% distinct, by = 'SampleID') %>%
    group_by(Type_of_Interface, PatientID, MMRstatus) %>%
    summarize(len = sum(len))) +
    geom_col(aes(y = PatientID, x = len, fill = Type_of_Interface ), position = 'fill') +
    facet_wrap(~MMRstatus, ncol = 2, scales = 'free_y') +
    labs(x = 'Proportion of total interface', y = 'Patient', fill = 'Type of\nInterface') +
    cowplot::theme_half_open(7) + 
    scale_fill_manual(
        name = 'Interface Type: ', 
        values = c('Hub+' = '#D55E00', 'Hub-' = '#009E73', 'Heterotypic' = 'lightyellow'), 
        labels = c('Hub+' = 'Hub-inside', 'Hub-' = 'Hub-outside', 'Heterotypic' = 'Hub-border')
    ) + 
theme(axis.text.x = element_text(angle = 90), legend.position = 'bottom') +
guides(fill = guide_legend(ncol = 2)) +
NULL
lengthsOfInterfaces_plot
ggsave(plot = lengthsOfInterfaces_plot, filename = 'figs/lengthsOfInterfaces.pdf', width =3, height = 2, units = 'in')

In [None]:
.temp = lengthsOfInterfaces %>%
    as.data.frame %>%
    left_join(., cells %>% select(SampleID, PatientID, MMRstatus) %>% distinct, by = 'SampleID') %>%
    group_by(Type_of_Interface, PatientID, MMRstatus) %>%
    summarize(len = sum(len)) %>%
    ungroup %>%
    group_by(PatientID, MMRstatus) %>%
    mutate(total_len = sum(len)) %>%
    mutate(prop_len = len/total_len) %>%
    ungroup %>%
    filter(Type_of_Interface == 'Hub+') %>%
    select(MMRstatus, prop_len) 
MMRd_interface = .temp %>% filter(MMRstatus == 'MMRd') %>% pull(prop_len)
MMRp_interface = .temp %>% filter(MMRstatus == 'MMRp') %>% pull(prop_len)
t.test(MMRd_interface, MMRp_interface)
# calculate the percentage of hub+ interface
(range(MMRd_interface) * 100) %>% round(digits = 1)
(range(MMRp_interface) * 100) %>% round(digits = 1)

### 9. Main Parallel Processing Loop
This section runs the spatial summarization function in parallel for all identified samples using `furrr`.

In [None]:
options(future.globals.maxSize = 1e10)
ids = unique(cells$SampleID[cells$MMRstatus == 'MMRd'])
system.time({
    counts_list = future_map(ids, function(.id) {
        summarize_cells_by_interface_proximity(cells[SampleID == .id], interfaces[[.id]])    
    }, .options = furrr::furrr_options(seed=TRUE))
    names(counts_list) = ids
})

### 10. Post-Processing and Organization
The raw counts are split into Hub+ and Hub- lists, standardized to ensure consistent columns, and then combined into a master data frame `allCounts` for easy plotting and export.

In [None]:
# Separate lists for hub positive and hub negative interfaces
hubPos_counts_list = lapply(counts_list, function(x){return(x[['CXCLpos tumor & CXCLpos stroma']])})
names(hubPos_counts_list) = paste0(names(counts_list), '_hubPos')

hubNeg_counts_list = lapply(counts_list, function(x){return(x[['CXCLneg tumor & CXCLneg stroma']])})
names(hubNeg_counts_list) = paste0(names(counts_list), '_hubNeg')

# Combine them back into a single list and standardize columns
counts_list = c(hubPos_counts_list, hubNeg_counts_list)
counts_list = standardize_matrix_columns(counts_list)

In [None]:
allCounts = lapply(counts_list, function(x) x %>%
    as.data.frame() %>%
    tibble::rownames_to_column('bin')) %>%
rbindlist(idcol = 'sample_interface') %>%
mutate(sample = gsub(sample_interface, pattern = '\\_.*', replacement = '')) %>%
mutate(interface = gsub(sample_interface, pattern = '.*\\_', replacement = '')) %>%
mutate(interface = interface %>% as.factor %>% fct_recode('Hub-inside' = 'hubPos', 'Hub-outside' = 'hubNeg')) %>%
select(!sample_interface) %>%
mutate(midpoint = unlist(lapply(bin, find_midpoint))) 
allCounts$total_counts_per_bin = allCounts %>% select(!c(bin, sample, interface, midpoint)) %>% rowSums
allCounts$total_TNKILC_per_bin = allCounts %>% select(lineage_list[['TNKILC']]) %>% rowSums
allCounts$interface %>% unique
allCounts %>% pivot_longer(cols = allCounts %>% select(!c(bin, sample, interface, midpoint, 
                                                          total_counts_per_bin, total_TNKILC_per_bin)) %>% names) %>% 
mutate(name = gsub(pattern = 'Epi.*', replacement = 'Epi', x = name)) %>%
group_by(name, bin, sample, interface, midpoint) %>%
summarize(value = sum(value)) %>%
pivot_wider(values_from = value, names_from = name) %>%
write.csv(., 'counts_in_bins.csv')

In [None]:
allCounts %>% select(!c(bin, sample, interface, midpoint)) %>% names

In [None]:
allCounts %>%
    group_by(midpoint, interface, sample) %>%
    summarize(percent_of_all_cells = 100*`Tcd8-CXCL13`/total_counts_per_bin) %>%
    group_by(midpoint, interface) %>%
    summarize(percent_of_all_cells = mean(percent_of_all_cells)) %>%
    ggplot() +
        geom_line(aes(x = midpoint, y = percent_of_all_cells, color = interface)) +
        facet_wrap(~interface)
allCounts %>%
    group_by(midpoint, interface, sample) %>%
    summarize(percent_of_TNKILC = 100*`Tcd8-CXCL13`/total_TNKILC_per_bin) %>%
    group_by(midpoint, interface) %>%
    summarize(percent_of_TNKILC = mean(percent_of_TNKILC)) %>%
    ggplot() +
        geom_line(aes(x = midpoint, y = percent_of_TNKILC, color = interface)) +
        facet_wrap(~interface)

In [None]:
interface_plot = function(counts, .types, est_model=c('binomial', 'poisson', 'mle')) {
    est_model <- match.arg(est_model)
    df = empirical_bayes_summary(
        rowSums(counts[, .types, drop = FALSE]),
        rowSums(counts),
        rownames(counts),
        est_model
    ) 

    ## get max y value for plotting 
    ymax = 100 * max(df$estimate + 1.96 * sqrt(df$variance))
    
    p1 = ggplot(df, aes(dist_bin, 100 * estimate)) + 
        geom_vline(xintercept = c(20.5), size = 2, linetype = 1, color = 'grey') + 
        geom_point(aes(size = size)) + 
        geom_errorbar(aes(ymin = 100 * (estimate - 1.96 * sqrt(variance)), ymax = 100 * (estimate + 1.96 * sqrt(variance))), width = 0) + 
        geom_hline(yintercept = 0) + 
        geom_line(data = . %>% dplyr::mutate(dist_bin = as.numeric(dist_bin))) + 
        theme_bw(base_size = 16) + 
        theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
        labs(y = '% of all TNKILC', x = 'Distance Window', size = '# Cells', subtitle = 'mean & 95% CI, *padj<0.01', title = paste(.types, collapse = '; ')) + 
        geom_text(aes(y = 100 * (estimate + 1.96 * sqrt(variance)), label = asterisk), size = 6, vjust = 0) + 
        annotate("text", x = 0.5, y = ymax + .05, label = 'Stromal Side', hjust = 0, size = 6) + 
        annotate("text", x = 40.5, y = ymax + .05, label = 'Epithelial Side', hjust = 1, size = 6) + 
        NULL
    return(p1)
}

### 11. Plotting Per-Patient Distributions
Generates PDFs of per-patient line traces for specific cell states (e.g., T-cell subsets) to visualize spatial patterns.

In [None]:
require(patchwork)
pdf('figs/per_patient_plots_TNKILC_as_prop_of_all_cells.pdf', height = 18, width = 32)
for (state in lineage_list[['TNKILC']]){
.types = grep(state, colnames(counts_list$C110_hubPos), value = TRUE) #grep('PD1', colnames(counts_list$C110_hubPos), value = TRUE)
p1 = imap(hubPos_counts_list, function(counts, .id) {    
    interface_plot(counts, .types, 'binomial') + labs(title = glue('{.id} (Hub+)'))    
}) %>% wrap_plots() + plot_annotation(title = paste0(.types, collapse =  ', '), 
                                      theme = theme(plot.title = element_text(size = 20, face = "bold", color = "darkblue")))
p2 = imap(hubNeg_counts_list, function(counts, .id) {    
    interface_plot(counts, .types, 'binomial') + labs(title = glue('{.id} (Hub-)'))    
}) %>% wrap_plots() + plot_annotation(title = paste0(.types, collapse =  ', '),  theme = theme(plot.title = element_text(size = 20, face = "bold", color = "darkblue")))
print(p1)
print(p2)
}
dev.off()

In [None]:
.types = grep('PD1', colnames(counts_list$C110_hubPos), value = TRUE)
.types

In [None]:
require(patchwork)
pdf('figs/per_patient_plots_TNKILC_prop.pdf', height = 18, width = 32)
for (state in lineage_list[['TNKILC']]){
.types = grep(state, colnames(counts_list$C110_hubPos), value = TRUE) #grep('PD1', colnames(counts_list$C110_hubPos), value = TRUE)
p1 = imap(hubPos_counts_list, function(counts, .id) { 
 interface_plot(counts, .types, 'binomial') + labs(title = glue('{.id} (Hub+)')) 
}) %>% wrap_plots() + plot_annotation(title = paste0(.types, collapse = ', '), 
 theme = theme(plot.title = element_text(size = 20, face = "bold", color = "darkblue")))
p2 = imap(hubNeg_counts_list, function(counts, .id) { 
 interface_plot(counts, .types, 'binomial') + labs(title = glue('{.id} (Hub-)')) 
}) %>% wrap_plots() + plot_annotation(title = paste0(.types, collapse = ', '), theme = theme(plot.title = element_text(size = 20, face = "bold", color = "darkblue")))
print(p1)
print(p2)
}
dev.off()

In [None]:
state = lineage_list[['TNKILC']][2]
state
.types = grep(state, colnames(counts_list$C110_hubPos), value = TRUE) #grep('PD1', colnames(counts_list$C110_hubPos), value = TRUE)
.types

In [None]:
fig.size(h = 9, w = 16)
options(repr.plot.res = 300)
lineage = 'TNKILC'
tnkilc_order = lineage_list[['TNKILC']] #c("Tcd8-CXCL13", "Tcd8-HOBIT", "Tcd8-gdlike", "Tcd8-gdlike-PD1", "Tcd8-GZMK", "Tplzf-gdlike", "Tcd4-CXCL13", "Tcd4-TFH", "Tcd4-Treg", "Tcd4-IL7R", "NK-CD16", "NK-XCL1", "ILC3")

# Prepare the data for the geom_text layers beforehand for clarity
text_data_asterisk <- final_results$summary_stats %>% 
 filter(cell_type %in% lineage_list[[lineage]]) %>%
 mutate(cell_type = factor(cell_type, ordered = TRUE, levels = tnkilc_order))

text_data_epi <- df %>% 
 filter(cell_type %in% lineage_list[[lineage]]) %>%
 mutate(cell_type = factor(cell_type, ordered = TRUE, levels = tnkilc_order)) %>% 
 select(cell_type, ymax) %>% 
 distinct() %>% 
 mutate(ymax = ymax + ymax/3, x = 32, label = 'Epi')

text_data_stroma <- df %>% 
 filter(cell_type %in% lineage_list[[lineage]]) %>% 
 mutate(cell_type = factor(cell_type, ordered = TRUE, levels = tnkilc_order)) %>% 
 select(cell_type, ymax) %>% 
 distinct() %>% 
 mutate(ymax = ymax + ymax/3, x = 8, label = ' Stroma')

tnkilc_plots = lapply(tnkilc_order, function(mytype){
# Create the plot
p1 = df %>% 
 filter(cell_type == mytype) %>%
 mutate(cell_type = factor(cell_type, ordered = TRUE, levels = tnkilc_order)) %>%
 mutate(dist_bin = fct_reorder(dist_bin, midpoint)) %>%
ggplot(
 data = ., 
 aes(dist_bin, 100 * estimate, color = Status)
) + 
 geom_vline(xintercept = c(20.5), size = 2, linetype = 1, color = 'grey') + 
 geom_point() + 
 geom_errorbar(aes(ymin = 100 * (estimate - 1.96 * sqrt(variance)), ymax = 100 * (estimate + 1.96 * sqrt(variance))), width = 0, show.legend = FALSE) + 
 geom_hline(yintercept = 0) + 
 geom_line(data = . %>% dplyr::mutate(dist_bin = (dist_bin)), show.legend = FALSE, aes(group = Status)) + 
 cowplot::theme_half_open(7) + 
 theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
 labs(y = 'Percent of all cells', x = 'Distance Window', size = '# Cells', subtitle = 'IVW meta-analysis; mean & 95% CI, *padj<0.01', title = lineage) + 
 geom_text(
 data = text_data_asterisk,
 aes(y = 100 * height, label = asterisk), 
 size = 6, vjust = .2, show.legend = FALSE, color = 'black'
 ) + 
 geom_text(
 data = text_data_epi, 
 aes(label = label, y = ymax, x = x), 
 color = 'black', size = 3
 ) +
 geom_text(
 data = text_data_stroma, 
 aes(label = label, y = ymax, x = x), 
 color = 'black', size = 3
 ) +
 scale_color_manual(
 name = 'Interface Type: ', 
 values = c('hubPos' = '#D55E00', 'hubNeg' = '#009E73', 'MSS' = 'grey'), 
 labels = c('hubPos' = 'Hub-inside MMRd', 'hubNeg' = 'Hub-outside MMRd', 'MSS' = 'MMRp')
 ) + 
 guides(color = guide_legend(override.aes = list(size = 2, shape = 16))) + 
 theme(
 #aspect.ratio = 0.5, 
 axis.text.x = element_text(size = 7), 
 strip.background = element_rect(fill = NA), 
 strip.text = element_text(size = 10, face = 'bold', color = 'black'), 
 title = element_text(size = 7), 
 legend.position = 'right', 
 legend.text = element_text(size = 7)
 ) +
 guides(fill = guide_legend(override.aes = list(nrow = 1, shape = 16))) +
 ggtitle(mytype) +
 NULL
return(p1)
ggsave(p1, filename = glue::glue('figs/cellstates/TNKILC/', mytype, '.pdf'), height = 4, width = 4, create.dir = TRUE)})


In [None]:
# Supp fig - T cell states
# (Code seems to be plotting suppression? No, just loop for saving PDFs)