# Statistical Analysis of Differential Expression

## Sample-Level Generalized Linear Mixed Models (GLMM) 
### see ../meta_analysis_GLMMs/glmm_fine_20250112.ipynb  and ../meta_analysis_GLMMs/glmm_coarse_20250703.ipynb 

To identify cluster-specific marker genes while accounting for spatial and technical sources of variation, we performed a Generalized Linear Mixed Model (GLMM) analysis separately for each biological sample. For each sample, raw gene counts were modeled using a Poisson error distribution with a canonical log link function. The model specification included the total number of UMIs (log-transformed) as an offset to normalize for sequencing depth. To capture spatial dependencies, we included random intercepts for the coarse k-nearest neighbor (k-NN) clusters (knn_coarse) and the field of view (fov), nested as (1 | fov / knn_coarse). This hierarchical structure accounts for the correlation of gene expression within spatial neighborhoods and imaging fields. Marginal effects for each cluster were estimated using the presto package, identifying genes with differential expression relative to the grand mean. Genes expressed in fewer than 3 cells per group were excluded prior to modeling.

## DerSimonian-Laird Meta-Analysis

### ../meta_analysis_cluster_markers/meta_analysis_of_fine_type_markers.ipynb 

To identify robust consensus markers across all biological replicates, sample-level summary statistics (log-transformed coefficients $\beta$ and standard errors $\sigma$) were combined using a random-effects meta-analysis. We employed the DerSimonian-Laird (DL) estimator to calculate the between-study variance ($\tau^2$). For each feature within each cluster, we computed two estimates:Fixed Effects (FE) Model: Assumes a single true effect size shared by all samples, weighting estimates by the inverse of their within-sample variance ($w_{FE} = 1/\sigma^2$).Random Effects (RE) Model: Incorporates both within-sample variance and between-sample heterogeneity ($\tau^2$), weighting estimates as $w_{RE} = 1/(\sigma^2 + \tau^2)$.Heterogeneity was assessed using Cochranâ€™s $Q$ statistic. To avoid false positives driven by outlier samples, final feature prioritization was based on the Random Effects Z-score ($Z_{RE} = \beta_{RE} / \sigma_{RE}$), which penalizes genes with high inter-sample disagreement. P-values were adjusted for multiple hypothesis testing using the Benjamini-Hochberg (FDR) procedure. Features were considered significant if they exhibited an FDR < 0.05 in the Random Effects model.

## Key References:

- GLMM/Presto: https://rdrr.io/github/immunogenomics/presto/f/vignettes/getting-started.Rmd
- Meta-Analysis: DerSimonian, R., & Laird, N. (1986). Meta-analysis in clinical trials. Controlled clinical trials, 7(3), 177-188.

In [None]:
require(tidyverse)
require(data.table)
require(Matrix)
require(singlecellmethods)
require(lme4)
require(presto)
require(furrr)
require(future)
set.seed(1)


In [None]:
doGLMM = function(obj, filename) {
    require(tidyverse)
    require(data.table)
    require(Matrix)
    require(singlecellmethods)
    require(lme4)
    require(presto)
    require(furrr)
    require(future)
    set.seed(1)
  temp = GetAssayData(obj, slot = 'counts')
  varyingGenes = rownames(temp[apply(temp, 1, function(x) {
    length(unique(x)) > 3
  }), ])
  rm(temp)
  obj = obj[varyingGenes,]
  pb = presto::collapse_counts(
    GetAssayData(obj, slot = 'counts'),
    obj@meta.data,
    c('orig.ident', 'fov', 'knn_renamed_cell_states'),
    min_cells_per_group = 3
  )
  
  pb$exprs_norm = pb$exprs_norm[rownames(pb$counts_mat), colnames(pb$counts_mat)]
  
  system.time({
    presto_res = presto::presto.presto(
      y ~ 1 + (1 |
                 knn_renamed_cell_states) +  (1 | fov / knn_renamed_cell_states) + offset(logUMI),
      pb$meta_data,
      pb$counts_mat,
      size_varname = "logUMI",
      effects_cov = 'knn_renamed_cell_states',
      ncore = 10,
      min_sigma = .05,
      family = "poisson",
      nsim = 1000
    )
  })
  
  readr::write_rds(presto_res, filename)
  
  contrasts_mat = make_contrast.presto(presto_res,
                                       var_contrast = "knn_renamed_cell_states")
  
  effects_marginal = contrasts.presto(presto_res,
                                      contrasts_mat,
                                      one_tailed = TRUE) %>%
    dplyr::mutate(cluster = contrast) %>%
    dplyr::mutate(
      logFC = sign(beta) * log2(exp(abs(beta))),
      # convert stats to log2 for interpretability
      SD = log2(exp(sigma)),
      zscore = logFC / SD
    ) %>%
    arrange(pvalue)
  
  effects_marginal$fdr = p.adjust(effects_marginal$pvalue, method = 'BH')
  effects_marginal$corr_fdr = effects_marginal$fdr
  effects_marginal$corr_fdr[effects_marginal$fdr == 0] = min(effects_marginal$fdr[effects_marginal$fdr != 0])
  effects_marginal$`-log10_fdr` = (-1) * log10(effects_marginal$corr_fdr)
  
  # meanExp = rowMeans(GetAssayData(obj, 'data'))
  # meanExp = data.frame(feature = names(meanExp), meanExp = meanExp)
  # for (cluster in unique(effects_marginal$cluster)) {
  #   temp = GetAssayData(obj, 'counts')[, rownames(obj@meta.data)[obj@meta.data$knn_renamed_cell_states == cluster]] %>% as.data.frame()
  #   temp = temp %>%
  #     rowwise() %>%
  #     mutate(`N_zeros` = sum(c_across(everything()) == 0)) %>%
  #     select(`N_zeros`) %>% as.data.frame()
  #   rownames(temp) = rownames(GetAssayData(obj, 'counts'))
  #   meanExp[, cluster] = temp$`N_zeros` / length(rownames(obj@meta.data)[obj@meta.data$knn_renamed_cell_states == cluster])
  # }
  return(effects_marginal)
}

In [None]:
getwd()

In [None]:
merfish_files = list.files(path = '../../../../Labeled MERFISH data/Seurat objects', pattern = "annotated_.*.rds", full.names = TRUE)
length(merfish_files)
merfish_files %>% writeLines

In [None]:
names(merfish_files) = gsub(merfish_files, pattern = '.*annotated_|.rds', replacement = '')
merfish_files

In [None]:
require(furrr)
require(future)
require(singlecellmethods)
require(Seurat)
set.seed(1)

plan(multisession, workers = 10)
future_map(names(merfish_files), function(sampleID){
    message(sampleID)
    obj = readr::read_rds(merfish_files[sampleID])
    lineages = unique(obj@meta.data$knn_coarse)
    lineages = lineages[!lineages %in% c('Mast', 'Plasma')]
    lapply(lineages, function(lineage, sampleID){
        message(lineages)
        obj_lineage = subset(obj, subset = knn_coarse == lineage)
        filename = paste0(c(sampleID, lineage, 'glmm.rds'), collapse = '_')
        system.time({
            effects_marginal = doGLMM(obj = obj_lineage, filename = filename)
        })
        data.table::fwrite(effects_marginal, gsub(filename, pattern = 'glmm.rds', replacement = 'marginal_effects.csv'))
    }, sampleID = sampleID)
}, .progress = TRUE)