In [1]:
# Setup for Clipper R kernel

# Make sure both your personal lib and the system libs are visible
.libPaths(c(
  "/mnt/home/bisholea/Rlibs",
  "/opt/gvsu/clipper/2025.05/R/4.4.3/library",
  "/opt/gvsu/clipper/2025.05/spack/apps/linux-rhel9-cascadelake/gcc-13.3.1/r-4.4.3-gz7onk2vbizgd2zxqa6ij47gygl6dsd4/rlib/R/library"
))

# Quick sanity check (optional, just to see)
.libPaths()
"ggplot2" %in% rownames(installed.packages())

# Load the packages you already have
library(readr)
library(dplyr)
library(tidyr)
library(ggplot2)
library(fgsea)
library(cowplot)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union




In [3]:
#!/usr/bin/env Rscript
suppressPackageStartupMessages({
  library(readr); library(dplyr); library(tidyr); library(tibble)
  library(purrr); library(ggplot2)
})

set.seed(1)

# ---------------- paths ----------------
capstone_dir <- "/mnt/projects/debruinz_project/bisholea/capstone"
gsea_dir     <- file.path(capstone_dir, "gsea")
fig_dir      <- file.path(capstone_dir, "figures")
dir.create(fig_dir, showWarnings = FALSE, recursive = TRUE)

# ---------------- load centralized data/style ----------------
fd <- readRDS(file.path(gsea_dir, "figdata.rds"))

# override theme locally for this script
fd$style$theme <- function() {
  ggplot2::theme_classic(base_size = 12, base_family = "serif") +
    ggplot2::theme(
      panel.grid          = ggplot2::element_blank(),
      plot.title.position = "plot",
      legend.title        = ggplot2::element_blank(),
      plot.margin         = ggplot2::margin(t = 3, r = 3, b = 3, l = 3),
      axis.title.x        = ggplot2::element_text(margin = ggplot2::margin(t = 4)),
      axis.title.y        = ggplot2::element_text(margin = ggplot2::margin(r = 4)),
      axis.ticks.length   = grid::unit(2, "pt")
    )
}

# palette + theme from figdata
pal_model <- if (!is.null(fd$palettes$model)) {
  fd$palettes$model
} else {
  c(AE = viridisLite::viridis(7)[4], NMF = "grey75")
}

plt_theme <- fd$style$theme()


save_pdf <- function(path, plot, w=7, h=5) {
  if (isTRUE(capabilities("cairo"))) {
    ggsave(path, plot, width=w, height=h, units="in", device=cairo_pdf, bg="white")
  } else {
    ggsave(path, plot, width=w, height=h, units="in", device="pdf", bg="white", useDingbats=FALSE)
  }
}

read_wide_matrix <- function(path, pathway_col = "pathway") {
  header <- readr::read_lines(path, n_max = 1)
  cols   <- strsplit(header, ",", fixed = TRUE)[[1]]
  pw_col <- if (pathway_col %in% cols) pathway_col else cols[1]
  other  <- setdiff(cols, pw_col)
  ct <- do.call(readr::cols, c(
    setNames(list(readr::col_character()), pw_col),
    setNames(rep(list(readr::col_double()), length(other)), other)
  ))
  df <- readr::read_csv(path, col_types = ct, show_col_types = FALSE)
  rn <- df[[pw_col]]; df[[pw_col]] <- NULL
  m <- as.matrix(df); rownames(m) <- rn; m
}

In [4]:
suppressPackageStartupMessages({
  library(readr); library(dplyr); library(tidyr); library(tibble)
  library(purrr); library(ggplot2)
  library(msigdbr); library(fgsea); library(data.table)
})

# =========================
# BLOCK 1: load GSEA + collapse pathways per pair
# =========================

# ---- load GSEA matrices and align pathway universe ----
ae_nes   <- read_wide_matrix(fd$gsea$paths$ae_nes_csv)
nmf_nes  <- read_wide_matrix(fd$gsea$paths$nmf_nes_csv)
ae_padj  <- read_wide_matrix(fd$gsea$paths$ae_padj_csv)
nmf_padj <- read_wide_matrix(fd$gsea$paths$nmf_padj_csv)

pw_common <- Reduce(intersect, list(rownames(ae_nes), rownames(nmf_nes),
                                    rownames(ae_padj), rownames(nmf_padj)))
ae_nes   <- ae_nes [pw_common, , drop = FALSE]
nmf_nes  <- nmf_nes[pw_common, , drop = FALSE]
ae_padj  <- ae_padj[pw_common, , drop = FALSE]
nmf_padj <- nmf_padj[pw_common, , drop = FALSE]

bp    <- fd$matching$best_pairs  # ae_index, nmf_index, cosine
# alpha <- if (!is.null(fd$alpha)) fd$alpha else 0.01

.safe_log10p <- function(p, floor = 1e-300) ifelse(is.finite(p), -log10(pmax(p, floor)), 0)

# ---- build pathway2genes (C5; toggle GO:BP-only if you want) ----
USE_BP_ONLY <- FALSE  # TRUE for GO:BP only, FALSE for all C5

p2g_cache   <- if (USE_BP_ONLY)
  file.path(gsea_dir, "pathway2genes_GO_BP_pwcommon.rds") else
  file.path(gsea_dir, "pathway2genes_C5_pwcommon.rds")

if (file.exists(p2g_cache)) {
  pathway2genes <- readRDS(p2g_cache)
} else {
  if (USE_BP_ONLY) {
    MSIG <- msigdbr(species = "Homo sapiens",
                    category = "C5", subcategory = "GO:BP") |>
      dplyr::transmute(pathway = gs_name, gene = toupper(gene_symbol))
  } else {
    MSIG <- msigdbr(species = "Homo sapiens", category = "C5") |>
      dplyr::transmute(pathway = gs_name, gene = toupper(gene_symbol))
  }
  MSIG_filt <- dplyr::filter(MSIG, pathway %in% pw_common)
  pathway2genes <- split(MSIG_filt$gene, MSIG_filt$pathway)
  pathway2genes <- pathway2genes[lengths(pathway2genes) > 0]
  saveRDS(pathway2genes, p2g_cache)
}

# ---- helpers for collapse ----
get_stats_vec <- function(model = c("AE","NMF"), factor_index, fd) {
  model <- match.arg(model)
  if (model == "AE") {
    v <- as.numeric(fd$weights$W_full$ae[, factor_index])
    names(v) <- toupper(fd$weights$W_full$genes_ae)
  } else {
    v <- as.numeric(fd$weights$W_full$nmf[, factor_index])
    names(v) <- toupper(fd$weights$W_full$genes_nmf)
  }
  v[is.finite(v)]
}

# collapse combined AE+NMF union for one pair
collapse_union_for_pair <- function(ai, ni,
                                    ae_padj, nmf_padj,
                                    ae_nes,  nmf_nes,
                                    pathway2genes,
                                    pval_threshold = 0.1,
                                    fd) {

  # vectors for this pair
  AE_p   <- ae_padj[, ai, drop = TRUE]
  NMF_p  <- nmf_padj[, ni, drop = TRUE]
  AE_NES <- ae_nes[,  ai, drop = TRUE]
  NMF_NES<- nmf_nes[, ni, drop = TRUE]

  # union of significant pathways in AE or NMF at p <= pval_threshold
  sig_mask <- (
    is.finite(AE_p)  & AE_p  <= pval_threshold
  ) | (
    is.finite(NMF_p) & NMF_p <= pval_threshold
  )

  if (!any(sig_mask)) return(character(0))

  pw <- names(AE_p)[sig_mask]

  # Build a combined fgsea-like table: one row per pathway
  df_sig <- tibble::tibble(
    pathway = pw,
    AE_p    = AE_p[pw],
    NMF_p   = NMF_p[pw],
    AE_NES  = AE_NES[pw],
    NMF_NES = NMF_NES[pw]
  ) |>
    dplyr::mutate(
      # best p-value across AE & NMF, handling NAs explicitly
      pval = dplyr::case_when(
        is.finite(AE_p)  & is.finite(NMF_p)  ~ pmin(AE_p, NMF_p),
        is.finite(AE_p)                       ~ AE_p,
        is.finite(NMF_p)                      ~ NMF_p,
        TRUE                                  ~ NA_real_
      ),
      # choose NES from model with larger |NES|
      NES  = ifelse(abs(AE_NES) >= abs(NMF_NES), AE_NES, NMF_NES)
    ) |>
    dplyr::filter(is.finite(pval), is.finite(NES)) |>
    dplyr::arrange(pval)


  if (!nrow(df_sig)) return(character(0))

  dt <- data.table::as.data.table(df_sig[, c("pathway","pval","NES")])
  dt[, ES := ifelse(NES >= 0, 1, -1)]

  # Use AE stats as reference for collapse (could also choose NMF)
  stats_vec <- get_stats_vec("AE", ai, fd)

  p2g <- lapply(pathway2genes, function(gg) intersect(gg, names(stats_vec)))
  p2g <- p2g[lengths(p2g) >= 1]
  dt  <- dt[pathway %in% names(p2g)]
  if (nrow(dt) == 0L) return(character(0))

  cp <- fgsea::collapsePathways(
    fgseaRes = dt,
    pathways = p2g,
    stats    = stats_vec,
    pval.threshold = pval_threshold
  )

  # return non-redundant "main" pathways from this combined set
  intersect(cp$mainPathways, df_sig$pathway)
}


# ---- which pairs to use ----
top_k_pairs <- nrow(bp)   # all matched pairs
sel_idx     <- order(bp$cosine, decreasing = TRUE)[seq_len(top_k_pairs)]

# use 0.1 for this analysis (per Zach)
pval_collapse <- 0.1

collapsed_panels <- purrr::map_dfr(
  seq_along(sel_idx),
  function(j) {
    i  <- sel_idx[j]
    ai <- bp$ae_index[i]
    ni <- bp$nmf_index[i]

    # 1) collapse the combined AE+NMF union at p <= 0.1
    keep_union <- collapse_union_for_pair(
      ai, ni,
      ae_padj, nmf_padj,
      ae_nes,  nmf_nes,
      pathway2genes,
      pval_threshold = pval_collapse,
      fd = fd
    )

    if (!length(keep_union)) {
      return(tibble::tibble())
    }

    # 2) classify each collapsed pathway as AE-only / NMF-only / Both
    tibble::tibble(
      pair_rank = j,         # 1..top_k_pairs
      ae_index  = ai,
      nmf_index = ni,
      pathway   = keep_union,
      in_ae     = is.finite(ae_padj[pathway, ai])  &
                  ae_padj[pathway, ai]  <= pval_collapse,
      in_nmf    = is.finite(nmf_padj[pathway, ni]) &
                  nmf_padj[pathway, ni] <= pval_collapse
    ) |>
      dplyr::mutate(
        panel = dplyr::case_when(
          in_ae & in_nmf ~ "Both",
          in_ae & !in_nmf ~ "AE",
          !in_ae & in_nmf ~ "NMF",
          TRUE ~ NA_character_
        )
      ) |>
      dplyr::filter(!is.na(panel))
  }
)

collapse_rds <- file.path(gsea_dir, "collapsed_panels_allPairs_alpha0.1.rds")
saveRDS(collapsed_panels, collapse_rds)
message("[OK] saved collapsed_panels to: ", collapse_rds)

# (optional) save to disk if you want to reuse in another script:
# saveRDS(collapsed_panels, file.path(gsea_dir, "collapsed_panels_pairs.rds"))