In [1]:
# Setup

# Use a personal library (no sudo needed)
dir.create("~/Rlibs", recursive = TRUE, showWarnings = FALSE)
.libPaths(c("~/Rlibs", .libPaths()))

# Make sure CRAN is set (some HPC images don’t have it)
options(repos = c(CRAN = "https://cloud.r-project.org"))

# Install the missing CRAN dep
install.packages("S7")

# Install Bioconductor manager if needed
if (!requireNamespace("BiocManager", quietly = TRUE)) {
  install.packages("BiocManager")
}

# Install fgsea + BiocParallel from Bioconductor
BiocManager::install(c("fgsea", "BiocParallel"), ask = FALSE, update = TRUE)

# (Optional but often needed)
install.packages(c("readr","dplyr","tidyr","ggplot2"))

Installing package into ‘/mnt/home/bisholea/Rlibs’
(as ‘lib’ is unspecified)

“installation of package ‘S7’ had non-zero exit status”
'getOption("repos")' replaces Bioconductor standard repositories, see
'help("repositories", package = "BiocManager")' for details.
Replacement repositories:
    CRAN: https://cloud.r-project.org

Bioconductor version 3.20 (BiocManager 1.30.26), R 4.4.1 (2024-06-14)

“package(s) not installed when version(s) same as or greater than current; use
  `force = TRUE` to re-install: 'fgsea' 'BiocParallel'”
Old packages: 'magrittr', 'Matrix', 'reticulate', 'SeuratObject', 'tibble',
  'utf8'

“installation of package ‘magrittr’ had non-zero exit status”
“installation of package ‘Matrix’ had non-zero exit status”
“installation of package ‘utf8’ had non-zero exit status”
“installation of package ‘reticulate’ had non-zero exit status”
“installation of package ‘SeuratObject’ had non-zero exit status”
“installation of package ‘tibble’ had non-zero exit status”
Installi

In [2]:
#!/usr/bin/env Rscript
suppressPackageStartupMessages({
  library(readr); library(dplyr); library(tidyr); library(tibble); library(purrr)
  library(ggplot2); library(Matrix); library(clue); library(stringr)
})

set.seed(1)

# =========================
# 0) Edit me: input paths
# =========================
capstone_dir <- "/mnt/projects/debruinz_project/bisholea/capstone"
gsea_dir     <- file.path(capstone_dir, "gsea")

# W/H + common-gene preproc (has ae_W_common, nmf_W_common)
preproc_rds  <- file.path(gsea_dir, "common_gene_preproc.rds")

# GSEA matrices (pathways × F1..Fk); first col (or 'pathway') is pathway ID
ae_nes_csv   <- file.path(gsea_dir, "ae_NES_matrix_allFactors_common_genes.csv")
nmf_nes_csv  <- file.path(gsea_dir, "nmf_NES_matrix_allFactors_common_genes.csv")
ae_padj_csv  <- file.path(gsea_dir, "ae_padj_matrix_allFactors_common_genes.csv")
nmf_padj_csv <- file.path(gsea_dir, "nmf_padj_matrix_allFactors_common_genes.csv")

# Optional factor-level counts you already use for the boxplot
ae_counts_csv  <- file.path(gsea_dir, "ae_factor_gsea_counts_common_genes.csv")
nmf_counts_csv <- file.path(gsea_dir, "nmf_factor_gsea_counts_common_genes.csv")

stopifnot(file.exists(preproc_rds),
          file.exists(ae_nes_csv), file.exists(nmf_nes_csv),
          file.exists(ae_padj_csv), file.exists(nmf_padj_csv))

# =========================
# 1) Helpers
# =========================
read_wide_matrix <- function(path, pathway_col = "pathway") {
  header <- readr::read_lines(path, n_max = 1)
  cols   <- strsplit(header, ",", fixed = TRUE)[[1]]
  pw_col <- if (pathway_col %in% cols) pathway_col else cols[1]
  other  <- setdiff(cols, pw_col)
  ct <- do.call(readr::cols, c(
    setNames(list(readr::col_character()), pw_col),
    setNames(rep(list(readr::col_double()), length(other)), other)
  ))
  df <- readr::read_csv(path, col_types = ct,
                        na = c("", "NA", "NaN", "nan", "NULL", "Inf", "-Inf"),
                        guess_max = 100000, show_col_types = FALSE)
  rn <- df[[pw_col]]; df[[pw_col]] <- NULL
  m <- as.matrix(df); rownames(m) <- rn; m
}

collapse_dups <- function(M) {
  # average duplicate gene rows (by rowname)
  spl <- split(seq_len(nrow(M)), rownames(M))
  out <- t(vapply(spl, function(idx) colMeans(M[idx, , drop = FALSE]), numeric(ncol(M))))
  colnames(out) <- colnames(M); out
}

l2n <- function(M) {           # column L2 normalize (safe)
  n <- sqrt(colSums(M^2))
  n[n == 0 | !is.finite(n)] <- 1
  sweep(M, 2, n, "/")
}

mlog10 <- function(p, floor = 1e-300) -log10(pmax(p, floor))

# =========================
# 2) Load W (common-gene)
# =========================
pp <- readRDS(preproc_rds)
# Expect: pp$ae_W_common, pp$nmf_W_common, pp$ae_genes_common, pp$nmf_genes_common
ae_W  <- as.matrix(pp$ae_W_common)
nmf_W <- as.matrix(pp$nmf_W_common)
rownames(ae_W)  <- pp$ae_genes_common
rownames(nmf_W) <- pp$nmf_genes_common

# collapse duplicate gene names (rare but safer)
ae_W  <- collapse_dups(ae_W)
nmf_W <- collapse_dups(nmf_W)

# align genes
genes_common <- intersect(rownames(ae_W), rownames(nmf_W))
ae_W  <- ae_W [genes_common, , drop = FALSE]
nmf_W <- nmf_W[genes_common, , drop = FALSE]

# cosine similarity matrix (AE × NMF)
S <- t(l2n(ae_W)) %*% l2n(nmf_W)

# one-to-one matches via Hungarian (maximize cosine)
ae_n <- nrow(S); nm_n <- ncol(S)
pad_k <- max(0, ae_n - nm_n)
S_pad <- if (pad_k > 0) cbind(S, matrix(0, nrow = ae_n, ncol = pad_k)) else S
assign_col <- as.integer(clue::solve_LSAP(S_pad, maximum = TRUE))
is_real    <- assign_col <= nm_n
ae_idx     <- which(is_real)
nm_idx     <- assign_col[is_real]
match_cos  <- S[cbind(ae_idx, nm_idx)]

pairs_df <- tibble(ae_index = ae_idx,
                   nmf_index = nm_idx,
                   cosine   = as.numeric(match_cos))

# =========================
# 2A) Load FULL (pre-subset) W from CSVs + compute pre-subset norms
# =========================
# Files live in the same gsea_dir
ae_var_csv  <- file.path(gsea_dir, "ae_adata_var_metadata_unique.csv")
ae_W_csv    <- file.path(gsea_dir, "ae_adata_var_embeddings_unique.csv")
nmf_var_csv <- file.path(gsea_dir, "als_nmf_var_metadata_unique.csv")
nmf_W_csv   <- file.path(gsea_dir, "als_nmf_var_embeddings_unique.csv")

stopifnot(file.exists(ae_var_csv),  file.exists(ae_W_csv),
          file.exists(nmf_var_csv), file.exists(nmf_W_csv))

# ---- AE full ----
ae_var_full <- readr::read_csv(ae_var_csv, show_col_types = FALSE)
ae_W_fulldf <- readr::read_csv(ae_W_csv,   show_col_types = FALSE)
ae_genes_full <- toupper(stringr::str_trim(ae_var_full$feature_name))
stopifnot(nrow(ae_W_fulldf) == length(ae_genes_full))
ae_W_full <- as.matrix(ae_W_fulldf)
rownames(ae_W_full) <- ae_genes_full
ae_W_full <- collapse_dups(ae_W_full)

# ---- NMF full ----
nmf_var_full <- readr::read_csv(nmf_var_csv, show_col_types = FALSE)
nmf_W_fulldf <- readr::read_csv(nmf_W_csv,   show_col_types = FALSE)
nmf_genes_full <- toupper(stringr::str_trim(nmf_var_full$feature_name))
stopifnot(nrow(nmf_W_fulldf) == length(nmf_genes_full))
nmf_W_full <- as.matrix(nmf_W_fulldf)
rownames(nmf_W_full) <- nmf_genes_full
nmf_W_full <- collapse_dups(nmf_W_full)

# store sparse versions to save RDS space
ae_W_full_sp  <- Matrix::Matrix(ae_W_full,  sparse = TRUE)
nmf_W_full_sp <- Matrix::Matrix(nmf_W_full, sparse = TRUE)


# ---- pre-subset norms (L1/L2) ----
l1 <- function(M) colSums(abs(M))
l2 <- function(M) sqrt(colSums(M^2))

l1norm_df_pre <- dplyr::bind_rows(
  tibble(model = "AE",  factor = colnames(ae_W_full),  l1 = l1(ae_W_full)),
  tibble(model = "NMF", factor = colnames(nmf_W_full), l1 = l1(nmf_W_full))
) |> mutate(model = factor(model, levels = c("AE","NMF")))

l2norm_df_pre <- dplyr::bind_rows(
  tibble(model = "AE",  factor = colnames(ae_W_full),  l2 = l2(ae_W_full)),
  tibble(model = "NMF", factor = colnames(nmf_W_full), l2 = l2(nmf_W_full))
) |> mutate(model = factor(model, levels = c("AE","NMF")))

# =========================
# 3) Load GSEA matrices
# =========================
ae_nes  <- read_wide_matrix(ae_nes_csv)
nmf_nes <- read_wide_matrix(nmf_nes_csv)
ae_padj <- read_wide_matrix(ae_padj_csv)
nmf_padj<- read_wide_matrix(nmf_padj_csv)

# align pathway universe across NES & padj
pw_common <- Reduce(intersect, list(rownames(ae_nes), rownames(nmf_nes),
                                    rownames(ae_padj), rownames(nmf_padj)))
ae_nes   <- ae_nes [pw_common, , drop = FALSE]
nmf_nes  <- nmf_nes[pw_common, , drop = FALSE]
ae_padj  <- ae_padj[pw_common, , drop = FALSE]
nmf_padj <- nmf_padj[pw_common, , drop = FALSE]

# precompute -log10(padj) 
ae_L  <- mlog10(ae_padj)
nmf_L <- mlog10(nmf_padj)

# =========================
# 4) Per-pair R² (NES) and R²(-log10 padj) on all pathways
# =========================
r2_pearson <- function(x, y) {
  ok <- is.finite(x) & is.finite(y)
  if (sum(ok) < 3) return(NA_real_)
  r <- suppressWarnings(cor(x[ok], y[ok], method = "pearson"))
  as.numeric(r * r)
}

# R^2 over all pathways for each matched pair
r2_tbl <- tibble(
  ae_index  = pairs_df$ae_index,
  nmf_index = pairs_df$nmf_index
) |>
  mutate(
    R2_NES = map2_dbl(ae_index, nmf_index,
                      ~ r2_pearson(ae_nes[, .x],  nmf_nes[, .y])),
    R2_mlog10_all = map2_dbl(ae_index, nmf_index,
                      ~ r2_pearson(ae_L[, .x],    nmf_L[, .y]))
  ) |>
  bind_cols(select(pairs_df, cosine))

# distribution summaries (useful for captions)
r2_summary <- list(
  NES = quantile(r2_tbl$R2_NES,        probs = c(.10,.25,.50,.75,.90), na.rm = TRUE),
  mlog10_all = quantile(r2_tbl$R2_mlog10_all, probs = c(.10,.25,.50,.75,.90), na.rm = TRUE)
)

# =========================
# 5) Factor-level summaries (optional but handy)
# =========================
# If you use counts CSVs for the AE/NMF boxplot:
ae_counts <- if (file.exists(ae_counts_csv)) read_csv(ae_counts_csv, show_col_types = FALSE) else NULL
nmf_counts<- if (file.exists(nmf_counts_csv)) read_csv(nmf_counts_csv, show_col_types = FALSE) else NULL

# Sparsity (% zeros) per factor (using common-gene W)
pct_zero <- function(M) colMeans(M == 0) * 100
sparsity_df <- bind_rows(
  tibble(model = "AE",  factor = colnames(ae_W),  pct_zero = pct_zero(ae_W)),
  tibble(model = "NMF", factor = colnames(nmf_W), pct_zero = pct_zero(nmf_W))
) |>
  mutate(model = factor(model, levels = c("AE","NMF")))

# L2 norms per factor (orthonormality signal)
l2norm_df <- bind_rows(
  tibble(model = "AE",  factor = colnames(ae_W),  l2 = sqrt(colSums(ae_W^2))),
  tibble(model = "NMF", factor = colnames(nmf_W), l2 = sqrt(colSums(nmf_W^2)))
) |>
  mutate(model = factor(model, levels = c("AE","NMF")))

# =========================
# 6) Pack and save figdata
# =========================
palettes <- list(
  model = c(AE = viridisLite::viridis(7)[4], NMF = "grey65")
)

figdata <- list(
  alpha = 0.01,                      # used elsewhere if needed
  matching = list(
    best_pairs = pairs_df,           # ae_index, nmf_index, cosine
    cosine_mat = as.data.frame(as.matrix(S))  # optional; large but handy
  ),
  gsea = list(
    # Paths to big matrices (we’re not embedding them to keep RDS light)
    paths = list(
      ae_nes_csv = ae_nes_csv, nmf_nes_csv = nmf_nes_csv,
      ae_padj_csv = ae_padj_csv, nmf_padj_csv = nmf_padj_csv
    ),
    # Core summaries you’ll plot
    r2_pairs = r2_tbl,               # one row per matched pair
    r2_summary = r2_summary          # quantiles for captions
  ),
  weights = list(
    # existing common-gene summaries
    sparsity = sparsity_df,
    l2norms  = l2norm_df,

    # NEW: pre-subset summaries
    l1norms_pre = l1norm_df_pre,
    l2norms_pre = l2norm_df_pre,

    # NEW: matrices themselves (store sparse to keep file size down)
    W_full = list(
      ae  = ae_W_full_sp,
      nmf = nmf_W_full_sp,
      genes_ae  = rownames(ae_W_full),
      genes_nmf = rownames(nmf_W_full)
    )
  ),
  counts = list(                      # optional, used in your first boxplot
    ae = ae_counts, nmf = nmf_counts
  ),
  meta = list(
    created_at = format(Sys.time(), "%Y-%m-%d %H:%M:%S %Z"),
    preproc_rds = preproc_rds,
    genes_common = genes_common
  ),
  palettes = palettes
)

figdata$style <- list(
  theme = function() {
    ggplot2::theme_classic(base_size = 12, base_family = "Helvetica") +
      ggplot2::theme(
        panel.grid = ggplot2::element_blank(),
        plot.title.position = "plot",
        legend.title = ggplot2::element_blank()
      )
  }
)

figdata$meta$schema_version <- "1.0"
out_rds <- file.path(gsea_dir, "figdata.rds")
saveRDS(figdata, out_rds)
message("[OK] wrote: ", out_rds)


“package ‘purrr’ was built under R version 4.4.3”
“package ‘stringr’ was built under R version 4.4.3”
[OK] wrote: /mnt/projects/debruinz_project/bisholea/capstone/gsea/figdata.rds

