In [None]:
# Load required libraries
library(tidyverse)
library(pheatmap)
library(dplyr)
library(ggplot2)

In [None]:
# Set input/output directories
cox_dir <- "~/Downloads/Annan_Project/Survival/TCGA_cancer_protcod/"
swr_dir <- "~/Downloads/Annan_Project/Survival/TCGA_cancer_protcod/HP_genesets/"
out_dir <- "~/Downloads/Annan_Project/Survival/TCGA_cancer_protcod/results"
dir.create(out_dir, showWarnings = FALSE)

In [None]:
# Load SWR gene sets (convert to uppercase)
swr_files <- list.files(swr_dir, pattern = "\\.txt$", full.names = TRUE)
swr_sets <- setNames(lapply(swr_files, function(f) {
  toupper(trimws(scan(f, what = "", quiet = TRUE)))
}), tools::file_path_sans_ext(basename(swr_files)))

In [None]:
# Fisher’s enrichment function
compute_enrichment <- function(gene_list, background, swr_sets) {
  lapply(swr_sets, function(swr_genes) {
    swr_genes <- intersect(swr_genes, background)
    a <- length(intersect(gene_list, swr_genes))
    b <- length(setdiff(gene_list, swr_genes))
    c <- length(setdiff(swr_genes, gene_list))
    d <- length(setdiff(background, union(gene_list, swr_genes)))
    m <- matrix(c(a, b, c, d), nrow = 2)
    fisher <- fisher.test(m)
    return(data.frame(pval = fisher$p.value, odds = fisher$estimate))
  }) %>% bind_rows(.id = "GeneSet")
}

In [None]:
# Process Cox files
cox_files <- list.files(cox_dir, pattern = "_protein_coding_cox.tsv$", full.names = TRUE)
sig_counts <- data.frame()
hazard_enrichment <- list()
protective_enrichment <- list()

for (file in cox_files) {
  cancer <- gsub("_protein_coding_cox.tsv", "", basename(file))
  df <- read.delim(file)
  df$Gene <- toupper(trimws(df$Gene))

  sig_df <- df %>% filter(FDR <= 0.05)
  sig_count <- nrow(sig_df)
  sig_counts <- rbind(sig_counts, data.frame(Cancer = cancer, SignificantGenes = sig_count))

  if (sig_count < 50) next

  background <- df$Gene

  top_hazard <- sig_df %>% filter(HR > 1) %>% arrange(desc(HR)) %>% pull(Gene) %>% head(5000)
  hazard_enrichment[[cancer]] <- compute_enrichment(top_hazard, background, swr_sets)

  top_protective <- sig_df %>% filter(HR < 1) %>% arrange(HR) %>% pull(Gene) %>% head(5000)
  protective_enrichment[[cancer]] <- compute_enrichment(top_protective, background, swr_sets)
}

In [None]:
# Save significant gene counts
write.table(sig_counts, file = file.path(out_dir, "significant_gene_counts.tsv"),
            sep = "\t", row.names = FALSE, quote = FALSE)

In [None]:
# Convert enrichment lists to matrix
to_matrix <- function(enrichment_list, value = "odds", pcut = 0.05) {
  all_cancers <- names(enrichment_list)
  all_genesets <- names(swr_sets)

  mat <- matrix(NA, nrow = length(all_cancers), ncol = length(all_genesets))
  rownames(mat) <- all_cancers
  colnames(mat) <- all_genesets

  for (cancer in all_cancers) {
    enrich <- enrichment_list[[cancer]]
    if (is.null(enrich)) next
    for (gs in enrich$GeneSet) {
      row <- which(rownames(mat) == cancer)
      col <- which(colnames(mat) == gs)
      if (enrich[enrich$GeneSet == gs, "pval"] < pcut) {
        mat[row, col] <- enrich[enrich$GeneSet == gs, value]
      }
    }
  }
  return(mat)
}

In [None]:
# Heatmap plotting function (custom colors and layout)
plot_heatmap <- function(mat, title, file_name, order_cols = NULL, color_palette = NULL) {
  mat <- mat[rowSums(!is.na(mat)) > 0, colSums(!is.na(mat)) > 0]
  if (nrow(mat) < 1 || ncol(mat) < 1) {
    message("⚠️ Not enough data for heatmap: ", title)
    return(NULL)
  }

  if (!is.null(order_cols)) {
    mat <- mat[, intersect(order_cols, colnames(mat))]
  }

  breaks <- c(0, 1, 2, 3, 4, 5)
  colors <- colorRampPalette(color_palette)(length(breaks) - 1)

  pdf(file = file.path(out_dir, file_name), width = 12, height = 10)
  pheatmap(mat,
           cluster_rows = FALSE,
           cluster_cols = FALSE,
           color = colors,
           breaks = breaks,
           na_col = "grey90",
           fontsize = 14,
           main = title)
  dev.off()
}

In [None]:
# Generate enrichment matrices
save_enrichment(protective_enrichment, "All_protective_fisher_results.tsv")
hazard_mat <- to_matrix(hazard_enrichment, value = "odds")
protective_mat <- to_matrix(protective_enrichment, value = "odds")

In [None]:
# Save matrices
write.table(hazard_mat, file = file.path(out_dir, "hazard_OR_matrix.tsv"),sep = "\t", quote = FALSE, col.names = NA)
write.table(protective_mat, file = file.path(out_dir, "protective_OR_matrix.tsv"),sep = "\t", quote = FALSE, col.names = NA)

In [None]:
# Optional: Set gene set order (or leave NULL)
gene_set_order <- colnames(hazard_mat)

# Define color palettes
hazard_colors <- c("white", "lightblue", "blue")
protective_colors <- c("white", "lightsalmon", "orange")

In [None]:
# Plot heatmaps
plot_heatmap(hazard_mat, "Hazard Enrichment (OR)", "hazard_enrichment_heatmap.pdf",
             order_cols = gene_set_order, color_palette = hazard_colors)
plot_heatmap(protective_mat, "Protective Enrichment (OR)", "protective_enrichment_heatmap.pdf",
             order_cols = gene_set_order, color_palette = protective_colors)

In [None]:
# Save full enrichment result tables
save_enrichment <- function(list_obj, file_name) {
    out <- bind_rows(lapply(names(list_obj), function(cancer) {
        df <- list_obj[[cancer]]
        if (!is.null(df)) df$Cancer <- cancer
        df
    }))
    
    write.table(
        out,
        file = file.path(out_dir, file_name),
        sep = "\t",
        row.names = FALSE,
        quote = FALSE
    )
}

# Save hazard and protective enrichment results
save_enrichment(hazard_enrichment, "All_hazard_fisher_results.tsv")
save_enrichment(protective_enrichment, "All_protective_fisher_results.tsv")