# Benchmarking PCA approach to filtering for joint dimensionality reduction
Author: Romana T. Pop^1^

1. Centre for Molecular Medicine Norway (NCMM), Faculty of Medicine, University of Oslo, Oslo, Norway

## Introduction
One of the challenges of multi-omics integration is the different dimensionalities of the various omics data types which can lead to the results being driven by the difference in dimensionality between omics layers, rather than by biological signal.

Therefore, omics data must generally be filtered prior to JDR. A few filtering strategies are commonly used, such as filtering to the top *n* most variable features or removing non-variable features. However, these approaches may not always lead to an equal proportion of variability being retained from each omics layer.

One approach to filtering in a more data-driven manner is performing principle component analysis (PCA) on the data prior to JDR and using a variance threshold to filter the data. 

Here, we compare the data and models of four JDR tools with and without PCA.

First, we load the libraries we will need and set some parameters for the analysis.

In [None]:
# ensure environment is clean
rm(list=ls())

# install MARMOT
# uncomment if not installed
#library(devtools)
#install_github("rtpop/MARMOT")

# load libraries
library(MARMOT)
library(tidyverse)
library(preprocessCore)
library(RColorBrewer)
library(gridExtra)
library(reshape2)
library(MOFA2)

In [None]:
# some intermediate files are provided for ease, set this parameter to FALSE
# if you do not wish to use them and wish to compute them again instead
precomputed <- TRUE

# set this parameter to FALSE if you do not want any intermediate files to be saved
# note that some intermediate files are necessary and will be saved regardless
intermediates <- TRUE

# setting working directory
wd <- "results"
setwd(wd)

# specify where plots should be saved
figure_dir <- "figures"

# specify directory for logs to be saved
log_dir <- "logs"

# specify directory for the benchmarking to be saved
benchmark_dir <- "benchmark"

# specify directory for results to be saved
tcga_res <- "TCGA"
gep_res <- "GEPliver"

# defining vector of cancer names for which to do the analysis
cancers_tcga <- c("aml", "breast", "colon", "gbm", "kidney", "liver", "lung",
          "melanoma", "ovarian", "sarcoma")

# defining names for the JDR models that we will run
model <- c("nonet", "indeg", "out", "both")

We compare the number of features in each omic with and without PCA (reproducing Figures 2 & S2 from the paper). 

In [None]:
# Define vectors for file paths
omics_files_no_pca <- file.path(wd, paste0("tcga_", cancers_tcga, "_omics_no_pca.Rda"))
omics_files_pca <- file.path(wd, paste0("tcga_", cancers_tcga, "_omics_pca.Rda"))
save_paths <- file.path("figures", paste0(cancers_tcga, "_data_dim_compare.pdf"))

# Create a list of vectors for omics_files
omics_files <- Map(function(pca, no_pca) c(pca, no_pca), omics_files_pca, omics_files_no_pca)

# Initialize a list to store plots
p_list <- vector("list", length(cancers_tcga))
names(p_list) <- cancers_tcga

# Use Map for plotting and saving individual plots
p_list <- Map(function(cancer, omics_file, save_path) {
  message("Processing: ", cancer)
  
  # Plot bar plot
  p <- plot_data_dim(data = omics_file, data_labels = c("PCA", "no PCA"),
                     log_x = FALSE, title = cancer, compare = TRUE)
  
  # Save the plot
  ggsave(p, file = save_path, height = 20, width = 20)
  
  return(p)
}, cancers_tcga, omics_files, save_paths)

# Combine all plots Next we benchmark the performance of four JDR tools (MOFA+, JIVE, MCIA and RGCCA) on the data with and without PCA. into a grid
grid_plot <- do.call(grid.arrange, c(grobs = p_list, ncol = 3))
print(grid_plot)
ggsave(grid_plot, file = "figures/data_dim_compare_all_can.pdf", height = 45, width = 40)

Next we benchmark the performance of four JDR tools (MOFA+, JIVE, MCIA and RGCCA) on the data with and without PCA. 

In [None]:
if (!precomputed) {
  # Ensure the benchmark directory exists
  dir.create(file.path(wd, benchmark_dir), showWarnings = FALSE)

  # Define vectors for omics file paths and save file paths
  omics_files_no_pca <- file.path(wd, paste0("tcga_", cancers_tcga, "_omics_no_pca.Rda"))
  omics_files_pca <- file.path(wd, paste0("tcga_", cancers_tcga, "_omics_pca.Rda"))
  save_files_pca <- file.path(wd, benchmark_dir, paste0(cancers_tcga, "_factorisations_pca.Rda"))
  save_files_no_pca <- file.path(wd, benchmark_dir, paste0(cancers_tcga, "_factorisations_no_pca.Rda"))


  # iterate over cancers
  Map(function(cancer, omics_file_pca, omics_file_no_pca, save_file_pca, save_file_no_pca) {
    print(cancer)
    
    # Run factorization
    factorizations_pca <- run_jdr(omic_list = omics_file_pca, seed = 13)
    factorizations_no_pca <- run_jdr(omic_list = omics_file_no_pca, seed = 13)
    
    # Save the factorizations
    save(factorizations_pca, file = save_file_pca)
    save(factorizations_no_pca, file = save_file_no_pca)

  }, cancers_tcga, omics_files_pca, omics_files_no_pca, save_files_pca, save_files_no_pca)
}

We perform univariate cox regression for each factor to asses its association with survival. The below reproduces Figure S3 from the manuscript.

In [None]:
# Define vectors for file paths
pca_files <- file.path(benchmark_dir, paste0(cancers_tcga, "_factorisations_pca.Rda"))
no_pca_files <- file.path(benchmark_dir, paste0(cancers_tcga, "_factorisations_no_pca.Rda"))
surv_files <- file.path(wd, paste0(cancers_tcga, "_surv.Rda"))

# Initialize the survival data frame
surv_df <- data.frame()

# process each cancer type
results <- Map(function(cancer, pca_file, no_pca_file, surv_file) {
  message("Processing: ", cancer)

  PCA <- get(load(pca_file))
  noPCA <- get(load(no_pca_file))
  surv <- get(load(surv_file))

  methods <- names(PCA)
  result_list <- list()

  for (method in methods) {
    if (method == "MOFA") {
      pca_fct <- PCA[[method]][[1]]
      nopca_fct <- noPCA[[method]][[1]]
    } else {
      pca_fct <- PCA[[method]]
      nopca_fct <- noPCA[[method]]
    }

    # Run the survival association of the factors
    PCA_cox <- surv_association(pca_fct, surv, univariate = TRUE)
    noPCA_cox <- surv_association(nopca_fct, surv, univariate = TRUE)

    df <- surv_compare(models = list(PCA_cox, noPCA_cox), 
                       model_labels = c("PCA", "no_PCA"), 
                       univariate = TRUE, method = "BH")

    df$cancer <- cancer
    df$method <- method

    result_list[[method]] <- list(df = df, cox_models = list(PCA_cox, noPCA_cox))
  }

  return(result_list)
}, cancers_tcga, pca_files, no_pca_files, surv_files)

# Combine data frames and save the results
for (cancer in cancers_tcga) {
  for (method in names(results[[cancer]])) {
    df <- results[[cancer]][[method]]$df
    surv_df <- rbind(surv_df, df)

    cox_all <- results[[cancer]][[method]]$cox_models
    names(cox_all) <- c("PCA", "no_PCA")

    save(cox_all, file = file.path(benchmark_dir, paste0(cancer, "_cox_models_", method, ".Rda")))
  }
}

save(surv_df, file = file.path(benchmark_dir, "TCGA_surv_all_bench.Rda"))

In [None]:
# plotting
load("benchmark/TCGA_surv_all_bench.Rda")

method <- unique(surv_df$method)
cols <- palette("Dark2")

for(i in method){
  surv_meth <- surv_df[which(surv_df$method == i), ]
  models <- unique(surv_meth$label)
  p <- surv_compare_dotplot(surv_df = surv_meth, models_to_compare = models,
                            colours = c(cols[8], "grey", cols[6]))

  ggsave(p, file = file.path(figure_dir, paste0("surv_compare_PCA_", i, ".pdf")))
}