# Init

In [{}]:
library(readxl)
library(NearestBalance)
library(zCompositions)
library(reshape2)
library(selbal)
library(LeyLabRMisc)
library(ggpubr)
library(data.table)
library(ggplot2)
library(tibble)
library(vegan)
library(foreach)
library(doParallel)
library(lme4) 
library(textshape)
library(mgcv)
library(MASS)
library(ggsci)
library(cluster)
library(tidyr)
library(readr)
library(broom)
library(lmerTest)
library(broom.mixed)
library(furrr)
library(testthat)
library(ggrepel)
library(pheatmap)
library(dplyr)
library(purrr)

df.dims(5)

In [{}]:
# in furrr
future::plan(multicore)

In [{}]:
source(file.path(work_dir, "../r_helper_lib", "nb_helpers.R"))
source(file.path(work_dir, "../r_helper_lib", "functions_mb.R"))

In [{}]:
set.seed(RANDOM_SEED)  # Set seed for reproducibility

In [{}]:
# suffix for some output files
GROUPS_SUFFIX = paste(GROUPS_TO_COMPARE, collapse = "_")

# create project-specific output folder if not exists
PROJ_OUTPUT_DIR = file.path(work_dir, "out", PROJECT_TAG)
if (!dir.exists(PROJ_OUTPUT_DIR)) {
  dir.create(PROJ_OUTPUT_DIR, recursive = TRUE)
}

# config-specific output subfolder
tmp = file.path(PROJ_OUTPUT_DIR, paste0(GROUPS_SUFFIX))
if (!dir.exists(tmp)) {
  dir.create(tmp, recursive = TRUE)
}
CFG_OUTPUT_DIR = file.path(tmp, paste0(sel_factor_coef))
if (!dir.exists(CFG_OUTPUT_DIR)) {
  dir.create(CFG_OUTPUT_DIR, recursive = TRUE)
}

In [{}]:
# output multisheet XLS file with the statistical tests results -- component-based approach
out_stat_compon_xlsx_file = file.path(CFG_OUTPUT_DIR, paste0("stats_", SCENARIO_COMPON, ".xlsx"))

# Load

## features

In [{}]:
flapro = lapply(flapro_file, function(x) {
    read_tsv(x, col_names = TRUE) %>% 
        pivot_longer(cols = -c(Family), names_to = "Sample", values_to = "Abundance") %>% 
        rename(FlaCluster_Rep = Family)
})

## meta - features

In [{}]:
meta_fla = read_tsv(meta_fla_file, col_names = TRUE)
meta_fla

meta_fla %>% select(Flagellin_ID) %>% distinct() %>% nrow()
meta_fla %>% select(Cluster_c4_representative) %>% distinct() %>% nrow()

meta_fla = meta_fla %>% 
    mutate(num_fla_per_cluster = n(), .by = Cluster_c4_representative) 

meta_fla %>% select(Cluster_c4_representative, num_fla_per_cluster) %>% 
    distinct() %>%
    arrange(desc(num_fla_per_cluster))

In [{}]:
# replace NA with "not_defined", to obtain true table() output
meta_fla = meta_fla %>%     
    mutate(Predicted = ifelse(is.na(Predicted_v3) | Predicted_v3 == "not_checked", "not_defined", Predicted_v3)) %>%
    select(-Predicted_v3) %>% 
    mutate(Experimental = ifelse(is.na(Experimental) | Experimental == "not_checked", "not_defined", Experimental))    

In [{}]:
# meta_fla: make Cluster_Pred_v which summarizes Predicted_v by Cluster_c4_representative in a way that if the value is the same, it's left; otherwise, it's assigned "mixed"
meta_fla = meta_fla %>%     
    mutate(Cluster_Pred = ifelse(n_distinct(Predicted) == 1, first(Predicted), "mixed"), .by = Cluster_c4_representative) %>% 
    mutate(Cluster_Exp = ifelse(n_distinct(Experimental) == 1, first(Experimental), "mixed"), .by = Cluster_c4_representative) %>% 
    # make Cluster_Species by concatenating all distinct Species , per Cluster_c4_representative
    mutate(Cluster_Species = paste(unique(Species), collapse = ";"), .by = Cluster_c4_representative) %>% 
    mutate(Cluster_Genus = paste(unique(Genus), collapse = ";"), .by = Cluster_c4_representative) %>%
    mutate(Cluster_Family = paste(unique(Family), collapse = ";"), .by = Cluster_c4_representative)

In [{}]:
meta_fla %>% select(Cluster_Pred) %>% table()
meta_fla %>% select(Cluster_Exp) %>% table()

## sample coverage

In [{}]:
sample_coverage = lapply(coverage_file, function(x) {
    read_tsv(x, col_names = FALSE) %>% 
        rename(Sample = "X1", Reads1 = "X2") %>% 
        distinct()
})