# Goal

* Flagellome analysis based on FlaPro results: HMP2 IBD dataset.


# Var

In [{}]:
work_dir = '/ebio/abt3_projects2/human_fla_profiling/atyakht/human-fla-profiling/'
PROJECT_TAG = "IBD"

AVAILABLE_OMICS = c("MGX", "MTX")
#AVAILABLE_OMICS = c("MTX")

DO_MTX_MGX_ratio = TRUE

# - scenario for the Component-base analysis, 3 options:
#SCENARIO_COMPON = "MGX" # - metagenomics (MGX, meta-DNA); OR
#SCENARIO_COMPON = "MTX" # - metatranscriptomics (MTX, meta-RNA)
SCENARIO_COMPON = "MTX_MGX_ratio" # - metatranscriptomics (MTX, meta-RNA) / metagenomics (MGX, meta-DNA) ratio

# - scenario for the Compositional analysis (Nearest Balance), 2 options:
#SCENARIO_NB = "MGX"
SCENARIO_NB = "MTX"

# flagellome profiles data and meta-data files
flapro_file = list()
coverage_file = list()

# DB v. C4 (Feb 2025):
flapro_file[["MGX"]] = file.path(work_dir, "data/IBD/merged_realcounts_MGX.txt")
coverage_file[["MGX"]] = file.path(work_dir, "data/IBD/full_hmp_mgx_readcount.txt")

flapro_file[["MTX"]] = file.path(work_dir, "data/IBD/merged_realcounts_MTX.txt")
coverage_file[["MTX"]] = file.path(work_dir, "data/IBD/full_hmp_mtx_readcount.txt")

# meta-data
meta_samples_file = list()
meta_samples_file[["MGX"]] = file.path(work_dir, "data/IBD/final_metadata_mgx-24-09-24.txt")
meta_samples_file[["MTX"]] = file.path(work_dir, "data/IBD/final_metadata_mtx-24-09-24.txt")
hmp_metadata_original_file = file.path(work_dir, "data/IBD/hmp2_metadata_2018-08-20.csv")

# meta-data
meta_fla_file = file.path(work_dir, "data/taxonomy_cluster_repr_c4-pred3.tsv")

In [2]:
# sample filtering params
MIN_FLA_READS_PER_SAMPLE = 100 #50 #30 # pre-prevalence filtering
MIN_FLA_READS_PER_SAMPLE_ROUND_2 = 30 # post-prevalence filtering

# filtering params for the relative abundance (flapro_rel)
REL_AB_PREVALENCE_CUTOFF = 25 #30
REL_AB_ABUND_CUTOFF = 0 #0.0001

# filtering params for NB (flapro), as well as its normalized version (flapro_perc)
PREVALENCE_CUTOFF = 30 #10
ABUND_CUTOFF_PERC = 0 #0.0001 #0.01 #0.005

# flapro_rel: coefficient of sigma for outlier detection
# (set it to some high value like 1E6 to disable outlier filtering)
REL_N_SIGMA_REL_OUTLIERS = 3 #0.5
# number of features (Fla) to show in the biplot
REL_N_FEATURES_BIPLOT = 5 #10

# colors used
FLA_CLASSES_COLORS = c("not_defined" = "#888888", "active" = "#5050ff", "silent" = "#ce3d32", "evader" = "cyan", "mixed" = "black")
FLA_CLASSES_SHAPES = c("not_defined" = 4, "active" = 1, "silent" = 3, "evader" = 4, "mixed" = 10)

# option: dedicated analysis of the Fla experimentally profiled
EXPLORE_EXPERIMENTAL_FLA = FALSE

# additionally adjust features for factors (by collecting the residuals) - 
# - for the Component-based analysis (SCENARIO_COMPON) - for the viz purposes, does not affect the LM:
ADD_ADJUST_FOR_FACTORS_COMPON = FALSE
# - for the Compositional analysis (SCENARIO_NB) - for the viz purposes as well the produced NB values:
ADD_ADJUST_FOR_FACTORS_NB = TRUE
# adjust for what:
ADD_ADJUST_FOR = "Age + Sex"

# Adjust this factor as needed
BIPLOT_ARROW_SCALING = c("MGX" = 5e3, "MTX" = 5e4, "MTX_MGX_ratio" = 2e3)
# add text labels to the biplot samples?
BIPLOT_LABELS_SAMPLES = FALSE

# repeated measures present? (e.g. multiple time points per subject)
# - for the Component-based analysis (_COMPON), to use lmer instead of lm:
REPEAT_MEAS_COMPON = TRUE
# - for the Compositional analysis (_NB), to generate 1-sample-per subject splits instead of common splits:
REPEAT_MEAS_NB = TRUE
# name of the repeated measures factor (used if flags above enabled)
REPEAT_MEAS_FACTOR = "Participant_ID"

In [3]:
# Nearest Balance parameters
# cross-validation parameters:
# threshold for reproducibility of Nearest Balance
reproducibility_threshold = 0.8 #0.9

# setting for leave-1-out
n_sim = 100  # number of cross-validation simulations
train_prop = 0.67 # proportion of samples to use for training (except for the case of repeated measures where it's defined customly)

RANDOM_SEED = 123 # for reproducibility

In [4]:
# clin. groups to be included in comparisons
GROUPS_TO_COMPARE = c("HC", "UC", "CD")
#GROUPS_TO_COMPARE = c("HC", "UC")
#GROUPS_TO_COMPARE = c("HC", "CD")

#sel_factor = "Group"
sel_factor = "DiseaseScore"

STAT_PLOT_CMP = list(c("HC", "UC"), c("HC", "CD"), c("UC", "CD"))
COHORT_COLORS = c("HC" = "#008ea0", "UC" = "#ff6f00", "CD" = "#c71000")

In [5]:
rel_model_formula = paste0(sel_factor, " + Age + (1|Participant_ID)")

# initial formula for PERMANOVA (many factors)
init_permanova_formula = paste0(sel_factor, " + Age + Sex")

# LM formula for Nearest Balance - factors selected based on adonis2 above
lm_nb_formula = paste0(sel_factor, " + Age")

# select 1 factor for which we seek Nearest Balance - 
# along with the name of coef that LM give to it (might be different for categorical factors).
if(sel_factor == "Group") {
    testthat::test_that("two cohorts are selected", {
        testthat::expect_equal(length(GROUPS_TO_COMPARE), 2)
    })
    #sel_factor_coef = paste0("Group", sort(GROUPS_TO_COMPARE, decreasing = TRUE)[1])
    sel_factor_coef = paste0("Group", GROUPS_TO_COMPARE[length(GROUPS_TO_COMPARE)])
} else {
    sel_factor_coef = "DiseaseScore"
}

#sel_factor = "Age"
#sel_factor_coef = "Age"

In [6]:
# number of permutations for PERMANOVA
N_PERMANOVA = 999 # for testing only
#N_PERMANOVA = 9999

In [7]:
# Set the number of CPU cores to use:
# in the "parallel" lib
num_rparallel_cores = 40