## Component-wise (flapro_REL)
over metagenome-coverage normalized relative abundance 

In [{}]:
flapro_rel[[SCENARIO_COMPON]] %>% inner_join(meta_samples, by = "Sample") %>% 
    select(Sample, Group) %>%
    distinct() %>%
    select(Group) %>%
    table()

### summarized Fla measures per sample

In [{}]:
# compute total *relative* fla counts per sample
flapro_rel_total_all = flapro_rel[[SCENARIO_COMPON]] %>% 
        group_by(Sample) %>% 
        summarise(sum_rel_abund = sum(value)) %>% 
        ungroup() %>%        
        rename(value = "sum_rel_abund") %>% 
        mutate(feature = "sum_rel_abund_F_all") %>%         
        #inner_join(meta_samples, by = "Sample") %>%      
        arrange(Sample)
flapro_rel_total_all

# save to file
write_tsv(flapro_rel_total_all %>% pivot_wider(names_from = feature, values_from = value), file.path(PROJ_OUTPUT_DIR, paste0("flapro_rel_total_all_", SCENARIO_COMPON, ".tsv")))

In [{}]:
flapro_rel[[SCENARIO_COMPON]] %>% 
    inner_join(meta_fla_clus, by = c("feature" = "FlaCluster")) %>%
    select(feature, Cluster_Pred) %>% 
    distinct() %>% 
    select(Cluster_Pred) %>% 
    table()

In [{}]:
# compute total *relative* fla counts per sample, stratified by Cluster_Pred
flapro_rel_total_by_Cluster_Pred = 
    flapro_rel[[SCENARIO_COMPON]] %>% 
    inner_join(meta_fla_clus, by = c("feature" = "FlaCluster")) %>%     
    group_by(Sample, Cluster_Pred) %>% 
    summarise(value = sum(value)) %>%    
    #mutate(feature = "sum_rel_abund") %>%
    mutate(feature = paste0("sum_rel_abund", "_F_", Cluster_Pred)) %>%    
    ungroup() %>%     
    select(-Cluster_Pred) %>%
    #inner_join(meta_samples, by = "Sample") %>% 
    arrange(Sample)    
flapro_rel_total_by_Cluster_Pred 

# save to file
write_tsv(flapro_rel_total_by_Cluster_Pred %>% pivot_wider(names_from = feature, values_from = value), file.path(PROJ_OUTPUT_DIR, paste0("flapro_rel_total_by_Cluster_Pred_", SCENARIO_COMPON, ".tsv")))

In [{}]:
flapro_rel_total_by_Cluster_Pred  %>% filter(value > 0)

flapro_rel_total_by_Cluster_Pred %>% inner_join(meta_samples, by = "Sample") %>% 
    select(Sample, Group) %>%
    distinct() %>%
    select(Group) %>%
    table()

In [{}]:
flapro_rel_total = rbind(flapro_rel_total_all, flapro_rel_total_by_Cluster_Pred)
flapro_rel_total

In [{}]:
samples_with_zero_Active = flapro_rel_total_by_Cluster_Pred %>% 
    filter(value == 0 & feature == "sum_rel_abund_F_active") %>%
    select(Sample)
samples_with_zero_Active

In [{}]:
# for each Sample, compute the ratio of Abundance_rel between Silent and Active
flapro_rel_Silent_Active_ratio = 
    flapro_rel_total_by_Cluster_Pred %>% 
    separate(feature, into = c("feature", "Cluster_Pred"), sep = "_F_") %>%
    # there are not more than a few samples with 0 Active rel abund, so just remove them
    anti_join(samples_with_zero_Active, by = "Sample") %>%
    group_by(Sample) %>%    
    mutate(ratio_silent_active = value[Cluster_Pred == "silent"] / value[Cluster_Pred == "active"]) %>%
    ungroup() %>%
    arrange(desc(ratio_silent_active)) %>% 
    select(-value, -Cluster_Pred) %>% 
    distinct() %>% 
    mutate(feature = "ratio_silent_active") %>% 
    rename(value = "ratio_silent_active")
flapro_rel_Silent_Active_ratio

# save to file
write_tsv(flapro_rel_Silent_Active_ratio %>% pivot_wider(names_from = feature, values_from = value), file.path(PROJ_OUTPUT_DIR, paste0("flapro_rel_Silent_Active_ratio_", SCENARIO_COMPON, ".tsv")))

In [{}]:
## for modeling, merge 2 columns into 1 to form a feature name
##flapro_rel_total_by_Cluster_Pred_for_lm = 
#flapro_rel_total_by_Cluster_Pred = flapro_rel_total_by_Cluster_Pred %>%
#    mutate(feature = paste0(feature, "_F_", Cluster_Pred)) %>% 
#    select(-Cluster_Pred)

### adjust features for Age - via residuals

In [{}]:
if(ADD_ADJUST_FOR_FACTORS_COMPON) {
    flapro_rel[[SCENARIO_COMPON]] = adjust_via_residuals(meta_samples, flapro_rel[[SCENARIO_COMPON]], ADD_ADJUST_FOR)
    flapro_rel_maj = adjust_via_residuals(meta_samples, flapro_rel_maj, ADD_ADJUST_FOR)
    flapro_rel_maj_log = adjust_via_residuals(meta_samples, flapro_rel_maj_log, ADD_ADJUST_FOR)
    flapro_rel_total = adjust_via_residuals(meta_samples, flapro_rel_total, ADD_ADJUST_FOR)
    if(SCENARIO_COMPON != "MTX_MGX_ratio") {
        flapro_alpha = adjust_via_residuals(meta_samples, flapro_alpha, ADD_ADJUST_FOR)
    }    
    flapro_rel_Silent_Active_ratio = adjust_via_residuals(meta_samples, flapro_rel_Silent_Active_ratio, ADD_ADJUST_FOR)    
}

In [{}]:
#flapro_rel_total_by_Cluster_Pred = flapro_rel_total_by_Cluster_Pred_for_lm %>%
#    separate(feature, into = c("feature", "Cluster_Pred"), sep = "_F_")

### viz summarized measures

In [{}]:
if(SCENARIO_COMPON != "MTX_MGX_ratio") {
    flapro_alpha_to_plot = flapro_alpha %>% separate(feature, into = c("feature", "Cluster_Pred"), sep = "_F_")
    #flapro_alpha_to_plot
}

In [{}]:
if(SCENARIO_COMPON != "MTX_MGX_ratio") {
    p1a = flapro_alpha_to_plot %>% 
        inner_join(meta_samples, by = "Sample") %>%
        filter(Cluster_Pred %in% c("all")) %>% 
        mutate(Cluster_Pred = factor(Cluster_Pred, levels = c("all", "active", "silent", "mixed", "not_defined"))) %>%
        ggplot(aes(x = Group, y = value, fill = Group)) +    
        ## log10 scale
        (if (!ADD_ADJUST_FOR_FACTORS_COMPON) scale_y_log10() else NULL) + 
        geom_violin() +
        geom_boxplot(width = 0.2, fill = "white") +
        geom_jitter(width = 0.3, height = 0, alpha = 0.2, size = 0.9, color = "#000000") + 
        theme_bw() +    
        facet_wrap( ~ feature, scales = "free_y") +
        scale_fill_manual(values = COHORT_COLORS) +
        # no grid
        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +        
        ylab("Flagellome alpha diversity") +
        # legend off
        theme(legend.position = "none") +
        (if (!REPEAT_MEAS_COMPON) 
        # Pairwise Wilcoxon test with significance annotations
        stat_compare_means(comparisons = STAT_PLOT_CMP,                        
                                    method = "wilcox.test", label = "p.format", 
                                    vjust = 0.1, size = 2, step.increase = 0.09)
        else NULL)

    p1b = flapro_alpha_to_plot %>% 
        inner_join(meta_samples, by = "Sample") %>%
        mutate(Cluster_Pred = factor(Cluster_Pred, levels = c("all", "active", "silent", "mixed", "not_defined"))) %>%
        ggplot(aes(x = Group, y = value, fill = Group)) +    
        ## log10 scale
        (if (!ADD_ADJUST_FOR_FACTORS_COMPON) scale_y_log10() else NULL) + 
        geom_violin() +
        geom_boxplot(width = 0.2, fill = "white") +
        geom_jitter(width = 0.3, height = 0, alpha = 0.2, size = 0.9, color = "#000000") + 
        theme_bw() +    
        facet_grid(Cluster_Pred ~ feature, scales = "free_y") +
        scale_fill_manual(values = COHORT_COLORS) +
        # no grid
        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +        
        ylab("Flagellome alpha diversity") +
        # legend off
        theme(legend.position = "none") +
        (if (!REPEAT_MEAS_COMPON) 
        # Pairwise Wilcoxon test with significance annotations
        stat_compare_means(comparisons = STAT_PLOT_CMP,                        
                                    method = "wilcox.test", label = "p.format", 
                                    vjust = 0.1, size = 2, step.increase = 0.09)
        else NULL)

    p1c = flapro_alpha_to_plot %>% 
        inner_join(meta_samples, by = "Sample") %>%
        filter(Cluster_Pred %in% c("all", "active", "silent")) %>% 
        mutate(Cluster_Pred = factor(Cluster_Pred, levels = c("all", "active", "silent", "mixed", "not_defined"))) %>%
        ggplot(aes(x = Group, y = value, fill = Group)) +    
        ## log10 scale
        (if (!ADD_ADJUST_FOR_FACTORS_COMPON) scale_y_log10() else NULL) + 
        geom_violin() +
        geom_boxplot(width = 0.2, fill = "white") +
        geom_jitter(width = 0.3, height = 0, alpha = 0.2, size = 0.9, color = "#000000") + 
        theme_bw() +    
        facet_grid(Cluster_Pred ~ feature, scales = "free_y") +
        scale_fill_manual(values = COHORT_COLORS) +
        # no grid
        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +        
        ylab("Flagellome alpha diversity") +
        # legend off
        theme(legend.position = "none") +
        (if (!REPEAT_MEAS_COMPON) 
        # Pairwise Wilcoxon test with significance annotations
        stat_compare_means(comparisons = STAT_PLOT_CMP,                        
                                    method = "wilcox.test", label = "p.format", 
                                    vjust = 0.1, size = 2, step.increase = 0.09)
        else NULL)        
}

In [{}]:
if(SCENARIO_COMPON != "MTX_MGX_ratio") {
    p.dims(8, 3)
    p = ggarrange(p1a, ncol = 1)
    plot(annotate_figure(p, top = text_grob(SCENARIO_COMPON)))
}

In [{}]:
if(SCENARIO_COMPON != "MTX_MGX_ratio") {
    p.dims(8, 15)
    p = ggarrange(p1b, ncol = 1)
    plot(annotate_figure(p, top = text_grob(SCENARIO_COMPON)))
}

In [{}]:
if(SCENARIO_COMPON != "MTX_MGX_ratio") {
    p.dims(8, 7.5)
    p = ggarrange(p1c, ncol = 1)
    plot(annotate_figure(p, top = text_grob(SCENARIO_COMPON)))
}

In [{}]:
flapro_rel_total_to_plot = flapro_rel_total %>% separate(feature, into = c("feature", "Cluster_Pred"), sep = "_F_")

In [{}]:
# compare total relative fla counts across the groups
p1 = flapro_rel_total_to_plot %>% inner_join(meta_samples, by = "Sample") %>%
    mutate(Cluster_Pred = factor(Cluster_Pred, levels = c("all", "active", "silent", "mixed", "not_defined"))) %>%
    ggplot(aes(x = Group, y = value, fill = Group)) +
    # log10 scale
    (if (!ADD_ADJUST_FOR_FACTORS_COMPON) scale_y_log10() else NULL) + 
    geom_violin() +
    geom_boxplot(width = 0.3, fill = "white") +
    geom_jitter(width = 0.3, height = 0, alpha = 0.2, size = 0.9, color = "#000000") +    
    theme_bw() +
    facet_wrap(~Cluster_Pred, ncol = 5) +
    scale_fill_manual(values = COHORT_COLORS) +
    # no grid
	theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
    # y axis title: Abundance
	ylab("Flagellome rel. abundance") +
    # legend off
    theme(legend.position = "none") +
    (if (!REPEAT_MEAS_COMPON) 
    # Pairwise Wilcoxon test with significance annotations
    stat_compare_means(comparisons = STAT_PLOT_CMP,                        
                                method = "wilcox.test", label = "p.format", 
                                vjust = 0.1, size = 2, step.increase = 0.07)
    else NULL)

p1b = flapro_rel_total_to_plot %>% inner_join(meta_samples, by = "Sample") %>%
    mutate(Cluster_Pred = factor(Cluster_Pred, levels = c("_all", "active", "silent", "mixed", "not_defined"))) %>%
    filter(Cluster_Pred %in% c("active", "silent")) %>% 
    ggplot(aes(x = Group, y = value, fill = Group)) +
    # log10 scale
    (if (!ADD_ADJUST_FOR_FACTORS_COMPON) scale_y_log10() else NULL) + 
    geom_violin() +
    geom_boxplot(width = 0.3, fill = "white") +
    geom_jitter(width = 0.3, height = 0, alpha = 0.2, size = 0.9, color = "#000000") +    
    theme_bw() +
    facet_wrap(~Cluster_Pred, ncol = 2) +
    scale_fill_manual(values = COHORT_COLORS) +
    # no grid
	theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
    # y axis title: Abundance
	ylab("Flagellome rel. abundance") +
    # legend off
    theme(legend.position = "none") +
    (if (!REPEAT_MEAS_COMPON) 
    # Pairwise Wilcoxon test with significance annotations
    stat_compare_means(comparisons = STAT_PLOT_CMP,                        
                                method = "wilcox.test", label = "p.format", 
                                vjust = 0.1, size = 2, step.increase = 0.07)
    else NULL)


In [{}]:
p.dims(5.2, 3)
p = ggarrange(p1b, ncol = 1)
annotate_figure(p, top = text_grob(SCENARIO_COMPON))

p.dims(12, 3)
p = ggarrange(p1, ncol = 1)
annotate_figure(p, top = text_grob(SCENARIO_COMPON))

In [{}]:
p1 = flapro_rel_Silent_Active_ratio %>% inner_join(meta_samples, by = "Sample") %>%
    ggplot(aes(x = Group, y = value, fill = Group)) +
    ## log10 scale
    (if (!ADD_ADJUST_FOR_FACTORS_COMPON) scale_y_log10() else NULL) + 
    geom_violin() +
    geom_boxplot(width = 0.2, fill = "white") +
    geom_jitter(width = 0.3, height = 0, alpha = 0.2, size = 0.9, color = "#000000") +    
    theme_bw() +    
    scale_fill_manual(values = COHORT_COLORS) +
    # no grid
	theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
	ylab("Silent/Active abundance ratio") +    
    # legend off
    theme(legend.position = "none") +
    (if (!REPEAT_MEAS_COMPON) 
    # Pairwise Wilcoxon test with significance annotations
    stat_compare_means(comparisons = STAT_PLOT_CMP,                        
                                method = "wilcox.test", label = "p.format", 
                                vjust = 0.1, size = 2, step.increase = 0.07)
    else NULL)


In [{}]:
p.dims(3.5, 3.5)
p = ggarrange(p1, ncol = 1)
annotate_figure(p, top = text_grob(SCENARIO_COMPON))

### Top Fla

In [{}]:
check_full(flapro_rel[[SCENARIO_COMPON]], "feature")

In [{}]:
# flapro_rel: boxplot the abundance per Fla in the order of median decreasing
top_n_fla = 
    flapro_rel[[SCENARIO_COMPON]] %>%
    mutate(mean_Abundance_rel = mean(value), .by = "feature") %>%    
    select(feature, mean_Abundance_rel) %>% 
    distinct() %>%         
    inner_join(meta_fla_clus, by = c("feature" = "FlaCluster")) %>%    
    arrange(desc(mean_Abundance_rel)) %>%
    filter(mean_Abundance_rel > 0) %>%
    head(200) 
top_n_fla

In [{}]:
p.dims(8, 12)
flapro_rel[[SCENARIO_COMPON]] %>% 
    inner_join(top_n_fla, by = "feature") %>%
    
    #ggplot(aes(x = reorder(feature, mean_Abundance_rel), y = value, color = Cluster_Exp)) +    
    #geom_jitter(aes(color = Cluster_Exp), size = 0.5, alpha = 0.4, width = 0.05) +    
    #scale_color_manual(values = FLA_CLASSES_COLORS) +    

    #ggplot(aes(x = reorder(feature, mean_Abundance_rel), y = value, shape = Cluster_Exp)) +    
    #geom_jitter(aes(shape = Cluster_Exp), size = 1, alpha = 0.4, width = 0.05, height = 0) +    
    #scale_shape_manual(values = FLA_CLASSES_SHAPES) +    

    ggplot(aes(x = reorder(feature, mean_Abundance_rel), y = value, shape = Cluster_Exp, color = Cluster_Exp)) +    
    geom_jitter(aes(shape = Cluster_Exp), size = 1, alpha = 0.4, width = 0.05, height = 0) +    
    scale_shape_manual(values = FLA_CLASSES_SHAPES) + 
    scale_color_manual(values = FLA_CLASSES_COLORS) +   

    theme_bw() +    
    theme(panel.border = element_rect(size = 0.1, colour = "black")) +
    # disable grid
    theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
    # disable x axis text    
    theme(axis.text.y = element_blank()) + 
    # disable y ticks
    theme(axis.ticks.y = element_blank()) +
    coord_flip() +
    labs(x = "Flagellin cluster", y = "Relative abundance") +
    # increase size of dot in legend
    guides(color = guide_legend(override.aes = list(size = 5))) +
    # log scale for y
    (if (!ADD_ADJUST_FOR_FACTORS_COMPON) scale_y_log10() else NULL)

In [{}]:
p.dims(8, 12)
flapro_rel[[SCENARIO_COMPON]] %>% 
    inner_join(top_n_fla, by = "feature") %>%

    #ggplot(aes(x = reorder(feature, mean_Abundance_rel), y = value, color = Cluster_Pred)) +    
    #geom_jitter(aes(color = Cluster_Pred), size = 0.3, alpha = 0.4, width = 0.05) +    
    #scale_color_manual(values = FLA_CLASSES_COLORS) +    

    ggplot(aes(x = reorder(feature, mean_Abundance_rel), y = value, shape = Cluster_Pred, color = Cluster_Pred)) +    
    geom_jitter(aes(shape = Cluster_Pred), size = 1, alpha = 0.4, width = 0.05, height = 0) +    
    scale_shape_manual(values = FLA_CLASSES_SHAPES) + 
    scale_color_manual(values = FLA_CLASSES_COLORS) +   

    theme_bw() +    
    theme(panel.border = element_rect(size = 0.1, colour = "black")) +
    # disable grid
    theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
    # disable x axis text    
    theme(axis.text.y = element_blank()) + 
    # disable y ticks
    theme(axis.ticks.y = element_blank()) +
    coord_flip() +
    labs(x = "Flagellin cluster", y = "Relative abundance") +
    # increase size of dot in legend
    guides(color = guide_legend(override.aes = list(size = 5))) +
    # log scale for y
    (if (!ADD_ADJUST_FOR_FACTORS_COMPON) scale_y_log10() else NULL)

### Beta diversity

In [{}]:
flapro_rel_df = 
    flapro_rel[[SCENARIO_COMPON]] %>%          
    select(feature, Sample, value) %>% 
    pivot_wider(names_from = "feature", values_from = "value", values_fill = 0) %>% 
    column_to_rownames("Sample") %>% 
    as.data.frame
flapro_rel_df

In [{}]:
# compute an Euclidean distance matrix from flapro_rel_df
dist_my_rel = dist(flapro_rel_df, method = "euclidean")

In [{}]:
# identify outliers based on the distance matrix
dist_my_rel_mat = as.matrix(dist_my_rel)
# Compute mean or median distance for each sample
mean_distances = rowMeans(dist_my_rel_mat)
median_distances = apply(dist_my_rel_mat, 1, median)

# Identify outliers using a threshold (e.g., X standard deviations from the mean)
threshold = mean(mean_distances) + REL_N_SIGMA_REL_OUTLIERS * sd(mean_distances)
outliers = which(mean_distances > threshold)

# Results
sort(names(outliers))
length(outliers)

In [{}]:
# remove outliers from dist_my_rel, a dist object
if(length(outliers)) {
    dist_my_rel_mat = dist_my_rel_mat[-outliers, -outliers]
    dist_my_rel = as.dist(dist_my_rel_mat)
    dim(dist_my_rel)
    flapro_rel_df_noOut = flapro_rel_df[-outliers,]
} else {
    flapro_rel_df_noOut = flapro_rel_df
}
dim(flapro_rel_df_noOut)

### PCoA biplot (rel.)

In [{}]:
# do PCoA
pcoa_res_rel = ape::pcoa(dist_my_rel)

# get the percentage of variance explained by top PCs
pve = round(pcoa_res_rel$values$Relative_eig[1:3] * 100, 1) 
# concatenate those values to PC1 and PC2
pve = paste0("PC", 1:3, " (", pve, "%)")
names(pve) = paste0("PC", 1:3)

# Extract PCoA coordinates for plotting
pcoa_scores = data.frame(pcoa_res_rel$vectors)
# rename all columns of pcoa_scores to PC1, PC2, PC3 and so on
colnames(pcoa_scores) = gsub(colnames(pcoa_scores), pattern = "Axis.", replacement = "PC")
#pcoa_scores

# Generalized computation of feature contributions for 3 PCs
num_pcs = 3  # Number of PCs to consider
feature_contributions = as.data.frame(
  do.call(cbind, lapply(1:num_pcs, function(pc) {
    apply(flapro_rel_df_noOut, 2, function(feature) suppressWarnings(cor(feature, pcoa_scores[[paste0("PC", pc)]])))
  }))
)
colnames(feature_contributions) = paste0("PC", 1:num_pcs)  # Name the columns
feature_contributions$Feature = colnames(flapro_rel_df_noOut)  # Add feature names

# Scale the feature contributions for visualization (optional)
feature_contributions = feature_contributions %>%
  mutate(across(starts_with("PC"), ~ .x * BIPLOT_ARROW_SCALING[[SCENARIO_COMPON]]))

# Compute magnitude of contributions and retain the top 5 features
feature_contributions = feature_contributions %>%
  rowwise() %>%
  mutate(Magnitude = sqrt(sum(c_across(starts_with("PC"))^2))) %>%
  ungroup() %>%
  arrange(desc(Magnitude)) %>%
  slice(1:REL_N_FEATURES_BIPLOT)  # Keep top features
feature_contributions

pcoa_vis = pcoa_scores %>% rownames_to_column("Sample") %>% 
    inner_join(meta_samples, by = "Sample")
#pcoa_vis

In [{}]:
# join meta to the feature_contributions
feature_contributions = feature_contributions %>% 
    inner_join(meta_fla_clus, by = c("Feature" = "FlaCluster")) %>%     
    mutate(Cluster_Species_trimmed = ifelse(		
			str_detect(Cluster_Species, ";"),			
			paste0(str_extract(Cluster_Species, "^[^;]+"), "+"),
			Cluster_Species)		
	) %>%	
    mutate(Feature_ext = paste(Feature, "\n", Cluster_Species_trimmed, sep = ""))  %>% 
	select(-Cluster_Species, -Cluster_Species_trimmed)
feature_contributions

In [{}]:
# biplot function
create_pcoa_biplot = function(pcoa_vis, feature_contributions, x_axis, y_axis, pve, color_var, scale_color = scale_color_manual(values = COHORT_COLORS)) {
  # Ensure the axis names are valid
  if (!(x_axis %in% colnames(pcoa_vis)) | !(y_axis %in% colnames(pcoa_vis))) {
    stop("Specified x_axis or y_axis is not in the pcoa_vis data.")
  }
  if (!(x_axis %in% colnames(feature_contributions)) | !(y_axis %in% colnames(feature_contributions))) {
    stop("Specified x_axis or y_axis is not in the feature_contributions data.")
  }

  # Create biplot
  ggplot() +
    # Sample points
    geom_point(data = pcoa_vis, aes_string(x = x_axis, y = y_axis, color = color_var), size = 0.5) +    

    ## Sample labels
    (if (BIPLOT_LABELS_SAMPLES) geom_text_repel(data = pcoa_vis, aes_string(x = x_axis, y = y_axis, label = "Sample"), size = 2, color = "#888888", segment.size = 0.1) else NULL) + 
        
    # X and Y labels for explained variance
    xlab(pve[x_axis]) +
    ylab(pve[y_axis]) +   
    # Theme and color scale
    theme_classic() + 
    scale_color +    
    # Increase bullet size in legend
    guides(color = guide_legend(override.aes = list(size = 4))) +

   # Feature arrows
    geom_segment(
      data = feature_contributions, 
      aes_string(x = 0, y = 0, xend = x_axis, yend = y_axis , 
        linetype = "Cluster_Pred"
        ), 
      arrow = arrow(length = unit(0.2, "cm"), type = "closed", angle = 15),      
      size = 0.2, color = "#888888"
    ) +
    # Feature labels
    geom_text(
      data = feature_contributions, 
      aes_string(x = x_axis, y = y_axis, label = "Feature_ext"), 
      vjust = "center", hjust = "middle", color = "black", size = 2
    ) +
    scale_linetype_manual(
      values = c("active" = "solid", "mixed" = "longdash", "silent" = "dashed", "not_defined" = "dotted"),
      name = "Cluster_Pred"
    ) 
}

p.dims(15, 8)
#p.dims(5, 10)
# PC1-2:
p1 = create_pcoa_biplot(pcoa_vis, feature_contributions, "PC1", "PC2", pve, "Group")
# PC1-3:
p2 = create_pcoa_biplot(pcoa_vis, feature_contributions, "PC1", "PC3", pve, "Group")

# plot them next to each other
p = ggarrange(p1, p2, ncol = 2, common.legend = TRUE, legend = "bottom")
annotate_figure(p, top = text_grob(SCENARIO_COMPON))

### Stat: summarized Fla features

In [{}]:
# init a list to store the tables to go to a multisheet xlsx
df_list_stats_compon_results = list()

In [{}]:
lm_in = rbind(        
        flapro_rel_total,                
        flapro_rel_Silent_Active_ratio) %>%
    inner_join(meta_samples, by = "Sample") %>%
    filter(Group %in% GROUPS_TO_COMPARE)
lm_in

In [{}]:
if(REPEAT_MEAS_COMPON) {
    lm_res = do_lmer_tidy(lm_in, rel_model_formula, arg_response_col = "value", arg_feature_col = "feature") %>% 
        select(-effect, -group)
} else {
    lm_res = do_lm_tidy(lm_in, rel_model_formula, arg_response_col = "value", arg_feature_col = "feature")
}
df.dims(50)    
lm_res #%>% filter(term == sel_factor_coef) %>% filter(p.value < 0.1)
df.dims(5)    

In [{}]:
df.dims(50)    
lm_res %>% filter(term == sel_factor_coef) %>% filter(p.value < 0.1)
df.dims(5)    

## save the results to file
tmp_sheet_name = paste0("SumFts_", SCENARIO_COMPON, "_", GROUPS_SUFFIX, "_", sel_factor_coef)
if(nchar(tmp_sheet_name) > 31) {
	print("Warning: xlsx sheet name too long! Reducing to 31 leading symbols.")
	tmp_sheet_name = substr(tmp_sheet_name, 0, 31)
	print(tmp_sheet_name)
}
df_list_stats_compon_results[[tmp_sheet_name]] = lm_res %>% filter(term == sel_factor_coef) %>% filter(p.value < 0.1)


### Stat: alpha diversity

In [{}]:
if(SCENARIO_COMPON != "MTX_MGX_ratio") {
    lm_in = flapro_alpha %>%
        inner_join(meta_samples, by = "Sample") %>%
        filter(Group %in% GROUPS_TO_COMPARE)
    lm_in
}

In [{}]:
if(SCENARIO_COMPON != "MTX_MGX_ratio") {
    if(REPEAT_MEAS_COMPON) {
        lm_res = do_lmer_tidy(lm_in, rel_model_formula, arg_response_col = "value", arg_feature_col = "feature") %>% 
        select(-effect, -group)
    } else {
        lm_res = do_lm_tidy(lm_in, rel_model_formula, arg_response_col = "value", arg_feature_col = "feature")
    }
    df.dims(50)    
    print(lm_res %>% filter(p.value < 0.1))
    df.dims(5)    

    # save the results to file
    tmp_sheet_name = paste0("FAlph_", SCENARIO_COMPON, "_", GROUPS_SUFFIX, "_", sel_factor_coef)
    if(nchar(tmp_sheet_name) > 31) {
        print("Warning: xlsx sheet name too long! Reducing to 31 leading symbols.")
        tmp_sheet_name = substr(tmp_sheet_name, 0, 31)
        print(tmp_sheet_name)
    }
    df_list_stats_compon_results[[tmp_sheet_name]] = lm_res %>% filter(p.value < 0.1)
}

### Stat: fla clusters (over flapro_rel_MAJ)

#### Linear model, all factors

In [{}]:
lm_in = flapro_rel_maj_log %>% 
    inner_join(meta_samples, by = "Sample") %>% 
    # subset to selected groups    
    filter(Group %in% GROUPS_TO_COMPARE)
lm_in

In [{}]:
if(REPEAT_MEAS_COMPON) {
    lm_res = do_lmer_tidy(lm_in, rel_model_formula, arg_response_col = "value", arg_feature_col = "feature") %>% 
        select(-effect, -group)
} else {
    lm_res = do_lm_tidy(lm_in, rel_model_formula, arg_response_col = "value", arg_feature_col = "feature")
}
df.dims(50)    
lm_res %>% filter(p.value < 0.1)
df.dims(5)    

In [{}]:
lm_res_sel_factor = lm_res %>% 
    filter(term == sel_factor_coef) %>%
    inner_join(meta_fla_clus, by = c("feature" = "FlaCluster")) %>% 
    mutate(p.adj = p.adjust(p.value, method = "fdr"))
df.dims(50) 
lm_res_sel_factor_s = lm_res_sel_factor %>% 
    ##filter(p.value < 0.1)
    filter(p.adj < 0.1) 
lm_res_sel_factor_s
df.dims(5)

In [{}]:
# save the results to file
tmp_sheet_name = paste0("FC_", SCENARIO_COMPON, "_", GROUPS_SUFFIX, "_", sel_factor_coef)
if(nchar(tmp_sheet_name) > 31) {
	print("Warning: xlsx sheet name too long! Reducing to 31 leading symbols.")
	tmp_sheet_name = substr(tmp_sheet_name, 0, 31)
	print(tmp_sheet_name)
}
df_list_stats_compon_results[[tmp_sheet_name]] = lm_res_sel_factor

In [{}]:
num_finds = nrow(lm_res_sel_factor_s)
num_finds
num_per_row = 5

my_w = min(13, 13 * (num_finds / num_per_row))
my_w
my_h = 3 * ceiling(num_finds / num_per_row)
my_h

p.dims(my_w, my_h) 
ttt = flapro_rel_maj_log %>% 	
	inner_join(lm_res_sel_factor_s, by = "feature") %>% 	
	inner_join(meta_samples, by = "Sample") %>% 
	mutate(sign_estimate = factor(sign(estimate))) %>% 		
	mutate(Cluster_Species_trimmed = ifelse(		
			str_detect(Cluster_Species, ";"),			
			paste0(str_extract(Cluster_Species, "^[^;]+"), "+"),
			Cluster_Species)		
	) %>%
	mutate(FlaCluster_ext = paste(feature, "\n", Cluster_Species_trimmed, "\n", Cluster_Pred, sep = "")) %>% 
    # subset to groups    
    filter(Group %in% GROUPS_TO_COMPARE)
#ttt

if(nrow(ttt) > 0) {
	ggplot(ttt, aes(y = value, x = Group)) +		
		## log scale y
		#scale_y_log10() +
		geom_boxplot(width = 0.1) +
		geom_violin(color="#888888", alpha=0.1, size=0.5) +
		geom_jitter(aes(color = sign_estimate), alpha=0.1, size=3, width=0.3, height = 0) +
		scale_color_manual(values = c("-1" = "blue", "1" = "red")) +
		facet_wrap(~FlaCluster_ext, ncol = num_per_row, scales = "free") +			
		theme_bw() +		
		# no grid
		theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
		# no grey fill for facet
		theme(strip.background = element_blank()) +
		# no legend
		theme(legend.position = "none") +

		# y axis title: Abundance
		ylab("Relative abundance (log10)")
}

#### Wilcoxon (only performed when there are 2 clinical groups selected)

In [{}]:
# Perform Wilcoxon test for each flagellin between the groups (without discarding the outliers)
wil_in = flapro_rel[[SCENARIO_COMPON]] %>% 
    inner_join(meta_samples %>% select(Sample, Group), by = "Sample") %>% 
    # subset to selected groups    
    filter(Group %in% GROUPS_TO_COMPARE)

#go_wilcox = length(GROUPS_TO_COMPARE) == 2
go_wilcox = FALSE

if(go_wilcox) {
    wilcox_results = wil_in %>%            
        group_by(FlaCluster) %>%
        summarise(
            p_value = wilcox.test(Abundance_rel ~ Group)$p.value,
            .groups = 'drop'
        ) %>% 
        arrange(p_value) %>% 
        # adjust p-values
        mutate(p_value_adj = p.adjust(p_value, method = "BH"))
    #wilcox_results

    wilcox_results = wilcox_results %>% 
        inner_join(meta_fla_clus, by = "FlaCluster") %>% 
        filter(p_value_adj < 0.1) %>% 
        #select(Flagellin_ID, Genus, Species, p_value, p_value_adj) %>% 
        arrange(p_value_adj)
}
if(go_wilcox) {
    df.dims(100)
    print(wilcox_results)
    df.dims(5)
}

In [{}]:
if(go_wilcox) {
	if(nrow(wilcox_results) > 0) {
		p.dims(6, 6)

		ttt = flapro_rel %>% 	
			semi_join(wilcox_results, by = "FlaCluster") %>% 	
			inner_join(meta_samples, by = "Sample") %>% 		
			# subset to groups
			filter(Group %in% GROUPS_TO_COMPARE)
		#ttt

		if(nrow(ttt) > 0) {
			ggplot(ttt, aes(y = Abundance_rel, x = Group)) +		
				# log scale y
				(if (!ADD_ADJUST_FOR_FACTORS_COMPON) scale_y_log10() else NULL) + 
				geom_boxplot(width = 0.1) +
				geom_violin(color="#888888", alpha=0.1, size=0.5) +
				geom_jitter(color="green", alpha=0.2, size=3, width=0.3) +
				facet_wrap(~FlaCluster) +	
				theme_bw()	
		}
	}
}

In [{}]:
# write the stat. results into a multi-sheet xls, with auto column width
openxlsx::write.xlsx(df_list_stats_compon_results, out_stat_compon_xlsx_file, colWidths="auto")