## Compositional analysis (available for MTX or MGX, not their ratio)

### Init normalized abundance and CLR 

In [None]:
# prepare abundance data
abundance_my = cmultRepl2(t(countData))
clr_my = log(abundance_my) - rowMeans(log(abundance_my)) # CLR transformation
dim(abundance_my)

In [None]:
clr_my_long = clr_my %>% rownames_to_column("Sample") %>%
    pivot_longer(cols = -c("Sample"), names_to = "feature", values_to = "value") 
clr_my_long

In [None]:
# adjust compositional data for factor(s) - via residuals
if(ADD_ADJUST_FOR_FACTORS_NB) {
    clr_my_long = adjust_via_residuals(meta_samples, clr_my_long, ADD_ADJUST_FOR)
    clr_my = clr_my_long %>% pivot_wider(names_from = "feature", values_from = "value") %>% 
        column_to_rownames("Sample") %>% 
        as.data.frame()
    # adjust abundance_my accordingly
    abundance_my = inverse_clr(clr_my)
}

### PCoA (Aitchison)

In [None]:
# compute an Euclidean distance matrix from clr_my
dist_my = dist(clr_my, method = "euclidean")

pcoa_res = ape::pcoa(dist_my)
pve = round(pcoa_res$values$Relative_eig[1:3] * 100, 1) 
# concatenate those values to PC1 and PC2
pve = paste0("PC", 1:3, " (", pve, "%)")

# plot a principal coordinate analysis
pcoa_my = cmdscale(dist_my, k = 3)
pcoa_my = as.data.frame(pcoa_my) %>% 
    rename(PCo1 = V1, PCo2 = V2, PCo3 = V3) #%>%
	#mutate(Sample = rownames(pcoa$points))
pcoa_my = cbind(pcoa_my, meta_samples_NB)

p.dims(9, 3)
# plot PC1-2 and PC2-3 next to each other
# PC1-2:
p1 = ggplot(pcoa_my, aes(x = PCo1, y = PCo2, color = Group)) + 
  geom_point(size = 2) +  
  xlab(pve[1]) +
  ylab(pve[2]) +   
  theme_classic() + 
  scale_color_manual(values = COHORT_COLORS) +
  # increase bullets size in legend
  guides(color = guide_legend(override.aes = list(size = 4)))
#p1

# PC1-3:
p2 = ggplot(pcoa_my, aes(x = PCo1, y = PCo3, color = Group)) + 
  geom_point(size = 2) +  
  xlab(pve[1]) +
  ylab(pve[3]) +   
  theme_classic() +  
  scale_color_manual(values = COHORT_COLORS) +
  # increase bullets size in legend
  guides(color = guide_legend(override.aes = list(size = 4)))
#p2
# plot p1 and p2 next to each other
gridExtra::grid.arrange(p1, p2, ncol=2)

### save a copy so that we can later compute the NB for the full set of samples

In [None]:
countData_full = countData
meta_samples_full = meta_samples
abundance_my_full = abundance_my
clr_my_full = clr_my

### Create filter #2 for samples

In [None]:
to_leave2 = meta_samples_NB %>% 	

	filter(Group %in% GROUPS_TO_COMPARE) %>% 

	select(Sample) %>% pull

length(to_leave2)

### generate a filtering preview (might visualize pre- and post-filtering data here)

In [None]:
meta_samples_preview = meta_samples_NB %>% filter(Sample %in% to_leave2)
nrow(meta_samples_preview)

meta_samples_preview %>% select(Group) %>% table()

In [None]:
length(to_leave2)
#setdiff(to_leave2, colnames(countData))

In [None]:
countData_preview = countData[, sort(to_leave2)]

### apply the filter #2

In [None]:
countData = countData_preview
meta_samples = meta_samples_preview
abundance_my = abundance_my[sort(colnames(countData)),] 
clr_my = clr_my[sort(colnames(countData_preview)),]


# meta_samples: drop the non-presented factor levels
meta_samples$Group = droplevels(meta_samples$Group)

In [None]:
test_that("countData and meta match", {
	expect_equal(sort(meta_samples %>% select(Sample) %>% pull), sort(colnames(countData)))
})

### weiter

In [None]:
# prepare a dataframe version of meta_by_sample and sort it by Sample
meta_df = data.frame(meta_samples %>% column_to_rownames("Sample")) 
#meta_df = meta_df[order(rownames(meta_df)),]
meta_df

In [None]:
test_that("meta_df and matrices have the same samples in the same order", {
	expect_equal(rownames(meta_df), rownames(clr_my))	
})

In [None]:
# though there was initial Fla filtering, some might have become zeroed out during the samples filtering
# so we need to remove them.
# get the names of the rows in countData with fewer than 5 non-zero values
# and remove them from countData and bracken_perc
low_features = names(which(rowSums(countData > 0) < 5))
test_that("no low features", {
	expect_equal(length(low_features), 0)
})

### Nearest Balance:

### PERMANOVA

In [None]:
## check for heteroscedasticity across the factor of interest
#anova(betadisper(dist(clr_my), meta_df$sel_factor))

In [None]:
df.dims(8)
adonis2(reformulate(init_permanova_formula, response = "dist(clr_my)"), meta_df, permutations = N_PERMANOVA, by = "terms")

In [None]:
adonis2(reformulate(lm_nb_formula, response = "dist(clr_my)"), meta_df, permutations = N_PERMANOVA, by = "terms")

In [None]:
coef_oa_bacs = do_clr_lm_get_fac_coeffs(lm_nb_formula, clr_my, meta_df, sel_factor_coef)
coef_oa_bacs

### Prepare splits for cross-validation

In [None]:
if(!REPEAT_MEAS_NB) {
    # Option A: default way: balanced by the factor of interest
    splits = caret::createDataPartition(times = n_sim, y = meta_df[[sel_factor]], p = train_prop)
} else {
    # Option B: custom way: in each split, select 1 random time point per participant (and the test set is the rest - hence multiple time points - but we don't use it, so it's fine)
    splits = vector(mode = "list", n_sim)
    tmp_hashes = list()
    for(i in 1:n_sim) {    
        # the way below produces a single random time point per participant
        # different across the iterations
        subset_meta_df = meta_df %>% 
            mutate(rownum = row_number()) %>%
            dplyr::slice_sample(n = 1, by = !!sym(REPEAT_MEAS_FACTOR)) %>%
            dplyr::ungroup()
        cur_tmp_hash = digest::digest(subset_meta_df)
        #print(cur_tmp_hash)    
        tmp_hashes[[i]] = cur_tmp_hash
        #print(subset_meta_df)
        
        one_subset_split = caret::createDataPartition(times = 1, y = subset_meta_df[[sel_factor]], p = 1)   
        splits[[i]] = as.vector(unlist(subset_meta_df[unlist(one_subset_split), "rownum"]))
    }
    # test that the randomization really worked across the iterations
    test_that("randomization across iterations", {
        expect_equal(length(unique(tmp_hashes)), n_sim)
    })
    names(splits) = paste("Resample", 1:n_sim, sep = "")
}

### Parallel NB generation via cross-validations

In [None]:
nb_list = parallel_run_nb(lm_nb_formula, sel_factor_coef, clr_my, meta_df, splits, n_sim, num_rparallel_cores)

In [None]:
# in a repetitive measures case: PERMANOVA within a 1-sample-per-subject subsets 
# and then compare the generated p-values distribution to the significance threshold
if(REPEAT_MEAS_NB) {
    pvs_perm = c()
    for(i in 1:n_sim) {
        split = splits[[i]]    
        cur_perm = adonis2(reformulate(init_permanova_formula, response = "dist(clr_my[split,])"), meta_df[split,,drop=F], permutations = N_PERMANOVA, by = "terms")
        pvs_perm[i] = cur_perm[sel_factor, "Pr(>F)"]       
    }
    print(paste0("median p: ", median(pvs_perm)))
    print(paste0("sd p: ", sd(pvs_perm)))
    # compare the distribution with the significance threshold    
    print(wilcox.test(pvs_perm, mu = 0.1, alternative = "greater"))
}

### Process outputs of NB

In [None]:
res = aggregate_balance_iterations(nb_list, reproducibility_threshold)
#res
sbp_iters = res$sbp_iters
sbp_consensus  = res$sbp_consensus

In [None]:
nb = format_consensus_balance(sbp_consensus)
nb

In [None]:
balance_size(nb)

In [None]:
# compute the consensus balance values for each sample 
nb_vals = compute_balance(abundance_my, nb)
#nb_vals

# link metadata to the balance values
nb_vals_meta = nb_vals %>% inner_join(meta_samples, by = "Sample")
nb_vals_meta

### Selected taxa within the balance: check out

In [None]:
df.dims(20)
sbp_iters %>% filter(taxName %like% "APC11219_1")
df.dims(5)

### Plot the balance values against the factors

In [None]:
# Plot the relation between the balance values and continuous factor
p.dims(4,3)
ggplot(nb_vals_meta, aes(x = as.factor(!!sym(sel_factor)), y = NB_Value)) + 
	geom_violin() +
	geom_boxplot(width = 0.2) +
	geom_jitter(color="red", alpha=0.2, size=3, width=0.2) +
	#geom_point(color="red", alpha=0.2, size=3) +
	
	#geom_smooth(method = "lm", se = FALSE) +
	#geom_text(size = 1.5) +
	stat_smooth(method = "loess", se = FALSE) +
	labs(x = sel_factor, y = "Nearest balance") + theme(legend.position = "none") +	
	theme_bw()

In [None]:
# Plot the relation between the balance values and continuous factor
p.dims(4,3)
ggplot(nb_vals_meta, aes(x = Group, y = NB_Value)) + 
	#geom_boxplot() +
    geom_violin(color="grey", alpha=0.7, size=0.5) +
	geom_boxplot(color="black", alpha=1, size=0.5, width = 0.3) +
	geom_jitter(color="red", alpha=0.3, size=2, width=0.2) +
	
	
	#geom_smooth(method = "lm", se = FALSE) +
	#geom_text(size = 1.5) +
	stat_smooth(method = "loess", se = FALSE) +
	labs(x = "Group", y = "Balance values") + theme(legend.position = "none") +	
	theme_bw()

In [None]:
# do regression
nb_vals_meta %>% dplyr::do(broom::tidy(lm(reformulate(sel_factor, response = "NB_Value"), .)))


### go on preparing the balance

In [None]:
# join sbp_consensus_reprod with coef_oa_bacs
nb_reprod_coef = join_coefs_and_sbp_reprod(get_sbp_consensus_with_reprod(sbp_iters), coef_oa_bacs)

df.dims(20)
nb_reprod_coef %>% arrange(desc(reprod))
df.dims(5)

In [None]:
# add features info
nb_reprod_coef = nb_reprod_coef %>% 	
	inner_join(meta_fla_clus, by = c("taxName" = "FlaCluster")) %>% 
	mutate(Cluster_Species_trimmed = ifelse(		
			str_detect(Cluster_Species, ";"),			
			paste0(str_extract(Cluster_Species, "^[^;]+"), "+"),
			Cluster_Species)		
	) %>%
	mutate(taxName_ext = paste(taxName, Cluster_Species_trimmed, sep = "; "))
nb_reprod_coef

### Coefficients for each taxon in NB: plot

In [None]:
p.dims(10,5)
# draw horizontal barplots of the coefficient for each taxName
pic = 
ggplot(nb_reprod_coef, aes(x = lm_coef, y = reorder(taxName_ext, lm_coef), fill = Cluster_Exp)) +	
	#geom_bar(stat = "identity", linewidth = 0.2, position = "dodge") +
	geom_bar(stat = "identity", linewidth = 0.2) +
	theme_bw() +
	scale_alpha_continuous(range = c(0.1, 1)) +	
	theme(axis.text = element_text(size = 6)) +	
	labs(x = "CLR LM coefficient", y = "") + 	
	# rotate labels
	theme(axis.text.x = element_text(angle = 60, hjust = 1), 
		plot.margin = margin(l = 20)) +
	# rotate the plot 90 degrees
	coord_flip() + 
	# set a palette for the fill
	scale_fill_simpsons() +	
	#scale_fill_ucscgb() +
	#scale_fill_igv() +
	
	#scale_fill_manual(values = all_fams_pal) +

	# legend text 10 
	theme(legend.text = element_text(size = 8))
pic

In [None]:
#p.dims(10,5)
p.dims(8, 5)
# draw horizontal barplots of the coefficient for each taxName
pic = 
ggplot(nb_reprod_coef, aes(x = lm_coef, y = reorder(taxName_ext, lm_coef), fill = Cluster_Pred)) +	
#ggplot(nb_reprod_coef, aes(x = lm_coef, y = reorder(taxName, lm_coef), fill = Family)) +	
#ggplot(nb_reprod_coef, aes(x = lm_coef, y = reorder(taxName, lm_coef), fill = Experimental)) +	# none?..
	#geom_bar(stat = "identity", linewidth = 0.2, position = "dodge") +
	geom_bar(stat = "identity", linewidth = 0.2) +
	theme_bw() +
	scale_alpha_continuous(range = c(0.1, 1)) +	
	theme(axis.text = element_text(size = 6)) +	
	labs(x = "CLR LM coefficient", y = "") + 	
	# rotate labels
	theme(axis.text.x = element_text(angle = 60, hjust = 1), 
		plot.margin = margin(l = 20)) +
	# rotate the plot 90 degrees
	coord_flip() + 
	# set a palette for the fill
	#scale_fill_simpsons() +	
	#scale_fill_ucscgb() +
	#scale_fill_igv() +
	scale_fill_manual(values = FLA_CLASSES_COLORS) +
	
	#scale_fill_manual(values = all_fams_pal) +

	# legend text 10 
	theme(legend.text = element_text(size = 8))
pic


In [None]:
p.dims(9,5)
# draw horizontal barplots of the coefficient for each taxName
pic = 
ggplot(nb_reprod_coef, aes(x = lm_coef, y = reorder(taxName_ext, lm_coef), fill = Cluster_Family)) +	
	#geom_bar(stat = "identity", linewidth = 0.2, position = "dodge") +
	geom_bar(stat = "identity", linewidth = 0.2) +
	theme_bw() +	
	labs(x = "Taxon coefficient", y = "") + 	
	coord_flip() + 
	
	theme(
			## legend (text), OR		
			legend.text = element_text(size = 10), legend.position = "right",
			#legend.text = element_text(size = 10), legend.position = "bottom",
			# legend OFF
			#legend.position = "none",

			#panel.border = element_blank(),
			panel.grid.major = element_blank(),
			panel.grid.minor = element_blank(),
			axis.line = element_line(colour = "black", linewidth = 0),
			axis.ticks = element_line(colour = "black", linewidth = 0.1),
			plot.title = element_text(hjust = 0.5, face = "bold"),

			strip.background = element_blank(),
			strip.text.x = element_blank(),
			panel.border = element_rect(colour = "black", linewidth = 1),

			#axis.text = element_text(size = 10),

			axis.text.x = element_text(size = 8, color = "black",
				## enable tilted labels
				# default:
				angle = 60, 
				# for MAGs (labels are longer)
				#angle = 70, 
				hjust = 1), 
			plot.margin = margin(r = 5, l = 40),

			# or disable labels
			#axis.text.x = element_blank(), 
			#plot.margin = margin(l = 20)			

			axis.text.y = element_text(size = 10, color = "black"),
			axis.title.x = element_text(size = 12, face = "bold", color = "black"),
			axis.title.y = element_text(size = 12, face = "bold", color = "black")			
		) +

	scale_fill_igv()	
	
pic


In [None]:
fig_maker = function(fac, prof, tax_level) {

	pic = ggplot(list_nb_reprod_coef %>% filter(factor == fac, profiling == prof, taxonomic_level == tax_level), aes(x = lm_coef, y = reorder(taxName_ext, lm_coef), fill = Family)) +			
		#geom_bar(stat = "identity", width = 1) +
		geom_bar(stat = "identity") +

		theme_bw() +		
		
		theme(axis.text = element_text(size = 10)) +	
		labs(x = "Taxon coefficient", y = "") + 	
		# rotate labels
		theme(
			## enable tilted labels
			axis.text.x = element_text(size = 7, 

				# default:
				#angle = 60, 
				# for MAGs (labels are longer)
				angle = 70, 

				hjust = 1), 
			plot.margin = margin(l = 40)

			# or disable labels
			#axis.text.x = element_blank(), 
			#plot.margin = margin(l = 20)
			) +

		# This line can be toggled on/off:			
		ggtitle(qfactors_plots_n_tables_inv[[fac]]) +				

		# rotate the plot 90 degrees
		coord_flip() + 

		# set a palette for the fill		
		#scale_fill_manual(values = all_fams_pal) +
		scale_fill_manual(values = all_fams_pal, breaks = names(all_fams_pal)) +

		## legend (text), OR		
		#theme(legend.text = element_text(size = 8), legend.position = "right") +
		#theme(legend.text = element_text(size = 8), legend.position = "bottom") +
		# legend OFF
		theme(legend.position = "none") +

		theme(
			#panel.border = element_blank(),
			panel.grid.major = element_blank(),
			panel.grid.minor = element_blank(),
			axis.line = element_line(colour = "black", linewidth = 0),
			axis.ticks = element_line(colour = "black", linewidth = 0.1),
			plot.title = element_text(hjust = 0.5, face = "bold"),

			strip.background = element_blank(),
			strip.text.x = element_blank(),
			panel.border = element_rect(colour = "black", linewidth = 1),

			axis.text.x = element_text(size = 8, color = "black"),
			axis.text.y = element_text(size = 9, face = "bold", color = "black"),
			axis.title.x = element_text(size = 10, face = "bold", color = "black"),
			axis.title.y = element_text(size = 10, face = "bold", color = "black")
			
			)
	
	#pic
}

In [None]:
p.dims(10, 5)

ttt = flapro_rel[[SCENARIO_NB]] %>% 
	inner_join(nb_reprod_coef, by = c("feature" = "taxName")) %>% 
	select(feature, Sample, value, b1, Cluster_Species, Cluster_Pred) %>%
	inner_join(meta_samples_NB, by = "Sample")  %>% 
	mutate(Association = ifelse(b1 > 0, "positive", "negative")) %>% 

	mutate(Cluster_Species_trimmed = ifelse(		
			str_detect(Cluster_Species, ";"),			
			paste0(str_extract(Cluster_Species, "^[^;]+"), "+"),
			Cluster_Species)		
	) %>%
	mutate(FlaCluster_ext = paste(feature, "\n", Cluster_Species_trimmed, "\n", Cluster_Pred, sep = ""))	
	#mutate(FlaCluster_ext = paste(FlaCluster, "\n", Cluster_Species, "\n", Cluster_Pred_v3, sep = "")) 
ttt

ggplot(ttt, aes(y = value, x = Group, color = Association)) +		
	# log scale y
	(if (!ADD_ADJUST_FOR_FACTORS_COMPON) scale_y_log10() else NULL) + 
	labs(y = "Abundance (log10)") +	
	geom_boxplot(width = 0.1) +
	geom_violin(color="#888888", alpha=0.1, size=0.5) +
	geom_jitter(alpha=0.1, size=3, width=0.3) +
	facet_wrap(~FlaCluster_ext, ncol = 4) +	
	scale_color_manual(values = c("negative" = "blue", "positive" = "red")) +
	theme_bw()	

### Compute the NB for the full dataset
Meaningful if the above analysis was conducted for a subset of the samples.

In [None]:
# compute the consensus balance values for each sample 
nb_vals_full = compute_balance(abundance_my_full, nb)
dim(nb_vals_full)
nb_vals_meta_full = nb_vals_full %>% inner_join(meta_samples_full, by = "Sample")
nb_vals_meta_full

In [None]:
p.dims(8, 4)
ggplot(nb_vals_meta_full, aes(x = as.factor(!!sym(sel_factor)), y = NB_Value)) + 
    geom_boxplot(width = 0.3) +
    geom_jitter(aes(color = as.factor(!!sym(sel_factor))), alpha=0.7, size=2, width=0.3, height = 0) +
    #facet_grid(. ~ !!sym(sel_factor)) +
    #geom_point(color="red", alpha=0.2, size=3) +
    #geom_smooth(method = "lm", se = FALSE) +	
    #geom_text(size = 1.5) +
    #stat_smooth(method = "loess", se = FALSE) +
    labs(x = sel_factor, y = "Nearest balance") + theme(legend.position = "none") +	
    scale_color_futurama() +	
    
    # y tick and y axis label every 1
    scale_y_continuous(breaks = scales::pretty_breaks(n = 10)) +	
    theme_bw()