## metadata - flagellins

In [{}]:
# for those species that are not defined, we will use the genus
# (and before - the same for genus & family)
meta_fla = meta_fla %>% 
    mutate(Genus = ifelse(is.na(Genus), paste0("uncl. ", Family), Genus))

meta_fla = meta_fla %>% 
    mutate(Genus = ifelse(Genus == "-", paste0(Family, " gen."), Genus)) %>% 
    mutate(Species = ifelse(Species == "-", paste0(Genus, " spp."), Species))

In [{}]:
# add a letter prefix to the Cluster
meta_fla = meta_fla %>% mutate(FlaCluster = paste0("FC_", Cluster_c4_representative)) %>% 
    select(-Cluster_c4_representative)
meta_fla

In [{}]:
meta_fla %>% select(Flagellin_ID) %>% distinct() %>% nrow()
meta_fla %>% select(FlaCluster) %>% distinct() %>% nrow()

In [{}]:
# prepare a per-FlaCluster version of meta_fla (with distinct())
meta_fla_clus = meta_fla %>%  
    #Content_of_the_cluster	num_fla_per_cluster	Cluster_Pred	Cluster_Exp	Cluster_Species	Cluster_Genus	Cluster_Family	FlaCluster
    select(FlaCluster, Cluster_Pred, Cluster_Exp, Cluster_Family, Cluster_Genus, Cluster_Species) %>%     
    distinct()
meta_fla_clus

## sample coverage

## feature matrices

In [{}]:
flapro = lapply(flapro, function(x) {
    x %>% mutate(FlaCluster = paste0("FC_", FlaCluster_Rep)) %>%
        select(-FlaCluster_Rep)
})

In [{}]:
# leave FlaProfiles only for those samples who have meta-data        
flapro = lapply(AVAILABLE_OMICS, function(x) {
    print(x)
    y = flapro[[x]]    
    print(y %>% unique_n('samples', Sample))
    print(y %>% unique_n('features', FlaCluster))
    res = y %>% filter(Sample %in% meta_samples$Sample)
    print(y %>% unique_n('samples present in meta', Sample))
    print(y %>% unique_n('features', FlaCluster))
    res
}) %>% setNames(AVAILABLE_OMICS)

In [{}]:
### for saving: remove zero Fla and make a filtered meta_fla accordingly

In [{}]:
flapro_no_zeros = lapply(AVAILABLE_OMICS, function(x) {
    flapro[[x]] %>% filter(Abundance > 0)
}) %>% setNames(AVAILABLE_OMICS)
flapro_no_zeros

flapro_no_zeros_mat = lapply(AVAILABLE_OMICS, function(x) {
    flapro_no_zeros[[x]] %>% 
        pivot_wider(names_from = Sample, values_from = Abundance, values_fill = 0) %>% 
        column_to_rownames("FlaCluster") %>% 
        as.matrix()
}) %>% setNames(AVAILABLE_OMICS)
flapro_no_zeros_mat

In [{}]:
meta_fla_no_zeros = lapply(AVAILABLE_OMICS, function(x) {
    meta_fla %>% 
    select(FlaCluster, Cluster_Family, Cluster_Genus, Cluster_Species, Cluster_Pred, Cluster_Exp, Content_of_the_cluster) %>% 
    distinct() %>% 
    inner_join(flapro_no_zeros[[x]] %>% select(FlaCluster) %>% distinct(), by = "FlaCluster")
}) %>% setNames(AVAILABLE_OMICS)
meta_fla_no_zeros

### identify the samples with 0 or very low Fla counts

In [{}]:
flapro_stats = lapply(AVAILABLE_OMICS, function(x) {
    y = flapro[[x]]
    res = y %>% 
	    mutate(counts_per_sample = as.double(sum(Abundance)), .by = "Sample") %>%     
        select(Sample, counts_per_sample) %>% distinct() %>% arrange(desc(counts_per_sample))

    # histogram
    p.dims(20,3)
    plot(res %>% ggplot(aes(counts_per_sample)) + geom_histogram(bins = 150) + 
        ylab("Number of samples") +
        xlab("Number of reads per sample") +
        ggtitle(x) +
        theme_minimal())
    res
}) %>% setNames(AVAILABLE_OMICS)

In [{}]:
flapro_stats = lapply(flapro_stats, function(x) {
    x %>% inner_join(meta_samples, by = "Sample")
})

In [{}]:
flapro = lapply(flapro, function(x) {
	res = x %>% 
		group_by(Sample) %>% 
		mutate(counts_per_sample = as.double(sum(Abundance))) %>% 
		# for component-based branch, do NOT drop the samples with 0 or very low Fla counts (that would affect case-control!)
		###filter(counts_per_sample >= MIN_FLA_READS_PER_SAMPLE) %>% 
		ungroup() %>% 
		arrange(desc(counts_per_sample)) %>% 
		rename(feature = "FlaCluster", value = "Abundance")
	print(x %>% unique_n('samples', Sample))
	res
})
flapro

In [{}]:
lapply(flapro_stats, function(x) {	
    x %>% select(Sample) %>% distinct() %>% nrow()
})

In [{}]:
# save the list of samples with 0 or very low Fla counts
samples_with_low_fla = lapply(flapro_stats, function(x) {	
	x %>% filter(counts_per_sample < MIN_FLA_READS_PER_SAMPLE) %>% select(Sample)
})
samples_with_low_fla

In [{}]:
# 1: percentage of Fla calculated to the total Fla reads (a helper for NB branch)
flapro_perc = lapply(flapro, function(x) {
    x %>% 
        group_by(Sample) %>%     
        # for null-flagellome samples, assign 0 to Abundance_perc
        mutate(value = ifelse(value == 0, 0, 100.0*value/counts_per_sample)) %>%         
        select(-counts_per_sample) %>% 
        ungroup()        
})
flapro_perc

In [{}]:
# 2: relative abundance of Fla calculated relative to the metagenome coverage, not total Fla reads (used in component-based branch)
# Its sum per sample - not 100%!
flapro_rel = lapply(AVAILABLE_OMICS, function(x) {
    y = flapro[[x]]
    y %>% 
        inner_join(sample_coverage[[x]], by = "Sample") %>% 
        mutate(value = 1e8 * value/Reads1) %>% 
        arrange(feature, desc(value)) %>% 
        select(-counts_per_sample, -Reads1)
        
}) %>% setNames(AVAILABLE_OMICS)
flapro_rel

In [{}]:
flapro = lapply(flapro, function(x) {
    x %>% select(-counts_per_sample) %>% ungroup()
})
#flapro

#### heatmap of the major Fla (as per flapro_rel)

In [{}]:
flapro_rel_no_zeros_mat = lapply(AVAILABLE_OMICS, function(x) {
    flapro_rel[[x]] %>% inner_join(meta_fla_no_zeros[[x]] %>% select(FlaCluster), by = c("feature" = "FlaCluster")) %>% 
        pivot_wider(names_from = Sample, values_from = value) %>% 
        column_to_rownames("feature") %>% 
        as.matrix()
}) %>% setNames(AVAILABLE_OMICS)
flapro_rel_no_zeros_mat

In [{}]:
for(x in AVAILABLE_OMICS) {
    y = flapro_rel_no_zeros_mat[[x]]   
    p.dims(9, 5)
    pheatmap(t(y),
        #cluster_rows = FALSE, cluster_cols = FALSE, 
        cluster_rows = TRUE, cluster_cols = TRUE, 
        show_rownames = TRUE, show_colnames = TRUE, fontsize = 8, 
        fontsize_number = 10, fontsize_row = 7, fontsize_col = 7, number_color = "black",        
        #angle_col = 45,
        # margin around the heatmap 20
        margins = c(20, 20),        
        #color = colorRampPalette(c("blue", "white", "#ff9090", "red"))(4),
    #   legend = FALSE,
    #   annotation_row = feature_subclasses,  # Add column annotation
    #   annotation_names_row = FALSE,        
    #   annotation_colors = ann_colors,  # Assign colors to subclasses
        border_color = "white", 
        main = x
    )
}

#### save intermediate results to files

In [{}]:
for(x in AVAILABLE_OMICS) {
    flapro_rel_no_zeros_dump = 
        meta_fla_no_zeros[[x]] %>% inner_join(flapro_rel[[x]] %>%
            arrange(Sample) %>% 
            pivot_wider(names_from = Sample, values_from = value), by = c("FlaCluster" = "feature"))
    flapro_rel_no_zeros_dump

    flapro_no_zeros_dump = 
        meta_fla_no_zeros[[x]] %>% inner_join(flapro[[x]] %>%
            arrange(Sample) %>% 
            pivot_wider(names_from = Sample, values_from = value), by = c("FlaCluster" = "feature"))
    flapro_no_zeros_dump

    # save 
    write_tsv(flapro_rel_no_zeros_dump, file.path(PROJ_OUTPUT_DIR, paste0("flapro_rel_no_zeros_", x, ".tsv")))
    write_tsv(flapro_no_zeros_dump, file.path(PROJ_OUTPUT_DIR, paste0("flapro_counts_no_zeros_", x, ".tsv")))
}

### flapro_rel: derive the MTX/MGX ratio and add it to this list

In [{}]:
if(DO_MTX_MGX_ratio) {   
    flapro_rel[["MTX_MGX_ratio"]] = flapro_rel[["MGX"]] %>% rename(Abundance_rel_MGX = value) %>% 
        inner_join(flapro_rel[["MTX"]] %>% rename(Abundance_rel_MTX = value), by = c("Sample", "feature")) %>% 
        # below are filtering steps that have a large effect:    
        # 1: remove the features encoded in a few taxa

        filter(Abundance_rel_MGX > 0) %>% 

        ## (don't need this, as the step 1 will do) 2: remove the features with very low MGX or MTX levels
        #filter(Abundance_rel_MTX + Abundance_rel_MGX > 1e-10) %>% 
        mutate(Abundance_rel = Abundance_rel_MTX/Abundance_rel_MGX) %>% 
        select(Sample, feature, Abundance_rel) %>% 
        rename(value = "Abundance_rel")     
    
    # make the table full - for correct calculation of feature prevalence
    flapro_rel[["MTX_MGX_ratio"]] = flapro_rel[["MTX_MGX_ratio"]] %>% 
        pivot_wider(names_from = "feature", values_from = "value", values_fill = 0) %>%
        pivot_longer(cols = -Sample, names_to = "feature", values_to = "value") 

    print(flapro_rel[["MTX_MGX_ratio"]])
}

### Fla alpha diversity

In [{}]:
if(SCENARIO_COMPON != "MTX_MGX_ratio") {
    # split flapro by Cluster_Pred
    list_flapro_by_Cluster_Pred = flapro[[SCENARIO_COMPON]] %>%
        inner_join(meta_fla_clus, by = c("feature" = "FlaCluster")) %>%         
        group_split(Cluster_Pred)
    names(list_flapro_by_Cluster_Pred) = lapply(list_flapro_by_Cluster_Pred, function(x) {
        x %>% select(Cluster_Pred) %>% distinct() %>% pull()
    })
    list_flapro_by_Cluster_Pred = lapply(list_flapro_by_Cluster_Pred, function(x) { 
        x %>% select(Sample, value, feature)
    })
    #list_flapro_by_Cluster_Pred

    # add flapro_alpha to the list under the name "all"
    list_flapro_by_Cluster_Pred_and_all = c(list_flapro_by_Cluster_Pred, list("all" = flapro[[SCENARIO_COMPON]]))
    #list_flapro_by_Cluster_Pred_and_all
}

In [{}]:
if(SCENARIO_COMPON != "MTX_MGX_ratio") {
    list_flapro_alpha = lapply(names(list_flapro_by_Cluster_Pred_and_all), function(x) {
        y = list_flapro_by_Cluster_Pred_and_all[[x]]

        for_psq_otu_table = y %>% 
            pivot_wider(names_from = Sample, values_from = value) %>%
            column_to_rownames("feature") %>% 
            as.matrix() %>% 
            phyloseq::otu_table(taxa_are_rows = TRUE)
        #for_psq_otu_table

        for_psq_tax_table = meta_fla_clus %>% 
            column_to_rownames("FlaCluster") %>% 
            as.matrix() %>% 
            phyloseq::tax_table()
        #for_psq_tax_table

        for_psq_sample_table = meta_samples %>% 
            column_to_rownames("Sample") %>% 
            phyloseq::sample_data()
        #for_psq_sample_table

        # create phyloseq object
        psq = phyloseq::phyloseq(for_psq_otu_table, for_psq_tax_table, for_psq_sample_table)
        #psq

        # compute the Fla alpha diversity
        flapro_alpha = phyloseq::estimate_richness(psq, split = TRUE, measures = c("Observed", "Chao1", "Shannon")) %>% 
            rownames_to_column("Sample") %>% 
            select(-se.chao1) %>% 
            pivot_longer(cols = -Sample, names_to = "feature", values_to = "value")
        #print(flapro_alpha)

        ## save to file
        #write_tsv(flapro_alpha %>% pivot_wider(names_from = feature, values_from = value), file.path(PROJ_OUTPUT_DIR, paste0("flapro_alpha_", SCENARIO_COMPON, ".tsv")))

        flapro_alpha
    }) %>% setNames(names(list_flapro_by_Cluster_Pred_and_all))
}

In [{}]:
if(SCENARIO_COMPON != "MTX_MGX_ratio") {
    list_flapro_alpha = lapply(names(list_flapro_by_Cluster_Pred_and_all), function(x) {
        y = list_flapro_alpha[[x]]
        y %>% mutate(feature = paste0(feature, "_F_", x)) 
    }) %>% setNames(names(list_flapro_by_Cluster_Pred_and_all))
    #list_flapro_alpha
    # rbind the list into one tibble
    flapro_alpha = bind_rows(list_flapro_alpha)
    print(flapro_alpha)
}

In [{}]:
if(SCENARIO_COMPON != "MTX_MGX_ratio") {
    #  TODO thuink of what to save and under which name
    ## save to file
    ##write_tsv(flapro_alpha %>% pivot_wider(names_from = feature, values_from = value), file.path(PROJ_OUTPUT_DIR, paste0("flapro_alpha_", SCENARIO_COMPON, ".tsv")))
}

### next thing

In [{}]:
# for flapro and flapro_perc, but not for flapro_rel - filter out the samples with low total Fla
flapro = lapply(AVAILABLE_OMICS, function(x) {
    y = flapro[[x]]
    res = y %>% filter(!(Sample %in% samples_with_low_fla[[x]]$Sample))
    print(res %>% select(Sample) %>% distinct() %>% nrow())
    res
}) %>% setNames(AVAILABLE_OMICS)

flapro_perc = lapply(AVAILABLE_OMICS, function(x) {
    y = flapro_perc[[x]]
    y %>% filter(!(Sample %in% samples_with_low_fla[[x]]$Sample))
}) %>% setNames(AVAILABLE_OMICS)

### prevalence filtering (flapro & flapro_perc - one way, flapro_rel -> flapro_rel_MAJ - another!)

#### - flapro & flapro_perc

In [{}]:
# check the long tables are full (includes the same numbers of zeros as if it were wide) before computing the prevalence
lapply(flapro_perc, check_full, arg_feature_col = "feature")

In [{}]:
flapro_perc_w_prev = lapply(flapro_perc, function(x) {
	x %>%
		mutate(prev = sum(value > ABUND_CUTOFF_PERC) / length(value) * 100, 
				.by=c(feature)) %>% 
		select(-value, -Sample) %>%
		distinct() %>% 
		arrange(desc(prev))
})
flapro_perc_w_prev

In [{}]:
# plot a barplot of the flagellins prevalence arranged by the prevalence desc
#p.dims(15, 10)
p.dims(5, 8)
tt = flapro_perc_w_prev[[SCENARIO_NB]] %>% filter(prev > PREVALENCE_CUTOFF)
tt %>% 
    #ggplot(aes(x = reorder(FlaCluster, prev), y = prev, fill = FlaCluster)) + #Family)) + 
    
    inner_join(meta_fla_clus, by = c("feature" = "FlaCluster")) %>% 
        ggplot(aes(x = reorder(feature, prev), y = prev, fill = Cluster_Pred)) +     

        geom_bar(stat = "identity") + #, color = "black", size = 0.1) + 
        #scale_fill_igv() +
        scale_fill_manual(values = FLA_CLASSES_COLORS) +    

        # ! The below commented line does NOT work properly    
        ## label y axis by Family    
        #scale_x_discrete(labels = tt$Family) +     

        theme_bw() + theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
        coord_flip() +
        labs(x = "Flagellin cluster", y = "Prevalence (%)") +
        # smaller axis text
        theme(axis.text.x = element_text(size = 6), axis.text.y = element_text(size = 6)) +
        # legend - top
        theme(legend.position = "top") +
        # smaller legend
        theme(legend.text = element_text(size = 7), legend.title = element_text(size = 7)) +
        theme(legend.key.size = unit(0.5, "cm")) + 
        ggtitle(paste0("flapro_perc: ", SCENARIO_NB))

    #flapro_perc_w_prev %>% ggplot(aes(prev)) + geom_histogram(bins = 150) + theme_minimal()

In [{}]:
# FLAPRO_PERC: filtering low prevalence
flapro_perc = lapply(flapro_perc, function(x) {  
    print(x %>% unique_n('features before filtering', feature))
    res = x %>% 
        mutate(prev = sum(value > ABUND_CUTOFF_PERC) / length(value) * 100, 
                .by=c(feature)) %>%
        filter(prev > PREVALENCE_CUTOFF) %>%
        select(-prev)
    print(res %>% unique_n('features after filtering', feature))
    res
})

In [{}]:
# filter FLAPRO the same way as FLAPRO_PERC
flapro = lapply(AVAILABLE_OMICS, function(x) {  	
	flapro[[x]] %>% semi_join(flapro_perc[[x]], by = "feature")
}) %>% setNames(AVAILABLE_OMICS)

#### extra round of filtering: drop zero samples that might have appeared after the prevalence filtering

In [{}]:
flapro_stats2 = lapply(AVAILABLE_OMICS, function(x) {
    flapro[[x]] %>% 
	    mutate(counts_per_sample = as.double(sum(value)), .by = "Sample") %>%     
        select(Sample, counts_per_sample) %>% distinct() %>% arrange(desc(counts_per_sample))
}) %>% setNames(AVAILABLE_OMICS)

In [{}]:
samples_with_0_fla2 = lapply(flapro_stats2, function(x) {	
	x %>% filter(counts_per_sample < MIN_FLA_READS_PER_SAMPLE_ROUND_2) %>% select(Sample)
})
samples_with_0_fla2

In [{}]:
# flapro and flapro_perc: round 2 filter out the samples with low total Fla
flapro = lapply(AVAILABLE_OMICS, function(x) {
    flapro[[x]] %>% filter(!(Sample %in% samples_with_0_fla2[[x]]$Sample))    
}) %>% setNames(AVAILABLE_OMICS)

flapro_perc = lapply(AVAILABLE_OMICS, function(x) {
    flapro_perc[[x]] %>% filter(!(Sample %in% samples_with_0_fla2[[x]]$Sample))
}) %>% setNames(AVAILABLE_OMICS)

#### - flapro_rel ---> flapro_rel_maj

In [{}]:
flapro_rel[[SCENARIO_COMPON]] %>% select(feature) %>% distinct() %>% nrow()

In [{}]:
flapro_rel_w_prev = 
	flapro_rel[[SCENARIO_COMPON]] %>%		
		mutate(prev = sum(value > REL_AB_ABUND_CUTOFF) / length(value) * 100, 		
				.by=c(feature)) %>% 
		select(-value, -Sample) %>%
		distinct() %>% 
		arrange(desc(prev))
flapro_rel_w_prev

In [{}]:
# plot hist of flapro_rel_w_prev
p.dims(8, 2)
flapro_rel_w_prev %>% 
    ggplot(aes(prev)) + geom_histogram(bins = 150) + theme_minimal() +
    xlab("Prevalence (%)") +
    ylab("Number of features") +
    ggtitle(SCENARIO_COMPON) +
    # smaller axis text
    theme(axis.text.x = element_text(size = 6), axis.text.y = element_text(size = 6)) +
    # legend - top
    theme(legend.position = "top") +
    # smaller legend
    theme(legend.text = element_text(size = 7), legend.title = element_text(size = 7)) +
    theme(legend.key.size = unit(0.5, "cm")) + 
    scale_x_continuous(limits=c(0,100)) +
    scale_y_continuous(limits=c(0,50))

In [{}]:
# (OFF) for the ratio - additionally require high prevalence by MTX
if(FALSE) {
	if(SCENARIO_COMPON == "MTX_MGX_ratio") {
		prev_by_mtx = flapro_rel[["MTX"]] %>%
			mutate(prev_mtx = sum(value > REL_AB_ABUND_CUTOFF) / length(value) * 100, 
					.by=c(feature)) %>% 
			select(-value, -Sample) %>%
			distinct() %>% 

			filter(prev_mtx > REL_AB_PREVALENCE_CUTOFF) %>%

			arrange(desc(prev_mtx))
		
		print(prev_by_mtx)
		
		flapro_rel_w_prev = flapro_rel_w_prev %>% semi_join(prev_by_mtx, by = "feature")
	}

	flapro_rel_w_prev
}

In [{}]:
p.dims(5, 8)
tt = flapro_rel_w_prev %>% filter(prev > REL_AB_PREVALENCE_CUTOFF)
tt %>% 
    inner_join(meta_fla_clus, by = c("feature" = "FlaCluster")) %>% 
    ggplot(aes(x = reorder(feature, prev), y = prev, fill = Cluster_Pred)) +     

    geom_bar(stat = "identity") + #, color = "black", size = 0.1) + 
    scale_fill_manual(values = FLA_CLASSES_COLORS) +    
    theme_bw() + theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
    coord_flip() +
    labs(x = "Flagellin cluster", y = "Prevalence (%)") +
    # smaller axis text
    theme(axis.text.x = element_text(size = 6), axis.text.y = element_text(size = 6)) +
    # legend - top
    theme(legend.position = "top") +
    # smaller legend
    theme(legend.text = element_text(size = 7), legend.title = element_text(size = 7)) +
    theme(legend.key.size = unit(0.5, "cm")) + 
    ggtitle(paste0("flapro_rel: ", SCENARIO_COMPON))    

In [{}]:
# Filter flapro_rel using its own prevalence and abundance cutoffs, save the filtered as flapro_rel_maj
flapro_rel_maj = 
    flapro_rel_w_prev %>%     
    filter(prev > REL_AB_PREVALENCE_CUTOFF) %>%    
    select(-prev)
flapro_rel_maj %>% unique_n('features after filtering', feature)

In [{}]:
flapro_rel_maj = flapro_rel_maj %>% inner_join(flapro_rel[[SCENARIO_COMPON]], by = "feature")
flapro_rel_maj

In [{}]:
flapro_rel_maj %>% inner_join(meta_fla_clus, by = c("feature" = "FlaCluster")) %>% 
    select(feature, Cluster_Pred) %>% 
    distinct() %>% 
    select(Cluster_Pred) %>% 
    table()

In [{}]:
# do log transformation of the fla profiles
flapro_rel_maj_log = flapro_rel_maj %>% 
    mutate(value = log10(value + 1))
flapro_rel_maj_log

### prep-s for NB: init countData, for the selected omic (in line with flapro and flapro_perc)

In [{}]:
# creating count table
countData = 
    flapro[[SCENARIO_NB]] %>%
    rename('sample' = Sample) %>%
    select(sample, feature, value) %>%
    # only leave the samples that are in the meta
    semi_join(meta_samples, by = c("sample" = "Sample")) %>%
    pivot_wider(names_from = "sample", values_from = "value", values_fill=0) %>%
    # convert countData to a data frame with name moved to rownames
    column_to_rownames("feature") %>%
    as.data.frame

# sort columns and rows alphabetically
countData = countData[sort(rownames(countData)), sort(colnames(countData))]
countData

In [{}]:
lapply(AVAILABLE_OMICS, function(x) {
    y = flapro[[x]]
    res = y %>% 
	    mutate(counts_per_sample = as.double(sum(value)), .by = "Sample") %>%     
        select(Sample, counts_per_sample) %>% distinct() %>% arrange(desc(counts_per_sample))
    
    res %>% filter(counts_per_sample == 0)
}) %>% setNames(AVAILABLE_OMICS)

In [{}]:
# version of meta data table for the Compositional analysis
meta_samples_NB = meta_samples %>% semi_join(flapro[[SCENARIO_NB]], by = "Sample") %>% arrange(Sample)
meta_samples_NB

In [{}]:
test_that("flapro and meta fit", {
    expect_equal(colnames(countData), meta_samples_NB %>% select(Sample) %>% pull())    
})

# Analysis