In [11]:
library(vegan)
library(repr)
library(ggplot2)
library(dplyr)

Loading required package: permute
Loading required package: lattice
This is vegan 2.5-5

Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union



In [21]:
# sample subset
env_data_file = "../otu_data/WaterQualityData/matched_cleaned_data/all_mdata_with_habitat.txt"
env_data = read.delim(env_data_file, row.names=1)
env_data = env_data[env_data$CollectionAgency != 'Preheim', ]
select_samples <- rownames(env_data)

# rarefied otu table
tsv.data <- read.delim("../otu_data/dispersal_selection_data/final_rarefied_table.tsv", row.names=1)
tsv.data2 <- read.delim("../otu_data/final_unrarefied_table.txt", row.names=1)
print(dim(tsv.data))
print(dim(tsv.data2))

# subset 
tsv.data.x = tsv.data[select_samples, ]
print(dim(tsv.data.x))
tsv.data2 = tsv.data2[select_samples, ]
print(dim(tsv.data2))

# Remove zeros columns & rows
tsv.data =  tsv.data.x[rowSums(tsv.data.x) > 0, colSums(tsv.data.x) > 0]
tsv.data2 =  tsv.data2[rowSums(tsv.data2) > 0, colSums(tsv.data2) > 0]
print(dim(tsv.data))
print(dim(tsv.data2))

[1]  362 1561
[1]   357 20964
[1]  204 1561
[1]   204 20964
[1]  204 1432
[1]   204 14000


In [None]:
# read in phylo dist mat
phydf_full <- read.delim("../otu_data/dispersal_selection_data/not_full_tree_distances.tsv", row.names=1)

# subset it to observed abundances and convert to matrix
phydf = phydf_full[ colnames(tsv.data), colnames(tsv.data) ]
phydist = as.matrix(phydf)

print(dim(phydf))

In [None]:
taxa_file = "../otu_data/dada2_outputs/taxa_table_with_OTUs.txt"
pre_taxa_df = read.delim(taxa_file, row.names=1)
pre_taxa_df[pre_taxa_df==""] <- NA

# Fix species
spc_df = pre_taxa_df[!is.na(pre_taxa_df[,'Species']),]
non_sp_df = pre_taxa_df[is.na(pre_taxa_df[,'Species']),]
spc_df <- within(spc_df, Species <- paste(Genus, Species, sep=" "))
taxa_df1 = rbind(spc_df, non_sp_df)[rownames(pre_taxa_df),]

#Fix genus
spc_df = taxa_df1[!is.na(taxa_df1[,'Genus']),]
non_sp_df = taxa_df1[is.na(taxa_df1[,'Genus']),]
spc_df <- within(spc_df, Genus <- paste(Family, Genus, sep=" "))
taxa_df2 = rbind(spc_df, non_sp_df)[rownames(taxa_df1),]
'Cyanobiaceae Cyanobium_PCC-6307' %in% taxa_df2$Genus

#Fix family
spc_df = taxa_df2[!is.na(taxa_df2[,'Family']),]
non_sp_df = taxa_df2[is.na(taxa_df2[,'Family']),]
spc_df <- within(spc_df, Family <- paste(Order, Family, sep=" "))
taxa_df3 = rbind(spc_df, non_sp_df)[rownames(taxa_df2),]
"Synechococcales Cyanobiaceae" %in% taxa_df3$Family

#Fix Order
spc_df = taxa_df3[!is.na(taxa_df3[,'Order']),]
non_sp_df = taxa_df3[is.na(taxa_df3[,'Order']),]
spc_df <- within(spc_df, Order <- paste(Class, Order, sep=" "))
taxa_df4 = rbind(spc_df, non_sp_df)[rownames(taxa_df3),]
'Oxyphotobacteria Synechococcales'%in% taxa_df4$Order

#Fix Class
spc_df = taxa_df4[!is.na(taxa_df4[,'Class']),]
non_sp_df = taxa_df4[is.na(taxa_df4[,'Class']),]
spc_df <- within(spc_df, Class <- paste(Phylum, Class, sep=" "))
taxa_df5 = rbind(spc_df, non_sp_df)[rownames(taxa_df4),]
'Cyanobacteria Oxyphotobacteria'%in% taxa_df5$Class

#Fix Phylum
spc_df = taxa_df5[!is.na(taxa_df5[,'Phylum']),]
non_sp_df = taxa_df5[is.na(taxa_df5[,'Phylum']),]
spc_df <- within(spc_df, Phylum <- paste(Kingdom, Phylum, sep=" "))
taxa_df = rbind(spc_df, non_sp_df)[rownames(taxa_df5),]
'Bacteria Cyanobacteria'%in% taxa_df$Phylum

taxa_df = taxa_df[colnames(tsv.data2), ]
taxa_df[taxa_df == ""] <- NA

In [None]:
classif_df = data.frame('num.classes'=apply(taxa_df, MARGIN=2, FUN=function(x) {length(unique(x[!is.na(x)]))}))
classif_df$asvs.classified <- (round((colSums(!is.na(taxa_df)) / nrow(taxa_df))*100, 2))
classif_df
file.exists('/Volumes/KeithSSD/CB_V4/otu_data/KLD2.RData')
file.exists('/Volumes/KeithSSD/CB_V4/otu_data/ASVEuc.RData')
file.exists('/Volumes/KeithSSD/CB_V4/otu_data/ASVCorr.RData')

In [None]:
# add a pseudocount and make relative
tsv.data3 = (tsv.data2+1)/rowSums((tsv.data2+1))
# flip
tsv.data.prep <- t(tsv.data3)
# Compute the total relative abundace of each ASV
otu.sums <- rowSums(tsv.data.prep)
tsv.data.prep <- tsv.data.prep/otu.sums

if (!file.exists('/Volumes/KeithSSD/CB_V4/otu_data/KLD2.RData')) {
    library(philentropy)
    # Compute KL divergence
    distances <- KL(tsv.data.prep, unit='log')
    otu.dist.mat = distances
    rownames(otu.dist.mat) <- rownames(tsv.data.prep)
    colnames(otu.dist.mat) <- rownames(tsv.data.prep)
    save(otu.dist.mat, file='/Volumes/KeithSSD/CB_V4/otu_data/KLD2.RData')
    rm(otu.dist.mat)
}
if (!file.exists('/Volumes/KeithSSD/CB_V4/otu_data/ASVEuc.RData')) {
    otu.dist.mat = as.matrix(dist(tsv.data.prep))
    save(otu.dist.mat, file='/Volumes/KeithSSD/CB_V4/otu_data/ASVEuc.RData')
    rm(otu.dist.mat)
}
if (!file.exists('/Volumes/KeithSSD/CB_V4/otu_data/ASVCorr.RData')) {
    otu.dist.mat2 = cor(t(tsv.data.prep))
    save(otu.dist.mat2, file='/Volumes/KeithSSD/CB_V4/otu_data/ASVCorr.RData')
    rm(otu.dist.mat2)
}

In [None]:
load('/Volumes/KeithSSD/CB_V4/otu_data/ASVEuc.RData')
otu.dist_ = as.dist(otu.dist.mat)


In [None]:
load('/Volumes/KeithSSD/CB_V4/otu_data/ASVEuc.RData')
load('/Volumes/KeithSSD/CB_V4/otu_data/ASVCorr.RData')
# subset the taxa DF
full_taxa_df = taxa_df[rownames(tsv.data.prep), ]

In [None]:
niche_df = data.frame('name'=c(),
                      'level'=c(),
                      'within.class.euc'=c(),
                      'within.class.phyd'=c(),
                      'within.class.cor'=c(),
                      'n.otus'=c())

counter = 0
taxa_levels = colnames(full_taxa_df)
for (a_level in taxa_levels){
    cat(a_level, " ", counter, "\n")
    # drop nulls
    sub_taxa = full_taxa_df[!is.na(full_taxa_df[,a_level]),]
    # drop any category with only greater than 2 OTUs
    sub_taxa2 = sub_taxa[sub_taxa[,a_level] %in% names(which(table(sub_taxa[,a_level]) > 2)), ]
    # extract names at this level
    avail_types = unique(sub_taxa2[,a_level])
    for (a_t in avail_types){
        # pull out OTUs at this level
        otus_i = rownames(sub_taxa2[which(sub_taxa2[,a_level] == a_t),])
        # make the row counter
        counter = counter + 1
        # record the name, level, and count
        niche_df[counter, 'name'] = a_t
        niche_df[counter, 'level'] = a_level
        niche_df[counter, 'n.otus'] = length(otus_i)
        
        # record the kld
        sub_dist.a = otu.dist.mat[otus_i, otus_i]
        niche_df[counter, 'within.class.euc'] = mean(sub_dist.a[upper.tri(sub_dist.a)])
        sub_dist.a2 = otu.dist.mat2[otus_i, otus_i]
        niche_df[counter, 'within.class.cor'] = mean(sub_dist.a2[upper.tri(sub_dist.a2)])
        
        # pull out the ones on the tree
        otus_ip = intersect(otus_i, colnames(phydf))
        if (length(otus_ip) > 1) {
            sub_dist.p = phydf_full[otus_ip, otus_ip]
            niche_df[counter, 'within.class.phyd'] = mean(sub_dist.p[upper.tri(sub_dist.p)])
        }
    }
}

niche_df$level <- factor(niche_df$level, levels=taxa_levels)

In [None]:
niche_df2 = niche_df[!is.na(niche_df$n.otus),]
niche_df2 = niche_df2[which(!(niche_df2$level %in% c('Kingdom', 'Species'))), ]

quantile_by_level <- function(df){
    class_niches = list()
    for (lev in unique(df$level)){
        class_niche = df[df$level == lev, ]
        cat(lev, " ", nrow(class_niche), " ")
        class_qs = quantile(class_niche$n.otus, probs = c(.25,.5,.75))
        class_qs = c(1, class_qs, max(class_niche$n.otus)+1)
        class_niche$n.ASVs = cut(class_niche$n.otus, class_qs, include.lowest=T)
        levels(class_niche$n.ASVs) <- paste('Q', 1:4, sep="")
        cat(table(class_niche$n.ASVs), "\n")
        class_niches[[lev]] = class_niche
    }
    new_df = rbind.data.frame(class_niches[['Phylum']], class_niches[['Class']],
                                 class_niches[['Order']], class_niches[['Family']],
                                 class_niches[['Genus']])
    new_df$n.ASVs <- factor(new_df$n.ASVs, levels=paste('Q', 1:4, sep=""))
    message(colSums(is.na(new_df)))
    return(new_df)
}

niche_df3 = quantile_by_level(niche_df2)
niche_df4 = niche_df3[!is.na(niche_df3$within.class.phyd), ]
niche_df5 = quantile_by_level(niche_df4)

colnames(niche_df5)[colnames(niche_df5) == 'level'] = "Level"
colnames(niche_df5)[colnames(niche_df5) == 'within.class.euc'] = "Within.Class.Dist"
colnames(niche_df5)[colnames(niche_df5) == 'within.class.phyd'] = "Within.Class.PD"
colnames(niche_df5)[colnames(niche_df5) == 'within.class.cor'] = "Within.Class.Corr"

colnames(niche_df3)[colnames(niche_df3) == 'level'] = "Level"
colnames(niche_df3)[colnames(niche_df3) == 'within.class.euc'] = "Within.Class.Dist"
colnames(niche_df3)[colnames(niche_df3) == 'within.class.phyd'] = "Within.Class.PD"
colnames(niche_df3)[colnames(niche_df3) == 'within.class.cor'] = "Within.Class.Corr"

In [None]:
options(repr.plot.width=4, repr.plot.height=3)

ggplot(data = niche_df_nok, aes(x = Level, y = Within.Class.Dist, fill=n.ASVs)) +
  geom_boxplot(position = position_dodge2(width = 0.8)) + 
  theme_classic() + coord_cartesian(ylim=c(0.0,0.4)) + scale_fill_grey()

ggplot(data = niche_df_nok, aes(x = Level, y = Within.Class.PD, fill=n.ASVs)) +
  geom_boxplot(position = position_dodge2(width = 0.8)) +  
  theme_classic() + coord_cartesian(ylim=c(0.0,0.8)) + scale_fill_grey()


In [3]:
library(pvclust)
library(parallel)
load('/Volumes/KeithSSD/CB_V4/otu_data/otuclusters.RData')

In [4]:
cluster_sizes = data.frame('sizes'=c(), 'cutoff'=c())

counter=0
for (cutoff_ in seq(0.94, 0.99, 0.005)){
    clusters_i <- pvpick(res.pv, alpha=cutoff_)$clusters
    clust_lens = unlist(lapply(clusters_i, FUN=length))
    cat(cutoff_, length(clust_lens), "\n")
    counter=counter+length(clust_lens)
    cluster_sizes[(counter+1):(counter+length(clust_lens)), 'sizes'] = clust_lens
    cluster_sizes[(counter+1):(counter+length(clust_lens)), 'cutoff'] = cutoff_
    counter=counter+length(clust_lens)
}


0.94 2 
0.945 195 
0.95 180 
0.955 169 
0.96 165 
0.965 149 
0.97 135 
0.975 120 
0.98 97 
0.985 73 
0.99 47 


In [7]:
table(cluster_sizes$cutoff, cluster_sizes$sizes)

       
         2  3  4  5  6  7  8  9 10 11 12 18 40 57 129 1504
  0.94   0  0  0  0  0  0  0  0  0  0  0  0  0  1   0    1
  0.945 80 42 25 18 10  5  3  4  2  2  1  1  0  1   1    0
  0.95  72 37 24 18 10  4  3  4  2  2  1  1  0  1   1    0
  0.955 64 35 23 18 11  3  3  4  2  2  1  1  1  0   1    0
  0.96  66 27 28 18 10  2  3  4  2  2  1  1  0  0   1    0
  0.965 61 24 24 16  8  2  3  4  2  2  1  1  0  0   1    0
  0.97  60 18 19 15  7  2  3  4  2  2  1  1  0  0   1    0
  0.975 58 13 15 15  7  1  2  4  2  1  0  1  0  0   1    0
  0.98  49 12 11 13  6  0  0  2  2  0  0  1  0  0   1    0
  0.985 37  7 10  8  5  0  0  2  2  0  0  1  0  0   1    0
  0.99  21  6  6  5  4  1  0  0  2  0  0  1  0  0   1    0

In [29]:
final_clusters = pvpick(res.pv)$clusters
cluster_labels = data.frame('cluster_number'=c())
for (a_clust in 1:length(final_clusters)){
    cluster_labels[final_clusters[[a_clust]], 'cluster_number'] = a_clust
}
unclustered = setdiff(colnames(tsv.data.x), rownames(cluster_labels))
cluster_labels[unclustered, 'cluster_number'] <- NA
dim(cluster_labels)
not_detected = setdiff(rownames(cluster_labels), colnames(tsv.data))
cluster_labels[not_detected, 'cluster_number'] <- NA
cluster_labels[(!is.na(cluster_labels$cluster_number)), 'cluster_number'] = cluster_labels[(!is.na(cluster_labels$cluster_number)), 'cluster_number'] - 1

cluster_labels$cluster_number <- factor(cluster_labels$cluster_number)

str(cluster_labels$cluster_number)
sum(is.na(cluster_labels$cluster_number))
write.table(x = cluster_labels, 
            file = '/Volumes/KeithSSD/CB_V4/otu_data/otu_cluster_vector.txt', 
            row.names = T,
            col.names = T, 
            sep = '\t')


 Factor w/ 179 levels "1","2","3","4",..: NA NA NA NA NA NA NA NA NA NA ...


In [None]:
level_medians = list()
dist_by_level = list()
taxa_levels = colnames(taxa_df)
for (a_level in taxa_levels){
    print(a_level)
    sub_taxa = taxa_df[!is.na(taxa_df[,a_level]),]
    sub_taxa2 = sub_taxa[sub_taxa[,a_level] %in% names(which(table(sub_taxa[,a_level]) > 1)), ]
    print(c(dim(sub_taxa)[1], 'rows'))
    avail_types = unique(sub_taxa2[,a_level])
    print(c(length(avail_types), 'appear >1 times'))
    dist_by_level[[a_level]] = vector(length=length(avail_types))
    for (t_i in 1:length(avail_types)){
        type_i = avail_types[t_i]
        otus_i = rownames(sub_taxa2[which(sub_taxa2[,a_level] == type_i),])
        sub_dist = phydf_full[otus_i, otus_i]
        med_val = median(as.dist(sub_dist))
        dist_by_level[[a_level]][t_i] = med_val
    }
    names(dist_by_level[[a_level]]) <- avail_types
    level_medians[[a_level]] = median(dist_by_level[[a_level]])
}
print(unlist(level_medians))
print(max(phydf_full))


In [None]:
# create simulated community
sim.tsv <- permatswap(tsv.data, "quasiswap", times=10)$perm[[5]]

In [None]:
exp.eco.dist = vegdist(t(tsv.data), method='bray')
exp.comm.dist = as.matrix(exp.eco.dist)

sim.eco.dist = vegdist(t(sim.tsv), method="bray")
sim.comm.dist = as.matrix(sim.eco.dist)

print(c(dim(exp.comm.dist), dim(sim.comm.dist)))

In [None]:
options(repr.plot.width=6, repr.plot.height=6)
par(mar=c(4,7,4,1)+.1)

boxplot(dist_by_level$Species, dist_by_level$Genus, dist_by_level$Family, 
        dist_by_level$Order, dist_by_level$Class, dist_by_level$Phylum,
        dist_by_level$Kingdom,
        main = "Median Phylogenetic Distance in a Taxonomic Categories by Level",
    at = 0:6*2,
    names = c("Species (n=1)", "Genus (n=164)", "Family (n=154)", 'Order (n=121)', 'Class (n=133)',
              "Phylum (n=28)", 'Kingdom (n=3)'),
    las = 2,
    horizontal = TRUE
)

In [None]:
# create simulated correlogram

exp_gram_f1 = "../otu_data/dispersal_selection_data/exp_correlog2.RData"
our_breaks = c(0, 0.001, 0.005, 0.015, 0.025, 0.03, 0.04, 0.05, 0.06, 0.07, 0.09, 0.11, 0.13, 0.15, 0.17,
               0.20, 0.22, 0.24, 0.26, 0.28, 0.30, 0.4, 0.5, 0.6, 0.7, 0.9, 1.1, 2.55, 4)


if (file.exists(exp_gram_f1)) {
    load(exp_gram_f1)
    write("Located File 1", stdout())
} else {
    exp.correlog = mantel.correlog(exp.comm.dist, D.geo=phydist, mult="BH", 
                                    r.type="spearman", cutoff = F,
                                    nperm=1, break.pts=our_breaks)
}

In [None]:
sim_gram_f1 = "../otu_data/dispersal_selection_data/sim_correlog2.RData"
if (file.exists(sim_gram_f1)) {
    load(sim_gram_f1)
    write("Located File 2", stdout())
} else {
    sim.correlog = mantel.correlog(sim.comm.dist, D.geo=phydist, mult="BH", 
                                   r.type="spearman", cutoff = F, nperm=1, 
                                   break.pts=our_breaks)
}

In [None]:
t_level_med = unlist(level_medians)
names(t_level_med) <- NULL
t_level_med = sort(t_level_med)[1:6]
correlog_real = data.frame("PhyDist" = exp.correlog$mantel.res[,'class.index'],
                           'MCorr' = exp.correlog$mantel.res[,'Mantel.cor'],
                           'Correlation.Significant' = exp.correlog$mantel.res[,'Pr(corrected)'] < 0.05)

correlog_null = data.frame("PhyDist" = sim.correlog$mantel.res[,'class.index'],
                           'MCorr' = sim.correlog$mantel.res[,'Mantel.cor'],
                           'Correlation.Significant' = sim.correlog$mantel.res[,'Pr(corrected)'] < 0.05)

correlog_data2 <- correlog_real %>%  mutate(modtype = 'Real') %>%
                 bind_rows(correlog_null %>% mutate(modtype = 'Null.Model'))

correlog_data3 = correlog_data2[!is.na(correlog_data2[,'MCorr']),]

correlog_data3[is.na(correlog_data3[,'Correlation.Significant']), 'Correlation.Significant'] <- FALSE

dim(correlog_data3)
correlog_data3

In [None]:
options(repr.plot.width=6, repr.plot.height=4)
annotation <- data.frame(
   x = sort(t_level_med)-0.013,
   y = c(-.031, -.034, -.031, -.034, -.031, -.034),
   label = c("S", "G", 'F', 'O', 'C', 'P'))

annotation[c(2), 'x'] = annotation[c(2), 'x'] + .029
annotation[c(5), 'x'] = annotation[c(5), 'x'] + .029

lsize=0.5
p = ggplot(data=correlog_data3, aes(y=MCorr, x=PhyDist, group=modtype, linetype=modtype, shape=modtype)) + 
           geom_point(aes(alpha = Correlation.Significant), size=1.6) +
           scale_alpha_manual(values=c(0.4, 0.9)) +
           geom_line(size=0.6) +
           scale_linetype_manual(values=c('dotted', 'solid')) + 
           labs(x="Phylogenetic Distance", y="Mantel Correlation") + 
           coord_cartesian(xlim=c(-0.02,0.4), ylim=c(-0.035, 0.025)) +
           geom_vline(xintercept = t_level_med[1], color = "black", linetype="dashed", size=lsize) + 
           geom_vline(xintercept = t_level_med[2], color = "black", linetype="dashed", size=lsize) + 
           geom_vline(xintercept = t_level_med[3], color = "black", linetype="dashed", size=lsize) + 
           geom_vline(xintercept = t_level_med[4], color = "black", linetype="dashed", size=lsize) + 
           geom_vline(xintercept = t_level_med[5], color = "black", linetype="dashed", size=lsize) + 
           geom_vline(xintercept = t_level_med[6], color = "black", linetype="dashed", size=lsize) + 
           geom_text(data=annotation, aes( x=x, y=y, label=label), color="black", size=4, angle=0, inherit.aes = FALSE)

In [None]:
p