In [1]:
env_dir = '/Volumes/KeithSSD/ChesapeakeMicrobiome/data/environmental_raw_data'
env_data = read.delim(paste(env_dir, 'merged_paired_water_quality_data.txt', sep="/"), sep="\t", row.names=1)

In [2]:
main_dir = '/Volumes/KeithSSD/ChesapeakeMicrobiome/data/otu_tables'
otu_table = read.delim(paste(main_dir, 'final_unrarefied_table.txt', sep="/"), sep="\t", row.names=1)
contam_df = read.delim(paste(main_dir, 'contaminant_table.txt', sep="/"), sep="\t", row.names=1)
taxa_table = read.delim(paste(main_dir, 'taxa_table_with_OTUs.txt', sep="/"), sep="\t", row.names=1)

In [4]:
# find everything that was accidentally dropped
contaminants = setdiff(colnames(contam_df), colnames(otu_table))
# match samples with environmental data
common_samples = rownames(env_data)
# subset both according to sample
contam_df2 = contam_df[common_samples, contaminants]
otu_table2 = otu_table[common_samples,]
# remove anything not present in these samples 
contam_df3 = contam_df2[,which(colSums(contam_df2) > 0)]
otu_table3 = otu_table2[,which(colSums(otu_table2) > 0)]

paste("Num shared columns", length(intersect(colnames(contam_df3), colnames(otu_table3))))
paste("Num total columns", length(union(colnames(contam_df3), colnames(otu_table3))))

# merge columns
super_Table = merge(x=contam_df3, y=otu_table3, by.x = 'row.names', by.y = 'row.names')
# drop the rownames from a copy
super_Table2 = super_Table[1:nrow(super_Table), 2:ncol(super_Table)]
# assign the rownames to their proper location
rownames(super_Table2) <- super_Table[,1]

In [5]:
outfile.1 = "/Volumes/KeithSSD/ChesapeakeMicrobiome/data/otu_tables/final_unrarefied_table.2.txt"
outfile.2 = "/Volumes/KeithSSD/ChesapeakeMicrobiome/data/otu_tables/final_rarefied_table.2.txt"
write.table(super_Table2, sep="\t", row.names=T, file=outfile.1)
colnames(super_Table2)[1:5]
rownames(super_Table2)[1:5]
dim(super_Table2)

In [6]:
library(phyloseq)
super_Table_ra = super_Table2 / rowSums(super_Table2)
minabund_check = which(colSums(super_Table_ra > 0.002) > 0)
OTU.clean = as.matrix(super_Table2[,minabund_check])
tax_mat = as.matrix(taxa_table)
colnames(tax_mat)[1] <- 'Domain'
tax_mat[tax_mat == ""] <- NA
tax.clean = tax_mat[colnames(OTU.clean), ]
TAX.phylo = tax_table(tax.clean)
physeq2 = phyloseq(otu_table(OTU.clean, taxa_are_rows = FALSE), TAX.phylo)

In [7]:
set.seed(42)
rare_table = rarefy_even_depth(physeq2, rngseed = 42, replace = TRUE, trimOTUs = TRUE, verbose = TRUE)

`set.seed(42)` was used to initialize repeatable random subsampling.
Please record this for your records so others can reproduce.
Try `set.seed(42); .Random.seed` for the full vector
...


In [8]:
rareOTU = as(otu_table(rare_table), "matrix")
OTUdf = as.data.frame(rareOTU)

write.table(OTUdf, sep="\t", row.names=T, file=outfile.2)
colnames(OTUdf)[1:5]
rownames(OTUdf)[1:5]
dim(OTUdf)

In [10]:
outfile.3 = "/Volumes/KeithSSD/ChesapeakeMicrobiome/data/otu_tables/final_rare_no_abund_filt.2.txt"
OTU.clean3 = as.matrix(super_Table2)
tax.clean3 = tax_mat[colnames(OTU.clean3), ]
TAX.phylo3 = tax_table(tax.clean3)
physeq3 = phyloseq(otu_table(OTU.clean3, taxa_are_rows = FALSE), TAX.phylo3)

set.seed(42)
rare_table2 = rarefy_even_depth(physeq3, rngseed = 42, replace = TRUE, trimOTUs = TRUE, verbose = TRUE)

rareOTU2 = as(otu_table(rare_table2), "matrix")
OTUdf2 = as.data.frame(rareOTU2)

write.table(OTUdf2, sep="\t", row.names=T, file=outfile.3)
colnames(OTUdf2)[1:5]
rownames(OTUdf2)[1:5]
dim(OTUdf2)

`set.seed(42)` was used to initialize repeatable random subsampling.
Please record this for your records so others can reproduce.
Try `set.seed(42); .Random.seed` for the full vector
...
16296OTUs were removed because they are no longer 
present in any sample after random subsampling

...


In [14]:
library("seqinr")
ncrna <- read.fasta(file = (paste(main_dir, 'ASV_Sequences_unfiltered.fa', sep="/")))


In [28]:
tree_dir = '/Volumes/KeithSSD/ChesapeakeMicrobiome/data/phylogenetic_tree'
treef1 = paste(tree_dir, 'query_high_abund.2.fasta', sep="/")
treef2 = paste(tree_dir, 'query_all_abund.2.fasta', sep="/")

annotation <- getAnnot(ncrna)
all_abund_bool = which(annotation %in% paste(">", names(OTUdf2), sep=""))
all_abud_both <- ncrna[all_abund_bool]
length(all_abud_both)
write.fasta(getSequence(all_abud_both), 
            sub('.', '', getAnnot(all_abud_both)), treef2)

high_abund_bool = which(annotation %in% paste(">", names(OTUdf), sep=""))
high_abud_both <- ncrna[high_abund_bool]
length(high_abud_both)
write.fasta(getSequence(high_abud_both), 
            sub('.', '', getAnnot(high_abud_both)), treef1)


In [29]:
test_bool = which(annotation %in% paste(">", names(OTUdf), sep=""))[1:30]
test_both <- ncrna[test_bool]
length(test_both)
write.fasta(getSequence(test_both), 
            sub('.', '', getAnnot(test_both)), 
            paste(tree_dir, 'test.2.fasta', sep="/"))
